File size: 4,877 Bytes
ec6ad2f
 
 
 
 
 
 
943db10
ec6ad2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
943db10
 
ec6ad2f
 
943db10
 
ec6ad2f
943db10
ec6ad2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

import json
import os
from typing import List, Optional
from dataclasses import dataclass
import numpy as np

from .config import Config, get_text_cood_file_path

@dataclass
class TextDetection:
    """Represents a detected text region."""
    bbox: List[int]
    text: str
    confidence: float
    id: Optional[int] = None

class TextDetector:
    """Handles text detection and grouping from comic images."""
    
    def __init__(self, config: Config):
        self.config = config
        self.reader = None

    def load(self):
        """Load the OCR reader."""
        if self.reader is None:
            import easyocr
            self.reader = easyocr.Reader(['en'])
    
    def detect_text(self) -> List[TextDetection]:
        """Detect text regions in the image."""
        self.load()
        results = self.reader.readtext(self.config.input_path)
        print(f"EasyOCR found {len(results)} raw detections")
        
        detections = []
        for box, text, confidence in results:
            bbox = self._normalize_bbox(box)
            detections.append(TextDetection(
                bbox=bbox,
                text=text.strip(),
                confidence=float(confidence)
            ))
        
        return detections
    
    def _normalize_bbox(self, box: List[List[int]]) -> List[int]:
        """Convert box coordinates to normalized bbox format."""
        return [
            min(x[0] for x in box),
            min(x[1] for x in box),
            max(x[0] for x in box),
            max(x[1] for x in box)
        ]
    
    @staticmethod
    def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
        """Calculate Euclidean distance between two bounding box centers."""
        center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
        center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
        return np.linalg.norm(np.subtract(center1, center2))
    
    def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
        """Group nearby text regions into speech bubbles."""
        # Filter out single character detections
        filtered_detections = [
            det for det in detections 
            if len(det.text.strip()) >= self.config.min_text_length
        ]
        
        # Sort by vertical position (top to bottom)
        filtered_detections.sort(key=lambda d: d.bbox[1])
        
        groups = []
        for detection in filtered_detections:
            merged = False
            
            for group in groups:
                if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
                    self._merge_detections(group, detection)
                    merged = True
                    break
            
            if not merged:
                groups.append(detection)
        
        # Sort groups by vertical position and assign IDs
        groups.sort(key=lambda g: g.bbox[1])
        for idx, group in enumerate(groups):
            group.id = idx + 1
        
        return groups
    
    def _merge_detections(self, group: TextDetection, detection: TextDetection):
        """Merge two text detections."""
        group.text += " " + detection.text
        group.bbox = [
            min(group.bbox[0], detection.bbox[0]),
            min(group.bbox[1], detection.bbox[1]),
            max(group.bbox[2], detection.bbox[2]),
            max(group.bbox[3], detection.bbox[3])
        ]
    
    def detect_and_group_text(self) -> str:
        """Main method to detect and group text, saving results to JSON."""
        text_coord_path = get_text_cood_file_path(self.config)
        if not os.path.exists(text_coord_path):
            detections = self.detect_text()
            groups = self.group_text_regions(detections)
            self._save_groups_to_json(groups, text_coord_path)
            print(f"Grouped bubbles saved: {text_coord_path}")
        
        return text_coord_path
    
    def _save_groups_to_json(self, groups: List[TextDetection], output_path: str):
        """Save grouped text detections to JSON file."""
        groups_data = []
        for group in groups:
            groups_data.append({
                "id": group.id,
                "bbox": [int(x) for x in group.bbox],
                "text": group.text,
                "confidence": group.confidence
            })
        
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(groups_data, f, indent=2, ensure_ascii=False)

    def cleanup(self):
        """Clean up resources."""
        try:
            if self.reader:
                del self.reader
        except:
            pass

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.cleanup()

    def __del__(self):
        self.cleanup()