diff --git a/manga-translator.py b/manga-translator.py index fc53f22..941cc3d 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -60,16 +60,13 @@ def is_valid_language(text: str, source_lang: str) -> bool: clean_text = re.sub(r'[^\w]', '', text) if not clean_text: return False - lang = source_lang.lower() - if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: foreign_chars = len(re.findall( r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF' r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]', - clean_text - )) + clean_text)) if foreign_chars > 0: return False latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text)) @@ -79,25 +76,21 @@ def is_valid_language(text: str, source_lang: str) -> bool: if total <= 6: return (latin_chars / total) >= 0.55 return (latin_chars / total) >= 0.45 - elif lang in ['ja', 'japanese']: ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text)) if len(clean_text) <= 3: return ja_chars >= 1 return (ja_chars / len(clean_text)) >= 0.4 - elif lang in ['ko', 'korean']: ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text)) if len(clean_text) <= 3: return ko_chars >= 1 return (ko_chars / len(clean_text)) >= 0.4 - elif lang in ['zh', 'chinese']: zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text)) if len(clean_text) <= 3: return zh_chars >= 1 return (zh_chars / len(clean_text)) >= 0.4 - return True @@ -122,6 +115,7 @@ _MANGA_INTERJECTIONS = { 'OK', 'OK!', 'OKAY', 'EEEEP', 'EEEP', 'OMIGOSH', + 'BECKY', 'BECKY!', 'HMM', 'HMM...', 'TSK', 'TCH', 'GRRR','I','A', @@ -139,30 +133,25 @@ def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper) if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS: return True - alpha_count = sum(c.isalpha() for c in t) if alpha_count < min_alpha_chars: return False if t_upper in _NOISE_TOKENS: return False - lang = source_lang.lower() if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: non_alpha = sum(not c.isalpha() for c in t) if len(t) > 0 and (non_alpha / len(t)) > 0.60: return False - if len(t) >= 3 and len(set(t_upper)) == 1: return False - if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: if len(t) > 4: vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t)) if vowels == 0: return False - return True @@ -195,7 +184,8 @@ def xyxy_to_xywh(b): if b is None: return None x1, y1, x2, y2 = b - return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} + return {"x": int(x1), "y": int(y1), + "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} def overlap_or_near(a, b, gap=0): ax1, ay1, ax2, ay2 = a @@ -205,13 +195,10 @@ def overlap_or_near(a, b, gap=0): return gap_x <= gap and gap_y <= gap def boxes_iou(a, b): - """Intersection over Union for two xyxy boxes.""" ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b - ix1 = max(ax1, bx1) - iy1 = max(ay1, by1) - ix2 = min(ax2, bx2) - iy2 = min(ay2, by2) + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) if inter == 0: return 0.0 @@ -223,10 +210,8 @@ def boxes_overlap_ratio(a, b): """Ratio of intersection to the SMALLER box area.""" ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b - ix1 = max(ax1, bx1) - iy1 = max(ay1, by1) - ix2 = min(ax2, bx2) - iy2 = min(ay2, by2) + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) if inter == 0: return 0.0 @@ -241,11 +226,11 @@ def ocr_candidate_score(text: str) -> float: n = len(t) if n == 0: return 0.0 - alpha = sum(c.isalpha() for c in t) / n - spaces = sum(c.isspace() for c in t) / n + alpha = sum(c.isalpha() for c in t) / n + spaces = sum(c.isspace() for c in t) / n punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n - bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n - penalty = 0.0 + bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n + penalty = 0.0 if re.search(r"\b[A-Z]\b", t): penalty += 0.05 if re.search(r"[0-9]{2,}", t): @@ -255,15 +240,11 @@ def ocr_candidate_score(text: str) -> float: def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool: x1, y1, x2, y2 = quad_bbox(quad) - w = max(1, x2 - x1) - h = max(1, y2 - y1) - return (w / h) >= ratio_threshold + return (max(1, x2 - x1) / max(1, y2 - y1)) >= ratio_threshold def quad_is_vertical(quad, ratio_threshold=1.5) -> bool: x1, y1, x2, y2 = quad_bbox(quad) - w = max(1, x2 - x1) - h = max(1, y2 - y1) - return (h / w) >= ratio_threshold + return (max(1, y2 - y1) / max(1, x2 - x1)) >= ratio_threshold # ============================================================ @@ -273,34 +254,35 @@ def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): h, w = image_bgr.shape[:2] upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)), interpolation=cv2.INTER_CUBIC) - gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) - denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21) - clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) + gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) + denoised = cv2.fastNlMeansDenoising(gray, None, h=10, + templateWindowSize=7, searchWindowSize=21) + clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) enhanced = clahe.apply(denoised) - kernel_sharpen = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]]) - sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen) - binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 11, 2) - kernel = np.ones((2, 2), np.uint8) - cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) + sharpened = cv2.filter2D(enhanced, -1, + np.array([[-1,-1,-1],[-1,9,-1],[-1,-1,-1]])) + binary = cv2.adaptiveThreshold(sharpened, 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 11, 2) + cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8)) return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR) def detect_small_text_regions(image_bgr, existing_quads): - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - mask = np.zeros(gray.shape, dtype=np.uint8) + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + mask = np.zeros(gray.shape, dtype=np.uint8) for quad in existing_quads: - pts = np.array(quad, dtype=np.int32) - cv2.fillPoly(mask, [pts], 255) - mask_inv = cv2.bitwise_not(mask) - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255) + mask_inv = cv2.bitwise_not(mask) + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv) - contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) text_regions = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) area = w * h - if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10: - text_regions.append((x, y, x+w, y+h)) + if 50 < area < 5000 and 0.1 < h / max(w, 1) < 10: + text_regions.append((x, y, x + w, y + h)) return text_regions @@ -308,7 +290,7 @@ def detect_small_text_regions(image_bgr, existing_quads): # SPEECH BUBBLE DETECTION # ============================================================ def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) @@ -322,10 +304,9 @@ def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5): def split_indices_by_bubble(indices, ocr, bubble_contours): if not indices: return [] - bubble_groups = {} - outside_group = [] + bubble_groups, outside_group = {}, [] for idx in indices: - bbox = quad_bbox(ocr[idx][0]) + bbox = quad_bbox(ocr[idx][0]) found = False for bidx, bubble in enumerate(bubble_contours): if is_quad_in_bubble(bbox, bubble): @@ -342,7 +323,8 @@ def split_indices_by_bubble(indices, ocr, bubble_contours): def check_vertical_alignment_split(indices, ocr, threshold=20): if len(indices) <= 1: return [indices] - items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], key=lambda x: x[1][1]) + items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], + key=lambda x: x[1][1]) groups, current_group = [], [items[0][0]] for i in range(1, len(items)): if items[i][1][1] - items[i-1][1][3] > threshold: @@ -366,9 +348,9 @@ def is_quad_oversized(quad, median_height, width_threshold=8.0): def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height): x1, y1, x2, y2 = quad_bbox(quad) w, h = x2 - x1, max(1, y2 - y1) - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] + pad = 2 + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] if roi.size == 0: return [(quad, text, conf)] gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) @@ -390,15 +372,15 @@ def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height): gaps.sort(key=lambda g: g[1], reverse=True) split_x_abs = max(0, x1) + gaps[0][0] if ' ' in text: - char_w = w / max(1, len(text)) + char_w = w / max(1, len(text)) split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] + spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] @@ -407,7 +389,7 @@ def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height): def validate_and_split_oversized_quads(image_bgr, filtered_ocr): if not filtered_ocr: return filtered_ocr, 0 - heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] + heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] median_height = float(np.median(heights)) if heights else 14.0 result, splits_made = [], 0 for quad, text, conf in filtered_ocr: @@ -426,15 +408,10 @@ def validate_and_split_oversized_quads(image_bgr, filtered_ocr): # HORIZONTAL GAP DETECTION AT QUAD LEVEL # ============================================================ def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5): - """ - Detects a large horizontal gap between quads within a group and splits them. - Fixes cases like BOX#8 in debug_clusters_016 where two column groups - are incorrectly merged into one box. - """ if len(indices) < 2: return None - items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) - boxes = [quad_bbox(ocr[i][0]) for i in items] + items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) + boxes = [quad_bbox(ocr[i][0]) for i in items] gap_threshold = med_h * gap_factor best_gap, best_split = 0.0, None for k in range(len(items) - 1): @@ -443,23 +420,18 @@ def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5): best_gap, best_split = gap, k if best_split is None: return None - left_group = [items[i] for i in range(best_split + 1)] + left_group = [items[i] for i in range(best_split + 1)] right_group = [items[i] for i in range(best_split + 1, len(items))] if not left_group or not right_group: return None return (left_group, right_group) - def orientation_compatible(idx_a, idx_b, ocr): - """ - Prevents merging a tall/narrow isolated glyph with wide horizontal text lines. - Fixes BOX#1 type problems in debug_clusters_015. - """ ba = quad_bbox(ocr[idx_a][0]) bb = quad_bbox(ocr[idx_b][0]) wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1]) wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1]) - ra, rb = wa/ha, wb/hb + ra, rb = wa / ha, wb / hb if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0): return False return True @@ -470,33 +442,20 @@ def orientation_compatible(idx_a, idx_b, ocr): # ============================================================ def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, min_gap_factor=1.8): - """ - FIX for BOX#6 type problem: - Splits a single OCR quad that spans two distinct text columns by finding - the largest vertical gap in its pixel projection. More aggressive than - split_oversized_quad_by_content — targets column-level gaps specifically. - """ x1, y1, x2, y2 = quad_bbox(quad) w, h = x2 - x1, max(1, y2 - y1) - - # Only attempt if the quad is wide enough to plausibly span two columns if w < med_h * 3.0: return [(quad, text, conf)] - pad = 2 - roi = image_bgr[max(0, y1-pad):min(image_bgr.shape[0], y2+pad), - max(0, x1):min(image_bgr.shape[1], x2)] + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] if roi.size == 0: return [(quad, text, conf)] - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - - # Threshold: column gap must be nearly empty + v_proj = np.sum(binary, axis=0) gap_threshold = h * 255 * 0.12 - min_gap_px = max(int(med_h * min_gap_factor), 10) - + min_gap_px = max(int(med_h * min_gap_factor), 10) gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): if v_proj[x] < gap_threshold: @@ -507,41 +466,29 @@ def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, if gw >= min_gap_px: gaps.append((gap_start + gw // 2, gw)) in_gap = False - if not gaps: return [(quad, text, conf)] - - # Use the widest gap as the split point gaps.sort(key=lambda g: g[1], reverse=True) split_x_rel = gaps[0][0] split_x_abs = x1 + split_x_rel - - # Ensure the split produces two non-trivial halves if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h: return [(quad, text, conf)] - if ' ' in text: - char_w = w / max(1, len(text)) + char_w = w / max(1, len(text)) split_idx = int(split_x_rel / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] + spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * split_x_rel / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] return [(quad, text, conf)] - def apply_column_gap_splits(image_bgr, ocr_list, med_h): - """ - Applies split_wide_quad_by_column_gap to every quad in the list. - Run this BEFORE grouping so column-spanning quads never seed bad groups. - """ result, splits_made = [], 0 for quad, text, conf in ocr_list: parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h) @@ -558,19 +505,18 @@ def apply_column_gap_splits(image_bgr, ocr_list, med_h): # ============================================================ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) + for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - next_bid = 1 - splits_made = [] + next_bid, splits_made = 1, [] for bid, indices in bubble_indices.items(): if len(indices) < 2: new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = indices next_bid += 1 continue @@ -580,20 +526,21 @@ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_qua for group in split_groups: if group: new_bubbles[next_bid] = build_lines_from_indices(group, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - new_quads[next_bid] = [ocr[i][0] for i in group] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) + new_quads[next_bid] = [ocr[i][0] for i in group] new_indices[next_bid] = group next_bid += 1 splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles") continue - vertical_splits = check_vertical_alignment_split(indices, ocr, threshold=int(med_h * 2.0)) + vertical_splits = check_vertical_alignment_split(indices, ocr, + threshold=int(med_h * 2.0)) if len(vertical_splits) > 1: for group in vertical_splits: if group: new_bubbles[next_bid] = build_lines_from_indices(group, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - new_quads[next_bid] = [ocr[i][0] for i in group] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) + new_quads[next_bid] = [ocr[i][0] for i in group] new_indices[next_bid] = group next_bid += 1 splits_made.append(f"BOX#{bid} → {len(vertical_splits)} vertical groups") @@ -603,42 +550,42 @@ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_qua x1, y1, x2, y2 = box if (x2 - x1) > med_h * 10: x_centers = [quad_center(ocr[i][0])[0] for i in indices] - x_median = np.median(x_centers) - left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median] + x_median = np.median(x_centers) + left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median] right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median] if left_group and right_group: - left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group]) + left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group]) right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group]) if right_box[0] - left_box[2] > med_h * 1.5: for grp in [left_group, right_group]: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - new_quads[next_bid] = [ocr[i][0] for i in grp] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} → 2 horizontal panels") continue new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = indices next_bid += 1 if splits_made: print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):") for s in splits_made: print(f" ✓ {s}") - return new_bubbles, new_boxes, new_quads, new_indices def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) + for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) - bids = list(bubble_boxes.keys()) - to_merge = [] + bids = list(bubble_boxes.keys()) + to_merge = [] for i in range(len(bids)): for j in range(i + 1, len(bids)): @@ -648,13 +595,11 @@ def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_qua cy_i = (box_i[1] + box_i[3]) / 2.0 cx_j = (box_j[0] + box_j[2]) / 2.0 cy_j = (box_j[1] + box_j[3]) / 2.0 - in_same_bubble = any( cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0 for c in bubble_contours ) - if in_same_bubble: if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0: to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i)) @@ -669,76 +614,69 @@ def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_qua for key in merge_groups: if top in merge_groups[key] or bottom in merge_groups[key]: merge_groups[key].update({top, bottom}) - found = True - break + found = True; break if not found: merge_groups[len(merge_groups)] = {top, bottom} new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} merged_bids, next_bid = set(), 1 - for merge_set in merge_groups.values(): - merge_list = sorted(merge_set) + merge_list = sorted(merge_set) print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}") all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b])) for b in merge_list: merged_bids.add(b) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) - new_quads[next_bid] = [ocr[i][0] for i in all_indices] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) + new_quads[next_bid] = [ocr[i][0] for i in all_indices] new_indices[next_bid] = all_indices next_bid += 1 - for bid in bids: if bid not in merged_bids: new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 - return new_bubbles, new_boxes, new_quads, new_indices def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h): """ - FIX for BOX#2+BOX#14 and BOX#7+BOX#18 type problems: - Merges boxes whose bounding rectangles are very close vertically AND - share significant horizontal overlap — indicating they belong to the - same speech bubble that the contour detector missed (e.g. dashed outlines). + Merges boxes that are vertically close AND share significant horizontal overlap. - Unlike merge_close_bubbles_by_line_height, this checks BOTH axes strictly - to avoid merging boxes from adjacent but distinct bubbles. + Single-quad boxes participate fully — no isolation treatment. + This fixes BOX#2+#16, BOX#8+#21, BOX#9+#22 type problems where a + single-line detection sits directly above/below a multi-line box in the + same speech bubble. + + Merge criteria (both must be true): + 1. Vertical gap ≤ 1.5 × med_h + 2. Horizontal overlap ratio ≥ 0.35 """ bids = sorted(bubble_boxes.keys()) - merge_map: Dict[int, List[int]] = {} - merged_into: Dict[int, int] = {} + merge_map: Dict[int, List[int]] = {} + merged_into: Dict[int, int] = {} for i, bid_i in enumerate(bids): if bid_i in merged_into: continue box_i = bubble_boxes[bid_i] - wi = box_i[2] - box_i[0] + wi = max(1, box_i[2] - box_i[0]) for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in merged_into: continue box_j = bubble_boxes[bid_j] - wj = box_j[2] - box_j[0] + wj = max(1, box_j[2] - box_j[0]) - # Vertical gap between the two boxes vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) - - # Horizontal overlap ratio (intersection / min width) - h_ix1 = max(box_i[0], box_j[0]) - h_ix2 = min(box_i[2], box_j[2]) - h_overlap = max(0, h_ix2 - h_ix1) + h_ix1 = max(box_i[0], box_j[0]) + h_ix2 = min(box_i[2], box_j[2]) + h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - # Merge only when: - # 1. Vertical gap is small (boxes are stacked closely) - # 2. Horizontal overlap is significant (same column) if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35: root = merged_into.get(bid_i, bid_i) merge_map.setdefault(root, [root]) @@ -758,8 +696,8 @@ def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_qu print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}") all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b])) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) - new_quads[next_bid] = [ocr[i][0] for i in all_indices] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) + new_quads[next_bid] = [ocr[i][0] for i in all_indices] new_indices[next_bid] = all_indices next_bid += 1 processed.update(group_unique) @@ -767,8 +705,8 @@ def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_qu for bid in bids: if bid not in processed: new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 @@ -777,40 +715,55 @@ def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_qu def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): + """ + Full fix pipeline: + 1. Split boxes that span multiple speech bubbles. + 2. Merge fragments detected inside the same contour. + 3. Merge fragments missed by contour detection (proximity+overlap) — pass 1. + 4. Second proximity pass — catches chains resolved after pass 1. + """ print("\n🔍 Running automatic bubble detection fixes...") - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) + for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 - bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_split_multi_bubble_boxes( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) - bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_merge_fragmented_bubbles( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) - # Second pass: catch fragments missed by contour detection (dashed bubbles, etc.) - bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_boxes_by_proximity_and_overlap( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) + bubbles, bubble_boxes, bubble_quads, bubble_indices = \ + detect_and_split_multi_bubble_boxes( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = \ + detect_and_merge_fragmented_bubbles( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) + + # Pass 1 + bubbles, bubble_boxes, bubble_quads, bubble_indices = \ + merge_boxes_by_proximity_and_overlap( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) + + # Pass 2 — catches chains only visible after pass 1 + bubbles, bubble_boxes, bubble_quads, bubble_indices = \ + merge_boxes_by_proximity_and_overlap( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) + return bubbles, bubble_boxes, bubble_quads, bubble_indices def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, overlap_threshold=0.50): - bids = list(bubble_boxes.keys()) + bids = list(bubble_boxes.keys()) to_remove = set() - for i in range(len(bids)): bid_i = bids[i] if bid_i in to_remove: continue - box_i = bubble_boxes[bid_i] + box_i = bubble_boxes[bid_i] area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1]) - for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in to_remove: continue - box_j = bubble_boxes[bid_j] + box_j = bubble_boxes[bid_j] area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1]) - - shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j]) + shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j]) overlap = boxes_overlap_ratio(box_i, box_j) - if overlap > overlap_threshold or len(shared) > 0: if area_i >= area_j: to_remove.add(bid_j) @@ -819,7 +772,6 @@ def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, to_remove.add(bid_i) print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})") break - if to_remove: print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)") for bid in to_remove: @@ -827,7 +779,6 @@ def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, bubble_indices.pop(bid, None) bubble_quads.pop(bid, None) bubbles.pop(bid, None) - return bubbles, bubble_boxes, bubble_quads, bubble_indices @@ -844,14 +795,14 @@ def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, oc x1, y1, x2, y2 = box w, h = x2 - x1, y2 - y1 if w > max_width or h > max_height: - indices = bubble_indices[bid] + indices = bubble_indices[bid] col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid, - use_aggressive_thresholds=True) + use_aggressive_thresholds=True) if col_split: for grp in col_split: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - new_quads[next_bid] = [ocr[i][0] for i in grp] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") @@ -860,15 +811,15 @@ def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, oc if row_split: for grp in row_split: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - new_quads[next_bid] = [ocr[i][0] for i in grp] + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") continue new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = box - new_quads[next_bid] = bubble_quads[bid] + new_boxes[next_bid] = box + new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 @@ -918,17 +869,19 @@ class ImprovedMacVisionDetector: variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))] gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - variants.append(("high_contrast", cv2.cvtColor( - cv2.resize(hc, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC), - cv2.COLOR_GRAY2BGR))) - variants.append(("bilateral", cv2.resize( - cv2.bilateralFilter(image_bgr, 9, 75, 75), - None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) - variants.append(("inverted", cv2.resize( - cv2.bitwise_not(image_bgr), - None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) - variants.append(("original", cv2.resize( - image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("high_contrast", + cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5, + interpolation=cv2.INTER_CUBIC), + cv2.COLOR_GRAY2BGR))) + variants.append(("bilateral", + cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75), + None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("inverted", + cv2.resize(cv2.bitwise_not(image_bgr), + None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("original", + cv2.resize(image_bgr, None, fx=2.5, fy=2.5, + interpolation=cv2.INTER_CUBIC))) return variants def run_vision_ocr(self, image_bgr): @@ -945,17 +898,17 @@ class ImprovedMacVisionDetector: def completion_handler(request, error): if error: return for obs in request.results(): - candidate = obs.topCandidates_(1)[0] - text, confidence = candidate.string(), candidate.confidence() - bbox = obs.boundingBox() - x = bbox.origin.x * iw - y_bl = bbox.origin.y * ih - w = bbox.size.width * iw - h = bbox.size.height * ih - y = ih - y_bl - h + candidate = obs.topCandidates_(1)[0] + text, conf = candidate.string(), candidate.confidence() + bbox = obs.boundingBox() + x = bbox.origin.x * iw + y_bl = bbox.origin.y * ih + w = bbox.size.width * iw + h = bbox.size.height * ih + y = ih - y_bl - h quad = [[int(x),int(y)],[int(x+w),int(y)], [int(x+w),int(y+h)],[int(x),int(y+h)]] - results.append((quad, text, confidence)) + results.append((quad, text, conf)) req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) @@ -969,7 +922,7 @@ class ImprovedMacVisionDetector: if not all_results: return [] scale_factor = 2.5 - normalized = [] + normalized = [] for variant_name, results in all_results: for quad, text, conf in results: sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad] @@ -981,7 +934,8 @@ class ImprovedMacVisionDetector: x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3]) if x2 < x1 or y2 < y1: return False inter = (x2-x1)*(y2-y1) - union = (b1[2]-b1[0])*(b1[3]-b1[1]) + (b2[2]-b2[0])*(b2[3]-b2[1]) - inter + union = ((b1[2]-b1[0])*(b1[3]-b1[1]) + + (b2[2]-b2[0])*(b2[3]-b2[1]) - inter) return inter / max(union, 1) > threshold clusters, used = [], set() @@ -1016,7 +970,7 @@ class ImprovedMacVisionDetector: else image_path_or_array if img is None or img.size == 0: return [] - variants = self.preprocess_variants(img) + variants = self.preprocess_variants(img) all_results = [] for vname, vimg in variants: r = self.run_vision_ocr(vimg) @@ -1056,17 +1010,17 @@ class MacVisionDetector: def completion_handler(request, error): if error: return for obs in request.results(): - candidate = obs.topCandidates_(1)[0] - text, confidence = candidate.string(), candidate.confidence() - bbox = obs.boundingBox() - x = bbox.origin.x * iw - y_bl = bbox.origin.y * ih - w = bbox.size.width * iw - h = bbox.size.height * ih - y = ih - y_bl - h + candidate = obs.topCandidates_(1)[0] + text, conf = candidate.string(), candidate.confidence() + bbox = obs.boundingBox() + x = bbox.origin.x * iw + y_bl = bbox.origin.y * ih + w = bbox.size.width * iw + h = bbox.size.height * ih + y = ih - y_bl - h quad = [[int(x),int(y)],[int(x+w),int(y)], [int(x+w),int(y+h)],[int(x),int(y+h)]] - results.append((quad, text, confidence)) + results.append((quad, text, conf)) req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) @@ -1076,79 +1030,74 @@ class MacVisionDetector: handler.performRequests_error_([req], None) return results -def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False): + +# ============================================================ +# COLUMN / ROW SPLITTING +# ============================================================ +def split_bubble_if_multiple_columns(indices, ocr, bid=None, + use_aggressive_thresholds=False): if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3] - b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - xs = [(b[0] + b[2]) / 2.0 for b in boxes] + boxes = [quad_bbox(ocr[i][0]) for i in indices] + hs = [max(1, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + xs = [(b[0]+b[2])/2.0 for b in boxes] xs_sorted = sorted(xs) - - gap_thresh = max(med_h * 1.2, 18) if use_aggressive_thresholds else max(med_h * 1.5, 22) + gap_thresh = max(med_h*1.2, 18) if use_aggressive_thresholds else max(med_h*1.5, 22) best_gap_idx, best_gap_size = None, 0.0 - for i in range(len(xs_sorted) - 1): - gap = xs_sorted[i + 1] - xs_sorted[i] + gap = xs_sorted[i+1] - xs_sorted[i] if gap > gap_thresh and gap > best_gap_size: best_gap_size, best_gap_idx = gap, i - if best_gap_idx is None: return None - split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx + 1]) / 2.0 - - left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 < split_x] - right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 >= split_x] - + split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx+1]) / 2.0 + left_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] + right_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] if not left_idxs or not right_idxs: return None return (left_idxs, right_idxs) - def split_bubble_if_multiple_rows(indices, ocr, bid=None): if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3] - b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - ys = [(b[1] + b[3]) / 2.0 for b in boxes] + boxes = [quad_bbox(ocr[i][0]) for i in indices] + hs = [max(1, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + ys = [(b[1]+b[3])/2.0 for b in boxes] ys_sorted = sorted(ys) - gap_thresh = max(med_h * 2.0, 30) best_gap_idx, best_gap_size = None, 0.0 - for i in range(len(ys_sorted) - 1): - gap = ys_sorted[i + 1] - ys_sorted[i] + gap = ys_sorted[i+1] - ys_sorted[i] if gap > gap_thresh and gap > best_gap_size: best_gap_size, best_gap_idx = gap, i - if best_gap_idx is None: return None - split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx + 1]) / 2.0 - - top_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 < split_y] - bottom_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 >= split_y] - - if not top_idxs or not bottom_idxs: return None - return (top_idxs, bottom_idxs) + split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0 + top_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y] + bot_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y] + if not top_idxs or not bot_idxs: return None + return (top_idxs, bot_idxs) def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3] - b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - - items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], - key=lambda x: (x[1][1] + x[1][3]) / 2.0) + boxes = [quad_bbox(ocr[i][0]) for i in indices] + hs = [max(1, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], + key=lambda x: (x[1][1]+x[1][3])/2.0) gap_thresh = max(med_h * factor, min_gap) best_gap, best_split_idx = 0.0, None - for k in range(len(items) - 1): - gap = items[k + 1][1][1] - items[k][1][3] + gap = items[k+1][1][1] - items[k][1][3] if gap > gap_thresh and gap > best_gap: best_gap, best_split_idx = gap, k - if best_split_idx is None: return None - top_idxs = [it[0] for it in items[:best_split_idx + 1]] - bottom_idxs = [it[0] for it in items[best_split_idx + 1:]] - if not top_idxs or not bottom_idxs: return None - return (top_idxs, bottom_idxs) + top_idxs = [it[0] for it in items[:best_split_idx+1]] + bot_idxs = [it[0] for it in items[best_split_idx+1:]] + if not top_idxs or not bot_idxs: return None + return (top_idxs, bot_idxs) def is_vertical_text_like(indices, ocr): @@ -1165,13 +1114,14 @@ def is_vertical_text_like(indices, ocr): def split_nested_or_side_by_side(indices, ocr): if len(indices) < 2: return None - xs = sorted([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in indices]) + xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 + for i in indices]) mid_idx = len(xs) // 2 - split_x = (xs[mid_idx - 1] + xs[mid_idx]) / 2.0 - - left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] - right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] - + split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0 + left_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] + right_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] if not left_idxs or not right_idxs: return None return (left_idxs, right_idxs) @@ -1184,29 +1134,25 @@ def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): if x2 <= x1 or y2 <= y1: return None crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: return None - - gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) - edges = cv2.Canny(gray, 50, 150) + gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(gray, 50, 150) h_proj = np.sum(edges, axis=0) - w = x2 - x1 + w = x2 - x1 if w < 100: return None - search_start = int(w * 0.35) search_end = int(w * 0.65) if search_end <= search_start: return None region = h_proj[search_start:search_end] if len(region) == 0: return None - threshold = np.percentile(region, 85) - candidates = [x1 + search_start + rx for rx in range(len(region)) if region[rx] >= threshold] + candidates = [x1 + search_start + rx + for rx in range(len(region)) if region[rx] >= threshold] if not candidates: return None split_x = int(np.median(candidates)) - if bubble_quads: - left_count = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) - right_count = len(bubble_quads) - left_count - if left_count == 0 or right_count == 0: return None - + lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) + rc = len(bubble_quads) - lc + if lc == 0 or rc == 0: return None return (x1, x2, split_x) @@ -1216,15 +1162,19 @@ def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): """ - Merges boxes that are spatially very close (within ~1.4× line height on - BOTH axes simultaneously). Strict dual-axis check prevents merging boxes - from adjacent but distinct bubbles — fixing the BOX#5+BOX#16 overlap problem. + Merges boxes that are spatially very close on BOTH axes AND share + meaningful horizontal overlap (same column). + + Single-quad boxes participate fully — no special isolation treatment. + The h_overlap_ratio >= 0.25 guard prevents merging horizontally + adjacent distinct bubbles. """ if not bubbles: return bubbles, bubble_boxes, bubble_quads, bubble_indices - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 + all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1]) + for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 merge_tol = max(8, med_h * 1.4) bids = sorted(bubble_boxes.keys()) @@ -1233,26 +1183,22 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, for i, bid_i in enumerate(bids): if bid_i in merged_set: continue x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i] - wi = x2_i - x1_i + wi = max(1, x2_i - x1_i) for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in merged_set: continue x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j] - wj = x2_j - x1_j + wj = max(1, x2_j - x1_j) gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j)) gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j)) - # Horizontal overlap ratio — must be significant to merge - h_ix1 = max(x1_i, x1_j) - h_ix2 = min(x2_i, x2_j) - h_overlap = max(0, h_ix2 - h_ix1) + h_ix1 = max(x1_i, x1_j) + h_ix2 = min(x2_i, x2_j) + h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - # STRICT: both gap_x AND gap_y must be small, AND boxes must - # share meaningful horizontal overlap (same column). - # This prevents merging horizontally adjacent distinct bubbles. if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25: if bid_i not in merge_map: merge_map[bid_i] = [bid_i] @@ -1264,11 +1210,10 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 - for bid in bids: if bid in merged_set: continue if bid in merge_map: - group = merge_map[bid] + group = merge_map[bid] all_indices = sorted(set(idx for b in group for idx in bubble_indices[b])) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) @@ -1289,7 +1234,7 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, # ============================================================ def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): if not ocr_list: return ocr_list, 0 - hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] + hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] med_h = float(np.median(hs)) if hs else 14.0 result, splits_made = [], 0 @@ -1302,8 +1247,9 @@ def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): max(0,x1):min(image_bgr.shape[1],x2)] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) + _, binary = cv2.threshold(gray, 0, 255, + cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) gap_threshold = roi.shape[0] * 255 * 0.15 gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): @@ -1319,17 +1265,19 @@ def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): gaps.sort(key=lambda g: g[1], reverse=True) split_x_abs = max(0, x1) + gaps[0][0] if ' ' in text: - char_w = w / max(1, len(text)) + char_w = w / max(1, len(text)) split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: - result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) + result.extend([ + ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) splits_made += 1 continue result.append((quad, text, conf)) @@ -1338,7 +1286,7 @@ def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0): if not ocr_list: return ocr_list, 0 - hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] + hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] med_h = float(np.median(hs)) if hs else 14.0 result, splits_made = [], 0 @@ -1351,8 +1299,9 @@ def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0) max(0,x1):min(image_bgr.shape[1],x2)] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) + _, binary = cv2.threshold(gray, 0, 255, + cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) gap_threshold = h * 255 * 0.20 gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): @@ -1368,17 +1317,19 @@ def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0) gaps.sort(key=lambda g: g[1], reverse=True) split_x_abs = max(0, x1) + gaps[0][0] if ' ' in text: - char_w = w / max(1, len(text)) + char_w = w / max(1, len(text)) split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: - result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) + result.extend([ + ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) splits_made += 1 continue result.append((quad, text, conf)) @@ -1389,8 +1340,8 @@ def normalize_ocr_quads(ocr_list): result = [] for quad, text, conf in ocr_list: x1, y1, x2, y2 = quad_bbox(quad) - pad = 3 - new_quad = [[x1-pad, y1-pad], [x2+pad, y1-pad], [x2+pad, y2+pad], [x1-pad, y2+pad]] + pad = 3 + new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]] result.append((new_quad, text, conf)) return result @@ -1401,10 +1352,12 @@ def normalize_ocr_quads(ocr_list): def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) if mode == "raw": return gray - if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(gray) + if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, + tileGridSize=(8,8)).apply(gray) if mode == "adaptive": den = cv2.GaussianBlur(gray, (3,3), 0) - return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) + return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 35, 11) if mode == "otsu": den = cv2.GaussianBlur(gray, (3,3), 0) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) @@ -1422,14 +1375,15 @@ def preprocess_variant(crop_bgr, mode): def rotate_image_keep_bounds(img, angle_deg): h, w = img.shape[:2] - c = (w/2, h/2) - M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) + c = (w/2, h/2) + M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) cos, sin = abs(M[0,0]), abs(M[0,1]) new_w = int((h*sin) + (w*cos)) new_h = int((h*cos) + (w*sin)) M[0,2] += (new_w/2) - c[0] M[1,2] += (new_h/2) - c[1] - return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) + return cv2.warpAffine(img, M, (new_w, new_h), + flags=cv2.INTER_CUBIC, borderValue=255) def rebuild_text_from_vision_result(res): @@ -1438,7 +1392,8 @@ def rebuild_text_from_vision_result(res): for bbox, txt, conf in res: if not txt or not txt.strip(): continue b = quad_bbox(bbox) - norm.append((b, txt, conf, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) + norm.append((b, txt, conf, + (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) if not norm: return "" med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) @@ -1453,22 +1408,25 @@ def rebuild_text_from_vision_result(res): placed = True; break if not placed: rows.append({"yc": it[4], "m": [it]}) rows.sort(key=lambda r: r["yc"]) - lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) for r in rows] + lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) + for r in rows] return normalize_text(" ".join(filter(None, lines))) -def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24): +def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, + upscale=3.0, pad=24): ih, iw = image_bgr.shape[:2] x1, y1, x2, y2 = bbox_xyxy x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad)) x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad)) - crop = image_bgr[y1:y2, x1:x2] + crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: return None, 0.0, "none" modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] angles = [0.0, 1.5, -1.5] best_v_txt, best_v_sc = "", 0.0 - up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), + up0 = cv2.resize(crop, + (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), interpolation=cv2.INTER_CUBIC) for mode in modes: @@ -1510,7 +1468,9 @@ def build_lines_from_indices(indices, ocr): placed = True; break if not placed: rows.append({"yc": it[3], "m": [it]}) rows.sort(key=lambda r: r["yc"]) - return [normalize_text(" ".join(ocr[i][1] for i,_,_,_,_ in sorted(r["m"], key=lambda z: z[2]))) + return [normalize_text( + " ".join(ocr[i][1] + for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2]))) for r in rows if r["m"]] @@ -1519,16 +1479,8 @@ def auto_gap(image_path, base=18, ref_w=750): return base * (img.shape[1] / ref_w) if img is not None else base -def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mode=False): - """ - Groups OCR quads into bubble candidates. - - Generic protections applied: - - orientation_compatible(): prevents tall/narrow glyphs merging with wide text lines. - - Horizontal gap guard: prevents side-by-side column quads from merging. - - detect_horizontal_gap_in_group(): post-merge split for groups with large internal gaps. - - Orientation check in secondary merge pass. - """ +def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, + strict_mode=False): n = len(ocr) if n == 0: return {}, {}, {}, {} @@ -1547,12 +1499,12 @@ def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mo if i in used: continue current_group = [i] used.add(i) - cx_i, cy_i = centers[i] + cx_i = centers[i][0] for j in sorted_indices: if j in used or j == i: continue cx_j, cy_j = centers[j] - if cy_j <= cy_i: continue + if cy_j <= centers[i][1]: continue if abs(cx_i - cx_j) > max_horizontal_offset: continue # Horizontal gap guard @@ -1598,7 +1550,8 @@ def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mo else: final_groups.append(group) - final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), min(centers[i][0] for i in g))) + final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), + min(centers[i][0] for i in g))) bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} ih, iw = image_shape[:2] @@ -1610,35 +1563,99 @@ def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mo if ub is None: continue x1, y1, x2, y2 = ub ap = max(1, int(round(med_h * 0.16))) - bubbles[bid] = lines - bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), min(iw-1,x2+ap), min(ih-1,y2+ap)) - bubble_quads[bid] = quads - bubble_indices[bid]= idxs + bubbles[bid] = lines + bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), + min(iw-1,x2+ap), min(ih-1,y2+ap)) + bubble_quads[bid] = quads + bubble_indices[bid] = idxs return bubbles, bubble_boxes, bubble_quads, bubble_indices +# ============================================================ +# SPLIT HELPER — centralises all split strategies +# ============================================================ +def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes, + filtered, image, iw, ih): + """ + Attempts all split strategies in priority order. + Returns ((part1_indices, part2_indices), reason_str) or (None, None). + + BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4 + so the gap between the top speech bubble and the bottom cluster triggers. + """ + indices = bubble_indices[bid] + box = bubble_boxes[bid] + + # 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster) + if is_vertical_text_like(indices, filtered): + vgap = split_cluster_by_big_vertical_gap(indices, filtered, + factor=1.4, min_gap=18) + if vgap: + return vgap, "vertical-stack y-gap" + + # 2. Panel border + sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) + if sr: + _, _, split_x = sr + li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x] + ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x] + if li and ri: + return (li, ri), "panel border" + elif len(bubble_quads[bid]) >= 4: + cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid, + use_aggressive_thresholds=True) + if cs: + return cs, "aggressive column" + + # 3. Column gap + cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid) + if cs: + return cs, "vertical column" + + # 4. Nested / side-by-side + ns = split_nested_or_side_by_side(indices, filtered) + if ns: + return ns, "nested/side-by-side" + + # 5. Row split + rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid) + if rs: + return rs, "horizontal row" + + # 6. Large vertical gap (general, less sensitive) + gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22) + if gy: + return gy, "large vertical-gap" + + return None, None + + # ============================================================ # DEBUG / EXPORT # ============================================================ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, clean_lines=None, out_path="debug_clusters.png"): + """ + Draws all detected boxes. + Single-quad boxes are drawn in orange for visibility but are NOT + labelled as (ISOLATED) — they participate fully in merge passes. + """ img = cv2.imread(image_path) if img is None: return for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) - cv2.fillPoly(img, [pts], (255,255,255)) - cv2.polylines(img, [pts], True, (180,180,180), 1) + cv2.fillPoly(img, [pts], (255, 255, 255)) + cv2.polylines(img, [pts], True, (180, 180, 180), 1) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb - is_isolated = len(bubble_indices.get(bid, [])) == 1 - color = (255,165,0) if is_isolated else (0,220,0) - thickness = 3 if is_isolated else 2 - cv2.rectangle(img, (x1,y1), (x2,y2), color, thickness) - label = f"BOX#{bid}" + (" (ISOLATED)" if is_isolated else "") - cv2.putText(img, label, (x1+2, max(15, y1+16)), + n_quads = len(bubble_indices.get(bid, [])) + color = (255, 165, 0) if n_quads == 1 else (0, 220, 0) + thickness = 3 if n_quads == 1 else 2 + cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) + cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) if clean_lines and bid in clean_lines: @@ -1651,15 +1668,18 @@ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, if cur: lines.append(cur.strip()) y_text = y2 + 18 for line in lines: - cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3) - cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1) + cv2.putText(img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3) + cv2.putText(img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1) y_text += 18 cv2.imwrite(out_path, img) def estimate_reading_order(bbox_dict, mode="ltr"): - items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) for bid, bb in bbox_dict.items()] + items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) + for bid, bb in bbox_dict.items()] items.sort(key=lambda t: t[2]) rows, tol = [], 90 for it in items: @@ -1704,6 +1724,7 @@ def translate_manga_text( print(f"❌ Cannot load image: {image_path}"); return resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) + ih, iw = image.shape[:2] print("Loading OCR engines...") if use_enhanced_ocr: @@ -1722,29 +1743,28 @@ def translate_manga_text( if missed_regions: print(f"🔍 Found {len(missed_regions)} potentially missed text regions") for region in missed_regions: - x1, y1, x2, y2 = region + rx1, ry1, rx2, ry2 = region pad = 10 - x1, y1 = max(0, x1-pad), max(0, y1-pad) - x2, y2 = min(image.shape[1], x2+pad), min(image.shape[0], y2+pad) - crop = image[y1:y2, x1:x2] + rx1, ry1 = max(0, rx1-pad), max(0, ry1-pad) + rx2, ry2 = min(iw, rx2+pad), min(ih, ry2+pad) + crop = image[ry1:ry2, rx1:rx2] if crop.size > 0: upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC) for quad, text, conf in detector.run_vision_ocr(upscaled): - raw.append(([[int(p[0]/4.0+x1), int(p[1]/4.0+y1)] for p in quad], - text, conf)) + raw.append(([[int(p[0]/4.0+rx1), int(p[1]/4.0+ry1)] + for p in quad], text, conf)) print(f"📝 Total detections after missed region scan: {len(raw)}") + # ── Filtering ───────────────────────────────────────────────────────── filtered, skipped = [], 0 - ih, iw = image.shape[:2] - for bbox, text, conf in raw: t = normalize_text(text) qb = quad_bbox(bbox) - if conf < confidence_threshold: skipped += 1; continue - if len(t) < min_text_length: skipped += 1; continue - if not is_valid_language(t, source_lang): skipped += 1; continue - if not is_meaningful_text(t, source_lang):skipped += 1; continue + if conf < confidence_threshold: skipped += 1; continue + if len(t) < min_text_length: skipped += 1; continue + if not is_valid_language(t, source_lang): skipped += 1; continue + if not is_meaningful_text(t, source_lang): skipped += 1; continue if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: skipped += 1; continue filtered.append((bbox, t, conf)) @@ -1758,105 +1778,56 @@ def translate_manga_text( if oversized_splits > 0: print(f"📐 Split {oversized_splits} oversized quad(s) before grouping") - filtered, splits_made = split_wide_ocr_items(image, filtered) - if splits_made > 0: - print(f"✂️ Split {splits_made} wide OCR lines across column gaps.") + filtered, wide_splits = split_wide_ocr_items(image, filtered) + if wide_splits > 0: + print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.") filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) if bridge_splits > 0: print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") - # ── Column-gap split: catches BOX#6 type wide quads spanning two columns ── - hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered] + # Column-gap split: catches wide quads spanning two columns (BOX#6 type) + hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered] med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre) - if col_splits > 0: - print(f"📐 Column-gap split: {col_splits} quad(s) split before grouping") filtered = normalize_ocr_quads(filtered) + # ── Grouping ────────────────────────────────────────────────────────── print("📊 Grouping quads vertically...") bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( - filtered, image.shape, gap_px=resolved_gap, bbox_padding=1, strict_mode=strict_grouping - ) + filtered, image.shape, gap_px=resolved_gap, + bbox_padding=1, strict_mode=strict_grouping) print(f" Created {len(bubbles)} initial box(es)") + # ── Auto-fix (split + merge) ────────────────────────────────────────── if auto_fix_bubbles: bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( - bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image - ) + bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image) + # ── Enforce max box size ────────────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, max_width_ratio=max_box_width_ratio, max_height_ratio=max_box_height_ratio, - image_shape=image.shape - ) + image_shape=image.shape) + # ── Close-proximity merge ───────────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered - ) + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered) + # ── Per-bubble split pass ───────────────────────────────────────────── new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} next_bid = max(bubbles.keys()) + 1 if bubbles else 1 splits_performed = [] for bid in list(bubbles.keys()): - box = bubble_boxes[bid] - bubble_split = None + split_result, split_reason = _split_bubble_if_needed( + bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih) - if is_vertical_text_like(bubble_indices[bid], filtered): - vgap = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, - factor=1.7, min_gap=18) - if vgap: - bubble_split = vgap - splits_performed.append(f"BOX#{bid} (vertical-stack y-gap)") - - if bubble_split is None: - sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) - if sr: - _, _, split_x = sr - li = [idx for idx in bubble_indices[bid] - if quad_center(filtered[idx][0])[0] < split_x] - ri = [idx for idx in bubble_indices[bid] - if quad_center(filtered[idx][0])[0] >= split_x] - if li and ri: - bubble_split = (li, ri) - splits_performed.append(f"BOX#{bid} (panel border)") - elif len(bubble_quads[bid]) >= 4: - cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, - bid=bid, use_aggressive_thresholds=True) - if cs: - bubble_split = cs - splits_performed.append(f"BOX#{bid} (aggressive column)") - - if bubble_split is None: - cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid) - if cs: - bubble_split = cs - splits_performed.append(f"BOX#{bid} (vertical column)") - - if bubble_split is None: - ns = split_nested_or_side_by_side(bubble_indices[bid], filtered) - if ns: - bubble_split = ns - splits_performed.append(f"BOX#{bid} (nested/side-by-side)") - - if bubble_split is None: - rs = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid) - if rs: - bubble_split = rs - splits_performed.append(f"BOX#{bid} (horizontal row)") - - if bubble_split is None: - gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, - factor=1.9, min_gap=22) - if gy: - bubble_split = gy - splits_performed.append(f"BOX#{bid} (large vertical-gap)") - - if bubble_split: - p1, p2 = bubble_split + if split_result: + p1, p2 = split_result + splits_performed.append(f"BOX#{bid} ({split_reason})") for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) @@ -1873,11 +1844,12 @@ def translate_manga_text( if splits_performed: print(f"\n🔀 Splits detected: {len(splits_performed)}") + for s in splits_performed: print(f" ✓ {s}") + # ── Remove nested / duplicate boxes ────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes( new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles, - overlap_threshold=0.50 - ) + overlap_threshold=0.50) print(f"✅ Final box count: {len(bubbles)}") # ── OCR quality pass ────────────────────────────────────────────────── @@ -1900,7 +1872,7 @@ def translate_manga_text( reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) - # ── Single-pass translation cache ──────────────────────────────────── + # ── Translation ─────────────────────────────────────────────────────── for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() if not src_txt: continue @@ -1936,8 +1908,9 @@ def translate_manga_text( src_u = src_txt.upper() src_engine = sources_used.get(bid, "unknown") - out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|" - f"{','.join(flags) if flags else '-'}") + out_lines.append( + f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|" + f"{','.join(flags) if flags else '-'}") print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}") translated_count += 1 @@ -1980,7 +1953,7 @@ def translate_manga_text( # ============================================================ if __name__ == "__main__": translate_manga_text( - image_path="17.jpg", + image_path="19.png", source_lang="english", target_lang="ca", confidence_threshold=0.03, @@ -1997,3 +1970,917 @@ if __name__ == "__main__": max_box_height_ratio=0.5, auto_fix_bubbles=True ) + +def split_bubble_if_multiple_rows(indices, ocr, bid=None): + if len(indices) < 2: return None + boxes = [quad_bbox(ocr[i][0]) for i in indices] + hs = [max(1, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + ys = [(b[1]+b[3])/2.0 for b in boxes] + ys_sorted = sorted(ys) + gap_thresh = max(med_h * 2.0, 30) + best_gap_idx, best_gap_size = None, 0.0 + for i in range(len(ys_sorted) - 1): + gap = ys_sorted[i+1] - ys_sorted[i] + if gap > gap_thresh and gap > best_gap_size: + best_gap_size, best_gap_idx = gap, i + if best_gap_idx is None: return None + split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0 + top_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y] + bot_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y] + if not top_idxs or not bot_idxs: return None + return (top_idxs, bot_idxs) + + +def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): + if len(indices) < 2: return None + boxes = [quad_bbox(ocr[i][0]) for i in indices] + hs = [max(1, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], + key=lambda x: (x[1][1]+x[1][3])/2.0) + gap_thresh = max(med_h * factor, min_gap) + best_gap, best_split_idx = 0.0, None + for k in range(len(items) - 1): + gap = items[k+1][1][1] - items[k][1][3] + if gap > gap_thresh and gap > best_gap: + best_gap, best_split_idx = gap, k + if best_split_idx is None: return None + top_idxs = [it[0] for it in items[:best_split_idx+1]] + bot_idxs = [it[0] for it in items[best_split_idx+1:]] + if not top_idxs or not bot_idxs: return None + return (top_idxs, bot_idxs) + + +def is_vertical_text_like(indices, ocr): + if len(indices) < 2: return False + boxes = [quad_bbox(ocr[i][0]) for i in indices] + med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes])) + med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes])) + if med_h < med_w * 1.2: return False + xs = [(b[0]+b[2])/2.0 for b in boxes] + ys = [(b[1]+b[3])/2.0 for b in boxes] + if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False + return True + + +def split_nested_or_side_by_side(indices, ocr): + if len(indices) < 2: return None + xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 + for i in indices]) + mid_idx = len(xs) // 2 + split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0 + left_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] + right_idxs = [i for i in indices + if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] + if not left_idxs or not right_idxs: return None + return (left_idxs, right_idxs) + + +def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): + x1, y1, x2, y2 = box_xyxy + ih, iw = image_bgr.shape[:2] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(iw-1, x2), min(ih-1, y2) + if x2 <= x1 or y2 <= y1: return None + crop = image_bgr[y1:y2, x1:x2] + if crop.size == 0: return None + gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(gray, 50, 150) + h_proj = np.sum(edges, axis=0) + w = x2 - x1 + if w < 100: return None + search_start = int(w * 0.35) + search_end = int(w * 0.65) + if search_end <= search_start: return None + region = h_proj[search_start:search_end] + if len(region) == 0: return None + threshold = np.percentile(region, 85) + candidates = [x1 + search_start + rx + for rx in range(len(region)) if region[rx] >= threshold] + if not candidates: return None + split_x = int(np.median(candidates)) + if bubble_quads: + lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) + rc = len(bubble_quads) - lc + if lc == 0 or rc == 0: return None + return (x1, x2, split_x) + + +# ============================================================ +# MERGE CLOSE BUBBLES +# ============================================================ +def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, + bubble_indices, ocr): + """ + Merges boxes that are spatially very close on BOTH axes AND share + meaningful horizontal overlap (same column). + + Single-quad boxes participate fully — no special isolation treatment. + The h_overlap_ratio >= 0.25 guard prevents merging horizontally + adjacent distinct bubbles. + """ + if not bubbles: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1]) + for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + merge_tol = max(8, med_h * 1.4) + + bids = sorted(bubble_boxes.keys()) + merged_set, merge_map = set(), {} + + for i, bid_i in enumerate(bids): + if bid_i in merged_set: continue + x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i] + wi = max(1, x2_i - x1_i) + + for j in range(i + 1, len(bids)): + bid_j = bids[j] + if bid_j in merged_set: continue + x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j] + wj = max(1, x2_j - x1_j) + + gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j)) + gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j)) + + h_ix1 = max(x1_i, x1_j) + h_ix2 = min(x2_i, x2_j) + h_overlap = max(0, h_ix2 - h_ix1) + h_overlap_ratio = h_overlap / max(1, min(wi, wj)) + + if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25: + if bid_i not in merge_map: + merge_map[bid_i] = [bid_i] + merge_map[bid_i].append(bid_j) + merged_set.add(bid_j) + + if not merge_map: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + next_bid = 1 + for bid in bids: + if bid in merged_set: continue + if bid in merge_map: + group = merge_map[bid] + all_indices = sorted(set(idx for b in group for idx in bubble_indices[b])) + new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) + new_quads[next_bid] = [ocr[i][0] for i in all_indices] + new_indices[next_bid] = all_indices + else: + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + + return new_bubbles, new_boxes, new_quads, new_indices + + +# ============================================================ +# WIDE / BRIDGE QUAD SPLITTING +# ============================================================ +def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): + if not ocr_list: return ocr_list, 0 + hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] + med_h = float(np.median(hs)) if hs else 14.0 + result, splits_made = [], 0 + + for quad, text, conf in ocr_list: + x1, y1, x2, y2 = quad_bbox(quad) + w = x2 - x1 + if w > med_h * width_factor: + pad = 2 + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] + if roi.size > 0: + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, + cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) + gap_threshold = roi.shape[0] * 255 * 0.15 + gaps, in_gap, gap_start = [], False, 0 + for x in range(len(v_proj)): + if v_proj[x] < gap_threshold: + if not in_gap: gap_start, in_gap = x, True + else: + if in_gap: + gw = x - gap_start + if gw >= max(int(med_h * 0.6), 12): + gaps.append((gap_start + gw // 2, gw)) + in_gap = False + if gaps: + gaps.sort(key=lambda g: g[1], reverse=True) + split_x_abs = max(0, x1) + gaps[0][0] + if ' ' in text: + char_w = w / max(1, len(text)) + split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + else: + split_idx = int(len(text) * (split_x_abs - x1) / w) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + if tl and tr: + result.extend([ + ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) + splits_made += 1 + continue + result.append((quad, text, conf)) + return result, splits_made + + +def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0): + if not ocr_list: return ocr_list, 0 + hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] + med_h = float(np.median(hs)) if hs else 14.0 + result, splits_made = [], 0 + + for quad, text, conf in ocr_list: + x1, y1, x2, y2 = quad_bbox(quad) + w, h = x2 - x1, max(1, y2 - y1) + if w / h > aspect_ratio_threshold: + pad = 2 + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] + if roi.size > 0: + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, + cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) + gap_threshold = h * 255 * 0.20 + gaps, in_gap, gap_start = [], False, 0 + for x in range(len(v_proj)): + if v_proj[x] < gap_threshold: + if not in_gap: gap_start, in_gap = x, True + else: + if in_gap: + gw = x - gap_start + if gw >= max(int(med_h * 0.8), 15): + gaps.append((gap_start + gw // 2, gw)) + in_gap = False + if gaps: + gaps.sort(key=lambda g: g[1], reverse=True) + split_x_abs = max(0, x1) + gaps[0][0] + if ' ' in text: + char_w = w / max(1, len(text)) + split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + else: + split_idx = int(len(text) * (split_x_abs - x1) / w) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + if tl and tr: + result.extend([ + ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) + splits_made += 1 + continue + result.append((quad, text, conf)) + return result, splits_made + + +def normalize_ocr_quads(ocr_list): + result = [] + for quad, text, conf in ocr_list: + x1, y1, x2, y2 = quad_bbox(quad) + pad = 3 + new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]] + result.append((new_quad, text, conf)) + return result + + +# ============================================================ +# VISION RE-READ +# ============================================================ +def preprocess_variant(crop_bgr, mode): + gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) + if mode == "raw": return gray + if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, + tileGridSize=(8,8)).apply(gray) + if mode == "adaptive": + den = cv2.GaussianBlur(gray, (3,3), 0) + return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 35, 11) + if mode == "otsu": + den = cv2.GaussianBlur(gray, (3,3), 0) + _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + return th + if mode == "invert": return 255 - gray + if mode == "bilateral": + den = cv2.bilateralFilter(gray, 7, 60, 60) + _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + return th + if mode == "morph_open": + _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8)) + return gray + + +def rotate_image_keep_bounds(img, angle_deg): + h, w = img.shape[:2] + c = (w/2, h/2) + M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) + cos, sin = abs(M[0,0]), abs(M[0,1]) + new_w = int((h*sin) + (w*cos)) + new_h = int((h*cos) + (w*sin)) + M[0,2] += (new_w/2) - c[0] + M[1,2] += (new_h/2) - c[1] + return cv2.warpAffine(img, M, (new_w, new_h), + flags=cv2.INTER_CUBIC, borderValue=255) + + +def rebuild_text_from_vision_result(res): + if not res: return "" + norm = [] + for bbox, txt, conf in res: + if not txt or not txt.strip(): continue + b = quad_bbox(bbox) + norm.append((b, txt, conf, + (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) + if not norm: return "" + med_h = float(np.median([x[5] for x in norm])) + row_tol = max(6.0, med_h * 0.75) + norm.sort(key=lambda z: z[4]) + rows = [] + for it in norm: + placed = False + for r in rows: + if abs(it[4] - r["yc"]) <= row_tol: + r["m"].append(it) + r["yc"] = float(np.mean([k[4] for k in r["m"]])) + placed = True; break + if not placed: rows.append({"yc": it[4], "m": [it]}) + rows.sort(key=lambda r: r["yc"]) + lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) + for r in rows] + return normalize_text(" ".join(filter(None, lines))) + + +def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, + upscale=3.0, pad=24): + ih, iw = image_bgr.shape[:2] + x1, y1, x2, y2 = bbox_xyxy + x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad)) + x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad)) + crop = image_bgr[y1:y2, x1:x2] + if crop.size == 0: return None, 0.0, "none" + + modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] + angles = [0.0, 1.5, -1.5] + best_v_txt, best_v_sc = "", 0.0 + up0 = cv2.resize(crop, + (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), + interpolation=cv2.INTER_CUBIC) + + for mode in modes: + proc = preprocess_variant(up0, mode) + proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc + for a in angles: + rot = rotate_image_keep_bounds(proc3, a) + res = (vision_detector.run_vision_ocr(rot) + if hasattr(vision_detector, 'run_vision_ocr') + else vision_detector.read(rot)) + txt = rebuild_text_from_vision_result(res) + sc = ocr_candidate_score(txt) + if sc > best_v_sc: + best_v_txt, best_v_sc = txt, sc + + if best_v_txt: return best_v_txt, best_v_sc, "vision-reread" + return None, 0.0, "none" + + +# ============================================================ +# LINES + BUBBLES +# ============================================================ +def build_lines_from_indices(indices, ocr): + if not indices: return [] + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) + med_h = float(np.median([it[4] for it in items])) if items else 10.0 + row_tol = max(6.0, med_h * 0.75) + items.sort(key=lambda x: x[3]) + rows = [] + for it in items: + placed = False + for r in rows: + if abs(it[3] - r["yc"]) <= row_tol: + r["m"].append(it) + r["yc"] = float(np.mean([k[3] for k in r["m"]])) + placed = True; break + if not placed: rows.append({"yc": it[3], "m": [it]}) + rows.sort(key=lambda r: r["yc"]) + return [normalize_text( + " ".join(ocr[i][1] + for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2]))) + for r in rows if r["m"]] + + +def auto_gap(image_path, base=18, ref_w=750): + img = cv2.imread(image_path) + return base * (img.shape[1] / ref_w) if img is not None else base + + +def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, + strict_mode=False): + n = len(ocr) + if n == 0: return {}, {}, {}, {} + + boxes = [quad_bbox(r[0]) for r in ocr] + centers = [quad_center(r[0]) for r in ocr] + hs = [max(1.0, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + + max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0 + max_horizontal_offset = med_h * 1.8 + + sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0])) + groups, used = [], set() + + for i in sorted_indices: + if i in used: continue + current_group = [i] + used.add(i) + cx_i = centers[i][0] + + for j in sorted_indices: + if j in used or j == i: continue + cx_j, cy_j = centers[j] + if cy_j <= centers[i][1]: continue + if abs(cx_i - cx_j) > max_horizontal_offset: continue + + # Horizontal gap guard + gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2])) + if gap_x > med_h * 1.5: continue + + # Orientation compatibility guard + if not orientation_compatible(i, j, ocr): continue + + vertical_gap = boxes[j][1] - boxes[current_group[-1]][3] + if vertical_gap <= max_vertical_gap: + current_group.append(j) + used.add(j) + cx_i = (cx_i + cx_j) / 2.0 + + if current_group: + groups.append(current_group) + + # Secondary merge pass + merged_groups, used_groups = [], set() + for i, group1 in enumerate(groups): + if i in used_groups: continue + merged = list(group1) + used_groups.add(i) + for j, group2 in enumerate(groups): + if i == j or j in used_groups: continue + if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap): + compat = all(orientation_compatible(a, b, ocr) + for a in merged for b in group2) + if compat: + merged.extend(group2) + used_groups.add(j) + merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1])) + + # Horizontal gap split pass + final_groups = [] + for group in merged_groups: + h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5) + if h_split: + lg, rg = h_split + final_groups.append(sorted(lg, key=lambda idx: centers[idx][1])) + final_groups.append(sorted(rg, key=lambda idx: centers[idx][1])) + else: + final_groups.append(group) + + final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), + min(centers[i][0] for i in g))) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} + ih, iw = image_shape[:2] + + for bid, idxs in enumerate(final_groups, start=1): + lines = build_lines_from_indices(idxs, ocr) + quads = [ocr[k][0] for k in idxs] + ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) + if ub is None: continue + x1, y1, x2, y2 = ub + ap = max(1, int(round(med_h * 0.16))) + bubbles[bid] = lines + bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), + min(iw-1,x2+ap), min(ih-1,y2+ap)) + bubble_quads[bid] = quads + bubble_indices[bid] = idxs + + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + +# ============================================================ +# SPLIT HELPER — centralises all split strategies +# ============================================================ +def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes, + filtered, image, iw, ih): + """ + Attempts all split strategies in priority order. + Returns ((part1_indices, part2_indices), reason_str) or (None, None). + + BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4 + so the gap between the top speech bubble and the bottom cluster triggers. + """ + indices = bubble_indices[bid] + box = bubble_boxes[bid] + + # 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster) + if is_vertical_text_like(indices, filtered): + vgap = split_cluster_by_big_vertical_gap(indices, filtered, + factor=1.4, min_gap=18) + if vgap: + return vgap, "vertical-stack y-gap" + + # 2. Panel border + sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) + if sr: + _, _, split_x = sr + li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x] + ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x] + if li and ri: + return (li, ri), "panel border" + elif len(bubble_quads[bid]) >= 4: + cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid, + use_aggressive_thresholds=True) + if cs: + return cs, "aggressive column" + + # 3. Column gap + cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid) + if cs: + return cs, "vertical column" + + # 4. Nested / side-by-side + ns = split_nested_or_side_by_side(indices, filtered) + if ns: + return ns, "nested/side-by-side" + + # 5. Row split + rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid) + if rs: + return rs, "horizontal row" + + # 6. Large vertical gap (general, less sensitive) + gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22) + if gy: + return gy, "large vertical-gap" + + return None, None + + +# ============================================================ +# DEBUG / EXPORT +# ============================================================ +def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, + clean_lines=None, out_path="debug_clusters.png"): + """ + Draws all detected boxes. + Single-quad boxes are drawn in orange for visibility but are NOT + labelled as (ISOLATED) — they participate fully in merge passes. + """ + img = cv2.imread(image_path) + if img is None: return + + for bbox, txt, conf in ocr: + pts = np.array(bbox, dtype=np.int32) + cv2.fillPoly(img, [pts], (255, 255, 255)) + cv2.polylines(img, [pts], True, (180, 180, 180), 1) + + for bid, bb in bubble_boxes.items(): + x1, y1, x2, y2 = bb + n_quads = len(bubble_indices.get(bid, [])) + color = (255, 165, 0) if n_quads == 1 else (0, 220, 0) + thickness = 3 if n_quads == 1 else 2 + cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) + cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + + if clean_lines and bid in clean_lines: + text = clean_lines[bid] + words = text.split() + lines, cur = [], "" + for w in words: + if len(cur) + len(w) < 25: cur += w + " " + else: lines.append(cur.strip()); cur = w + " " + if cur: lines.append(cur.strip()) + y_text = y2 + 18 + for line in lines: + cv2.putText(img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3) + cv2.putText(img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1) + y_text += 18 + + cv2.imwrite(out_path, img) + + +def estimate_reading_order(bbox_dict, mode="ltr"): + items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) + for bid, bb in bbox_dict.items()] + items.sort(key=lambda t: t[2]) + rows, tol = [], 90 + for it in items: + placed = False + for r in rows: + if abs(it[2] - r["cy"]) <= tol: + r["items"].append(it) + r["cy"] = float(np.mean([x[2] for x in r["items"]])) + placed = True; break + if not placed: rows.append({"cy": it[2], "items": [it]}) + rows.sort(key=lambda r: r["cy"]) + order = [] + for r in rows: + r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) + order.extend([z[0] for z in r["items"]]) + return {bid: i+1 for i, bid in enumerate(order)} + + +# ============================================================ +# MAIN PIPELINE +# ============================================================ +def translate_manga_text( + image_path="001-page.png", + source_lang="en", + target_lang="ca", + confidence_threshold=0.03, + min_text_length=1, + gap_px="auto", + quality_threshold=0.62, + export_to_file="output.txt", + export_bubbles_to="bubbles.json", + reading_mode="ltr", + debug=True, + use_enhanced_ocr=True, + strict_grouping=True, + max_box_width_ratio=0.6, + max_box_height_ratio=0.5, + auto_fix_bubbles=True +): + image = cv2.imread(image_path) + if image is None: + print(f"❌ Cannot load image: {image_path}"); return + + resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) + ih, iw = image.shape[:2] + print("Loading OCR engines...") + + if use_enhanced_ocr: + detector = ImprovedMacVisionDetector(source_lang=source_lang) + print("🚀 Using Enhanced Multi-Pass OCR") + else: + detector = MacVisionDetector(source_lang=source_lang) + + print("Running detection OCR (Apple Vision)...") + raw = detector.read(image_path) + print(f"Raw detections: {len(raw)}") + + if use_enhanced_ocr: + existing_quads = [r[0] for r in raw] + missed_regions = detect_small_text_regions(image, existing_quads) + if missed_regions: + print(f"🔍 Found {len(missed_regions)} potentially missed text regions") + for region in missed_regions: + rx1, ry1, rx2, ry2 = region + pad = 10 + rx1, ry1 = max(0, rx1-pad), max(0, ry1-pad) + rx2, ry2 = min(iw, rx2+pad), min(ih, ry2+pad) + crop = image[ry1:ry2, rx1:rx2] + if crop.size > 0: + upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, + interpolation=cv2.INTER_CUBIC) + for quad, text, conf in detector.run_vision_ocr(upscaled): + raw.append(([[int(p[0]/4.0+rx1), int(p[1]/4.0+ry1)] + for p in quad], text, conf)) + print(f"📝 Total detections after missed region scan: {len(raw)}") + + # ── Filtering ───────────────────────────────────────────────────────── + filtered, skipped = [], 0 + for bbox, text, conf in raw: + t = normalize_text(text) + qb = quad_bbox(bbox) + if conf < confidence_threshold: skipped += 1; continue + if len(t) < min_text_length: skipped += 1; continue + if not is_valid_language(t, source_lang): skipped += 1; continue + if not is_meaningful_text(t, source_lang): skipped += 1; continue + if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: + skipped += 1; continue + filtered.append((bbox, t, conf)) + + print(f"Kept: {len(filtered)} | Skipped: {skipped}") + if not filtered: + print("⚠️ No text after filtering."); return + + # ── Pre-grouping quad splits ────────────────────────────────────────── + filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered) + if oversized_splits > 0: + print(f"📐 Split {oversized_splits} oversized quad(s) before grouping") + + filtered, wide_splits = split_wide_ocr_items(image, filtered) + if wide_splits > 0: + print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.") + + filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) + if bridge_splits > 0: + print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") + + # Column-gap split: catches wide quads spanning two columns (BOX#6 type) + hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered] + med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 + filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre) + + filtered = normalize_ocr_quads(filtered) + + # ── Grouping ────────────────────────────────────────────────────────── + print("📊 Grouping quads vertically...") + bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( + filtered, image.shape, gap_px=resolved_gap, + bbox_padding=1, strict_mode=strict_grouping) + print(f" Created {len(bubbles)} initial box(es)") + + # ── Auto-fix (split + merge) ────────────────────────────────────────── + if auto_fix_bubbles: + bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( + bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image) + + # ── Enforce max box size ────────────────────────────────────────────── + bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( + bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, + max_width_ratio=max_box_width_ratio, + max_height_ratio=max_box_height_ratio, + image_shape=image.shape) + + # ── Close-proximity merge ───────────────────────────────────────────── + bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered) + + # ── Per-bubble split pass ───────────────────────────────────────────── + new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} + next_bid = max(bubbles.keys()) + 1 if bubbles else 1 + splits_performed = [] + + for bid in list(bubbles.keys()): + split_result, split_reason = _split_bubble_if_needed( + bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih) + + if split_result: + p1, p2 = split_result + splits_performed.append(f"BOX#{bid} ({split_reason})") + for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: + ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) + new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) + new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2), + min(iw-1,ub[2]+2), min(ih-1,ub[3]+2)) + new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] + new_bubble_indices[part_bid] = part_idxs + next_bid += 1 + else: + new_bubbles[bid] = bubbles[bid] + new_bubble_boxes[bid] = bubble_boxes[bid] + new_bubble_quads[bid] = bubble_quads[bid] + new_bubble_indices[bid] = bubble_indices[bid] + + if splits_performed: + print(f"\n🔀 Splits detected: {len(splits_performed)}") + for s in splits_performed: print(f" ✓ {s}") + + # ── Remove nested / duplicate boxes ────────────────────────────────── + bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes( + new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles, + overlap_threshold=0.50) + print(f"✅ Final box count: {len(bubbles)}") + + # ── OCR quality pass ────────────────────────────────────────────────── + translator = GoogleTranslator(source=source_lang, target=target_lang) + clean_lines: Dict[int, str] = {} + sources_used: Dict[int, str] = {} + translations: Dict[int, str] = {} + + for bid, lines in bubbles.items(): + base_txt = normalize_text(" ".join(lines)) + base_sc = ocr_candidate_score(base_txt) + txt, src_used = base_txt, "vision-base" + if base_sc < quality_threshold: + rr_txt, rr_sc, rr_src = reread_bubble_with_vision( + image, bubble_boxes[bid], detector, upscale=3.0, pad=24) + if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): + txt, src_used = rr_txt, rr_src + clean_lines[bid] = normalize_text(txt) + sources_used[bid] = src_used + + reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) + + # ── Translation ─────────────────────────────────────────────────────── + for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): + src_txt = clean_lines[bid].strip() + if not src_txt: continue + if not is_valid_language(src_txt, source_lang): continue + if not is_meaningful_text(src_txt, source_lang): continue + try: + tgt = translator.translate(src_txt) or "" + tgt = postprocess_translation_general(tgt).upper() + except Exception as e: + tgt = f"[Error: {e}]" + translations[bid] = tgt + + if debug: + save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, + clean_lines, "debug_clusters.png") + + # ── Text output ─────────────────────────────────────────────────────── + divider = "─" * 120 + out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] + print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} " + f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider) + + translated_count = 0 + for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): + src_txt = clean_lines[bid].strip() + if not src_txt: continue + if not is_valid_language(src_txt, source_lang): continue + if not is_meaningful_text(src_txt, source_lang): continue + + flags = [] + tgt = translations.get(bid, "") + if not tgt: flags.append("NO_TRANSLATION") + src_u = src_txt.upper() + src_engine = sources_used.get(bid, "unknown") + + out_lines.append( + f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|" + f"{','.join(flags) if flags else '-'}") + print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " + f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}") + translated_count += 1 + + out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.") + with open(export_to_file, "w", encoding="utf-8") as f: + f.write("\n".join(out_lines)) + + # ── bubbles.json ────────────────────────────────────────────────────── + bubbles_payload = {} + for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): + src_txt = clean_lines[bid].strip() + if not src_txt: continue + if not is_valid_language(src_txt, source_lang): continue + if not is_meaningful_text(src_txt, source_lang): continue + box = bubble_boxes.get(bid) + tgt = translations.get(bid, "") + bubbles_payload[str(bid)] = { + "order": reading_map.get(bid, bid), + "ocr_source": sources_used.get(bid, "unknown"), + "original": src_txt.upper(), + "translated": tgt, + "box": { + "x": box[0] if box else 0, + "y": box[1] if box else 0, + "w": (box[2]-box[0]) if box else 0, + "h": (box[3]-box[1]) if box else 0, + }, + "lines": [line.upper() for line in bubbles.get(bid, [])], + } + + with open(export_bubbles_to, "w", encoding="utf-8") as f: + json.dump(bubbles_payload, f, ensure_ascii=False, indent=2) + + print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") + + +# ============================================================ +# ENTRY POINT +# ============================================================ +if __name__ == "__main__": + translate_manga_text( + image_path="19.png", + source_lang="english", + target_lang="ca", + confidence_threshold=0.03, + min_text_length=1, + gap_px="auto", + quality_threshold=0.62, + export_to_file="output.txt", + export_bubbles_to="bubbles.json", + reading_mode="rtl", + debug=True, + use_enhanced_ocr=True, + strict_grouping=True, + max_box_width_ratio=0.6, + max_box_height_ratio=0.5, + auto_fix_bubbles=True + ) \ No newline at end of file