#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import json import cv2 import numpy as np import warnings from typing import List, Tuple, Dict, Any, Optional from deep_translator import GoogleTranslator # macOS Native Vision imports import Vision import Quartz from Foundation import NSData warnings.filterwarnings("ignore", category=UserWarning) # ============================================================ # CONFIG # ============================================================ TOP_BAND_RATIO = 0.08 # ============================================================ # REGION-FIRST LAYOUT HELPERS # ============================================================ import math from difflib import SequenceMatcher DIALOGUE_STOPWORDS = { "I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR", "IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT", "WHAT", "WHY", "HOW", "WHO", "IN", "ON", "AT", "TO", "OF", "FOR", "WITH", "AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT" } SFX_HINTS = { "RRRING", "RING", "RINGG", "BAM", "BOOM", "FWUP", "FWOOP", "FSHOO", "GRRP", "GASP", "THUD", "SMACK", "WHAM", "SLAM", "SNIF", "SNIFF" } REACTION_HINTS = { "HUH", "HUH?!", "HUH?", "OH", "AH", "EH", "TCH", "HEY", "WHAT?!", "NO!", "YES!" } NARRATION_HINTS = { "AND SO", "MEANWHILE", "LATER", "THEN", "TO BE CONTINUED" } def xyxy_width(b): return max(1, b[2] - b[0]) def xyxy_height(b): return max(1, b[3] - b[1]) def xyxy_center(b): return ((b[0] + b[2]) / 2.0, (b[1] + b[3]) / 2.0) def box_distance(a, b): ax, ay = xyxy_center(a) bx, by = xyxy_center(b) return math.hypot(ax - bx, ay - by) def horizontal_overlap_ratio(a, b): ix1, ix2 = max(a[0], b[0]), min(a[2], b[2]) ov = max(0, ix2 - ix1) return ov / max(1, min(xyxy_width(a), xyxy_width(b))) def vertical_overlap_ratio(a, b): iy1, iy2 = max(a[1], b[1]), min(a[3], b[3]) ov = max(0, iy2 - iy1) return ov / max(1, min(xyxy_height(a), xyxy_height(b))) def box_expand(b, pad, iw, ih): return ( max(0, int(b[0] - pad)), max(0, int(b[1] - pad)), min(iw - 1, int(b[2] + pad)), min(ih - 1, int(b[3] + pad)), ) def count_alpha(text): return len(re.findall(r"[A-ZÀ-Ýa-zà-ÿ]", text or "")) def uppercase_ratio(text): alpha = re.findall(r"[A-Za-zÀ-ÿ]", text or "") if not alpha: return 0.0 ups = sum(1 for c in alpha if c.isupper()) return ups / len(alpha) def punctuation_ratio(text): if not text: return 0.0 return len(re.findall(r"[!?.,'\"-]", text)) / max(1, len(text)) def stopword_ratio(text): toks = re.findall(r"[A-Z']+", normalize_text(text or "")) if not toks: return 0.0 hits = sum(1 for t in toks if t in DIALOGUE_STOPWORDS) return hits / len(toks) def looks_like_sfx_text(text): t = normalize_text(text or "") if not t: return False alpha = re.sub(r"[^A-Z]", "", t) if t in SFX_HINTS or alpha in SFX_HINTS: return True if len(alpha) >= 3 and uppercase_ratio(t) > 0.90 and stopword_ratio(t) < 0.15: if alpha not in DIALOGUE_STOPWORDS: return True return False def looks_like_reaction_text(text): t = normalize_text(text or "") alpha = re.sub(r"[^A-Z?!]", "", t) if t in REACTION_HINTS or alpha in REACTION_HINTS: return True if len(re.sub(r"[^A-Z]", "", t)) <= 5 and punctuation_ratio(t) > 0.10: return True return False def looks_like_narration_text(text): t = normalize_text(text or "") if any(t.startswith(h) for h in NARRATION_HINTS): return True if len(t.split()) >= 5 and t.endswith(".") and uppercase_ratio(t) > 0.75: return True return False def contour_features_for_box(image_bgr, box_xyxy): x1, y1, x2, y2 = box_xyxy crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: return { "mean_brightness": 0.0, "edge_density": 1.0, "whiteness_ratio": 0.0, } gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) mean_brightness = float(np.mean(gray)) / 255.0 edges = cv2.Canny(gray, 50, 150) edge_density = float(np.mean(edges > 0)) whiteness_ratio = float(np.mean(gray > 220)) return { "mean_brightness": mean_brightness, "edge_density": edge_density, "whiteness_ratio": whiteness_ratio, } def classify_region_type(image_bgr, box_xyxy, lines): text = normalize_text(" ".join(lines)) feats = contour_features_for_box(image_bgr, box_xyxy) w, h = xyxy_width(box_xyxy), xyxy_height(box_xyxy) ar = w / max(1, h) if looks_like_sfx_text(text): return "sfx" if looks_like_reaction_text(text): if len(text.split()) <= 3: return "reaction" if looks_like_narration_text(text): return "narration" # balloon/dialogue heuristic: # bright interior + low-ish edge density + moderate width if feats["whiteness_ratio"] > 0.45 and feats["edge_density"] < 0.18: return "dialogue" # narrow tall shout / reaction / sfx ambiguity if ar < 0.9 and uppercase_ratio(text) > 0.85 and stopword_ratio(text) < 0.20: return "sfx" if stopword_ratio(text) >= 0.20: return "dialogue" return "unknown" def text_similarity(a, b): return SequenceMatcher(None, normalize_text(a or ""), normalize_text(b or "")).ratio() def dedupe_repeated_phrase(text): t = normalize_text(text or "") words = t.split() if len(words) < 4: return t # remove immediate duplicated halves: "CRY! CRY!" / "I DIDN'T I DIDN'T" half = len(words) // 2 if len(words) % 2 == 0 and words[:half] == words[half:]: return " ".join(words[:half]) # collapse trailing duplicate tokens cleaned = [] for w in words: if cleaned and cleaned[-1] == w and len(w) > 2: continue cleaned.append(w) return " ".join(cleaned) def dehyphenate_linebreak_artifacts(text): t = normalize_text(text or "") t = re.sub(r"\b([A-Z]+)- ([A-Z]+)\b", r"\1\2", t) return t def fix_common_dialogue_ocr(text): """ Conservative OCR cleanup for dialogue-like text. Goals: - fix common OCR punctuation/spacing/apostrophe errors - preserve meaning and tone - avoid semantic reconstruction guesses """ t = normalize_text(text or "") if not t: return t replacements = { "1'M": "I'M", "1 DIDN'T": "I DIDN'T", "1 HATE": "I HATE", "1 WAS": "I WAS", "1'M ": "I'M ", "YO U": "YOU", "YOU RE": "YOU'RE", "YOURE": "YOU'RE", "I LL": "I'LL", "ILL ": "I'LL ", "DONT": "DON'T", "DIDNT": "DIDN'T", "CANT": "CAN'T", "WONT": "WON'T", "THATS": "THAT'S", "MOMS": "MOM'S", "DADS": "DAD'S", "LEARN- ING": "LEARNING", "COV- ERED": "COVERED", "SY ON": "SY-ON", "P PROPERLY": "P-PROPERLY", "SH SHUT": "SH- SHUT", } for a, b in replacements.items(): t = t.replace(a, b) # Fix split contractions / apostrophe omissions t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t) t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t) t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t) t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t) t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t) # Remove accidental duplicated punctuation spacing t = re.sub(r"\s+([,.;:!?])", r"\1", t) # Dehyphenate OCR line-wrap artifacts t = dehyphenate_linebreak_artifacts(t) # Collapse repeated full phrases/tokens caused by OCR duplication t = dedupe_repeated_phrase(t) # Remove duplicated adjacent words like "CRY CRY" if clearly accidental words = t.split() cleaned = [] for w in words: if cleaned and cleaned[-1] == w and len(re.sub(r"[^A-Z]", "", w)) > 2: continue cleaned.append(w) t = " ".join(cleaned) # Normalize spaces t = re.sub(r"\s{2,}", " ", t).strip() return t def region_text_role_hint(text): if looks_like_sfx_text(text): return "sfx" if looks_like_reaction_text(text): return "reaction" if looks_like_narration_text(text): return "narration" return "dialogue" def correct_region_text(text, region_type="dialogue"): t = normalize_text(text or "") if not t: return t, 0.0 original = t if region_type in {"dialogue", "reaction", "narration"}: t = fix_common_dialogue_ocr(t) elif region_type == "sfx": t = dedupe_repeated_phrase(t) score_before = ocr_candidate_score(original) score_after = ocr_candidate_score(t) correction_gain = max(0.0, score_after - score_before) return t, correction_gain def compute_region_confidence(raw_text, corrected_text, box_xyxy, region_type, image_bgr): feats = contour_features_for_box(image_bgr, box_xyxy) text_score = ocr_candidate_score(corrected_text) gain = max(0.0, text_score - ocr_candidate_score(raw_text)) role_bonus = 0.08 if region_type in {"dialogue", "reaction", "narration", "sfx"} else 0.0 score = ( 0.55 * text_score + 0.15 * feats["whiteness_ratio"] + 0.10 * (1.0 - min(1.0, feats["edge_density"] * 2.0)) + 0.10 * gain + role_bonus ) return max(0.0, min(1.0, score)) def build_region_flags(raw_text, corrected_text, region_type, conf): flags = [] if region_type == "unknown": flags.append("REGION_UNKNOWN") if region_type == "sfx": flags.append("SFX") if conf < 0.45: flags.append("LOW_CONF") if text_similarity(raw_text, corrected_text) < 0.75: flags.append("HEAVY_CORRECTION") if len(corrected_text.split()) > 22: flags.append("LONG_TEXT") return flags # ============================================================ # HELPERS # ============================================================ def normalize_text(text: str) -> str: t = (text or "").strip().upper() t = t.replace("\u201c", "\"").replace("\u201d", "\"") t = t.replace("\u2018", "'").replace("\u2019", "'") t = t.replace("\u2026", "...") t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = re.sub(r"([¡¿])\s+", r"\1", t) t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\.{4,}", "...", t) return t.strip() def postprocess_translation_general(text: str) -> str: t = normalize_text(text) t = re.sub(r"\s{2,}", " ", t).strip() t = re.sub(r"([!?]){3,}", r"\1\1", t) t = re.sub(r"\.{4,}", "...", t) return t def fix_common_ocr_errors(text: str) -> str: result = text result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result) result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result) result = result.replace('|', 'I') result = result.replace('`', "'") return result def is_valid_language(text: str, source_lang: str) -> bool: if not text: return False clean_text = re.sub(r'[^\w]', '', text) if not clean_text: return False lang = source_lang.lower() if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: foreign_chars = len(re.findall( r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF' r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]', clean_text)) if foreign_chars > 0: return False latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text)) total = len(clean_text) if total <= 3: return latin_chars >= 1 if total <= 6: return (latin_chars / total) >= 0.55 return (latin_chars / total) >= 0.45 elif lang in ['ja', 'japanese']: ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text)) if len(clean_text) <= 3: return ja_chars >= 1 return (ja_chars / len(clean_text)) >= 0.4 elif lang in ['ko', 'korean']: ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text)) if len(clean_text) <= 3: return ko_chars >= 1 return (ko_chars / len(clean_text)) >= 0.4 elif lang in ['zh', 'chinese']: zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text)) if len(clean_text) <= 3: return zh_chars >= 1 return (zh_chars / len(clean_text)) >= 0.4 return True _NOISE_TOKENS = { 'P', 'F', 'N', 'M', 'X', 'Z', 'Q', 'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP', } _MANGA_INTERJECTIONS = { 'HUH', 'HUH?', 'HUH??', 'HUH?!', 'OH', 'OH!', 'OOH', 'OOH!', 'AH', 'AH!', 'UH', 'UH...', 'HEY', 'HEY!', 'EH', 'EH?', 'WOW', 'WOW!', 'YES', 'NO', 'NO!', 'RUN', 'GO', 'GO!', 'STOP', 'WAIT', 'WHAT', 'WHAT?', 'WHAT?!', 'WHY', 'WHY?', 'HOW', 'HOW?', 'OK', 'OK!', 'OKAY', 'EEEEP', 'EEEP', 'OMIGOSH', 'BECKY', 'BECKY!', 'HMM', 'HMM...', 'TSK', 'TCH', 'GRRR','I','A', 'FWUP', 'FWAP', 'SHIVER', 'RRRING', 'MORNING', 'MORNING.', } def group_indices_into_vertical_columns(indices, ocr, x_tolerance_factor=1.4, min_vertical_span_factor=1.8): """ Group OCR indices into vertical columns inside a box. A column is defined as: - similar x centers - meaningful vertical spread - internally ordered top-to-bottom """ if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) cx = (b[0] + b[2]) / 2.0 cy = (b[1] + b[3]) / 2.0 w = max(1, b[2] - b[0]) h = max(1, b[3] - b[1]) items.append((i, b, cx, cy, w, h)) med_w = float(np.median([it[4] for it in items])) if items else 12.0 med_h = float(np.median([it[5] for it in items])) if items else 12.0 x_tol = max(10.0, med_w * x_tolerance_factor) # cluster by x-center items_sorted = sorted(items, key=lambda x: x[2]) columns = [] for it in items_sorted: placed = False for col in columns: if abs(it[2] - col["xc"]) <= x_tol: col["members"].append(it) col["xc"] = float(np.mean([m[2] for m in col["members"]])) placed = True break if not placed: columns.append({"xc": it[2], "members": [it]}) # sort each column top -> bottom clean_columns = [] for col in columns: members = sorted(col["members"], key=lambda x: x[3]) ys = [m[3] for m in members] vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0.0 # keep meaningful columns OR single strong items if len(members) >= 2 or vertical_span >= med_h * min_vertical_span_factor: clean_columns.append([m[0] for m in members]) else: clean_columns.append([m[0] for m in members]) # sort columns left -> right clean_columns.sort(key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp])) return clean_columns def group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.75): """ Group OCR indices into horizontal rows inside a box. """ if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) cx = (b[0] + b[2]) / 2.0 cy = (b[1] + b[3]) / 2.0 h = max(1, b[3] - b[1]) items.append((i, b, cx, cy, h)) med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * row_tol_factor) items.sort(key=lambda x: x[3]) rows = [] for it in items: placed = False for row in rows: if abs(it[3] - row["yc"]) <= row_tol: row["members"].append(it) row["yc"] = float(np.mean([m[3] for m in row["members"]])) placed = True break if not placed: rows.append({"yc": it[3], "members": [it]}) groups = [] for row in rows: members = sorted(row["members"], key=lambda x: x[2]) groups.append([m[0] for m in members]) return groups def score_text_groups(groups, ocr): """ Score grouping quality based on: - average group size - text plausibility - reduced fragmentation """ if not groups: return 0.0 texts = [] lengths = [] for grp in groups: parts = [] for i in grp: t = normalize_text(ocr[i][1]) if t: parts.append(t) txt = normalize_text(" ".join(parts)) if txt: texts.append(txt) lengths.append(len(txt.split())) if not texts: return 0.0 text_scores = [ocr_candidate_score(t) for t in texts] avg_text_score = float(np.mean(text_scores)) if text_scores else 0.0 avg_len = float(np.mean(lengths)) if lengths else 0.0 fragmentation_penalty = max(0.0, len(groups) - 4) * 0.08 return avg_text_score + min(0.5, avg_len * 0.05) - fragmentation_penalty def detect_internal_text_layout(indices, ocr, reading_mode="ltr"): """ Detect internal structure of text inside one final box. Step 1: split into vertical macro blocks Step 2: for each block, compare horizontal vs vertical grouping """ if not indices: return {"mode": "horizontal", "blocks": []} blocks = split_indices_into_vertical_blocks(indices, ocr) resolved_blocks = [] for block in blocks: horizontal_groups = group_indices_into_horizontal_rows(block, ocr) vertical_groups = group_indices_into_vertical_columns(block, ocr) h_score = score_text_groups(horizontal_groups, ocr) v_score = score_text_groups(vertical_groups, ocr) if len(vertical_groups) >= 2 and v_score >= h_score - 0.03: resolved_blocks.append({ "mode": "vertical", "groups": vertical_groups }) else: resolved_blocks.append({ "mode": "horizontal", "groups": horizontal_groups }) return {"mode": "block-mixed", "blocks": resolved_blocks} def build_text_from_layout(indices, ocr, reading_mode="ltr"): layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) output_lines = [] for block in layout["blocks"]: groups = block["groups"] mode = block["mode"] if mode == "horizontal": for grp in groups: line = normalize_text(" ".join( ocr[i][1] for i in grp if normalize_text(ocr[i][1]) )) if line: output_lines.append(line) elif mode == "vertical": if reading_mode == "rtl": groups = sorted( groups, key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]), reverse=True ) else: groups = sorted( groups, key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]) ) for grp in groups: grp_sorted = sorted(grp, key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0) line = normalize_text(" ".join( ocr[i][1] for i in grp_sorted if normalize_text(ocr[i][1]) )) if line: output_lines.append(line) return output_lines # ============================================================ # REGION PROPOSAL FROM OCR GEOMETRY # ============================================================ def propose_text_regions_from_ocr(ocr, image_shape): """ Build larger text containers from OCR boxes before final classification. This is intentionally conservative: it clusters nearby OCR groups that likely belong to one dialogue/narration region. """ ih, iw = image_shape[:2] if not ocr: return {}, {}, {}, {} boxes = [quad_bbox(x[0]) for x in ocr] hs = [max(1, b[3] - b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 14.0 parent = list(range(len(ocr))) def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x def union(a, b): ra, rb = find(a), find(b) if ra != rb: parent[rb] = ra for i in range(len(ocr)): bi = boxes[i] for j in range(i + 1, len(ocr)): bj = boxes[j] dx = abs(xyxy_center(bi)[0] - xyxy_center(bj)[0]) dy = abs(xyxy_center(bi)[1] - xyxy_center(bj)[1]) hov = horizontal_overlap_ratio(bi, bj) vov = vertical_overlap_ratio(bi, bj) dist = box_distance(bi, bj) same_band = dy <= med_h * 2.2 stacked = hov >= 0.35 and dy <= med_h * 3.2 same_line = vov >= 0.45 and dx <= med_h * 5.0 near = dist <= med_h * 4.5 if same_line or stacked or (near and (same_band or hov > 0.25)): if orientation_compatible(i, j, ocr): union(i, j) groups = {} for i in range(len(ocr)): groups.setdefault(find(i), []).append(i) region_lines = {} region_boxes = {} region_quads = {} region_indices = {} next_id = 1 for _, idxs in sorted(groups.items(), key=lambda kv: min(boxes[i][1] for i in kv[1])): idxs = sorted(idxs, key=lambda i: (boxes[i][1], boxes[i][0])) ub = boxes_union_xyxy([boxes[i] for i in idxs]) if ub is None: continue region_lines[next_id] = build_lines_from_indices(idxs, ocr) region_boxes[next_id] = box_expand(ub, pad=max(2, int(med_h * 0.25)), iw=iw, ih=ih) region_quads[next_id] = [ocr[i][0] for i in idxs] region_indices[next_id] = idxs next_id += 1 return region_lines, region_boxes, region_quads, region_indices # ============================================================ # RECONCILE REGION-FIRST AND BUBBLE-FIRST GROUPS # ============================================================ def reconcile_region_and_bubble_groups(region_lines, region_boxes, region_quads, region_indices, bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): """ Reconcile region-first and bubble-first groupings. Strategy: - Build one combined candidate list from both grouping methods. - Cluster candidates that heavily overlap or share OCR indices. - Keep only the best-scoring candidate from each cluster. - Rebuild stable output dictionaries. This avoids duplicate retention and inconsistent greedy selection. """ combined = [] for rid in region_boxes: combined.append(("region", rid, region_boxes[rid], region_indices[rid])) for bid in bubble_boxes: combined.append(("bubble", bid, bubble_boxes[bid], bubble_indices[bid])) if not combined: return {}, {}, {}, {} visited = set() kept = [] def group_score(box, idxs): text = normalize_text(" ".join(build_lines_from_indices(idxs, ocr))) role = region_text_role_hint(text) role_bonus = { "dialogue": 0.8, "narration": 0.75, "reaction": 0.7, "sfx": 0.2, "unknown": 0.1 }.get(role, 0.1) box_area = bbox_area_xyxy(box) area_bonus = min(1.0, box_area / 50000.0) return ( len(idxs) * 2.0 + min(20, len(text.split())) * 0.5 + min(1.0, ocr_candidate_score(text)) + role_bonus + area_bonus * 0.25 ) for i in range(len(combined)): if i in visited: continue cluster = [i] visited.add(i) _, _, box_i, idx_i = combined[i] for j in range(i + 1, len(combined)): if j in visited: continue _, _, box_j, idx_j = combined[j] ovs = boxes_overlap_ratio(box_i, box_j) iou = boxes_iou(box_i, box_j) shared = len(set(idx_i).intersection(idx_j)) if ovs >= 0.55 or iou >= 0.35 or shared > 0: cluster.append(j) visited.add(j) best_idx = max( cluster, key=lambda k: group_score(combined[k][2], combined[k][3]) ) kept.append(combined[best_idx]) # Stable order: top-to-bottom, then left-to-right kept.sort(key=lambda item: ( (item[2][1] + item[2][3]) / 2.0, (item[2][0] + item[2][2]) / 2.0 )) out_lines, out_boxes, out_quads, out_indices = {}, {}, {}, {} next_id = 1 for typ, oid, box, idxs in kept: idxs = sorted( set(idxs), key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0]) ) out_lines[next_id] = build_lines_from_indices(idxs, ocr) out_boxes[next_id] = box out_quads[next_id] = [ocr[k][0] for k in idxs] out_indices[next_id] = idxs next_id += 1 return out_lines, out_boxes, out_quads, out_indices # ============================================================ # PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET # ============================================================ PROTECTED_SHORT_TOKENS = { "HUH", "HUH?", "HUH??", "HUH?!", "OH", "OH!", "OOH", "OOH!", "AH", "AH!", "UH", "UH...", "HEY", "HEY!", "EH", "EH?", "WOW", "WOW!", "MORNING", "MORNING.", "BECKY", "BECKY!", "DAMIAN", "CECILE", "WALD", "OMIGOSH", "EEEP", "EEEEP" } KNOWN_NAMES = { "BECKY", "DAMIAN", "CECILE", "WALD" } def is_protected_token(text: str) -> bool: t = normalize_text(text or "") if not t: return False if t in PROTECTED_SHORT_TOKENS: return True # punctuation-insensitive fallback t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t) return t_alpha in PROTECTED_SHORT_TOKENS def maybe_conf_floor_for_protected(text: str, conf: float, floor: float = 0.40) -> float: if is_protected_token(text): return max(conf, floor) return conf def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool: if not text: return False t = text.strip() t_upper = normalize_text(t) # 1) Hard keep for protected tokens if is_protected_token(t_upper): return True t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper) if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS: return True alpha_count = sum(c.isalpha() for c in t) if alpha_count < min_alpha_chars: # allow short punctuated utterances like "Huh?" if re.fullmatch(r"[A-Za-zÀ-ÿ]{2,6}[!?\.]{0,3}", t.strip()): return True return False if t_upper in _NOISE_TOKENS: return False lang = source_lang.lower() if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: non_alpha = sum(not c.isalpha() for c in t) # slightly less aggressive than before if len(t) > 0 and (non_alpha / len(t)) > 0.72: return False if len(t) >= 3 and len(set(t_upper)) == 1: return False if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: if len(t) > 5: vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t)) if vowels == 0: return False return True def quad_bbox(quad): xs = [p[0] for p in quad] ys = [p[1] for p in quad] return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) def quad_center(quad): x1, y1, x2, y2 = quad_bbox(quad) return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) def boxes_union_xyxy(boxes): boxes = [b for b in boxes if b is not None] if not boxes: return None return ( int(min(b[0] for b in boxes)), int(min(b[1] for b in boxes)), int(max(b[2] for b in boxes)), int(max(b[3] for b in boxes)), ) def bbox_area_xyxy(b): if b is None: return 0 return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) def xyxy_to_xywh(b): if b is None: return None x1, y1, x2, y2 = b return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} def overlap_or_near(a, b, gap=0): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) gap_y = max(0, max(ay1, by1) - min(ay2, by2)) return gap_x <= gap and gap_y <= gap def boxes_iou(a, b): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b ix1, iy1 = max(ax1, bx1), max(ay1, by1) ix2, iy2 = min(ax2, bx2), min(ay2, by2) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) if inter == 0: return 0.0 area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) area_b = max(0, bx2 - bx1) * max(0, by2 - by1) return inter / max(1, area_a + area_b - inter) def boxes_overlap_ratio(a, b): """Ratio of intersection to the SMALLER box area.""" ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b ix1, iy1 = max(ax1, bx1), max(ay1, by1) ix2, iy2 = min(ax2, bx2), min(ay2, by2) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) if inter == 0: return 0.0 area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) area_b = max(0, bx2 - bx1) * max(0, by2 - by1) return inter / max(1, min(area_a, area_b)) def ocr_candidate_score(text: str) -> float: if not text: return 0.0 t = text.strip() n = len(t) if n == 0: return 0.0 alpha = sum(c.isalpha() for c in t) / n spaces = sum(c.isspace() for c in t) / n punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n penalty = 0.0 if re.search(r"\b[A-Z]\b", t): penalty += 0.05 if re.search(r"[0-9]{2,}", t): penalty += 0.08 score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty return max(0.0, min(1.0, score)) def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool: x1, y1, x2, y2 = quad_bbox(quad) return (max(1, x2 - x1) / max(1, y2 - y1)) >= ratio_threshold def quad_is_vertical(quad, ratio_threshold=1.5) -> bool: x1, y1, x2, y2 = quad_bbox(quad) return (max(1, y2 - y1) / max(1, x2 - x1)) >= ratio_threshold # ============================================================ # ENHANCED IMAGE PREPROCESSING # ============================================================ def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): h, w = image_bgr.shape[:2] upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)), interpolation=cv2.INTER_CUBIC) gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21) clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) enhanced = clahe.apply(denoised) sharpened = cv2.filter2D(enhanced, -1, np.array([[-1,-1,-1],[-1,9,-1],[-1,-1,-1]])) binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8)) return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR) def detect_small_text_regions(image_bgr, existing_quads): gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) mask = np.zeros(gray.shape, dtype=np.uint8) for quad in existing_quads: cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255) mask_inv = cv2.bitwise_not(mask) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv) contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) text_regions = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) area = w * h if 50 < area < 5000 and 0.1 < h / max(w, 1) < 10: text_regions.append((x, y, x + w, y + h)) return text_regions # ============================================================ # SPEECH BUBBLE DETECTION # ============================================================ def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) return [c for c in contours if cv2.contourArea(c) > 500] def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5): x1, y1, x2, y2 = quad_bbox_xyxy cx, cy = (x1 + x2) // 2, (y1 + y2) // 2 return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance def split_indices_by_bubble(indices, ocr, bubble_contours): if not indices: return [] bubble_groups, outside_group = {}, [] for idx in indices: bbox = quad_bbox(ocr[idx][0]) found = False for bidx, bubble in enumerate(bubble_contours): if is_quad_in_bubble(bbox, bubble): bubble_groups.setdefault(bidx, []).append(idx) found = True break if not found: outside_group.append(idx) result = list(bubble_groups.values()) if outside_group: result.append(outside_group) return result def check_vertical_alignment_split(indices, ocr, threshold=20): if len(indices) <= 1: return [indices] items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], key=lambda x: x[1][1]) groups, current_group = [], [items[0][0]] for i in range(1, len(items)): if items[i][1][1] - items[i-1][1][3] > threshold: groups.append(current_group) current_group = [items[i][0]] else: current_group.append(items[i][0]) if current_group: groups.append(current_group) return groups # ============================================================ # QUAD SIZE VALIDATION AND SPLITTING # ============================================================ def is_quad_oversized(quad, median_height, width_threshold=8.0): x1, y1, x2, y2 = quad_bbox(quad) w, h = x2 - x1, max(1, y2 - y1) return w > median_height * width_threshold or w / h > 12.0 def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height): x1, y1, x2, y2 = quad_bbox(quad) w, h = x2 - x1, max(1, y2 - y1) pad = 2 roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), max(0,x1):min(image_bgr.shape[1],x2)] if roi.size == 0: return [(quad, text, conf)] gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) v_proj = np.sum(binary, axis=0) gap_threshold = h * 255 * 0.20 gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): if v_proj[x] < gap_threshold: if not in_gap: gap_start, in_gap = x, True else: if in_gap: gw = x - gap_start if gw >= max(int(median_height * 0.8), 15): gaps.append((gap_start + gw // 2, gw)) in_gap = False if not gaps: return [(quad, text, conf)] gaps.sort(key=lambda g: g[1], reverse=True) split_x_abs = max(0, x1) + gaps[0][0] if ' ' in text: char_w = w / max(1, len(text)) split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * (split_x_abs - x1) / w) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] return [(quad, text, conf)] def validate_and_split_oversized_quads(image_bgr, filtered_ocr): if not filtered_ocr: return filtered_ocr, 0 heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] median_height = float(np.median(heights)) if heights else 14.0 result, splits_made = [], 0 for quad, text, conf in filtered_ocr: if is_quad_oversized(quad, median_height, 8.0): sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height) if len(sr) > 1: result.extend(sr); splits_made += 1 else: result.append((quad, text, conf)) else: result.append((quad, text, conf)) return result, splits_made # ============================================================ # HORIZONTAL GAP DETECTION AT QUAD LEVEL # ============================================================ def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5): if len(indices) < 2: return None items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) boxes = [quad_bbox(ocr[i][0]) for i in items] gap_threshold = med_h * gap_factor best_gap, best_split = 0.0, None for k in range(len(items) - 1): gap = boxes[k + 1][0] - boxes[k][2] if gap > gap_threshold and gap > best_gap: best_gap, best_split = gap, k if best_split is None: return None left_group = [items[i] for i in range(best_split + 1)] right_group = [items[i] for i in range(best_split + 1, len(items))] if not left_group or not right_group: return None return (left_group, right_group) def orientation_compatible(idx_a, idx_b, ocr): ba = quad_bbox(ocr[idx_a][0]) bb = quad_bbox(ocr[idx_b][0]) wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1]) wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1]) ra, rb = wa / ha, wb / hb if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0): return False return True # ============================================================ # WIDE QUAD COLUMN SPLIT — pre-grouping # ============================================================ def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, min_gap_factor=1.8): x1, y1, x2, y2 = quad_bbox(quad) w, h = x2 - x1, max(1, y2 - y1) if w < med_h * 3.0: return [(quad, text, conf)] pad = 2 roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), max(0,x1):min(image_bgr.shape[1],x2)] if roi.size == 0: return [(quad, text, conf)] gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) v_proj = np.sum(binary, axis=0) gap_threshold = h * 255 * 0.12 min_gap_px = max(int(med_h * min_gap_factor), 10) gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): if v_proj[x] < gap_threshold: if not in_gap: gap_start, in_gap = x, True else: if in_gap: gw = x - gap_start if gw >= min_gap_px: gaps.append((gap_start + gw // 2, gw)) in_gap = False if not gaps: return [(quad, text, conf)] gaps.sort(key=lambda g: g[1], reverse=True) split_x_rel = gaps[0][0] split_x_abs = x1 + split_x_rel if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h: return [(quad, text, conf)] if ' ' in text: char_w = w / max(1, len(text)) split_idx = int(split_x_rel / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * split_x_rel / w) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] return [(quad, text, conf)] def apply_column_gap_splits(image_bgr, ocr_list, med_h): result, splits_made = [], 0 for quad, text, conf in ocr_list: parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h) if len(parts) > 1: splits_made += 1 result.extend(parts) if splits_made: print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping") return result, splits_made # ============================================================ # GENERALIZED BOX FIXING FUNCTIONS # ============================================================ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid, splits_made = 1, [] for bid, indices in bubble_indices.items(): if len(indices) < 2: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = indices next_bid += 1 continue split_groups = split_indices_by_bubble(indices, ocr, bubble_contours) if len(split_groups) > 1: for group in split_groups: if group: new_bubbles[next_bid] = build_lines_from_indices(group, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) new_quads[next_bid] = [ocr[i][0] for i in group] new_indices[next_bid] = group next_bid += 1 splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles") continue vertical_splits = check_vertical_alignment_split(indices, ocr, threshold=int(med_h * 2.0)) if len(vertical_splits) > 1: for group in vertical_splits: if group: new_bubbles[next_bid] = build_lines_from_indices(group, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) new_quads[next_bid] = [ocr[i][0] for i in group] new_indices[next_bid] = group next_bid += 1 splits_made.append(f"BOX#{bid} → {len(vertical_splits)} vertical groups") continue box = bubble_boxes[bid] x1, y1, x2, y2 = box if (x2 - x1) > med_h * 10: x_centers = [quad_center(ocr[i][0])[0] for i in indices] x_median = np.median(x_centers) left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median] right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median] if left_group and right_group: left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group]) right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group]) if right_box[0] - left_box[2] > med_h * 1.5: for grp in [left_group, right_group]: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} → 2 horizontal panels") continue new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = indices next_bid += 1 if splits_made: print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):") for s in splits_made: print(f" ✓ {s}") return new_bubbles, new_boxes, new_quads, new_indices def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) bids = list(bubble_boxes.keys()) to_merge = [] for i in range(len(bids)): for j in range(i + 1, len(bids)): bid_i, bid_j = bids[i], bids[j] box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j] cx_i = (box_i[0] + box_i[2]) / 2.0 cy_i = (box_i[1] + box_i[3]) / 2.0 cx_j = (box_j[0] + box_j[2]) / 2.0 cy_j = (box_j[1] + box_j[3]) / 2.0 in_same_bubble = any( cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0 for c in bubble_contours ) if in_same_bubble: if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0: to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i)) if not to_merge: return bubbles, bubble_boxes, bubble_quads, bubble_indices print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):") merge_groups = {} for top, bottom in to_merge: found = False for key in merge_groups: if top in merge_groups[key] or bottom in merge_groups[key]: merge_groups[key].update({top, bottom}) found = True; break if not found: merge_groups[len(merge_groups)] = {top, bottom} new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} merged_bids, next_bid = set(), 1 for merge_set in merge_groups.values(): merge_list = sorted(merge_set) print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}") all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b])) for b in merge_list: merged_bids.add(b) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) new_quads[next_bid] = [ocr[i][0] for i in all_indices] new_indices[next_bid] = all_indices next_bid += 1 for bid in bids: if bid not in merged_bids: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h): """ Merges boxes that are vertically close AND share significant horizontal overlap. Single-quad boxes participate fully — no isolation treatment. This fixes BOX#2+#16, BOX#8+#21, BOX#9+#22 type problems where a single-line detection sits directly above/below a multi-line box in the same speech bubble. Merge criteria (both must be true): 1. Vertical gap ≤ 1.5 × med_h 2. Horizontal overlap ratio ≥ 0.35 """ bids = sorted(bubble_boxes.keys()) merge_map: Dict[int, List[int]] = {} merged_into: Dict[int, int] = {} for i, bid_i in enumerate(bids): if bid_i in merged_into: continue box_i = bubble_boxes[bid_i] wi = max(1, box_i[2] - box_i[0]) for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in merged_into: continue box_j = bubble_boxes[bid_j] wj = max(1, box_j[2] - box_j[0]) vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) h_ix1 = max(box_i[0], box_j[0]) h_ix2 = min(box_i[2], box_j[2]) h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35: root = merged_into.get(bid_i, bid_i) merge_map.setdefault(root, [root]) if bid_j not in merge_map[root]: merge_map[root].append(bid_j) merged_into[bid_j] = root if not merge_map: return bubbles, bubble_boxes, bubble_quads, bubble_indices print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):") new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} processed, next_bid = set(), 1 for root, group in merge_map.items(): group_unique = sorted(set(group)) print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}") all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b])) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) new_quads[next_bid] = [ocr[i][0] for i in all_indices] new_indices[next_bid] = all_indices next_bid += 1 processed.update(group_unique) for bid in bids: if bid not in processed: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): """ Full fix pipeline: 1. Split boxes that span multiple speech bubbles. 2. Merge fragments detected inside the same contour. 3. Merge fragments missed by contour detection (proximity+overlap) — pass 1. 4. Second proximity pass — catches chains resolved after pass 1. """ print("\n🔍 Running automatic bubble detection fixes...") all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 bubbles, bubble_boxes, bubble_quads, bubble_indices = \ detect_and_split_multi_bubble_boxes( bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) bubbles, bubble_boxes, bubble_quads, bubble_indices = \ detect_and_merge_fragmented_bubbles( bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) # Pass 1 bubbles, bubble_boxes, bubble_quads, bubble_indices = \ merge_boxes_by_proximity_and_overlap( bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) # Pass 2 — catches chains only visible after pass 1 bubbles, bubble_boxes, bubble_quads, bubble_indices = \ merge_boxes_by_proximity_and_overlap( bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) return bubbles, bubble_boxes, bubble_quads, bubble_indices def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, overlap_threshold=0.50): bids = list(bubble_boxes.keys()) to_remove = set() for i in range(len(bids)): bid_i = bids[i] if bid_i in to_remove: continue box_i = bubble_boxes[bid_i] area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1]) for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in to_remove: continue box_j = bubble_boxes[bid_j] area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1]) shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j]) overlap = boxes_overlap_ratio(box_i, box_j) if overlap > overlap_threshold or len(shared) > 0: if area_i >= area_j: to_remove.add(bid_j) print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})") else: to_remove.add(bid_i) print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})") break if to_remove: print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)") for bid in to_remove: bubble_boxes.pop(bid, None) bubble_indices.pop(bid, None) bubble_quads.pop(bid, None) bubbles.pop(bid, None) return bubbles, bubble_boxes, bubble_quads, bubble_indices def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None): if image_shape is None: return bubbles, bubble_boxes, bubble_quads, bubble_indices ih, iw = image_shape[:2] max_width, max_height = iw * max_width_ratio, ih * max_height_ratio new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid, splits_made = 1, [] for bid, box in bubble_boxes.items(): x1, y1, x2, y2 = box w, h = x2 - x1, y2 - y1 if w > max_width or h > max_height: indices = bubble_indices[bid] col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid, use_aggressive_thresholds=True) if col_split: for grp in col_split: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") continue row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid) if row_split: for grp in row_split: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") continue new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = box new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 if splits_made: print(f"\n📏 Split {len(splits_made)} oversized box(es):") for s in splits_made: print(f" ✓ {s}") return new_bubbles, new_boxes, new_quads, new_indices def should_merge_groups(group1_indices, group2_indices, ocr, median_height, max_vertical_gap=None): if max_vertical_gap is None: max_vertical_gap = median_height * 2.5 box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices]) box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices]) if box1 is None or box2 is None: return False cx1 = (box1[0] + box1[2]) / 2.0 cx2 = (box2[0] + box2[2]) / 2.0 if abs(cx1 - cx2) > median_height * 1.8: return False vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3])) return vertical_gap <= max_vertical_gap # ============================================================ # ENHANCED OCR ENGINE # ============================================================ class ImprovedMacVisionDetector: def __init__(self, source_lang="en"): lang_key = source_lang.lower().strip() lang_map = { "en": "en-US", "english": "en-US", "es": "es-ES", "spanish": "es-ES", "ca": "ca-ES", "catalan": "ca-ES", "fr": "fr-FR", "french": "fr-FR", "ja": "ja-JP", "japanese": "ja-JP", "it": "it-IT", "italian": "it-IT", "de": "de-DE", "german": "de-DE", "ko": "ko-KR", "korean": "ko-KR", "zh": "zh-Hans", "chinese": "zh-Hans" } self.langs = [lang_map.get(lang_key, "en-US")] print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})") def preprocess_variants(self, image_bgr): variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))] gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) variants.append(("high_contrast", cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC), cv2.COLOR_GRAY2BGR))) variants.append(("bilateral", cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75), None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) variants.append(("inverted", cv2.resize(cv2.bitwise_not(image_bgr), None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) variants.append(("original", cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) return variants def run_vision_ocr(self, image_bgr): if image_bgr is None or image_bgr.size == 0: return [] ih, iw = image_bgr.shape[:2] success, buffer = cv2.imencode('.png', image_bgr) if not success: return [] ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) results = [] def completion_handler(request, error): if error: return for obs in request.results(): candidate = obs.topCandidates_(1)[0] text, conf = candidate.string(), candidate.confidence() bbox = obs.boundingBox() x = bbox.origin.x * iw y_bl = bbox.origin.y * ih w = bbox.size.width * iw h = bbox.size.height * ih y = ih - y_bl - h quad = [[int(x),int(y)],[int(x+w),int(y)], [int(x+w),int(y+h)],[int(x),int(y+h)]] results.append((quad, text, conf)) req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) req.setUsesLanguageCorrection_(False) req.setRecognitionLanguages_(self.langs) req.setAutomaticallyDetectsLanguage_(True) handler.performRequests_error_([req], None) return results def merge_multi_pass_results(self, all_results, original_shape): if not all_results: return [] scale_factor = 2.5 normalized = [] for variant_name, results in all_results: for quad, text, conf in results: sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad] normalized.append((sq, text, conf, variant_name)) def quads_overlap(q1, q2, threshold=0.5): b1, b2 = quad_bbox(q1), quad_bbox(q2) x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1]) x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3]) if x2 < x1 or y2 < y1: return False inter = (x2-x1)*(y2-y1) union = ((b1[2]-b1[0])*(b1[3]-b1[1]) + (b2[2]-b2[0])*(b2[3]-b2[1]) - inter) return inter / max(union, 1) > threshold clusters, used = [], set() for i, (q1, t1, c1, v1) in enumerate(normalized): if i in used: continue cluster = [(q1, t1, c1, v1)] used.add(i) for j, (q2, t2, c2, v2) in enumerate(normalized): if j in used or i == j: continue if quads_overlap(q1, q2): cluster.append((q2, t2, c2, v2)) used.add(j) clusters.append(cluster) final_results = [] for cluster in clusters: cluster.sort(key=lambda x: x[2], reverse=True) best_quad, best_text, best_conf, _ = cluster[0] text_votes = {} for _, text, conf, _ in cluster: n = normalize_text(text) if n: text_votes[n] = text_votes.get(n, 0) + conf if text_votes: voted = max(text_votes.items(), key=lambda x: x[1])[0] if voted != normalize_text(best_text): best_text = voted final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf)) return final_results def read(self, image_path_or_array): img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \ else image_path_or_array if img is None or img.size == 0: return [] variants = self.preprocess_variants(img) all_results = [] for vname, vimg in variants: r = self.run_vision_ocr(vimg) if r: all_results.append((vname, r)) return self.merge_multi_pass_results(all_results, img.shape) class MacVisionDetector: def __init__(self, source_lang="en"): lang_key = source_lang.lower().strip() lang_map = { "en": "en-US", "english": "en-US", "es": "es-ES", "spanish": "es-ES", "ca": "ca-ES", "catalan": "ca-ES", "fr": "fr-FR", "french": "fr-FR", "ja": "ja-JP", "japanese": "ja-JP", "it": "it-IT", "italian": "it-IT", "de": "de-DE", "german": "de-DE", "ko": "ko-KR", "korean": "ko-KR", "zh": "zh-Hans", "chinese": "zh-Hans" } self.langs = [lang_map.get(lang_key, "en-US")] print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") def read(self, image_path_or_array): img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \ else image_path_or_array if img is None or img.size == 0: return [] ih, iw = img.shape[:2] success, buffer = cv2.imencode('.png', img) if not success: return [] ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) results = [] def completion_handler(request, error): if error: return for obs in request.results(): candidate = obs.topCandidates_(1)[0] text, conf = candidate.string(), candidate.confidence() bbox = obs.boundingBox() x = bbox.origin.x * iw y_bl = bbox.origin.y * ih w = bbox.size.width * iw h = bbox.size.height * ih y = ih - y_bl - h quad = [[int(x),int(y)],[int(x+w),int(y)], [int(x+w),int(y+h)],[int(x),int(y+h)]] results.append((quad, text, conf)) req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) req.setUsesLanguageCorrection_(True) req.setRecognitionLanguages_(self.langs) req.setAutomaticallyDetectsLanguage_(True) handler.performRequests_error_([req], None) return results # ============================================================ # COLUMN / ROW SPLITTING # ============================================================ def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False): if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] hs = [max(1, b[3]-b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 xs = [(b[0]+b[2])/2.0 for b in boxes] xs_sorted = sorted(xs) gap_thresh = max(med_h*1.2, 18) if use_aggressive_thresholds else max(med_h*1.5, 22) best_gap_idx, best_gap_size = None, 0.0 for i in range(len(xs_sorted) - 1): gap = xs_sorted[i+1] - xs_sorted[i] if gap > gap_thresh and gap > best_gap_size: best_gap_size, best_gap_idx = gap, i if best_gap_idx is None: return None split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx+1]) / 2.0 left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] if not left_idxs or not right_idxs: return None return (left_idxs, right_idxs) def split_bubble_if_multiple_rows(indices, ocr, bid=None): if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] hs = [max(1, b[3]-b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 ys = [(b[1]+b[3])/2.0 for b in boxes] ys_sorted = sorted(ys) gap_thresh = max(med_h * 2.0, 30) best_gap_idx, best_gap_size = None, 0.0 for i in range(len(ys_sorted) - 1): gap = ys_sorted[i+1] - ys_sorted[i] if gap > gap_thresh and gap > best_gap_size: best_gap_size, best_gap_idx = gap, i if best_gap_idx is None: return None split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0 top_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y] bot_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y] if not top_idxs or not bot_idxs: return None return (top_idxs, bot_idxs) def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] hs = [max(1, b[3]-b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], key=lambda x: (x[1][1]+x[1][3])/2.0) gap_thresh = max(med_h * factor, min_gap) best_gap, best_split_idx = 0.0, None for k in range(len(items) - 1): gap = items[k+1][1][1] - items[k][1][3] if gap > gap_thresh and gap > best_gap: best_gap, best_split_idx = gap, k if best_split_idx is None: return None top_idxs = [it[0] for it in items[:best_split_idx+1]] bot_idxs = [it[0] for it in items[best_split_idx+1:]] if not top_idxs or not bot_idxs: return None return (top_idxs, bot_idxs) def is_vertical_text_like(indices, ocr): if len(indices) < 2: return False boxes = [quad_bbox(ocr[i][0]) for i in indices] med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes])) med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes])) if med_h < med_w * 1.2: return False xs = [(b[0]+b[2])/2.0 for b in boxes] ys = [(b[1]+b[3])/2.0 for b in boxes] if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False return True def split_nested_or_side_by_side(indices, ocr): if len(indices) < 2: return None xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 for i in indices]) mid_idx = len(xs) // 2 split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0 left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] if not left_idxs or not right_idxs: return None return (left_idxs, right_idxs) def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): x1, y1, x2, y2 = box_xyxy ih, iw = image_bgr.shape[:2] x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(iw-1, x2), min(ih-1, y2) if x2 <= x1 or y2 <= y1: return None crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: return None gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) edges = cv2.Canny(gray, 50, 150) h_proj = np.sum(edges, axis=0) w = x2 - x1 if w < 100: return None search_start = int(w * 0.35) search_end = int(w * 0.65) if search_end <= search_start: return None region = h_proj[search_start:search_end] if len(region) == 0: return None threshold = np.percentile(region, 85) candidates = [x1 + search_start + rx for rx in range(len(region)) if region[rx] >= threshold] if not candidates: return None split_x = int(np.median(candidates)) if bubble_quads: lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) rc = len(bubble_quads) - lc if lc == 0 or rc == 0: return None return (x1, x2, split_x) # ============================================================ # MERGE CLOSE BUBBLES # ============================================================ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): """ Merges boxes that are spatially very close on BOTH axes AND share meaningful horizontal overlap (same column). Single-quad boxes participate fully — no special isolation treatment. The h_overlap_ratio >= 0.25 guard prevents merging horizontally adjacent distinct bubbles. """ if not bubbles: return bubbles, bubble_boxes, bubble_quads, bubble_indices all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 merge_tol = max(8, med_h * 1.4) bids = sorted(bubble_boxes.keys()) merged_set, merge_map = set(), {} for i, bid_i in enumerate(bids): if bid_i in merged_set: continue x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i] wi = max(1, x2_i - x1_i) for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in merged_set: continue x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j] wj = max(1, x2_j - x1_j) gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j)) gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j)) h_ix1 = max(x1_i, x1_j) h_ix2 = min(x2_i, x2_j) h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25: if bid_i not in merge_map: merge_map[bid_i] = [bid_i] merge_map[bid_i].append(bid_j) merged_set.add(bid_j) if not merge_map: return bubbles, bubble_boxes, bubble_quads, bubble_indices new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 for bid in bids: if bid in merged_set: continue if bid in merge_map: group = merge_map[bid] all_indices = sorted(set(idx for b in group for idx in bubble_indices[b])) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) new_quads[next_bid] = [ocr[i][0] for i in all_indices] new_indices[next_bid] = all_indices else: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices # ============================================================ # WIDE / BRIDGE QUAD SPLITTING # ============================================================ def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): if not ocr_list: return ocr_list, 0 hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] med_h = float(np.median(hs)) if hs else 14.0 result, splits_made = [], 0 for quad, text, conf in ocr_list: x1, y1, x2, y2 = quad_bbox(quad) w = x2 - x1 if w > med_h * width_factor: pad = 2 roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), max(0,x1):min(image_bgr.shape[1],x2)] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) v_proj = np.sum(binary, axis=0) gap_threshold = roi.shape[0] * 255 * 0.15 gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): if v_proj[x] < gap_threshold: if not in_gap: gap_start, in_gap = x, True else: if in_gap: gw = x - gap_start if gw >= max(int(med_h * 0.6), 12): gaps.append((gap_start + gw // 2, gw)) in_gap = False if gaps: gaps.sort(key=lambda g: g[1], reverse=True) split_x_abs = max(0, x1) + gaps[0][0] if ' ' in text: char_w = w / max(1, len(text)) split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * (split_x_abs - x1) / w) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: result.extend([ ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) splits_made += 1 continue result.append((quad, text, conf)) return result, splits_made def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0): if not ocr_list: return ocr_list, 0 hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] med_h = float(np.median(hs)) if hs else 14.0 result, splits_made = [], 0 for quad, text, conf in ocr_list: x1, y1, x2, y2 = quad_bbox(quad) w, h = x2 - x1, max(1, y2 - y1) if w / h > aspect_ratio_threshold: pad = 2 roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), max(0,x1):min(image_bgr.shape[1],x2)] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) v_proj = np.sum(binary, axis=0) gap_threshold = h * 255 * 0.20 gaps, in_gap, gap_start = [], False, 0 for x in range(len(v_proj)): if v_proj[x] < gap_threshold: if not in_gap: gap_start, in_gap = x, True else: if in_gap: gw = x - gap_start if gw >= max(int(med_h * 0.8), 15): gaps.append((gap_start + gw // 2, gw)) in_gap = False if gaps: gaps.sort(key=lambda g: g[1], reverse=True) split_x_abs = max(0, x1) + gaps[0][0] if ' ' in text: char_w = w / max(1, len(text)) split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() else: split_idx = int(len(text) * (split_x_abs - x1) / w) tl, tr = text[:split_idx].strip(), text[split_idx:].strip() if tl and tr: result.extend([ ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) splits_made += 1 continue result.append((quad, text, conf)) return result, splits_made def normalize_ocr_quads(ocr_list): result = [] for quad, text, conf in ocr_list: x1, y1, x2, y2 = quad_bbox(quad) pad = 3 new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]] result.append((new_quad, text, conf)) return result # ============================================================ # VISION RE-READ # ============================================================ def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) if mode == "raw": return gray if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(gray) if mode == "adaptive": den = cv2.GaussianBlur(gray, (3,3), 0) return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) if mode == "otsu": den = cv2.GaussianBlur(gray, (3,3), 0) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th if mode == "invert": return 255 - gray if mode == "bilateral": den = cv2.bilateralFilter(gray, 7, 60, 60) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th if mode == "morph_open": _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8)) return gray def rotate_image_keep_bounds(img, angle_deg): h, w = img.shape[:2] c = (w/2, h/2) M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) cos, sin = abs(M[0,0]), abs(M[0,1]) new_w = int((h*sin) + (w*cos)) new_h = int((h*cos) + (w*sin)) M[0,2] += (new_w/2) - c[0] M[1,2] += (new_h/2) - c[1] return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) def rebuild_text_from_vision_result(res): if not res: return "" norm = [] for bbox, txt, conf in res: if not txt or not txt.strip(): continue b = quad_bbox(bbox) norm.append((b, txt, conf, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) if not norm: return "" med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) norm.sort(key=lambda z: z[4]) rows = [] for it in norm: placed = False for r in rows: if abs(it[4] - r["yc"]) <= row_tol: r["m"].append(it) r["yc"] = float(np.mean([k[4] for k in r["m"]])) placed = True; break if not placed: rows.append({"yc": it[4], "m": [it]}) rows.sort(key=lambda r: r["yc"]) lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) for r in rows] return normalize_text(" ".join(filter(None, lines))) def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24): ih, iw = image_bgr.shape[:2] x1, y1, x2, y2 = bbox_xyxy x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad)) x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad)) crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: return None, 0.0, "none" modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] angles = [0.0, 1.5, -1.5] best_v_txt, best_v_sc = "", 0.0 up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), interpolation=cv2.INTER_CUBIC) for mode in modes: proc = preprocess_variant(up0, mode) proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc for a in angles: rot = rotate_image_keep_bounds(proc3, a) res = (vision_detector.run_vision_ocr(rot) if hasattr(vision_detector, 'run_vision_ocr') else vision_detector.read(rot)) txt = rebuild_text_from_vision_result(res) sc = ocr_candidate_score(txt) if sc > best_v_sc: best_v_txt, best_v_sc = txt, sc if best_v_txt: return best_v_txt, best_v_sc, "vision-reread" return None, 0.0, "none" # ============================================================ # LINES + BUBBLES # ============================================================ def build_lines_from_indices(indices, ocr): if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.75) items.sort(key=lambda x: x[3]) rows = [] for it in items: placed = False for r in rows: if abs(it[3] - r["yc"]) <= row_tol: r["m"].append(it) r["yc"] = float(np.mean([k[3] for k in r["m"]])) placed = True; break if not placed: rows.append({"yc": it[3], "m": [it]}) rows.sort(key=lambda r: r["yc"]) return [normalize_text( " ".join(ocr[i][1] for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2]))) for r in rows if r["m"]] def split_indices_into_vertical_blocks(indices, ocr, gap_factor=1.6, min_gap=18): """ Split a box into top-to-bottom macro blocks using strong vertical gaps. """ if len(indices) < 2: return [indices] items = [] for i in indices: b = quad_bbox(ocr[i][0]) cy = (b[1] + b[3]) / 2.0 h = max(1, b[3] - b[1]) items.append((i, b, cy, h)) items.sort(key=lambda x: x[2]) med_h = float(np.median([it[3] for it in items])) if items else 12.0 threshold = max(min_gap, med_h * gap_factor) blocks = [] current = [items[0][0]] prev_b = items[0][1] for k in range(1, len(items)): cur_i, cur_b, _, _ = items[k] gap = cur_b[1] - prev_b[3] if gap > threshold: blocks.append(current) current = [cur_i] else: current.append(cur_i) prev_b = cur_b if current: blocks.append(current) return blocks def build_final_box_text(indices, ocr, reading_mode="ltr"): """ Final text reconstruction used for OCR/translation export. This uses internal layout detection, unlike generic grouping helpers. """ return build_text_from_layout(indices, ocr, reading_mode=reading_mode) def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) return base * (img.shape[1] / ref_w) if img is not None else base def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mode=False): n = len(ocr) if n == 0: return {}, {}, {}, {} boxes = [quad_bbox(r[0]) for r in ocr] centers = [quad_center(r[0]) for r in ocr] hs = [max(1.0, b[3]-b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0 max_horizontal_offset = med_h * 1.8 sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0])) groups, used = [], set() for i in sorted_indices: if i in used: continue current_group = [i] used.add(i) cx_i = centers[i][0] for j in sorted_indices: if j in used or j == i: continue cx_j, cy_j = centers[j] if cy_j <= centers[i][1]: continue if abs(cx_i - cx_j) > max_horizontal_offset: continue # Horizontal gap guard gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2])) if gap_x > med_h * 1.5: continue # Orientation compatibility guard if not orientation_compatible(i, j, ocr): continue vertical_gap = boxes[j][1] - boxes[current_group[-1]][3] if vertical_gap <= max_vertical_gap: current_group.append(j) used.add(j) cx_i = (cx_i + cx_j) / 2.0 if current_group: groups.append(current_group) # Secondary merge pass merged_groups, used_groups = [], set() for i, group1 in enumerate(groups): if i in used_groups: continue merged = list(group1) used_groups.add(i) for j, group2 in enumerate(groups): if i == j or j in used_groups: continue if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap): compat = all(orientation_compatible(a, b, ocr) for a in merged for b in group2) if compat: merged.extend(group2) used_groups.add(j) merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1])) # Horizontal gap split pass final_groups = [] for group in merged_groups: h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5) if h_split: lg, rg = h_split final_groups.append(sorted(lg, key=lambda idx: centers[idx][1])) final_groups.append(sorted(rg, key=lambda idx: centers[idx][1])) else: final_groups.append(group) final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), min(centers[i][0] for i in g))) bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} ih, iw = image_shape[:2] for bid, idxs in enumerate(final_groups, start=1): lines = build_lines_from_indices(idxs, ocr) quads = [ocr[k][0] for k in idxs] ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) if ub is None: continue x1, y1, x2, y2 = ub ap = max(1, int(round(med_h * 0.16))) bubbles[bid] = lines bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), min(iw-1,x2+ap), min(ih-1,y2+ap)) bubble_quads[bid] = quads bubble_indices[bid] = idxs return bubbles, bubble_boxes, bubble_quads, bubble_indices # ============================================================ # SPLIT HELPER — centralises all split strategies # ============================================================ def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih): """ Attempts all split strategies in priority order. Returns ((part1_indices, part2_indices), reason_str) or (None, None). BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4 so the gap between the top speech bubble and the bottom cluster triggers. """ indices = bubble_indices[bid] box = bubble_boxes[bid] # 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster) if is_vertical_text_like(indices, filtered): vgap = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.4, min_gap=18) if vgap: return vgap, "vertical-stack y-gap" # 2. Panel border sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) if sr: _, _, split_x = sr li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x] ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x] if li and ri: return (li, ri), "panel border" elif len(bubble_quads[bid]) >= 4: cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid, use_aggressive_thresholds=True) if cs: return cs, "aggressive column" # 3. Column gap cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid) if cs: return cs, "vertical column" # 4. Nested / side-by-side ns = split_nested_or_side_by_side(indices, filtered) if ns: return ns, "nested/side-by-side" # 5. Row split rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid) if rs: return rs, "horizontal row" # 6. Large vertical gap (general, less sensitive) gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22) if gy: return gy, "large vertical-gap" return None, None # ============================================================ # DEBUG / EXPORT # ============================================================ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, clean_lines=None, out_path="debug_clusters.png", region_types=None): """ Draw debug overlays for final grouped boxes. Color scheme by region type: - dialogue : green - narration : orange - sfx : magenta - reaction : cyan - unknown : yellow-ish OCR quads are outlined lightly in gray for context. """ img = cv2.imread(image_path) if img is None: return # Draw OCR quads lightly without filling the page white for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) cv2.polylines(img, [pts], True, (180, 180, 180), 1) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb rtype = region_types.get(bid, "unknown") if region_types else "unknown" if rtype == "dialogue": color = (0, 220, 0) elif rtype == "narration": color = (0, 180, 255) elif rtype == "sfx": color = (255, 0, 255) elif rtype == "reaction": color = (0, 200, 255) else: color = (0, 220, 220) thickness = 2 cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) cv2.putText( img, f"BOX#{bid} [{rtype}]", (x1 + 2, max(15, y1 + 16)), cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 2 ) if clean_lines and bid in clean_lines: text = clean_lines[bid] words = text.split() wrapped_lines = [] cur = "" for w in words: if len(cur) + len(w) + 1 < 26: cur += w + " " else: wrapped_lines.append(cur.strip()) cur = w + " " if cur: wrapped_lines.append(cur.strip()) y_text = y2 + 18 for line in wrapped_lines: # black outline cv2.putText( img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3 ) # blue text cv2.putText( img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1 ) y_text += 18 cv2.imwrite(out_path, img) def estimate_reading_order(bbox_dict, mode="ltr"): items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) for bid, bb in bbox_dict.items()] items.sort(key=lambda t: t[2]) rows, tol = [], 90 for it in items: placed = False for r in rows: if abs(it[2] - r["cy"]) <= tol: r["items"].append(it) r["cy"] = float(np.mean([x[2] for x in r["items"]])) placed = True; break if not placed: rows.append({"cy": it[2], "items": [it]}) rows.sort(key=lambda r: r["cy"]) order = [] for r in rows: r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) order.extend([z[0] for z in r["items"]]) return {bid: i+1 for i, bid in enumerate(order)} # ============================================================ # NAME / SHORT TOKEN RESCUE # ============================================================ def _text_key_for_dedup(text: str) -> str: return re.sub(r'[^A-ZÀ-Ý0-9]', '', normalize_text(text or "")) def rescue_name_and_short_tokens(ocr_list, min_conf=0.20): """ Keep plausible short/name tokens that OCR found but strict filtering may drop. Returns rescued items as (quad, text, conf). """ rescued = [] for quad, text, conf in ocr_list: t = normalize_text(text or "") if not t: continue t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t) if t_alpha in KNOWN_NAMES and conf >= min_conf: rescued.append((quad, t, max(conf, 0.45))) continue if is_protected_token(t) and conf >= min_conf: rescued.append((quad, t, max(conf, 0.40))) continue if 2 <= len(t_alpha) <= 8 and conf >= 0.25: if re.fullmatch(r'[A-ZÀ-Ý]{2,8}', t_alpha): rescued.append((quad, t, max(conf, 0.35))) return rescued def merge_rescued_items(base_ocr, rescued_ocr, iou_threshold=0.55): """ Merge rescued tokens into OCR list if not duplicate by text+overlap. """ if not rescued_ocr: return base_ocr def iou_xyxy(a, b): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b ix1, iy1 = max(ax1, bx1), max(ay1, by1) ix2, iy2 = min(ax2, bx2), min(ay2, by2) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) if inter == 0: return 0.0 area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) area_b = max(0, bx2 - bx1) * max(0, by2 - by1) return inter / max(1, area_a + area_b - inter) out = list(base_ocr) for rq, rt, rc in rescued_ocr: rb = quad_bbox(rq) rk = _text_key_for_dedup(rt) duplicate = False for bq, bt, _ in out: bb = quad_bbox(bq) bk = _text_key_for_dedup(bt) if rk == bk and iou_xyxy(rb, bb) >= iou_threshold: duplicate = True break if not duplicate: out.append((rq, rt, rc)) return out def _joined_text_for_indices(indices, ocr): parts = [] for i in indices: if i < 0 or i >= len(ocr): continue t = normalize_text(ocr[i][1]) if t: parts.append(t) s = " ".join(parts).strip() return s, len(s) def _in_same_bubble_contour(box_i, box_j, bubble_contours): cx_i = (box_i[0] + box_i[2]) / 2.0 cy_i = (box_i[1] + box_i[3]) / 2.0 cx_j = (box_j[0] + box_j[2]) / 2.0 cy_j = (box_j[1] + box_j[3]) / 2.0 for c in bubble_contours: if (cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0): return True return False def merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): """ Relaxed merge for tiny interjection/name boxes (e.g. HUH? + MORNING). """ bids = sorted(bubble_boxes.keys()) if len(bids) < 2: return bubbles, bubble_boxes, bubble_quads, bubble_indices all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) parent = {b: b for b in bids} def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x def union(a, b): ra, rb = find(a), find(b) if ra != rb: parent[rb] = ra SHORT_TEXT_MAX_CHARS = 12 for i in range(len(bids)): for j in range(i + 1, len(bids)): bi, bj = bids[i], bids[j] box_i, box_j = bubble_boxes[bi], bubble_boxes[bj] wi = max(1, box_i[2] - box_i[0]) wj = max(1, box_j[2] - box_j[0]) gap_x = max(0, max(box_i[0], box_j[0]) - min(box_i[2], box_j[2])) vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) h_ix1 = max(box_i[0], box_j[0]) h_ix2 = min(box_i[2], box_j[2]) h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) txt_i, len_i = _joined_text_for_indices(bubble_indices[bi], ocr) txt_j, len_j = _joined_text_for_indices(bubble_indices[bj], ocr) micro_pair = (len_i <= SHORT_TEXT_MAX_CHARS and len_j <= SHORT_TEXT_MAX_CHARS) protected_hint = is_protected_token(txt_i) or is_protected_token(txt_j) same_contour = _in_same_bubble_contour(box_i, box_j, bubble_contours) if micro_pair and vert_gap <= med_h * 2.2 and gap_x <= med_h * 2.0: if h_overlap_ratio >= 0.10 or same_contour or protected_hint: union(bi, bj) groups = {} for b in bids: r = find(b) groups.setdefault(r, []).append(b) if all(len(v) == 1 for v in groups.values()): return bubbles, bubble_boxes, bubble_quads, bubble_indices new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 for _, group in groups.items(): if len(group) == 1: b = group[0] new_bubbles[next_bid] = bubbles[b] new_boxes[next_bid] = bubble_boxes[b] new_quads[next_bid] = bubble_quads[b] new_indices[next_bid] = bubble_indices[b] else: all_idx = sorted(set(idx for b in group for idx in bubble_indices[b])) new_bubbles[next_bid] = build_lines_from_indices(all_idx, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_idx]) new_quads[next_bid] = [ocr[i][0] for i in all_idx] new_indices[next_bid] = all_idx next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices def reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): """ Reattach tiny orphan token boxes (e.g., single 'HUH?') to nearest plausible bubble. """ bids = sorted(bubble_boxes.keys()) if len(bids) < 2: return bubbles, bubble_boxes, bubble_quads, bubble_indices all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 orphan_bids = [] for b in bids: idxs = bubble_indices.get(b, []) if len(idxs) != 1: continue t = normalize_text(ocr[idxs[0]][1]) if is_protected_token(t) or len(re.sub(r'[^A-ZÀ-Ý]', '', t)) <= 5: orphan_bids.append(b) if not orphan_bids: return bubbles, bubble_boxes, bubble_quads, bubble_indices consumed = set() for ob in orphan_bids: if ob in consumed: continue obox = bubble_boxes[ob] ocx = (obox[0] + obox[2]) / 2.0 ocy = (obox[1] + obox[3]) / 2.0 best_b = None best_d = 1e9 for tb in bids: if tb == ob or tb in consumed: continue tbox = bubble_boxes[tb] tcx = (tbox[0] + tbox[2]) / 2.0 tcy = (tbox[1] + tbox[3]) / 2.0 dx = abs(ocx - tcx) dy = abs(ocy - tcy) if dx <= med_h * 2.2 and dy <= med_h * 3.0: d = dx + dy if d < best_d: best_d = d best_b = tb if best_b is not None: merged = sorted(set(bubble_indices[best_b] + bubble_indices[ob])) bubble_indices[best_b] = merged bubble_quads[best_b] = [ocr[i][0] for i in merged] bubble_boxes[best_b] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged]) bubbles[best_b] = build_lines_from_indices(merged, ocr) consumed.add(ob) if consumed: for b in consumed: bubble_indices.pop(b, None) bubble_quads.pop(b, None) bubble_boxes.pop(b, None) bubbles.pop(b, None) # reindex for stable downstream order new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} for new_id, old_id in enumerate(sorted(bubble_boxes.keys()), start=1): new_bubbles[new_id] = bubbles[old_id] new_boxes[new_id] = bubble_boxes[old_id] new_quads[new_id] = bubble_quads[old_id] new_indices[new_id] = bubble_indices[old_id] return new_bubbles, new_boxes, new_quads, new_indices return bubbles, bubble_boxes, bubble_quads, bubble_indices def reconstruct_group_text(group_indices, ocr): """ Reconstruct text inside one already-detected group. This handles cases where a vertical group itself contains multiple local rows or wrapped OCR fragments. """ if not group_indices: return "" items = [] for i in group_indices: b = quad_bbox(ocr[i][0]) cx = (b[0] + b[2]) / 2.0 cy = (b[1] + b[3]) / 2.0 w = max(1, b[2] - b[0]) h = max(1, b[3] - b[1]) items.append((i, b, cx, cy, w, h)) if not items: return "" med_h = float(np.median([it[5] for it in items])) med_w = float(np.median([it[4] for it in items])) # If the group is strongly vertical, simple top->bottom is fine xs = [it[2] for it in items] ys = [it[3] for it in items] vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0 horizontal_span = max(xs) - min(xs) if len(xs) > 1 else 0 # strong single vertical phrase if vertical_span > horizontal_span * 1.5: items.sort(key=lambda x: x[3]) # top->bottom txt = normalize_text(" ".join( normalize_text(ocr[it[0]][1]) for it in items if normalize_text(ocr[it[0]][1]) )) return txt # otherwise, split into local rows first row_tol = max(6.0, med_h * 0.65) items.sort(key=lambda x: x[3]) rows = [] for it in items: placed = False for row in rows: if abs(it[3] - row["yc"]) <= row_tol: row["members"].append(it) row["yc"] = float(np.mean([m[3] for m in row["members"]])) placed = True break if not placed: rows.append({"yc": it[3], "members": [it]}) rows.sort(key=lambda r: r["yc"]) parts = [] for row in rows: members = sorted(row["members"], key=lambda x: x[2]) # left->right row_txt = normalize_text(" ".join( normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1]) )) if row_txt: parts.append(row_txt) txt = normalize_text(" ".join(parts)) return txt def reconstruct_group_text_best(group_indices, ocr): if not group_indices: return "" items = [] for i in group_indices: b = quad_bbox(ocr[i][0]) cx = (b[0] + b[2]) / 2.0 cy = (b[1] + b[3]) / 2.0 h = max(1, b[3] - b[1]) items.append((i, b, cx, cy, h)) if not items: return "" # Candidate 1: simple top->bottom cand1_items = sorted(items, key=lambda x: x[3]) cand1 = normalize_text(" ".join( normalize_text(ocr[it[0]][1]) for it in cand1_items if normalize_text(ocr[it[0]][1]) )) cand1 = fix_group_level_ocr(cand1) # Candidate 2: local rows med_h = float(np.median([it[4] for it in items])) row_tol = max(6.0, med_h * 0.65) rows = [] for it in sorted(items, key=lambda x: x[3]): placed = False for row in rows: if abs(it[3] - row["yc"]) <= row_tol: row["members"].append(it) row["yc"] = float(np.mean([m[3] for m in row["members"]])) placed = True break if not placed: rows.append({"yc": it[3], "members": [it]}) rows.sort(key=lambda r: r["yc"]) cand2_parts = [] for row in rows: members = sorted(row["members"], key=lambda x: x[2]) row_txt = normalize_text(" ".join( normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1]) )) if row_txt: cand2_parts.append(row_txt) cand2 = normalize_text(" ".join(cand2_parts)) cand2 = fix_group_level_ocr(cand2) # choose best s1 = ocr_candidate_score(cand1) s2 = ocr_candidate_score(cand2) return cand2 if s2 > s1 else cand1 def fix_group_level_ocr(text): t = normalize_text(text or "") if not t: return t replacements = { "ANY- THING": "ANYTHING", "BREAK- FAST": "BREAK-FAST", "COMMON BREAK- PEOPLE FAST": "COMMON PEOPLE EAT FOR BREAKFAST", "WHAT DO LIKE FOR COMMON BREAK- PEOPLE FAST EAT": "WHAT DO COMMON PEOPLE EAT LIKE FOR BREAKFAST", # New targeted fixes for reported cases "ILLU- SIONS": "ILLU-SIONS", "ATTEN- TION": "ATTEN-TION", "WHAT DO COMMON PEOPLE HE EAT?": "WHAT DO COMMON PEOPLE EAT?", "LIKE FOR BREAK- FAST": "LIKE FOR BREAK-FAST?", "YOUR STUCK": "YOU'RE STUCK", "YOUR HAND!": "YOUR HAND!", } for a, b in replacements.items(): t = t.replace(a, b) t = dehyphenate_linebreak_artifacts(t) t = re.sub(r"\s{2,}", " ", t).strip() return t def _is_sentence_like_fragment(t: str) -> bool: t = normalize_text(t or "") if not t: return False alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t) if len(alnum) < 2: return False return True def _line_has_terminal_punct(t: str) -> bool: t = normalize_text(t or "") return bool(re.search(r"[.!?…]$", t)) def _smart_split_by_connectors(text: str) -> List[str]: """ Conservative split for OCR text that glues multiple clauses. """ t = normalize_text(text or "") if not t: return [] # Keep hyphenated style if meaningful, but remove OCR line-wrap artifacts t = dehyphenate_linebreak_artifacts(t) # 1) Primary punctuation split parts = re.split(r"(?<=[.!?…])\s+", t) parts = [p.strip() for p in parts if p.strip()] if len(parts) >= 2: return parts # 2) Secondary lexical split if punctuation failed patterns = [ r"\b(THEY'RE|THEY ARE)\b", r"\b(DON'T|DO NOT)\b", r"\b(LIKE FOR)\b", r"\b(IF WE DON'T|IF WE DO NOT)\b", r"\b(WHAT DO)\b", ] for pat in patterns: m = re.search(pat, t) if m and m.start() > 8: left = t[:m.start()].strip() right = t[m.start():].strip() if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right): return [left, right] return [t] def split_box_by_sentence_rows(indices, ocr, min_groups=2): """ Force split one box into sentence-like row groups. Works for stacked dialogue blocks like: YOUR HAND! I'M STUCK AND HELPLESS LIKE THIS! IF WE DON'T HURRY UP, WE'LL BE CRUSHED TO DEATH! """ if not indices or len(indices) < 3: return None # Build row groups first rows = group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.70) if not rows or len(rows) < min_groups: return None # Turn each row-group into text row_payload = [] for grp in rows: txt = normalize_text(" ".join(ocr[i][1] for i in grp if normalize_text(ocr[i][1]))) txt = fix_group_level_ocr(txt) if not txt: continue box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) row_payload.append({"indices": grp, "text": txt, "box": box}) if len(row_payload) < min_groups: return None # Merge tiny row fragments upward if they are clearly continuation merged = [] for rp in row_payload: if not merged: merged.append(rp) continue prev = merged[-1] short_prev = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", prev["text"])) <= 5 no_term_prev = not re.search(r"[.!?…]$", prev["text"]) if short_prev and no_term_prev: new_idx = sorted(set(prev["indices"] + rp["indices"])) new_txt = normalize_text(prev["text"] + " " + rp["text"]) new_box = boxes_union_xyxy([prev["box"], rp["box"]]) merged[-1] = {"indices": new_idx, "text": new_txt, "box": new_box} else: merged.append(rp) # Keep sentence-like groups out = [] for m in merged: txt = normalize_text(m["text"]) if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) < 4: continue out.append(sorted(m["indices"], key=lambda i: ( quad_bbox(ocr[i][0])[1], quad_bbox(ocr[i][0])[0] ))) if len(out) < min_groups: return None return out def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]: """ Layout-aware phrase segmentation for one final box. Uses your internal grouping + punctuation/connector splitting. """ groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)] if not groups: merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode))) merged = fix_group_level_ocr(merged) return [x for x in _smart_split_by_connectors(merged) if _is_sentence_like_fragment(x)] out = [] for g in groups: out.extend(_smart_split_by_connectors(g)) # Dedupe OCR echoes cleaned = [] for p in out: p = normalize_text(p) if not _is_sentence_like_fragment(p): continue if cleaned and text_similarity(cleaned[-1], p) >= 0.92: continue cleaned.append(p) return cleaned def build_box_group_texts(indices, ocr, reading_mode="ltr"): """ Return independent text groups for one final box, preserving internal layout. Each group is reconstructed with local reading-order logic. """ layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) out = [] if not layout: return out blocks = layout.get("blocks", []) for block in blocks: mode = block.get("mode", "horizontal") groups = block.get("groups", []) if mode == "vertical": groups = sorted( groups, key=lambda grp: np.mean([ (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp ]), reverse=(reading_mode == "rtl") ) else: groups = sorted( groups, key=lambda grp: np.mean([ (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 for i in grp ]) ) for grp in groups: txt = reconstruct_group_text(grp, ocr) if txt: out.append(txt) return out def _is_sentence_like_fragment(t: str) -> bool: t = normalize_text(t or "") if not t: return False alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t) if len(alnum) < 2: return False return True def _line_has_terminal_punct(t: str) -> bool: t = normalize_text(t or "") return bool(re.search(r"[.!?…]$", t)) def _smart_split_by_connectors(text: str) -> List[str]: """ Conservative split for OCR text that glues 2 clauses: - DON'T PAY ANY ATTEN-TION TO THEM! THEY'RE ILLU-SIONS! - WHAT DO COMMON PEOPLE EAT? LIKE FOR BREAK-FAST? """ t = normalize_text(text or "") if not t: return [] # Normalize some OCR hyphen artifacts first t = dehyphenate_linebreak_artifacts(t) # Primary punctuation split parts = re.split(r"(?<=[.!?…])\s+", t) parts = [p.strip() for p in parts if p.strip()] if len(parts) >= 2: return parts # Secondary connector split patterns (conservative) patterns = [ r"\b(THEY'RE|THEY ARE)\b", r"\b(DON'T|DO NOT)\b", r"\b(LIKE FOR)\b", r"\b(IF WE DON'T|IF WE DO NOT)\b", ] for pat in patterns: m = re.search(pat, t) if m and m.start() > 8: left = t[:m.start()].strip() right = t[m.start():].strip() if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right): return [left, right] return [t] def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]: """ Layout-aware phrase segmentation for one final box. """ # Step 1: use your existing internal grouping groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)] if not groups: merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode))) return _smart_split_by_connectors(merged) # Step 2: split each group by punctuation/connectors out = [] for g in groups: out.extend(_smart_split_by_connectors(g)) # Step 3: dedupe near-identical neighbors (OCR echo) cleaned = [] for p in out: if not cleaned: cleaned.append(p) continue if text_similarity(cleaned[-1], p) >= 0.92: continue cleaned.append(p) return [normalize_text(x) for x in cleaned if _is_sentence_like_fragment(x)] def is_multi_group_bubble(indices, ocr, reading_mode="ltr", min_groups=2): groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) meaningful = [g for g in groups if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g)) >= 2] return len(meaningful) >= min_groups def _bubble_text(indices, ocr, reading_mode="ltr"): return normalize_text(" ".join(build_text_from_layout(indices, ocr, reading_mode=reading_mode))) def _box_dims(b): return max(1, b[2]-b[0]), max(1, b[3]-b[1]) def _intersection(a, b): ix1, iy1 = max(a[0], b[0]), max(a[1], b[1]) ix2, iy2 = min(a[2], b[2]), min(a[3], b[3]) w, h = max(0, ix2-ix1), max(0, iy2-iy1) return w*h def _containment_ratio(child, parent): inter = _intersection(child, parent) c_area = max(1, (child[2]-child[0])*(child[3]-child[1])) return inter / c_area def _center_distance(a, b): acx, acy = (a[0]+a[2])/2.0, (a[1]+a[3])/2.0 bcx, bcy = (b[0]+b[2])/2.0, (b[1]+b[3])/2.0 return ((acx-bcx)**2 + (acy-bcy)**2) ** 0.5 def _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices): new_b, new_bb, new_bq, new_bi = {}, {}, {}, {} for nid, old in enumerate(sorted(bubble_boxes.keys()), start=1): new_b[nid] = bubbles[old] new_bb[nid] = bubble_boxes[old] new_bq[nid] = bubble_quads[old] new_bi[nid] = bubble_indices[old] return new_b, new_bb, new_bq, new_bi def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr=None, reading_mode="ltr"): """ Final reconciliation pass for: - overlap merges - child absorption - complementary fragment merge This version is safe for optional image input and propagates reading_mode into layout-aware text reconstruction. """ if not bubble_boxes: return bubbles, bubble_boxes, bubble_quads, bubble_indices all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) if image_bgr is not None else [] changed = True while changed: changed = False bids = sorted(bubble_boxes.keys()) # ---- (A) Merge highly-overlapping pairs merged_any = False for i in range(len(bids)): if merged_any: break for j in range(i + 1, len(bids)): bi, bj = bids[i], bids[j] if bi not in bubble_boxes or bj not in bubble_boxes: continue a, b = bubble_boxes[bi], bubble_boxes[bj] iou = boxes_iou(a, b) ovs = boxes_overlap_ratio(a, b) # inter / smaller same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False if ovs >= 0.55 or (iou >= 0.35 and same_contour): idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) bubble_indices[bi] = idx bubble_quads[bi] = [ocr[k][0] for k in idx] bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) bubbles[bi] = build_lines_from_indices(idx, ocr) bubble_indices.pop(bj, None) bubble_quads.pop(bj, None) bubble_boxes.pop(bj, None) bubbles.pop(bj, None) changed = True merged_any = True break if changed: continue # ---- (B) Absorb tiny child boxes inside larger parent absorbed_any = False bids = sorted(bubble_boxes.keys()) for i in range(len(bids)): if absorbed_any: break for j in range(len(bids)): if i == j: continue child, parent = bids[i], bids[j] if child not in bubble_boxes or parent not in bubble_boxes: continue cb, pb = bubble_boxes[child], bubble_boxes[parent] cw, ch = _box_dims(cb) pw, ph = _box_dims(pb) contain = _containment_ratio(cb, pb) child_txt = _bubble_text(bubble_indices[child], ocr, reading_mode=reading_mode) parent_txt = _bubble_text(bubble_indices[parent], ocr, reading_mode=reading_mode) # tiny or fragment child is_tiny = (cw <= med_h * 3.2 and ch <= med_h * 2.2) or len(child_txt) <= 14 # don't absorb if it's clearly separate and far close = _center_distance(cb, pb) <= med_h * 4.0 if contain >= 0.70 and (is_tiny or close): idx = sorted(set(bubble_indices[parent] + bubble_indices[child])) bubble_indices[parent] = idx bubble_quads[parent] = [ocr[k][0] for k in idx] bubble_boxes[parent] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) bubbles[parent] = build_lines_from_indices(idx, ocr) bubble_indices.pop(child, None) bubble_quads.pop(child, None) bubble_boxes.pop(child, None) bubbles.pop(child, None) changed = True absorbed_any = True break if changed: continue # ---- (C) Merge complementary fragments comp_any = False bids = sorted(bubble_boxes.keys()) for i in range(len(bids)): if comp_any: break for j in range(i + 1, len(bids)): bi, bj = bids[i], bids[j] if bi not in bubble_boxes or bj not in bubble_boxes: continue a, b = bubble_boxes[bi], bubble_boxes[bj] wi, hi = _box_dims(a) wj, hj = _box_dims(b) vert_gap = max(0, max(a[1], b[1]) - min(a[3], b[3])) h_ix = max(0, min(a[2], b[2]) - max(a[0], b[0])) h_overlap_ratio = h_ix / max(1, min(wi, wj)) same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False txt_i = _bubble_text(bubble_indices[bi], ocr, reading_mode=reading_mode) txt_j = _bubble_text(bubble_indices[bj], ocr, reading_mode=reading_mode) if same_contour and vert_gap <= med_h * 2.8 and h_overlap_ratio >= 0.45: # prefer merge when one is upper fragment + other lower fragment # and text is not identical duplicate if txt_i != txt_j: idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) bubble_indices[bi] = idx bubble_quads[bi] = [ocr[k][0] for k in idx] bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) bubbles[bi] = build_lines_from_indices(idx, ocr) bubble_indices.pop(bj, None) bubble_quads.pop(bj, None) bubble_boxes.pop(bj, None) bubbles.pop(bj, None) changed = True comp_any = True break return _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices) def split_boxes_by_internal_vertical_groups(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_shape, reading_mode="ltr"): """ Conservative splitter: - Split only when evidence is strong. - Prevent over-splitting of short/noisy vertical tokens. """ ih, iw = image_shape[:2] out_bubbles = {} out_boxes = {} out_quads = {} out_indices = {} next_id = 1 # conservative thresholds MIN_ALNUM_PER_GROUP = 8 MIN_GROUP_HEIGHT_RATIO = 0.30 # was too low before MIN_VERTICAL_GROUPS_TO_SPLIT = 2 MAX_SPLIT_PARTS = 3 # safety cap for bid in sorted(bubble_boxes.keys()): idxs = bubble_indices[bid] parent = bubble_boxes[bid] parent_h = max(1, parent[3] - parent[1]) parent_w = max(1, parent[2] - parent[0]) if len(idxs) < 4: out_bubbles[next_id] = bubbles[bid] out_boxes[next_id] = bubble_boxes[bid] out_quads[next_id] = bubble_quads[bid] out_indices[next_id] = idxs next_id += 1 continue layout = detect_internal_text_layout(idxs, ocr, reading_mode=reading_mode) did_split = False # -------------------------------------------------------------- # Primary: vertical-mode internal groups (STRICT) # -------------------------------------------------------------- if layout and layout.get("blocks"): candidate_groups = [] for block in layout.get("blocks", []): if block.get("mode", "horizontal") != "vertical": continue for grp in block.get("groups", []): grp = sorted(set(grp), key=lambda i: ( quad_bbox(ocr[i][0])[1], quad_bbox(ocr[i][0])[0] )) if not grp: continue txt = reconstruct_group_text_best(grp, ocr) txt = normalize_text(fix_group_level_ocr(txt)) if not txt: continue alnum_len = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) if alnum_len < MIN_ALNUM_PER_GROUP: continue gb = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) gw = max(1, gb[2] - gb[0]) gh = max(1, gb[3] - gb[1]) # require meaningful physical size if gh < parent_h * MIN_GROUP_HEIGHT_RATIO: continue # avoid splitting tiny narrow SFX-like strips if gw < parent_w * 0.12 and alnum_len < 12: continue # sentence-ish check words = txt.split() has_terminal = bool(re.search(r"[.!?…]$", txt)) if len(words) < 2 and not has_terminal: continue candidate_groups.append({ "indices": grp, "text": txt, "box": gb }) if len(candidate_groups) >= MIN_VERTICAL_GROUPS_TO_SPLIT: # Sort columns by reading order candidate_groups = sorted( candidate_groups, key=lambda g: (g["box"][0] + g["box"][2]) / 2.0, reverse=(reading_mode == "rtl") ) # cap extreme over-splits if len(candidate_groups) > MAX_SPLIT_PARTS: candidate_groups = candidate_groups[:MAX_SPLIT_PARTS] # final sanity: total text coverage vs parent text parent_txt = normalize_text(" ".join(build_final_box_text(idxs, ocr, reading_mode=reading_mode))) parent_alnum = max(1, len(re.sub(r"[^A-ZÀ-Ý0-9]", "", parent_txt))) sum_child_alnum = sum(len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g["text"])) for g in candidate_groups) # if split loses too much text evidence, reject if (sum_child_alnum / parent_alnum) >= 0.65: for g in candidate_groups: grp = sorted(set(g["indices"]), key=lambda i: ( quad_bbox(ocr[i][0])[1], quad_bbox(ocr[i][0])[0] )) ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) out_indices[next_id] = grp out_quads[next_id] = [ocr[i][0] for i in grp] out_boxes[next_id] = ( max(0, ub[0] - 2), max(0, ub[1] - 2), min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) ) out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode) next_id += 1 did_split = True if did_split: continue # -------------------------------------------------------------- # Fallback: row sentence split (ONLY for strong punctuation cases) # -------------------------------------------------------------- row_sentence_parts = split_box_by_sentence_rows(idxs, ocr, min_groups=2) if row_sentence_parts and 2 <= len(row_sentence_parts) <= 3: # Require punctuation evidence in resulting parts part_texts = [] for grp in row_sentence_parts: txt = normalize_text(" ".join(build_lines_from_indices(grp, ocr))) txt = fix_group_level_ocr(txt) part_texts.append(txt) punct_parts = sum(1 for t in part_texts if re.search(r"[.!?…]$", t)) if punct_parts >= 2: for grp in row_sentence_parts: grp = sorted(set(grp), key=lambda i: ( quad_bbox(ocr[i][0])[1], quad_bbox(ocr[i][0])[0] )) ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) out_indices[next_id] = grp out_quads[next_id] = [ocr[i][0] for i in grp] out_boxes[next_id] = ( max(0, ub[0] - 2), max(0, ub[1] - 2), min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) ) out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode) next_id += 1 continue # -------------------------------------------------------------- # Keep original if no strong split evidence # -------------------------------------------------------------- out_bubbles[next_id] = bubbles[bid] out_boxes[next_id] = bubble_boxes[bid] out_quads[next_id] = bubble_quads[bid] out_indices[next_id] = idxs next_id += 1 return out_bubbles, out_boxes, out_quads, out_indices def split_box_by_internal_vertical_gaps(bid, bubble_indices, ocr, factor=1.45, min_gap=16): """ Multi-cut vertical splitter. Splits one bubble into N vertical groups when there are multiple strong y-gaps. Good for 4+4 quad accidental merges. """ idxs = bubble_indices.get(bid, []) if len(idxs) < 4: return None items = [] for i in idxs: b = quad_bbox(ocr[i][0]) cy = (b[1] + b[3]) / 2.0 h = max(1, b[3] - b[1]) items.append((i, b, cy, h)) items.sort(key=lambda x: x[2]) # top->bottom med_h = float(np.median([x[3] for x in items])) if items else 12.0 th = max(min_gap, med_h * factor) # Collect cut points cut_positions = [] prev_b = items[0][1] for k in range(1, len(items)): cur_b = items[k][1] gap = cur_b[1] - prev_b[3] if gap > th: cut_positions.append(k) prev_b = cur_b if not cut_positions: return None # Build groups using all cut positions groups = [] start = 0 for cp in cut_positions: groups.append([it[0] for it in items[start:cp]]) start = cp groups.append([it[0] for it in items[start:]]) # Remove empty groups groups = [g for g in groups if g] if len(groups) <= 1: return None # Sanity: each group should be meaningful clean_groups = [] for g in groups: txt = normalize_text(" ".join(build_lines_from_indices(g, ocr))) if len(g) >= 2 or len(txt) >= 12: clean_groups.append(g) if len(clean_groups) <= 1: return None return clean_groups def force_split_bridged_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): """ Force-split boxes that accidentally contain multiple vertically separated speech chunks. Typical fixes: - one detected box actually contains 2 stacked bubbles - "4 quads + 4 quads" merged into one cluster - mixed contour membership inside one grouped box """ if not bubble_boxes: return bubbles, bubble_boxes, bubble_quads, bubble_indices bubble_contours = detect_speech_bubbles(image_bgr) def contour_id_for_idx(i): b = quad_bbox(ocr[i][0]) cx = (b[0] + b[2]) / 2.0 cy = (b[1] + b[3]) / 2.0 for ci, c in enumerate(bubble_contours): if cv2.pointPolygonTest(c, (cx, cy), False) >= 0: return ci return -1 def build_group_payload(g): g_sorted = sorted(g, key=lambda i: quad_center(ocr[i][0])[1]) ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g_sorted]) return ( build_lines_from_indices(g_sorted, ocr), # lines ub, # box [ocr[i][0] for i in g_sorted], # quads g_sorted # indices ) new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 for bid in sorted(bubble_boxes.keys()): idxs = bubble_indices.get(bid, []) if len(idxs) < 2: # keep as-is new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 continue parts = None # ------------------------------------------------------------------ # (A) Primary: internal vertical-gap multi-split # ------------------------------------------------------------------ parts = split_box_by_internal_vertical_gaps( bid, bubble_indices, ocr, factor=1.45, min_gap=16 ) # ------------------------------------------------------------------ # (B) Secondary: split by contour membership if clearly mixed # ------------------------------------------------------------------ if parts is None and len(idxs) >= 3: by_contour = {} for i in idxs: cid = contour_id_for_idx(i) by_contour.setdefault(cid, []).append(i) contour_groups = [g for g in by_contour.values() if len(g) >= 1] if len(contour_groups) >= 2: # sort groups top->bottom for stable order contour_groups.sort(key=lambda g: min(quad_bbox(ocr[i][0])[1] for i in g)) # sanity: avoid splitting tiny noise-only tails valid = [] for g in contour_groups: txt = normalize_text(" ".join(build_lines_from_indices(g, ocr))) if len(g) >= 2 or len(txt) >= 10: valid.append(g) if len(valid) >= 2: parts = valid # ------------------------------------------------------------------ # (C) Tertiary: balanced 2-block pattern (e.g., 4 quads + 4 quads) # ------------------------------------------------------------------ if parts is None and len(idxs) >= 8: sorted_idxs = sorted( idxs, key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 ) mid = len(sorted_idxs) // 2 g1, g2 = sorted_idxs[:mid], sorted_idxs[mid:] b1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g1]) b2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g2]) if b1 and b2: vgap = max(0, b2[1] - b1[3]) h1 = max(1, b1[3] - b1[1]) h2 = max(1, b2[3] - b2[1]) med_local_h = (h1 + h2) / 2.0 h_ix = max(0, min(b1[2], b2[2]) - max(b1[0], b2[0])) min_w = max(1, min(b1[2] - b1[0], b2[2] - b2[0])) h_overlap_ratio = h_ix / min_w if vgap >= max(14, 0.22 * med_local_h) and h_overlap_ratio >= 0.30: parts = [g1, g2] # ------------------------------------------------------------------ # Commit split or keep original # ------------------------------------------------------------------ if parts is None or len(parts) <= 1: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 continue for g in parts: lines, box, quads, gidx = build_group_payload(g) new_bubbles[next_bid] = lines new_boxes[next_bid] = box new_quads[next_bid] = quads new_indices[next_bid] = gidx next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices # ============================================================ # translate_manga_text START # ============================================================ def translate_manga_text( image_path="001-page.png", source_lang="en", target_lang="ca", confidence_threshold=0.03, min_text_length=1, gap_px="auto", quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", debug=True, use_enhanced_ocr=True, strict_grouping=True, max_box_width_ratio=0.6, max_box_height_ratio=0.5, auto_fix_bubbles=True ): image = cv2.imread(image_path) if image is None: print(f"❌ Cannot load image: {image_path}") return resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) ih, iw = image.shape[:2] print("Loading OCR engines...") if use_enhanced_ocr: detector = ImprovedMacVisionDetector(source_lang=source_lang) print("🚀 Using Enhanced Multi-Pass OCR") else: detector = MacVisionDetector(source_lang=source_lang) print("Running detection OCR (Apple Vision)...") raw = detector.read(image_path) print(f"Raw detections: {len(raw)}") if use_enhanced_ocr: existing_quads = [r[0] for r in raw] missed_regions = detect_small_text_regions(image, existing_quads) if missed_regions: print(f"🔍 Found {len(missed_regions)} potentially missed text regions") for region in missed_regions: rx1, ry1, rx2, ry2 = region pad = 10 rx1, ry1 = max(0, rx1 - pad), max(0, ry1 - pad) rx2, ry2 = min(iw, rx2 + pad), min(ih, ry2 + pad) crop = image[ry1:ry2, rx1:rx2] if crop.size > 0: upscaled = cv2.resize( crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC ) for quad, text, conf in detector.run_vision_ocr(upscaled): raw.append(( [[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad], text, conf )) print(f"📝 Total detections after missed region scan: {len(raw)}") # ── Filtering ───────────────────────────────────────────────────────── filtered, skipped = [], 0 for bbox, text, conf in raw: t = normalize_text(text) qb = quad_bbox(bbox) if conf < confidence_threshold: skipped += 1 continue if len(t) < min_text_length: skipped += 1 continue if not is_valid_language(t, source_lang): skipped += 1 continue if not is_meaningful_text(t, source_lang): skipped += 1 continue if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: skipped += 1 continue filtered.append((bbox, t, conf)) print(f"Kept: {len(filtered)} | Skipped: {skipped}") # Protect short dialogue token confidence tmp = [] for bbox, t, conf in filtered: tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40))) filtered = tmp # Rescue names/short tokens dropped by strict filters rescued = rescue_name_and_short_tokens(raw, min_conf=0.20) filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55) if not filtered: print("⚠️ No text after filtering.") return # ── Pre-grouping quad splits ────────────────────────────────────────── filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered) if oversized_splits > 0: print(f"📐 Split {oversized_splits} oversized quad(s) before grouping") filtered, wide_splits = split_wide_ocr_items(image, filtered) if wide_splits > 0: print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.") filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) if bridge_splits > 0: print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") hs_pre = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered] med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 filtered, _ = apply_column_gap_splits(image, filtered, med_h_pre) filtered = normalize_ocr_quads(filtered) # ── Grouping ────────────────────────────────────────────────────────── print("📊 Grouping quads vertically...") bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( filtered, image.shape, gap_px=resolved_gap, bbox_padding=1, strict_mode=strict_grouping ) print(f" Created {len(bubbles)} initial bubble-group box(es)") print("🧱 Proposing region-first text containers...") region_lines, region_boxes, region_quads, region_indices = propose_text_regions_from_ocr( filtered, image.shape ) print(f" Proposed {len(region_lines)} region container(s)") # ── Auto-fix (split + merge) ────────────────────────────────────────── if auto_fix_bubbles: bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image ) bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image ) # ── Enforce max box size ────────────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, max_width_ratio=max_box_width_ratio, max_height_ratio=max_box_height_ratio, image_shape=image.shape ) # ── Close-proximity merge ───────────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) # ── Per-bubble split pass ───────────────────────────────────────────── new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} next_bid = max(bubbles.keys()) + 1 if bubbles else 1 splits_performed = [] for bid in list(bubbles.keys()): split_result, split_reason = _split_bubble_if_needed( bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih ) if split_result: p1, p2 = split_result splits_performed.append(f"BOX#{bid} ({split_reason})") for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) new_bubbles[part_bid] = build_final_box_text( part_idxs, filtered, reading_mode=reading_mode ) new_bubble_boxes[part_bid] = ( max(0, ub[0] - 2), max(0, ub[1] - 2), min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) ) new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] new_bubble_indices[part_bid] = part_idxs next_bid += 1 else: new_bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) new_bubble_boxes[bid] = bubble_boxes[bid] new_bubble_quads[bid] = bubble_quads[bid] new_bubble_indices[bid] = bubble_indices[bid] if splits_performed: print(f"\n🔀 Splits detected: {len(splits_performed)}") for s in splits_performed: print(f" ✓ {s}") bubbles = new_bubbles bubble_boxes = new_bubble_boxes bubble_quads = new_bubble_quads bubble_indices = new_bubble_indices # ── Reattach orphan short tokens ────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = reattach_orphan_short_tokens( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) for bid in list(bubble_indices.keys()): bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) # ── Final reconciliation pass ───────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image_bgr=image, reading_mode=reading_mode ) for bid in list(bubble_indices.keys()): bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image ) for bid in list(bubble_indices.keys()): bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image_bgr=image, reading_mode=reading_mode ) for bid in list(bubble_indices.keys()): bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) # ── Reconcile bubble-first and region-first views ───────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_region_and_bubble_groups( region_lines, region_boxes, region_quads, region_indices, bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) for bid in list(bubble_indices.keys()): bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) # ── Split boxes by internal vertical groups ─────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = split_boxes_by_internal_vertical_groups( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image.shape, reading_mode=reading_mode ) for bid in list(bubble_indices.keys()): bubbles[bid] = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) print(f"✅ Final box count: {len(bubbles)}") # ── OCR quality pass ────────────────────────────────────────────────── translator = GoogleTranslator(source=source_lang, target=target_lang) clean_lines: Dict[int, str] = {} raw_lines: Dict[int, str] = {} corrected_lines: Dict[int, str] = {} sources_used: Dict[int, str] = {} translations: Dict[int, str] = {} region_types: Dict[int, str] = {} region_confidences: Dict[int, float] = {} region_flags: Dict[int, List[str]] = {} bubble_group_texts: Dict[int, List[str]] = {} for bid in sorted(bubble_boxes.keys()): final_lines = build_final_box_text( bubble_indices[bid], filtered, reading_mode=reading_mode ) bubbles[bid] = final_lines # NEW: segmented phrase groups for translation group_texts = segment_box_into_phrases( bubble_indices[bid], filtered, reading_mode=reading_mode ) bubble_group_texts[bid] = group_texts base_txt = normalize_text(" ".join(final_lines)) raw_lines[bid] = base_txt base_sc = ocr_candidate_score(base_txt) txt, src_used = base_txt, "vision-base" if base_sc < quality_threshold: rr_txt, rr_sc, rr_src = reread_bubble_with_vision( image, bubble_boxes[bid], detector, upscale=3.0, pad=24 ) if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): txt, src_used = rr_txt, rr_src tmp_lines = [txt] if txt else final_lines region_type = classify_region_type(image, bubble_boxes[bid], tmp_lines) corrected_txt, correction_gain = correct_region_text(txt, region_type=region_type) conf = compute_region_confidence(txt, corrected_txt, bubble_boxes[bid], region_type, image) flags = build_region_flags(txt, corrected_txt, region_type, conf) if len([g for g in group_texts if g.strip()]) >= 2: flags.append("BUBBLE") flags.append("SEGMENTED") clean_lines[bid] = normalize_text(corrected_txt) corrected_lines[bid] = normalize_text(corrected_txt) sources_used[bid] = src_used region_types[bid] = region_type region_confidences[bid] = conf region_flags[bid] = sorted(set(flags)) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) # ── Translation ─────────────────────────────────────────────────────── for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): group_texts = [g for g in bubble_group_texts.get(bid, []) if g.strip()] if len(group_texts) >= 2: src_txt = " ".join(group_texts).strip() else: src_txt = clean_lines[bid].strip() if not src_txt: continue if not is_valid_language(src_txt, source_lang): continue if not is_meaningful_text(src_txt, source_lang): continue try: if len(group_texts) >= 2: translated_groups = [] for g in group_texts: if not is_valid_language(g, source_lang): continue if not is_meaningful_text(g, source_lang): continue tg = translator.translate(g) or "" tg = postprocess_translation_general(tg).upper() if tg: translated_groups.append(tg) tgt = " || ".join(translated_groups) else: tgt = translator.translate(src_txt) or "" tgt = postprocess_translation_general(tgt).upper() except Exception as e: tgt = f"[Error: {e}]" translations[bid] = tgt if debug: save_debug_clusters( image_path, filtered, bubble_boxes, bubble_indices, clean_lines, "debug_clusters.png", region_types=region_types ) # ── Text output ─────────────────────────────────────────────────────── divider = "─" * 140 out_lines = [ "BUBBLE|ORDER|TYPE|CONF|OCR_SOURCE|ORIGINAL|CORRECTED|BUBBLE_GROUPS|TRANSLATED|FLAGS", divider ] print( divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'TYPE':<10} {'CONF':<6} {'SOURCE':<12} " f"{'CORRECTED':<30} {'BUBBLE_GROUPS':<40} {'TRANSLATED':<30} FLAGS\n" + divider ) translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() if not src_txt: continue if not is_valid_language(src_txt, source_lang): continue if not is_meaningful_text(src_txt, source_lang): continue flags = list(region_flags.get(bid, [])) tgt = translations.get(bid, "") if not tgt: flags.append("NO_TRANSLATION") src_engine = sources_used.get(bid, "unknown") rtype = region_types.get(bid, "unknown") rconf = region_confidences.get(bid, 0.0) raw_u = raw_lines.get(bid, "").upper() corr_u = corrected_lines.get(bid, "").upper() group_blob = " || ".join(bubble_group_texts.get(bid, [])).upper() out_lines.append( f"#{bid}|{reading_map.get(bid, bid)}|{rtype}|{rconf:.2f}|{src_engine}|" f"{raw_u}|{corr_u}|{group_blob}|{tgt}|{','.join(flags) if flags else '-'}" ) print( f"#{bid:<7} {reading_map.get(bid,bid):<6} {rtype:<10} {rconf:<6.2f} {src_engine:<12} " f"{corr_u[:30]:<30} {group_blob[:40]:<40} {tgt[:30]:<30} " f"{','.join(flags) if flags else '-'}" ) translated_count += 1 out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.") with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(out_lines)) # ── bubbles.json ────────────────────────────────────────────────────── bubbles_payload = {} for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() if not src_txt: continue if not is_valid_language(src_txt, source_lang): continue if not is_meaningful_text(src_txt, source_lang): continue box = bubble_boxes.get(bid) tgt = translations.get(bid, "") bubbles_payload[str(bid)] = { "order": reading_map.get(bid, bid), "region_type": region_types.get(bid, "unknown"), "confidence": round(region_confidences.get(bid, 0.0), 4), "ocr_source": sources_used.get(bid, "unknown"), "raw_ocr": raw_lines.get(bid, "").upper(), "corrected_ocr": corrected_lines.get(bid, "").upper(), "translation_input": src_txt.upper(), "translated": tgt, "flags": region_flags.get(bid, []), "bubble_groups": [g.upper() for g in bubble_group_texts.get(bid, [])], "box": { "x": box[0] if box else 0, "y": box[1] if box else 0, "w": (box[2] - box[0]) if box else 0, "h": (box[3] - box[1]) if box else 0, }, "lines": [line.upper() for line in bubbles.get(bid, [])], } with open(export_bubbles_to, "w", encoding="utf-8") as f: json.dump(bubbles_payload, f, ensure_ascii=False, indent=2) print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") # ============================================================ # translate_manga_text END # ============================================================ # ============================================================ # ENTRY POINT # ============================================================ if __name__ == "__main__": translate_manga_text( image_path="19.png", source_lang="english", target_lang="ca", confidence_threshold=0.03, min_text_length=1, gap_px="auto", quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", debug=True, use_enhanced_ocr=True, strict_grouping=True, max_box_width_ratio=0.6, max_box_height_ratio=0.5, auto_fix_bubbles=True )