diff --git a/README.md b/README.md index f7faae3..86e3167 100644 --- a/README.md +++ b/README.md @@ -182,4 +182,4 @@ Enable verbose output by modifying the logging level in `manga-translator.py` - Processing time: ~10-30 seconds per page (varies by image size and hardware) - ML models are downloaded automatically on first run - GPU acceleration available with compatible CUDA setup (optional) -- Tested on macOS 13+ with Python 3.11 +- Tested on macOS 13+ with Python 3.11 \ No newline at end of file diff --git a/manga-translator.py b/manga-translator.py index 14384bb..11f0615 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -31,8 +31,6 @@ from difflib import SequenceMatcher # ============================================================ # FIX: COMMON SHORT ENGLISH WORDS (1–2 chars) -# Prevents OCR from discarding or misclassifying valid short tokens. -# Source: most frequent 1-char and 2-char English words. # ============================================================ SHORT_ENGLISH_WORDS_1 = { "A", "I", @@ -41,7 +39,43 @@ SHORT_ENGLISH_WORDS_1 = { SHORT_ENGLISH_WORDS_2 = { "AM", "AN", "AS", "AT", "BE", "BY", "DO", "GO", "HE", "IF", "IN", "IS", "IT", "ME", "MY", "NO", "OF", "OH", "OK", "ON", - "OR", "SO", "TO", "UP", "US", "WE","BUT","I" + "OR", "SO", "TO", "UP", "US", "WE", "BUT", "I" +} + +# ── Manga bold font substitution table ─────────────────────── +BOLD_FONT_WORD_FIXES = { + # D → P misread + r'\bANP\b': 'AND', + r'\bANP,\b': 'AND,', + r'\bBEHINP\b': 'BEHIND', + r'\bHIPING\b': 'HIDING', + r'\bHIPINO\b': 'HIDING', + r'\bPON\'T\b': "DON'T", + r'\bPEATH\b': 'DEATH', + r'\bPEATHI\b': 'DEATH!', + r'\bCRUSHEP\b': 'CRUSHED', + r'\bSTUCK\b': 'STUCK', + r'\bHANP\b': 'HAND', + r'\bHANPI\b': 'HAND!', + # E → F misread + r'\bLIKF\b': 'LIKE', + r'\bTHF\b': 'THE', + r'\bWF\b': 'WE', + # S → 5/59 misread + r'\bHELPLE59\b': 'HELPLESS', + r'\bHELPLESS\b': 'HELPLESS', + # V → W/U misread + r'\bWVLL\b': "WE'LL", + r'\bWVL\b': "WE'LL", + # G → O misread (NG endings) + r'\bTOUCHINO\b': 'TOUCHING', + r'\bHIDINO\b': 'HIDING', + r'\bGOINO\b': 'GOING', + # Missing space between words + r'\bIFWE\b': 'IF WE', + r'\bWELL\b': 'WELL', + # I → ! misread at end of exclamation + r'([A-Z])I\b': r'\1!', } # Combined protected set used by is_meaningful_text() @@ -55,7 +89,6 @@ DIALOGUE_STOPWORDS = { } PROTECTED_SHORT_TOKENS = { - # ... existing entries ... "HUH", "HUH?", "HUH??", "HUH?!", "OH", "OH!", "OOH", "OOH!", "AH", "AH!", "UH", "UH...", @@ -65,7 +98,6 @@ PROTECTED_SHORT_TOKENS = { "BECKY", "BECKY!", "DAMIAN", "CECILE", "WALD", "OMIGOSH", "EEEP", "EEEEP", - # FIX: common short words that appear alone on a manga line "GOOD", "WELL", "YEAH", "OKAY", "SURE", "WAIT", "STOP", "LOOK", "COME", "BACK", "HERE", "OVER", "JUST", "EVEN", "ONLY", @@ -73,72 +105,10 @@ PROTECTED_SHORT_TOKENS = { "THIS", "WITH", "FROM", "HAVE", "WILL", } +# ── Single definition of _MANGA_INTERJECTIONS ───────────────── +# FIX Issue 3: only ONE definition — expanded version kept, +# duplicate removed. _MANGA_INTERJECTIONS = { - # ... existing entries ... - # FIX: short words that appear isolated on their own OCR line - 'GOOD', 'WELL', 'YEAH', 'OKAY', 'SURE', - 'WAIT', 'STOP', 'LOOK', 'COME', 'BACK', - 'HERE', 'OVER', 'JUST', 'EVEN', 'ONLY', - 'ALSO', 'THEN', 'WHEN', 'WHAT', 'THAT', - 'THIS', 'WITH', 'FROM', 'HAVE', 'WILL', - 'TRUE', 'REAL', 'FINE', 'DONE', 'GONE', - 'HELP', 'MOVE', 'STAY', 'CALM', 'COOL', -} - - -# FIX: SFX_HINTS contains ONLY pure onomatopoeia — no words -# that could appear in dialogue (MORNING, GOOD, etc. removed) -SFX_HINTS = { - # impact / hits - "BAM", "BOOM", "WHAM", "SLAM", "SMACK", "THUD", "CRACK", - "CRASH", "BANG", "POW", "BIFF", "BONK", "CLUNK", "CLANG", - "THWACK", "WHAP", "WHUMP", "FWAP", "FWUP", "FWOOP", - # motion / air - "FSHOO", "WHOOSH", "SWISH", "SWOOSH", "WOOSH", "ZOOM", - "VROOM", "WHIRR", "WHIZZ", - # bells / rings - "RRRING", "RING", "RINGG", "DING", "DONG", "CLANG", - "JINGLE", "CHIME", - # body / breath - "SNIF", "SNIFF", "GULP", "GASP", "WHEEZE", "PANT", - "GRUNT", "GROAN", "SNORE", - # misc short - "GRRP", "GRRR", "TICK", "TOCK", "DRIP", "PLOP", - "SQUEAK", "CREAK", "RUSTLE", "THUMP", - # typing / tech - "BEEP", "BOOP", "BUZZ", "CLICK", "CLACK", - # specific manga sfx - "FWMP", "FTMP", "FWIP", "FWSH", "SHFF", "SHFFT", - "TMP", "TMP TMP", "STEP", "STOMP", -} - -# FIX: REACTION_HINTS — short emotional utterances only -# Proper nouns and greetings removed (they are dialogue) -REACTION_HINTS = { - "HUH", "HUH?!", "HUH?", "HUH??", - "OH", "OH!", "OOH", "OOH!", - "AH", "AH!", "UH", "EH", "EH?", - "TCH", "TSK", - "WHAT?!", "WHAT?", - "NO!", "YES!", - "EEK", "EEEEP", "EEEP", -} - -# ============================================================ -# FIX: narration and dialogue are treated as the same output type. -# Narration boxes are kept structurally but labelled as dialogue -# so they are translated and rendered identically. -# ============================================================ -DIALOGUE_EQUIVALENT_TYPES = {"dialogue", "narration", "reaction"} - -NARRATION_HINTS = { - "AND SO", "MEANWHILE", "LATER", "THEN", "TO BE CONTINUED" -} - -# FIX: Added common sentence-leading words that are 2–3 chars -# and would otherwise be dropped by the alpha-count gate. -_MANGA_INTERJECTIONS = { - # --- existing entries --- 'HUH', 'HUH?', 'HUH??', 'HUH?!', 'OH', 'OH!', 'OOH', 'OOH!', 'AH', 'AH!', 'UH', 'UH...', @@ -162,29 +132,81 @@ _MANGA_INTERJECTIONS = { 'SHIVER', 'RRRING', 'MORNING', 'MORNING.', - # --- FIX: sentence starters and conjunctions --- + # Sentence starters and conjunctions 'BUT', 'AND', 'SO', 'OR', 'IF', 'AS', 'YET', 'NOR', 'FOR', - # --- FIX: common short dialogue words --- + # Common short dialogue words 'GET', 'GOT', 'NOT', 'NOW', 'TOO', 'YOU', 'HIM', 'HER', 'ITS', 'OUR', 'CAN', 'DID', 'HAS', 'HAD', 'LET', 'SAY', 'SEE', 'TRY', 'USE', 'ALL', 'ANY', 'ONE', 'OWN', 'NEW', 'OLD', 'BIG', 'BAD', 'ODD', + # Short words that appear isolated on their own OCR line + 'GOOD', 'WELL', 'YEAH', 'OKAY', 'SURE', + 'WAIT', 'STOP', 'LOOK', 'COME', 'BACK', + 'HERE', 'OVER', 'JUST', 'EVEN', 'ONLY', + 'ALSO', 'THEN', 'WHEN', 'THAT', + 'WITH', 'FROM', 'HAVE', 'WILL', + 'TRUE', 'REAL', 'FINE', 'DONE', 'GONE', + 'HELP', 'MOVE', 'STAY', 'CALM', 'COOL', +} + +SFX_HINTS = { + "BAM", "BOOM", "WHAM", "SLAM", "SMACK", "THUD", "CRACK", + "CRASH", "BANG", "POW", "BIFF", "BONK", "CLUNK", "CLANG", + "THWACK", "WHAP", "WHUMP", "FWAP", "FWUP", "FWOOP", + "FSHOO", "WHOOSH", "SWISH", "SWOOSH", "WOOSH", "ZOOM", + "VROOM", "WHIRR", "WHIZZ", + "RRRING", "RING", "RINGG", "DING", "DONG", + "JINGLE", "CHIME", + "SNIF", "SNIFF", "GULP", "GASP", "WHEEZE", "PANT", + "GRUNT", "GROAN", "SNORE", + "GRRP", "GRRR", "TICK", "TOCK", "DRIP", "PLOP", + "SQUEAK", "CREAK", "RUSTLE", "THUMP", + "BEEP", "BOOP", "BUZZ", "CLICK", "CLACK", + "FWMP", "FTMP", "FWIP", "FWSH", "SHFF", "SHFFT", + "TMP", "TMP TMP", "STEP", "STOMP", +} + +REACTION_HINTS = { + "HUH", "HUH?!", "HUH?", "HUH??", + "OH", "OH!", "OOH", "OOH!", + "AH", "AH!", "UH", "EH", "EH?", + "TCH", "TSK", + "WHAT?!", "WHAT?", + "NO!", "YES!", + "EEK", "EEEEP", "EEEP", +} + +DIALOGUE_EQUIVALENT_TYPES = {"dialogue", "narration", "reaction"} + +NARRATION_HINTS = { + "AND SO", "MEANWHILE", "LATER", "THEN", "TO BE CONTINUED" +} + +KNOWN_NAMES = { + "BECKY", "DAMIAN", "CECILE", "WALD" +} + +_NOISE_TOKENS = { + 'P', 'F', 'N', 'M', 'X', 'Z', 'Q', + 'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP', } +# ============================================================ +# NORMALISE REGION TYPE +# ============================================================ def normalise_region_type(region_type: str) -> str: - """ - FIX: Collapse narration → dialogue so both are treated - identically in translation, output, and rendering. - """ if region_type == "narration": return "dialogue" return region_type +# ============================================================ +# GEOMETRY HELPERS +# ============================================================ def xyxy_width(b): return max(1, b[2] - b[0]) @@ -239,58 +261,41 @@ def stopword_ratio(text): hits = sum(1 for t in toks if t in DIALOGUE_STOPWORDS) return hits / len(toks) + +# ============================================================ +# TEXT CLASSIFICATION HELPERS +# ============================================================ def looks_like_sfx_text(text: str) -> bool: - """ - FIX: Rewritten with much stricter guards. - - True SFX characteristics: - - Single token OR very short (≤ 2 words) - - No sentence-ending punctuation (. ! ?) that implies speech - - No stopwords at all - - No known proper nouns (names are dialogue, not sfx) - - Matches known sfx vocabulary OR is a pure onomatopoeia pattern - - Multi-word sentences with stopwords, names, or punctuation - are NEVER sfx regardless of uppercase ratio. - """ t = normalize_text(text or "") if not t: return False - alpha = re.sub(r"[^A-Z]", "", t) - words = t.split() + alpha = re.sub(r"[^A-Z]", "", t) + words = t.split() - # Hard block: proper nouns are always dialogue for name in KNOWN_NAMES: if name in words: return False - # Hard block: any stopword present → dialogue toks = re.findall(r"[A-Z']+", t) if any(tok in DIALOGUE_STOPWORDS for tok in toks): return False - # Hard block: sentence punctuation implies speech if re.search(r"[.?!,]", t) and len(words) > 2: return False - # Hard block: more than 3 words is almost certainly dialogue if len(words) > 3: return False - # Exact sfx vocabulary match if t in SFX_HINTS or alpha in SFX_HINTS: return True - # Pure onomatopoeia: repeated consonant clusters, no vowel variety - # e.g. GRRP, THUD, WHAM, FWUP — short, no spaces, high consonant ratio if (len(alpha) >= 2 and len(alpha) <= 8 and uppercase_ratio(t) > 0.90 and stopword_ratio(t) < 0.05 and len(words) == 1): vowels = len(re.findall(r"[AEIOU]", alpha)) consonants = len(alpha) - vowels - # Pure sfx tends to be consonant-heavy or vowel-repetition if consonants >= len(alpha) * 0.55: return True @@ -317,18 +322,11 @@ def contour_features_for_box(image_bgr, box_xyxy): x1, y1, x2, y2 = box_xyxy crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: - return { - "mean_brightness": 0.0, - "edge_density": 1.0, - "whiteness_ratio": 0.0, - } - + return {"mean_brightness": 0.0, "edge_density": 1.0, "whiteness_ratio": 0.0} gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) mean_brightness = float(np.mean(gray)) / 255.0 - edges = cv2.Canny(gray, 50, 150) edge_density = float(np.mean(edges > 0)) - whiteness_ratio = float(np.mean(gray > 220)) return { "mean_brightness": mean_brightness, @@ -337,17 +335,6 @@ def contour_features_for_box(image_bgr, box_xyxy): } def classify_region_type(image_bgr, box_xyxy, lines): - """ - FIX: Dialogue is now the safe default. - - Decision tree (in priority order): - 1. sfx — only when looks_like_sfx_text() AND ≤ 3 words - 2. reaction — very short (≤ 3 words), no stopwords, known reaction vocab - 3. narration — rectangular banner shape + no speech punctuation - 4. dialogue — everything else (DEFAULT) - - Narration is immediately normalised to dialogue per project rules. - """ text = normalize_text(" ".join(lines)) words = text.split() word_count = len(words) @@ -355,28 +342,20 @@ def classify_region_type(image_bgr, box_xyxy, lines): w, h = xyxy_width(box_xyxy), xyxy_height(box_xyxy) ar = w / max(1, h) - # ── 1. SFX ─────────────────────────────────────────────── - # Requires BOTH text hint AND short word count if looks_like_sfx_text(text) and word_count <= 3: return "sfx" - # ── 2. Reaction ────────────────────────────────────────── - # Very short utterances with no stopwords and reaction vocab if (word_count <= 2 and looks_like_reaction_text(text) and stopword_ratio(text) < 0.10): return "reaction" - # ── 3. Narration → immediately collapsed to dialogue ───── - # Narration boxes are wide banners, no tail, rectangular - # Only fire when shape strongly suggests a caption box is_wide_banner = ar > 3.5 and h < 60 if (is_wide_banner and looks_like_narration_text(text) and word_count >= 4): - return "dialogue" # normalise narration → dialogue directly + return "dialogue" - # ── 4. Dialogue (default) ──────────────────────────────── return "dialogue" def text_similarity(a, b): @@ -387,11 +366,9 @@ def dedupe_repeated_phrase(text): words = t.split() if len(words) < 4: return t - half = len(words) // 2 if len(words) % 2 == 0 and words[:half] == words[half:]: return " ".join(words[:half]) - cleaned = [] for w in words: if cleaned and cleaned[-1] == w and len(w) > 2: @@ -441,8 +418,7 @@ def fix_common_dialogue_ocr(text): t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t) t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t) t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t) - t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t) - + t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = dehyphenate_linebreak_artifacts(t) t = dedupe_repeated_phrase(t) @@ -454,55 +430,38 @@ def fix_common_dialogue_ocr(text): continue cleaned.append(w) t = " ".join(cleaned) - t = re.sub(r"\s{2,}", " ", t).strip() - return t def region_text_role_hint(text: str) -> str: - """ - FIX: Mirrors the stricter classify_region_type() logic for - use in grouping/scoring where image features are unavailable. - Narration collapses to dialogue. - """ words = normalize_text(text or "").split() - if looks_like_sfx_text(text) and len(words) <= 3: return "sfx" - if (len(words) <= 2 and looks_like_reaction_text(text) and stopword_ratio(text) < 0.10): return "reaction" - - # narration → dialogue return "dialogue" - def correct_region_text(text, region_type="dialogue"): t = normalize_text(text or "") if not t: return t, 0.0 - original = t - if region_type in {"dialogue", "reaction", "narration"}: t = fix_common_dialogue_ocr(t) elif region_type == "sfx": t = dedupe_repeated_phrase(t) - score_before = ocr_candidate_score(original) - score_after = ocr_candidate_score(t) - + score_after = ocr_candidate_score(t) correction_gain = max(0.0, score_after - score_before) return t, correction_gain def compute_region_confidence(raw_text, corrected_text, box_xyxy, region_type, image_bgr): - feats = contour_features_for_box(image_bgr, box_xyxy) + feats = contour_features_for_box(image_bgr, box_xyxy) text_score = ocr_candidate_score(corrected_text) - gain = max(0.0, text_score - ocr_candidate_score(raw_text)) + gain = max(0.0, text_score - ocr_candidate_score(raw_text)) role_bonus = 0.08 if region_type in {"dialogue", "reaction", "narration", "sfx"} else 0.0 - score = ( 0.55 * text_score + 0.15 * feats["whiteness_ratio"] + @@ -526,6 +485,7 @@ def build_region_flags(raw_text, corrected_text, region_type, conf): flags.append("LONG_TEXT") return flags + # ============================================================ # HELPERS # ============================================================ @@ -550,40 +510,37 @@ def postprocess_translation_general(text: str) -> str: return t def fix_common_ocr_errors(text: str) -> str: + """ + FIX Issue 1: fix_digit_letters is now defined BEFORE the return + statement so it is actually executed. + """ result = text - # existing fixes - result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result) - result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result) - result = result.replace('|', 'I') - result = result.replace('`', "'") + # Word-level bold font fixes + for pattern, replacement in BOLD_FONT_WORD_FIXES.items(): + result = re.sub(pattern, replacement, result) - # FIX: Replace digit-zero used as letter-O in common English words. - # Vision OCR sometimes reads O → 0 in bold/stylised manga fonts. - # Pattern: word containing digits that look like letters. + # Digit-as-letter substitution (e.g. G00D → GOOD, M0RNING → MORNING) DIGIT_AS_LETTER = { - '0': 'O', - '1': 'I', - '3': 'E', - '4': 'A', - '5': 'S', - '8': 'B', + '0': 'O', '1': 'I', '3': 'E', + '4': 'A', '5': 'S', '8': 'B', } - # Only apply inside tokens that are otherwise all-alpha - # e.g. "G00D" → "GOOD", "M0RNING" → "MORNING" def fix_digit_letters(m): - word = m.group(0) + word = m.group(0) fixed = word for digit, letter in DIGIT_AS_LETTER.items(): fixed = fixed.replace(digit, letter) - # Only accept the fix if the result is all-alpha (real word) - if fixed.isalpha(): - return fixed - return word + # Only accept if result is purely alphabetic (avoids mangling numbers) + return fixed if fixed.isalpha() else word result = re.sub(r'\b[A-Za-z0-9]{2,12}\b', fix_digit_letters, result) + # Standard symbol fixes + result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result) + result = result.replace('|', 'I') + result = result.replace('`', "'") + return result def is_valid_language(text: str, source_lang: str) -> bool: @@ -626,469 +583,9 @@ def is_valid_language(text: str, source_lang: str) -> bool: return True -_NOISE_TOKENS = { - 'P', 'F', 'N', 'M', 'X', 'Z', 'Q', - 'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP', -} - -_MANGA_INTERJECTIONS = { - 'HUH', 'HUH?', 'HUH??', 'HUH?!', - 'OH', 'OH!', 'OOH', 'OOH!', - 'AH', 'AH!', 'UH', 'UH...', - 'HEY', 'HEY!', - 'EH', 'EH?', - 'WOW', 'WOW!', - 'YES', 'NO', 'NO!', - 'RUN', 'GO', 'GO!', - 'STOP', 'WAIT', - 'WHAT', 'WHAT?', 'WHAT?!', - 'WHY', 'WHY?', - 'HOW', 'HOW?', - 'OK', 'OK!', 'OKAY', - 'EEEEP', 'EEEP', - 'OMIGOSH', - 'BECKY', 'BECKY!', - 'HMM', 'HMM...', - 'TSK', 'TCH', - 'GRRR','I','A', - 'FWUP', 'FWAP', - 'SHIVER', - 'RRRING', - 'MORNING', 'MORNING.', -} - -def group_indices_into_vertical_columns(indices, ocr, - x_tolerance_factor=1.4, - min_vertical_span_factor=1.8): - if not indices: - return [] - - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - w = max(1, b[2] - b[0]) - h = max(1, b[3] - b[1]) - items.append((i, b, cx, cy, w, h)) - - med_w = float(np.median([it[4] for it in items])) if items else 12.0 - med_h = float(np.median([it[5] for it in items])) if items else 12.0 - x_tol = max(10.0, med_w * x_tolerance_factor) - - items_sorted = sorted(items, key=lambda x: x[2]) - columns = [] - - for it in items_sorted: - placed = False - for col in columns: - if abs(it[2] - col["xc"]) <= x_tol: - col["members"].append(it) - col["xc"] = float(np.mean([m[2] for m in col["members"]])) - placed = True - break - if not placed: - columns.append({"xc": it[2], "members": [it]}) - - clean_columns = [] - for col in columns: - members = sorted(col["members"], key=lambda x: x[3]) - ys = [m[3] for m in members] - vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0.0 - - if len(members) >= 2 or vertical_span >= med_h * min_vertical_span_factor: - clean_columns.append([m[0] for m in members]) - else: - clean_columns.append([m[0] for m in members]) - - clean_columns.sort(key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp])) - return clean_columns - -def group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.75): - if not indices: - return [] - - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - h = max(1, b[3] - b[1]) - items.append((i, b, cx, cy, h)) - - med_h = float(np.median([it[4] for it in items])) if items else 10.0 - row_tol = max(6.0, med_h * row_tol_factor) - - items.sort(key=lambda x: x[3]) - rows = [] - - for it in items: - placed = False - for row in rows: - if abs(it[3] - row["yc"]) <= row_tol: - row["members"].append(it) - row["yc"] = float(np.mean([m[3] for m in row["members"]])) - placed = True - break - if not placed: - rows.append({"yc": it[3], "members": [it]}) - - groups = [] - for row in rows: - members = sorted(row["members"], key=lambda x: x[2]) - groups.append([m[0] for m in members]) - - return groups - -def score_text_groups(groups, ocr): - if not groups: - return 0.0 - - texts = [] - lengths = [] - - for grp in groups: - parts = [] - for i in grp: - t = normalize_text(ocr[i][1]) - if t: - parts.append(t) - txt = normalize_text(" ".join(parts)) - if txt: - texts.append(txt) - lengths.append(len(txt.split())) - - if not texts: - return 0.0 - - text_scores = [ocr_candidate_score(t) for t in texts] - avg_text_score = float(np.mean(text_scores)) if text_scores else 0.0 - avg_len = float(np.mean(lengths)) if lengths else 0.0 - fragmentation_penalty = max(0.0, len(groups) - 4) * 0.08 - - return avg_text_score + min(0.5, avg_len * 0.05) - fragmentation_penalty - -def detect_internal_text_layout(indices, ocr, reading_mode="ltr"): - if not indices: - return {"mode": "horizontal", "blocks": []} - - blocks = split_indices_into_vertical_blocks(indices, ocr) - - resolved_blocks = [] - - for block in blocks: - horizontal_groups = group_indices_into_horizontal_rows(block, ocr) - vertical_groups = group_indices_into_vertical_columns(block, ocr) - - h_score = score_text_groups(horizontal_groups, ocr) - v_score = score_text_groups(vertical_groups, ocr) - - if len(vertical_groups) >= 2 and v_score >= h_score - 0.03: - resolved_blocks.append({ - "mode": "vertical", - "groups": vertical_groups - }) - else: - resolved_blocks.append({ - "mode": "horizontal", - "groups": horizontal_groups - }) - - return {"mode": "block-mixed", "blocks": resolved_blocks} - - -def build_text_from_layout(indices, ocr, reading_mode="ltr"): - layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) - output_lines = [] - - for block in layout["blocks"]: - groups = block["groups"] - mode = block["mode"] - - if mode == "horizontal": - for grp in groups: - line = normalize_text(" ".join( - ocr[i][1] for i in grp if normalize_text(ocr[i][1]) - )) - if line: - output_lines.append(line) - - elif mode == "vertical": - if reading_mode == "rtl": - groups = sorted( - groups, - key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]), - reverse=True - ) - else: - groups = sorted( - groups, - key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]) - ) - - for grp in groups: - grp_sorted = sorted(grp, key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0) - line = normalize_text(" ".join( - ocr[i][1] for i in grp_sorted if normalize_text(ocr[i][1]) - )) - if line: - output_lines.append(line) - - return output_lines - # ============================================================ -# FIX: BUBBLE CONTOUR MEMBERSHIP CACHE -# Pre-compute which speech-bubble contour each OCR quad belongs to -# so that two quads in *different* contours are NEVER merged. +# PROTECTED TOKEN HELPERS # ============================================================ - -def build_quad_to_bubble_map(ocr: list, bubble_contours: list) -> Dict[int, int]: - """ - Returns a dict {ocr_index -> bubble_contour_index} - OCR quads that fall outside every contour get value -1. - """ - mapping: Dict[int, int] = {} - for idx in range(len(ocr)): - bbox = quad_bbox(ocr[idx][0]) - cx = (bbox[0] + bbox[2]) / 2.0 - cy = (bbox[1] + bbox[3]) / 2.0 - assigned = -1 - for cidx, contour in enumerate(bubble_contours): - if cv2.pointPolygonTest(contour, (float(cx), float(cy)), False) >= 0: - assigned = cidx - break - mapping[idx] = assigned - return mapping - - -def same_bubble_contour(idx_a: int, idx_b: int, - quad_to_bubble: Dict[int, int]) -> bool: - """ - Returns True only when both quads are inside the SAME detected contour. - Two quads that are both 'outside' (-1) are treated as potentially - different regions (conservative). - """ - ca = quad_to_bubble.get(idx_a, -1) - cb = quad_to_bubble.get(idx_b, -1) - if ca == -1 or cb == -1: - return False # unknown → don't force-merge - return ca == cb - - -# ============================================================ -# REGION PROPOSAL FROM OCR GEOMETRY (FIXED) -# ============================================================ -def propose_text_regions_from_ocr(ocr, image_shape, image_bgr=None): - """ - Build larger text containers from OCR boxes before final classification. - - FIX 1: Tightened proximity thresholds so quads from adjacent speech - bubbles are not merged. - FIX 2: When image_bgr is supplied, pre-compute bubble contours and - refuse to merge two quads that belong to *different* contours. - """ - ih, iw = image_shape[:2] - if not ocr: - return {}, {}, {}, {} - - boxes = [quad_bbox(x[0]) for x in ocr] - hs = [max(1, b[3] - b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 14.0 - - # FIX: build contour membership map when image is available - quad_to_bubble: Dict[int, int] = {} - if image_bgr is not None: - bubble_contours = detect_speech_bubbles(image_bgr) - quad_to_bubble = build_quad_to_bubble_map(ocr, bubble_contours) - - parent = list(range(len(ocr))) - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(a, b): - ra, rb = find(a), find(b) - if ra != rb: - parent[rb] = ra - - for i in range(len(ocr)): - bi = boxes[i] - for j in range(i + 1, len(ocr)): - bj = boxes[j] - - # FIX: hard-block merging quads from different contours - if quad_to_bubble and not same_bubble_contour(i, j, quad_to_bubble): - continue - - dx = abs(xyxy_center(bi)[0] - xyxy_center(bj)[0]) - dy = abs(xyxy_center(bi)[1] - xyxy_center(bj)[1]) - - hov = horizontal_overlap_ratio(bi, bj) - vov = vertical_overlap_ratio(bi, bj) - dist = box_distance(bi, bj) - - # FIX: tightened from med_h*2.2 → med_h*1.4 - same_band = dy <= med_h * 1.4 - # FIX: tightened from med_h*3.2 → med_h*2.0 - stacked = hov >= 0.35 and dy <= med_h * 2.0 - # FIX: tightened from med_h*5.0 → med_h*3.5 - same_line = vov >= 0.45 and dx <= med_h * 3.5 - # FIX: tightened from med_h*4.5 → med_h*2.8 - near = dist <= med_h * 2.8 - - if same_line or stacked or (near and (same_band or hov > 0.25)): - if orientation_compatible(i, j, ocr): - union(i, j) - - groups = {} - for i in range(len(ocr)): - groups.setdefault(find(i), []).append(i) - - region_lines = {} - region_boxes = {} - region_quads = {} - region_indices = {} - next_id = 1 - - for _, idxs in sorted(groups.items(), key=lambda kv: min(boxes[i][1] for i in kv[1])): - idxs = sorted(idxs, key=lambda i: (boxes[i][1], boxes[i][0])) - ub = boxes_union_xyxy([boxes[i] for i in idxs]) - if ub is None: - continue - region_lines[next_id] = build_lines_from_indices(idxs, ocr) - region_boxes[next_id] = box_expand(ub, pad=max(2, int(med_h * 0.25)), iw=iw, ih=ih) - region_quads[next_id] = [ocr[i][0] for i in idxs] - region_indices[next_id] = idxs - next_id += 1 - - return region_lines, region_boxes, region_quads, region_indices - -# ============================================================ -# RECONCILE REGION-FIRST AND BUBBLE-FIRST GROUPS (FIXED) -# ============================================================ -def reconcile_region_and_bubble_groups(region_lines, region_boxes, region_quads, region_indices, - bubbles, bubble_boxes, bubble_quads, bubble_indices, - ocr): - """ - Reconcile region-first and bubble-first groupings. - - FIX: Tightened overlap/IoU thresholds so that spatially adjacent but - semantically distinct boxes are no longer collapsed. - overlap_ratio: 0.55 → 0.70 - iou: 0.35 → 0.45 - shared indices: still triggers merge (correct behaviour) - """ - combined = [] - - for rid in region_boxes: - combined.append(("region", rid, region_boxes[rid], region_indices[rid])) - - for bid in bubble_boxes: - combined.append(("bubble", bid, bubble_boxes[bid], bubble_indices[bid])) - - if not combined: - return {}, {}, {}, {} - - visited = set() - kept = [] - - def group_score(box, idxs): - text = normalize_text(" ".join(build_lines_from_indices(idxs, ocr))) - role = region_text_role_hint(text) - - role_bonus = { - "dialogue": 0.8, - "narration": 0.75, - "reaction": 0.7, - "sfx": 0.2, - "unknown": 0.1 - }.get(role, 0.1) - - box_area = bbox_area_xyxy(box) - area_bonus = min(1.0, box_area / 50000.0) - - return ( - len(idxs) * 2.0 + - min(20, len(text.split())) * 0.5 + - min(1.0, ocr_candidate_score(text)) + - role_bonus + - area_bonus * 0.25 - ) - - for i in range(len(combined)): - if i in visited: - continue - - cluster = [i] - visited.add(i) - - _, _, box_i, idx_i = combined[i] - - for j in range(i + 1, len(combined)): - if j in visited: - continue - - _, _, box_j, idx_j = combined[j] - - ovs = boxes_overlap_ratio(box_i, box_j) - iou = boxes_iou(box_i, box_j) - shared = len(set(idx_i).intersection(idx_j)) - - # FIX: raised thresholds — only collapse truly overlapping boxes - if ovs >= 0.70 or iou >= 0.45 or shared > 0: - cluster.append(j) - visited.add(j) - - best_idx = max( - cluster, - key=lambda k: group_score(combined[k][2], combined[k][3]) - ) - kept.append(combined[best_idx]) - - kept.sort(key=lambda item: ( - (item[2][1] + item[2][3]) / 2.0, - (item[2][0] + item[2][2]) / 2.0 - )) - - out_lines, out_boxes, out_quads, out_indices = {}, {}, {}, {} - next_id = 1 - - for typ, oid, box, idxs in kept: - idxs = sorted( - set(idxs), - key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0]) - ) - - out_lines[next_id] = build_lines_from_indices(idxs, ocr) - out_boxes[next_id] = box - out_quads[next_id] = [ocr[k][0] for k in idxs] - out_indices[next_id] = idxs - next_id += 1 - - return out_lines, out_boxes, out_quads, out_indices - -# ============================================================ -# PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET -# ============================================================ -PROTECTED_SHORT_TOKENS = { - "HUH", "HUH?", "HUH??", "HUH?!", - "OH", "OH!", "OOH", "OOH!", - "AH", "AH!", "UH", "UH...", - "HEY", "HEY!", "EH", "EH?", - "WOW", "WOW!", - "MORNING", "MORNING.", - "BECKY", "BECKY!", - "DAMIAN", "CECILE", "WALD", - "OMIGOSH", "EEEP", "EEEEP" -} - -KNOWN_NAMES = { - "BECKY", "DAMIAN", "CECILE", "WALD" -} - def is_protected_token(text: str) -> bool: t = normalize_text(text or "") if not t: @@ -1110,49 +607,37 @@ def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> t = text.strip() t_upper = normalize_text(t) - # ── FIX: ALL protection checks run BEFORE any length gate ── - # Order matters: shortest/most fragile tokens must be - # protected first so they never reach the discard logic. - - # 1. Common 1–2 char English words + # Protection checks run BEFORE any length gate lang = source_lang.lower() if lang in {"en", "english"} and t_upper in SHORT_ENGLISH_PROTECTED: return True - # 2. Explicitly protected tokens (names, interjections) if is_protected_token(t_upper): return True - # 3. Manga interjections and sentence starters t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper) if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS: return True - # 4. Short punctuated utterances like "Huh?" / "Oh!" - if re.fullmatch(r"[A-Za-zÀ-ÿ]{1,6}[!?\\.]{1,3}", t.strip()): + if re.fullmatch(r"[A-Za-zÀ-ÿ]{1,6}[!?\\.]{ 1,3}", t.strip()): return True - # ── Now apply the alpha character count gate ─────────────── alpha_count = sum(c.isalpha() for c in t) if alpha_count < min_alpha_chars: return False - # ── Noise token blocklist ────────────────────────────────── if t_upper in _NOISE_TOKENS: return False - # ── Non-Latin character ratio check ─────────────────────── if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: non_alpha = sum(not c.isalpha() for c in t) if len(t) > 0 and (non_alpha / len(t)) > 0.72: return False - # ── Repeated single character (e.g. "AAAA") ─────────────── if len(t) >= 3 and len(set(t_upper)) == 1: return False - # ── No vowels in a long word → likely noise ──────────────── if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: if len(t) > 5: @@ -1162,6 +647,10 @@ def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> return True + +# ============================================================ +# QUAD / BOX UTILITIES +# ============================================================ def quad_bbox(quad): xs = [p[0] for p in quad] ys = [p[1] for p in quad] @@ -1214,7 +703,6 @@ def boxes_iou(a, b): return inter / max(1, area_a + area_b - inter) def boxes_overlap_ratio(a, b): - """Ratio of intersection to the SMALLER box area.""" ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b ix1, iy1 = max(ax1, bx1), max(ay1, by1) @@ -1240,14 +728,9 @@ def ocr_candidate_score(text: str) -> float: bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n penalty = 0.0 - - # FIX: Only penalise isolated single letters when the WHOLE token - # is a single letter — not when a word like "I" or "A" appears - # inside a longer sentence. Old pattern \b[A-Z]\b fired on "I" - # inside "I CAN'T" which incorrectly penalised valid dialogue. + # FIX: only penalise when the WHOLE token is a single letter if re.fullmatch(r"[A-Z]", t.strip()): penalty += 0.05 - if re.search(r"[0-9]{2,}", t): penalty += 0.08 @@ -1264,10 +747,10 @@ def quad_is_vertical(quad, ratio_threshold=1.5) -> bool: # ============================================================ -# ENHANCED IMAGE PREPROCESSING +# IMAGE PREPROCESSING # ============================================================ def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): - h, w = image_bgr.shape[:2] + h, w = image_bgr.shape[:2] upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)), interpolation=cv2.INTER_CUBIC) gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) @@ -1284,8 +767,8 @@ def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR) def detect_small_text_regions(image_bgr, existing_quads): - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - mask = np.zeros(gray.shape, dtype=np.uint8) + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + mask = np.zeros(gray.shape, dtype=np.uint8) for quad in existing_quads: cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255) mask_inv = cv2.bitwise_not(mask) @@ -1421,13 +904,13 @@ def validate_and_split_oversized_quads(image_bgr, filtered_ocr): # ============================================================ -# HORIZONTAL GAP DETECTION AT QUAD LEVEL +# HORIZONTAL GAP DETECTION # ============================================================ def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5): if len(indices) < 2: return None - items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) - boxes = [quad_bbox(ocr[i][0]) for i in items] + items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) + boxes = [quad_bbox(ocr[i][0]) for i in items] gap_threshold = med_h * gap_factor best_gap, best_split = 0.0, None for k in range(len(items) - 1): @@ -1454,7 +937,7 @@ def orientation_compatible(idx_a, idx_b, ocr): # ============================================================ -# WIDE QUAD COLUMN SPLIT — pre-grouping +# WIDE QUAD COLUMN SPLIT # ============================================================ def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, min_gap_factor=1.8): @@ -1504,7 +987,6 @@ def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] return [(quad, text, conf)] - def apply_column_gap_splits(image_bgr, ocr_list, med_h): result, splits_made = [], 0 for quad, text, conf in ocr_list: @@ -1518,10 +1000,355 @@ def apply_column_gap_splits(image_bgr, ocr_list, med_h): # ============================================================ -# GENERALIZED BOX FIXING FUNCTIONS +# LAYOUT DETECTION +# ============================================================ +def group_indices_into_vertical_columns(indices, ocr, + x_tolerance_factor=1.4, + min_vertical_span_factor=1.8): + if not indices: + return [] + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + w = max(1, b[2] - b[0]) + h = max(1, b[3] - b[1]) + items.append((i, b, cx, cy, w, h)) + + med_w = float(np.median([it[4] for it in items])) if items else 12.0 + x_tol = max(10.0, med_w * x_tolerance_factor) + + items_sorted = sorted(items, key=lambda x: x[2]) + columns = [] + for it in items_sorted: + placed = False + for col in columns: + if abs(it[2] - col["xc"]) <= x_tol: + col["members"].append(it) + col["xc"] = float(np.mean([m[2] for m in col["members"]])) + placed = True + break + if not placed: + columns.append({"xc": it[2], "members": [it]}) + + clean_columns = [] + for col in columns: + members = sorted(col["members"], key=lambda x: x[3]) + clean_columns.append([m[0] for m in members]) + + clean_columns.sort( + key=lambda grp: np.mean( + [(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp] + ) + ) + return clean_columns + +def group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.75): + if not indices: + return [] + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + h = max(1, b[3] - b[1]) + items.append((i, b, cx, cy, h)) + + med_h = float(np.median([it[4] for it in items])) if items else 10.0 + row_tol = max(6.0, med_h * row_tol_factor) + + items.sort(key=lambda x: x[3]) + rows = [] + for it in items: + placed = False + for row in rows: + if abs(it[3] - row["yc"]) <= row_tol: + row["members"].append(it) + row["yc"] = float(np.mean([m[3] for m in row["members"]])) + placed = True + break + if not placed: + rows.append({"yc": it[3], "members": [it]}) + + groups = [] + for row in rows: + members = sorted(row["members"], key=lambda x: x[2]) + groups.append([m[0] for m in members]) + return groups + +def score_text_groups(groups, ocr): + if not groups: + return 0.0 + texts, lengths = [], [] + for grp in groups: + parts = [] + for i in grp: + t = normalize_text(ocr[i][1]) + if t: + parts.append(t) + txt = normalize_text(" ".join(parts)) + if txt: + texts.append(txt) + lengths.append(len(txt.split())) + if not texts: + return 0.0 + text_scores = [ocr_candidate_score(t) for t in texts] + avg_text_score = float(np.mean(text_scores)) + avg_len = float(np.mean(lengths)) + fragmentation_penalty = max(0.0, len(groups) - 4) * 0.08 + return avg_text_score + min(0.5, avg_len * 0.05) - fragmentation_penalty + +def detect_internal_text_layout(indices, ocr, reading_mode="ltr"): + if not indices: + return {"mode": "horizontal", "blocks": []} + blocks = split_indices_into_vertical_blocks(indices, ocr) + resolved_blocks = [] + for block in blocks: + horizontal_groups = group_indices_into_horizontal_rows(block, ocr) + vertical_groups = group_indices_into_vertical_columns(block, ocr) + h_score = score_text_groups(horizontal_groups, ocr) + v_score = score_text_groups(vertical_groups, ocr) + if len(vertical_groups) >= 2 and v_score >= h_score - 0.03: + resolved_blocks.append({"mode": "vertical", "groups": vertical_groups}) + else: + resolved_blocks.append({"mode": "horizontal", "groups": horizontal_groups}) + return {"mode": "block-mixed", "blocks": resolved_blocks} + +def build_text_from_layout(indices, ocr, reading_mode="ltr"): + layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) + output_lines = [] + for block in layout["blocks"]: + groups = block["groups"] + mode = block["mode"] + if mode == "horizontal": + for grp in groups: + line = normalize_text(" ".join( + ocr[i][1] for i in grp if normalize_text(ocr[i][1]) + )) + if line: + output_lines.append(line) + elif mode == "vertical": + if reading_mode == "rtl": + groups = sorted( + groups, + key=lambda g: np.mean( + [(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in g] + ), + reverse=True + ) + else: + groups = sorted( + groups, + key=lambda g: np.mean( + [(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in g] + ) + ) + for grp in groups: + grp_sorted = sorted( + grp, + key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 + ) + line = normalize_text(" ".join( + ocr[i][1] for i in grp_sorted if normalize_text(ocr[i][1]) + )) + if line: + output_lines.append(line) + return output_lines + + +# ============================================================ +# BUBBLE CONTOUR MEMBERSHIP +# ============================================================ +def build_quad_to_bubble_map(ocr: list, bubble_contours: list) -> Dict[int, int]: + mapping: Dict[int, int] = {} + for idx in range(len(ocr)): + bbox = quad_bbox(ocr[idx][0]) + cx = (bbox[0] + bbox[2]) / 2.0 + cy = (bbox[1] + bbox[3]) / 2.0 + assigned = -1 + for cidx, contour in enumerate(bubble_contours): + if cv2.pointPolygonTest(contour, (float(cx), float(cy)), False) >= 0: + assigned = cidx + break + mapping[idx] = assigned + return mapping + +def same_bubble_contour(idx_a: int, idx_b: int, + quad_to_bubble: Dict[int, int]) -> bool: + ca = quad_to_bubble.get(idx_a, -1) + cb = quad_to_bubble.get(idx_b, -1) + if ca == -1 or cb == -1: + return False + return ca == cb + + +# ============================================================ +# REGION PROPOSAL FROM OCR GEOMETRY +# ============================================================ +def propose_text_regions_from_ocr(ocr, image_shape, image_bgr=None): + ih, iw = image_shape[:2] + if not ocr: + return {}, {}, {}, {} + + boxes = [quad_bbox(x[0]) for x in ocr] + hs = [max(1, b[3] - b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 14.0 + + quad_to_bubble: Dict[int, int] = {} + if image_bgr is not None: + bubble_contours = detect_speech_bubbles(image_bgr) + quad_to_bubble = build_quad_to_bubble_map(ocr, bubble_contours) + + parent = list(range(len(ocr))) + + def find(x): + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + + def union(a, b): + ra, rb = find(a), find(b) + if ra != rb: + parent[rb] = ra + + for i in range(len(ocr)): + bi = boxes[i] + for j in range(i + 1, len(ocr)): + bj = boxes[j] + + if quad_to_bubble and not same_bubble_contour(i, j, quad_to_bubble): + continue + + dx = abs(xyxy_center(bi)[0] - xyxy_center(bj)[0]) + dy = abs(xyxy_center(bi)[1] - xyxy_center(bj)[1]) + hov = horizontal_overlap_ratio(bi, bj) + vov = vertical_overlap_ratio(bi, bj) + dist = box_distance(bi, bj) + + same_band = dy <= med_h * 1.4 + stacked = hov >= 0.35 and dy <= med_h * 2.0 + same_line = vov >= 0.45 and dx <= med_h * 3.5 + near = dist <= med_h * 2.8 + + if same_line or stacked or (near and (same_band or hov > 0.25)): + if orientation_compatible(i, j, ocr): + union(i, j) + + groups = {} + for i in range(len(ocr)): + groups.setdefault(find(i), []).append(i) + + region_lines = {} + region_boxes = {} + region_quads = {} + region_indices = {} + next_id = 1 + + for _, idxs in sorted(groups.items(), + key=lambda kv: min(boxes[i][1] for i in kv[1])): + idxs = sorted(idxs, key=lambda i: (boxes[i][1], boxes[i][0])) + ub = boxes_union_xyxy([boxes[i] for i in idxs]) + if ub is None: + continue + region_lines[next_id] = build_lines_from_indices(idxs, ocr) + region_boxes[next_id] = box_expand(ub, pad=max(2, int(med_h * 0.25)), + iw=iw, ih=ih) + region_quads[next_id] = [ocr[i][0] for i in idxs] + region_indices[next_id] = idxs + next_id += 1 + + return region_lines, region_boxes, region_quads, region_indices + + +# ============================================================ +# RECONCILE REGION AND BUBBLE GROUPS +# ============================================================ +def reconcile_region_and_bubble_groups(region_lines, region_boxes, region_quads, + region_indices, bubbles, bubble_boxes, + bubble_quads, bubble_indices, ocr): + combined = [] + for rid in region_boxes: + combined.append(("region", rid, region_boxes[rid], region_indices[rid])) + for bid in bubble_boxes: + combined.append(("bubble", bid, bubble_boxes[bid], bubble_indices[bid])) + + if not combined: + return {}, {}, {}, {} + + visited = set() + kept = [] + + def group_score(box, idxs): + text = normalize_text(" ".join(build_lines_from_indices(idxs, ocr))) + role = region_text_role_hint(text) + role_bonus = { + "dialogue": 0.8, + "narration": 0.75, + "reaction": 0.7, + "sfx": 0.2, + "unknown": 0.1, + }.get(role, 0.1) + box_area = bbox_area_xyxy(box) + area_bonus = min(1.0, box_area / 50000.0) + return ( + len(idxs) * 2.0 + + min(20, len(text.split())) * 0.5 + + min(1.0, ocr_candidate_score(text)) + + role_bonus + + area_bonus * 0.25 + ) + + for i in range(len(combined)): + if i in visited: + continue + cluster = [i] + visited.add(i) + _, _, box_i, idx_i = combined[i] + + for j in range(i + 1, len(combined)): + if j in visited: + continue + _, _, box_j, idx_j = combined[j] + ovs = boxes_overlap_ratio(box_i, box_j) + iou = boxes_iou(box_i, box_j) + shared = len(set(idx_i).intersection(idx_j)) + if ovs >= 0.70 or iou >= 0.45 or shared > 0: + cluster.append(j) + visited.add(j) + + best_idx = max(cluster, + key=lambda k: group_score(combined[k][2], combined[k][3])) + kept.append(combined[best_idx]) + + kept.sort(key=lambda item: ( + (item[2][1] + item[2][3]) / 2.0, + (item[2][0] + item[2][2]) / 2.0, + )) + + out_lines, out_boxes, out_quads, out_indices = {}, {}, {}, {} + next_id = 1 + + for typ, oid, box, idxs in kept: + idxs = sorted( + set(idxs), + key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0]) + ) + out_lines[next_id] = build_lines_from_indices(idxs, ocr) + out_boxes[next_id] = box + out_quads[next_id] = [ocr[k][0] for k in idxs] + out_indices[next_id] = idxs + next_id += 1 + + return out_lines, out_boxes, out_quads, out_indices + + +# ============================================================ +# MULTI-BUBBLE BOX SPLITTING AND MERGING # ============================================================ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, image_bgr): + bubbles, ocr, image_bgr): all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 @@ -1543,20 +1370,22 @@ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_qua for group in split_groups: if group: new_bubbles[next_bid] = build_lines_from_indices(group, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) + new_boxes[next_bid] = boxes_union_xyxy( + [quad_bbox(ocr[i][0]) for i in group]) new_quads[next_bid] = [ocr[i][0] for i in group] new_indices[next_bid] = group next_bid += 1 splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles") continue - vertical_splits = check_vertical_alignment_split(indices, ocr, - threshold=int(med_h * 2.0)) + vertical_splits = check_vertical_alignment_split( + indices, ocr, threshold=int(med_h * 2.0)) if len(vertical_splits) > 1: for group in vertical_splits: if group: new_bubbles[next_bid] = build_lines_from_indices(group, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) + new_boxes[next_bid] = boxes_union_xyxy( + [quad_bbox(ocr[i][0]) for i in group]) new_quads[next_bid] = [ocr[i][0] for i in group] new_indices[next_bid] = group next_bid += 1 @@ -1566,8 +1395,8 @@ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_qua box = bubble_boxes[bid] x1, y1, x2, y2 = box if (x2 - x1) > med_h * 10: - x_centers = [quad_center(ocr[i][0])[0] for i in indices] - x_median = np.median(x_centers) + x_centers = [quad_center(ocr[i][0])[0] for i in indices] + x_median = np.median(x_centers) left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median] right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median] if left_group and right_group: @@ -1576,7 +1405,8 @@ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_qua if right_box[0] - left_box[2] > med_h * 1.5: for grp in [left_group, right_group]: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_boxes[next_bid] = boxes_union_xyxy( + [quad_bbox(ocr[i][0]) for i in grp]) new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 @@ -1591,12 +1421,13 @@ def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_qua if splits_made: print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):") - for s in splits_made: print(f" ✓ {s}") + for s in splits_made: + print(f" ✓ {s}") return new_bubbles, new_boxes, new_quads, new_indices def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, image_bgr): + bubbles, ocr, image_bgr): all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 @@ -1619,7 +1450,8 @@ def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_qua ) if in_same_bubble: if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0: - to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i)) + to_merge.append( + (bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i)) if not to_merge: return bubbles, bubble_boxes, bubble_quads, bubble_indices @@ -1631,22 +1463,26 @@ def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_qua for key in merge_groups: if top in merge_groups[key] or bottom in merge_groups[key]: merge_groups[key].update({top, bottom}) - found = True; break + found = True + break if not found: merge_groups[len(merge_groups)] = {top, bottom} new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} merged_bids, next_bid = set(), 1 + for merge_set in merge_groups.values(): merge_list = sorted(merge_set) print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}") all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b])) - for b in merge_list: merged_bids.add(b) + for b in merge_list: + merged_bids.add(b) new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) new_quads[next_bid] = [ocr[i][0] for i in all_indices] new_indices[next_bid] = all_indices next_bid += 1 + for bid in bids: if bid not in merged_bids: new_bubbles[next_bid] = bubbles[bid] @@ -1654,23 +1490,13 @@ def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_qua new_quads[next_bid] = bubble_quads[bid] new_indices[next_bid] = bubble_indices[bid] next_bid += 1 + return new_bubbles, new_boxes, new_quads, new_indices def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h): - """ - Merges boxes that are vertically close AND share significant horizontal overlap. - - FIX: Tightened thresholds to prevent cross-bubble merging: - vert_gap: med_h * 1.5 → med_h * 0.8 - h_overlap_ratio: 0.35 → 0.55 - - This keeps legitimate fragment merges (same bubble, split by OCR) - while blocking merges across adjacent bubbles that happen to be - vertically stacked (the Box-8 / Box-6 failure cases). - """ - bids = sorted(bubble_boxes.keys()) + bids = sorted(bubble_boxes.keys()) merge_map: Dict[int, List[int]] = {} merged_into: Dict[int, int] = {} @@ -1687,14 +1513,12 @@ def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_qu box_j = bubble_boxes[bid_j] wj = max(1, box_j[2] - box_j[0]) - vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) - h_ix1 = max(box_i[0], box_j[0]) - h_ix2 = min(box_i[2], box_j[2]) + vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) + h_ix1 = max(box_i[0], box_j[0]) + h_ix2 = min(box_i[2], box_j[2]) h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - # FIX: tightened from med_h*1.5 → med_h*0.8 - # FIX: tightened from 0.35 → 0.55 if vert_gap <= med_h * 0.8 and h_overlap_ratio >= 0.55: root = merged_into.get(bid_i, bid_i) merge_map.setdefault(root, [root]) @@ -1730,13 +1554,10 @@ def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_qu return new_bubbles, new_boxes, new_quads, new_indices + def _majority_contour_id(indices: list, quad_to_bubble: Dict[int, int]) -> int: - """ - FIX B helper: Returns the most common contour ID among all quads - in a box. Falls back to -1 only if truly no quad is inside any contour. - """ from collections import Counter - ids = [quad_to_bubble.get(i, -1) for i in indices] + ids = [quad_to_bubble.get(i, -1) for i in indices] valid = [cid for cid in ids if cid != -1] if not valid: return -1 @@ -1744,12 +1565,7 @@ def _majority_contour_id(indices: list, quad_to_bubble: Dict[int, int]) -> int: def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, image_bgr): - """ - FIX B: Uses majority contour vote instead of idx[0] only. - Also relaxed vert_gap threshold from med_h*2.5 → med_h*3.5 - to catch boxes like 002/box9+10 that have a slightly larger gap. - """ + bubbles, ocr, image_bgr): all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 @@ -1766,24 +1582,18 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads, bid_i = bids[i] if bid_i in visited: continue - - box_i = bubble_boxes[bid_i] - text_i = normalize_text(" ".join(bubbles.get(bid_i, []))) - role_i = region_text_role_hint(text_i) - - if role_i == "sfx": + box_i = bubble_boxes[bid_i] + text_i = normalize_text(" ".join(bubbles.get(bid_i, []))) + if region_text_role_hint(text_i) == "sfx": continue for j in range(i + 1, len(bids)): bid_j = bids[j] if bid_j in visited: continue - box_j = bubble_boxes[bid_j] text_j = normalize_text(" ".join(bubbles.get(bid_j, []))) - role_j = region_text_role_hint(text_j) - - if role_j == "sfx": + if region_text_role_hint(text_j) == "sfx": continue idx_i = bubble_indices[bid_i] @@ -1791,20 +1601,18 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads, if not idx_i or not idx_j: continue - # FIX B: majority vote instead of idx[0] cid_i = _majority_contour_id(idx_i, quad_to_bubble) cid_j = _majority_contour_id(idx_j, quad_to_bubble) if cid_i == -1 or cid_j == -1 or cid_i != cid_j: continue - # FIX B: relaxed from med_h*2.5 → med_h*3.5 vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) if vert_gap > med_h * 3.5: continue h_overlap = max(0, min(box_i[2], box_j[2]) - max(box_i[0], box_j[0])) min_w = min(xyxy_width(box_i), xyxy_width(box_j)) - if h_overlap / max(1, min_w) < 0.20: # FIX B: relaxed from 0.25 → 0.20 + if h_overlap / max(1, min_w) < 0.20: continue merge_pairs.append((bid_i, bid_j)) @@ -1816,7 +1624,6 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads, return bubbles, bubble_boxes, bubble_quads, bubble_indices print(f"\n🔗 Continuation merge: {len(merge_pairs)} pair(s):") - processed = set() new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 @@ -1844,30 +1651,16 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads, return new_bubbles, new_boxes, new_quads, new_indices + def merge_same_column_dialogue_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): - """ - FIX D: Merges dialogue boxes that share the same horizontal column - (strong x-overlap) and are vertically close, even when they have - different contour IDs. - - This catches 004/box2+6 where the speech bubble body and its - continuation are detected as separate contours. - - Criteria: - - Both boxes are dialogue (not sfx) - - Horizontal overlap ratio ≥ 0.50 (same column) - - Vertical gap ≤ med_h * 4.0 - - Combined height ≤ image_height * 0.35 (not a full-page merge) - """ ih, iw = image_bgr.shape[:2] - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) - for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - - bids = sorted(bubble_boxes.keys(), - key=lambda b: (bubble_boxes[b][1] + bubble_boxes[b][3]) / 2.0) + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) + for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + bids = sorted(bubble_boxes.keys(), + key=lambda b: (bubble_boxes[b][1] + bubble_boxes[b][3]) / 2.0) merge_pairs = [] visited = set() @@ -1875,7 +1668,6 @@ def merge_same_column_dialogue_boxes(bubble_boxes, bubble_indices, bubble_quads, bid_i = bids[i] if bid_i in visited: continue - box_i = bubble_boxes[bid_i] text_i = normalize_text(" ".join(bubbles.get(bid_i, []))) if region_text_role_hint(text_i) == "sfx": @@ -1885,25 +1677,21 @@ def merge_same_column_dialogue_boxes(bubble_boxes, bubble_indices, bubble_quads, bid_j = bids[j] if bid_j in visited: continue - box_j = bubble_boxes[bid_j] text_j = normalize_text(" ".join(bubbles.get(bid_j, []))) if region_text_role_hint(text_j) == "sfx": continue - # Vertical gap check vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) if vert_gap > med_h * 4.0: continue - # Horizontal overlap check - h_ov = max(0, min(box_i[2], box_j[2]) - max(box_i[0], box_j[0])) + h_ov = max(0, min(box_i[2], box_j[2]) - max(box_i[0], box_j[0])) min_w = min(xyxy_width(box_i), xyxy_width(box_j)) if h_ov / max(1, min_w) < 0.50: continue - # Combined height sanity check - merged_h = (max(box_i[3], box_j[3]) - min(box_i[1], box_j[1])) + merged_h = max(box_i[3], box_j[3]) - min(box_i[1], box_j[1]) if merged_h > ih * 0.35: continue @@ -1916,7 +1704,6 @@ def merge_same_column_dialogue_boxes(bubble_boxes, bubble_indices, bubble_quads, return bubbles, bubble_boxes, bubble_quads, bubble_indices print(f"\n📐 Same-column dialogue merge: {len(merge_pairs)} pair(s):") - processed = set() new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 @@ -1944,17 +1731,9 @@ def merge_same_column_dialogue_boxes(bubble_boxes, bubble_indices, bubble_quads, return new_bubbles, new_boxes, new_quads, new_indices + def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): - """ - Full fix pipeline: - 1. Split boxes spanning multiple bubbles. - 2. Merge fragments inside the same contour. - 3. Merge continuation boxes (same bubble, split detection). - 4. FIX D: Merge same-column dialogue boxes. - 5. Proximity+overlap merge — pass 1. - 6. Proximity+overlap merge — pass 2. - """ print("\n🔍 Running automatic bubble detection fixes...") all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] @@ -1972,7 +1751,6 @@ def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, merge_continuation_boxes( bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) - # FIX D: same-column dialogue merge bubbles, bubble_boxes, bubble_quads, bubble_indices = \ merge_same_column_dialogue_boxes( bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) @@ -1991,21 +1769,26 @@ def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, - overlap_threshold=0.50): + overlap_threshold=0.50): bids = list(bubble_boxes.keys()) to_remove = set() + for i in range(len(bids)): bid_i = bids[i] - if bid_i in to_remove: continue + if bid_i in to_remove: + continue box_i = bubble_boxes[bid_i] area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1]) + for j in range(i + 1, len(bids)): bid_j = bids[j] - if bid_j in to_remove: continue + if bid_j in to_remove: + continue box_j = bubble_boxes[bid_j] area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1]) shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j]) overlap = boxes_overlap_ratio(box_i, box_j) + if overlap > overlap_threshold or len(shared) > 0: if area_i >= area_j: to_remove.add(bid_j) @@ -2014,6 +1797,7 @@ def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, to_remove.add(bid_i) print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})") break + if to_remove: print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)") for bid in to_remove: @@ -2021,44 +1805,52 @@ def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, bubble_indices.pop(bid, None) bubble_quads.pop(bid, None) bubbles.pop(bid, None) + return bubbles, bubble_boxes, bubble_quads, bubble_indices def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, - max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None): + max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None): if image_shape is None: return bubbles, bubble_boxes, bubble_quads, bubble_indices - ih, iw = image_shape[:2] - max_width, max_height = iw * max_width_ratio, ih * max_height_ratio + + ih, iw = image_shape[:2] + max_width = iw * max_width_ratio + max_height = ih * max_height_ratio new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid, splits_made = 1, [] for bid, box in bubble_boxes.items(): x1, y1, x2, y2 = box w, h = x2 - x1, y2 - y1 + if w > max_width or h > max_height: - indices = bubble_indices[bid] - col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid, - use_aggressive_thresholds=True) + indices = bubble_indices[bid] + col_split = split_bubble_if_multiple_columns( + indices, ocr, bid=bid, use_aggressive_thresholds=True) if col_split: for grp in col_split: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_boxes[next_bid] = boxes_union_xyxy( + [quad_bbox(ocr[i][0]) for i in grp]) new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") continue + row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid) if row_split: for grp in row_split: new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_boxes[next_bid] = boxes_union_xyxy( + [quad_bbox(ocr[i][0]) for i in grp]) new_quads[next_bid] = [ocr[i][0] for i in grp] new_indices[next_bid] = grp next_bid += 1 splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") continue + new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = box new_quads[next_bid] = bubble_quads[bid] @@ -2067,12 +1859,14 @@ def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, oc if splits_made: print(f"\n📏 Split {len(splits_made)} oversized box(es):") - for s in splits_made: print(f" ✓ {s}") + for s in splits_made: + print(f" ✓ {s}") + return new_bubbles, new_boxes, new_quads, new_indices def should_merge_groups(group1_indices, group2_indices, ocr, median_height, - max_vertical_gap=None): + max_vertical_gap=None): if max_vertical_gap is None: max_vertical_gap = median_height * 2.5 box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices]) @@ -2088,20 +1882,10 @@ def should_merge_groups(group1_indices, group2_indices, ocr, median_height, # ============================================================ -# FIX: CONTOUR-AWARE BUBBLE SPLITTING -# Splits a merged group using actual contour membership BEFORE -# any proximity/overlap merging pass runs. +# CONTOUR-AWARE BUBBLE SPLITTING # ============================================================ - def split_group_by_contour_membership(indices: list, ocr: list, quad_to_bubble: Dict[int, int]) -> List[List[int]]: - """ - Partition OCR indices by their assigned bubble contour. - Indices with no contour (-1) form their own singleton groups. - - Returns a list of groups; if all indices share the same contour - the original list is returned as-is (no split needed). - """ buckets: Dict[int, List[int]] = {} for idx in indices: cid = quad_to_bubble.get(idx, -1) @@ -2110,7 +1894,6 @@ def split_group_by_contour_membership(indices: list, ocr: list, if len(buckets) <= 1: return [indices] - # Sort each bucket top-to-bottom result = [] for cid, group in sorted(buckets.items()): group_sorted = sorted(group, @@ -2119,25 +1902,8 @@ def split_group_by_contour_membership(indices: list, ocr: list, result.append(group_sorted) return result -# ============================================================ -# FIX: MIXED-TYPE GROUP SPLITTER -# Splits a group whose quads contain BOTH sfx-like and -# dialogue-like text into separate sub-groups. -# This fixes Box-12/007, Box-22/007, Box-13/008 where an SFX -# quad (RRRING, A MEAL-GRUBBING SHE-BEAST) was merged with a -# dialogue quad because they shared the same contour region. -# ============================================================ def split_group_by_region_type(indices: list, ocr: list) -> List[List[int]]: - """ - Partition OCR indices by their inferred region type. - - Groups with only one type are returned as-is. - Groups mixing sfx + dialogue/narration are split so each - type forms its own sub-group, ordered top-to-bottom. - - Returns a list of index groups. - """ if len(indices) <= 1: return [indices] @@ -2147,15 +1913,12 @@ def split_group_by_region_type(indices: list, ocr: list) -> List[List[int]]: role = region_text_role_hint(text) typed.setdefault(role, []).append(idx) - # Only split when we have genuinely different types present - # and at least one group is sfx (the most common contaminator) has_sfx = "sfx" in typed has_dialogue = "dialogue" in typed or "narration" in typed or "reaction" in typed if not (has_sfx and has_dialogue): return [indices] - # Build clean groups sorted top-to-bottom within each type result = [] for role in ("dialogue", "narration", "reaction", "sfx", "unknown"): group = typed.get(role, []) @@ -2170,17 +1933,7 @@ def split_group_by_region_type(indices: list, ocr: list) -> List[List[int]]: def split_group_by_spatial_gap(indices: list, ocr: list, - gap_factor: float = 1.2) -> List[List[int]]: - """ - FIX C: Reduced gap_factor from 1.8 → 1.2 and added adaptive - minimum gap based on the actual inter-quad spacing distribution. - - This catches tight splits like: - 007/box12: "YOU'RE A BIG MEAN JERK." vs "I HATE YOU, SY-ON BOY." - 007/box15: three separate italic caption lines - 007/box21: two side-by-side dialogue bubbles - 008/box13: "AND I'M TOO CUTE..." vs "I WAS NOT!" - """ + gap_factor: float = 1.2) -> List[List[int]]: if len(indices) <= 1: return [indices] @@ -2188,29 +1941,24 @@ def split_group_by_spatial_gap(indices: list, ocr: list, for i in indices] med_h = float(np.median(all_h)) if all_h else 14.0 - # ── Adaptive gap: use median inter-quad gap as baseline ─── - sorted_by_y = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1]) + sorted_by_y = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1]) inter_gaps_y = [] for k in range(len(sorted_by_y) - 1): b_curr = quad_bbox(ocr[sorted_by_y[k]][0]) b_next = quad_bbox(ocr[sorted_by_y[k+1]][0]) - gap = b_next[1] - b_curr[3] + gap = b_next[1] - b_curr[3] if gap > 0: inter_gaps_y.append(gap) - # Adaptive threshold: max of (med_h * gap_factor) and - # (median_inter_gap * 2.5) — whichever is smaller wins if inter_gaps_y: - median_inter = float(np.median(inter_gaps_y)) + median_inter = float(np.median(inter_gaps_y)) gap_threshold_y = min(med_h * gap_factor, - max(med_h * 0.8, median_inter * 2.5)) + max(med_h * 0.8, median_inter * 2.5)) else: gap_threshold_y = med_h * gap_factor - # ── Try horizontal split first ──────────────────────────── - sorted_by_x = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[0]) - boxes_x = [quad_bbox(ocr[i][0]) for i in sorted_by_x] - + sorted_by_x = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[0]) + boxes_x = [quad_bbox(ocr[i][0]) for i in sorted_by_x] inter_gaps_x = [] for k in range(len(sorted_by_x) - 1): gap = boxes_x[k+1][0] - boxes_x[k][2] @@ -2218,7 +1966,7 @@ def split_group_by_spatial_gap(indices: list, ocr: list, inter_gaps_x.append(gap) if inter_gaps_x: - median_inter_x = float(np.median(inter_gaps_x)) + median_inter_x = float(np.median(inter_gaps_x)) gap_threshold_x = min(med_h * gap_factor, max(med_h * 0.8, median_inter_x * 2.5)) else: @@ -2235,13 +1983,10 @@ def split_group_by_spatial_gap(indices: list, ocr: list, left = [sorted_by_x[i] for i in range(best_h_split + 1)] right = [sorted_by_x[i] for i in range(best_h_split + 1, len(sorted_by_x))] if left and right: - # Recurse to catch further splits in each half - return (split_group_by_spatial_gap(left, ocr, gap_factor) + + return (split_group_by_spatial_gap(left, ocr, gap_factor) + split_group_by_spatial_gap(right, ocr, gap_factor)) - # ── Try vertical split ──────────────────────────────────── boxes_y = [quad_bbox(ocr[i][0]) for i in sorted_by_y] - best_v_gap, best_v_split = 0.0, None for k in range(len(sorted_by_y) - 1): gap = boxes_y[k + 1][1] - boxes_y[k][3] @@ -2253,27 +1998,49 @@ def split_group_by_spatial_gap(indices: list, ocr: list, top = [sorted_by_y[i] for i in range(best_v_split + 1)] bottom = [sorted_by_y[i] for i in range(best_v_split + 1, len(sorted_by_y))] if top and bottom: - # Recurse to catch further splits in each half - return (split_group_by_spatial_gap(top, ocr, gap_factor) + + return (split_group_by_spatial_gap(top, ocr, gap_factor) + split_group_by_spatial_gap(bottom, ocr, gap_factor)) return [indices] + +def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]: + """ + FIX Issue 2: now wired into apply_contour_split_to_all_boxes as + Strategy 4. Splits a group when a line ends with sentence-ending + punctuation AND the next line starts a new sentence. + """ + if len(lines) <= 1: + return [quads] + + SENTENCE_END = re.compile(r'[!?\\.]\s*$') + SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]') + + groups = [] + current = [] + + for i, (quad, line) in enumerate(zip(quads, lines)): + current.append(quad) + if i < len(lines) - 1: + if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]): + groups.append(current) + current = [] + + if current: + groups.append(current) + + return groups if len(groups) > 1 else [quads] + + def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): """ - FIX: Pre-pass that runs BEFORE proximity merging. - Chains three split strategies in order: + Pre-pass that runs BEFORE proximity merging. + Chains four split strategies in order: 1. Contour membership — different speech-bubble contours - 2. Mixed region type — sfx quads merged with dialogue quads (NEW) - 3. Spatial gap — two dialogue bubbles side-by-side (NEW) - - Primary fix for: - Box-8/008 (4 bubbles merged) - Box-6/008 (2 adjacent bubbles merged) - Box-12/007 (RRRING + dialogue merged) - Box-22/007 (two dialogue bubbles merged) - Box-13/008 (RRRING + dialogue merged) + 2. Mixed region type — sfx quads merged with dialogue quads + 3. Spatial gap — two dialogue bubbles side-by-side + 4. Sentence boundary — FIX Issue 2: now actually called here """ bubble_contours = detect_speech_bubbles(image_bgr) quad_to_bubble = (build_quad_to_bubble_map(ocr, bubble_contours) @@ -2286,26 +2053,32 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, for bid in sorted(bubble_boxes.keys()): indices = bubble_indices[bid] - # ── Strategy 1: contour membership ─────────────────── + # Strategy 1: contour membership groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble) - # ── Strategy 2: mixed region type ──────────────────── - # Apply to every group produced by strategy 1 + # Strategy 2: mixed region type refined = [] for grp in groups: sub = split_group_by_region_type(grp, ocr) refined.extend(sub) groups = refined - # ── Strategy 3: spatial gap ─────────────────────────── - # Apply to every group produced by strategies 1+2 + # Strategy 3: spatial gap final = [] for grp in groups: sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8) final.extend(sub) groups = final - # ── Commit results ──────────────────────────────────── + # Strategy 4: sentence boundary split ← FIX Issue 2 + sentence_final = [] + for grp in groups: + grp_lines = [normalize_text(ocr[i][1]) for i in grp] + sub = split_at_sentence_boundaries(grp, grp_lines) + sentence_final.extend(sub) + groups = sentence_final + + # Commit results if len(groups) <= 1: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] @@ -2335,174 +2108,46 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, # ============================================================ -# ENHANCED OCR ENGINE +# SPLIT HELPERS FOR enforce_max_box_size # ============================================================ -class ImprovedMacVisionDetector: - def __init__(self, source_lang="en"): - lang_key = source_lang.lower().strip() - lang_map = { - "en": "en-US", "english": "en-US", - "es": "es-ES", "spanish": "es-ES", - "ca": "ca-ES", "catalan": "ca-ES", - "fr": "fr-FR", "french": "fr-FR", - "ja": "ja-JP", "japanese": "ja-JP", - "it": "it-IT", "italian": "it-IT", - "de": "de-DE", "german": "de-DE", - "ko": "ko-KR", "korean": "ko-KR", - "zh": "zh-Hans", "chinese": "zh-Hans" - } - self.langs = [lang_map.get(lang_key, "en-US")] - print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})") +def split_bubble_if_multiple_columns(indices, ocr, bid=None, + use_aggressive_thresholds=False): + if len(indices) < 2: + return None + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in indices] + med_h = float(np.median(all_h)) if all_h else 14.0 + factor = 1.5 if use_aggressive_thresholds else 2.5 + result = detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=factor) + if result is None: + return None + left_group, right_group = result + if not left_group or not right_group: + return None + return [left_group, right_group] - def preprocess_variants(self, image_bgr): - variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))] - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - variants.append(("high_contrast", - cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5, - interpolation=cv2.INTER_CUBIC), - cv2.COLOR_GRAY2BGR))) - variants.append(("bilateral", - cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75), - None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) - variants.append(("inverted", - cv2.resize(cv2.bitwise_not(image_bgr), - None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) - variants.append(("original", - cv2.resize(image_bgr, None, fx=2.5, fy=2.5, - interpolation=cv2.INTER_CUBIC))) - return variants +def split_bubble_if_multiple_rows(indices, ocr, bid=None): + if len(indices) < 2: + return None + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in indices] + med_h = float(np.median(all_h)) if all_h else 14.0 + groups = check_vertical_alignment_split(indices, ocr, threshold=int(med_h * 2.5)) + if len(groups) > 1: + return groups + return None - def run_vision_ocr(self, image_bgr): - if image_bgr is None or image_bgr.size == 0: - return [] - ih, iw = image_bgr.shape[:2] - success, buffer = cv2.imencode('.png', image_bgr) - if not success: - return [] - ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer)) - cg_image = Quartz.CGImageSourceCreateWithData(ns_data, None) - cg_image = Quartz.CGImageSourceCreateImageAtIndex(cg_image, 0, None) - request = Vision.VNRecognizeTextRequest.alloc().init() - request.setRecognitionLevel_(1) - request.setUsesLanguageCorrection_(True) - request.setRecognitionLanguages_(self.langs) - handler = Vision.VNImageRequestHandler.alloc().initWithCGImage_options_( - cg_image, {}) - handler.performRequests_error_([request], None) - results = [] - scale_x, scale_y = iw, ih - for obs in (request.results() or []): - bbox = obs.boundingBox() - x1 = int(bbox.origin.x * scale_x) - y1 = int((1 - bbox.origin.y - bbox.size.height) * scale_y) - x2 = int((bbox.origin.x + bbox.size.width) * scale_x) - y2 = int((1 - bbox.origin.y) * scale_y) - x1, y1 = max(0, x1), max(0, y1) - x2, y2 = min(iw, x2), min(ih, y2) - if x2 <= x1 or y2 <= y1: - continue - text = obs.topCandidates_(1)[0].string() if obs.topCandidates_(1) else "" - conf = float(obs.topCandidates_(1)[0].confidence()) if obs.topCandidates_(1) else 0.0 - quad = [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] - results.append((quad, text, conf)) - return results - - def detect(self, image_bgr): - """ - Multi-variant OCR with consensus merging. - Returns list of (quad, text, conf) tuples. - """ - if image_bgr is None or image_bgr.size == 0: - return [] - - variants = self.preprocess_variants(image_bgr) - all_results = [] - variant_names = [] - - for name, variant_img in variants: - try: - res = self.run_vision_ocr(variant_img) - # scale coordinates back to original image space - vh, vw = variant_img.shape[:2] - oh, ow = image_bgr.shape[:2] - sx, sy = ow / max(1, vw), oh / max(1, vh) - scaled = [] - for quad, text, conf in res: - sq = [[int(p[0]*sx), int(p[1]*sy)] for p in quad] - scaled.append((sq, text, conf)) - all_results.append(scaled) - variant_names.append(name) - except Exception as e: - print(f" ⚠️ Variant '{name}' failed: {e}") - - if not all_results: - return [] - - return self._merge_variant_results(all_results, variant_names) - - def _merge_variant_results(self, all_results, variant_names): - """ - Merge OCR results from multiple preprocessing variants. - Strategy: use the variant with the most detections as base, - then fill gaps from other variants using IoU matching. - """ - """ - FIX E: Use self.langs[0] locale for is_meaningful_text() - instead of hardcoded "en", so short words like "BUT" and "I" - are protected when source_lang != "en". - """ - if not all_results: - return [] - - # Derive source_lang string from self.langs[0] (e.g. "en-US" → "en") - lang_code = self.langs[0].split("-")[0].lower() - - base_idx = max(range(len(all_results)), key=lambda i: len(all_results[i])) - base = list(all_results[base_idx]) - others = [r for i, r in enumerate(all_results) if i != base_idx] - - for other in others: - for quad_o, text_o, conf_o in other: - box_o = quad_bbox(quad_o) - matched = False - for k, (quad_b, text_b, conf_b) in enumerate(base): - box_b = quad_bbox(quad_b) - if boxes_iou(box_o, box_b) > 0.40: - if conf_o > conf_b: - base[k] = (quad_b, text_o, conf_o) - matched = True - break - # FIX E: use lang_code not hardcoded "en" - if not matched and is_meaningful_text(text_o, lang_code): - base.append((quad_o, text_o, conf_o)) - - return base # ============================================================ # BUILD LINES FROM INDICES # ============================================================ def build_lines_from_indices(indices, ocr, reading_mode="ltr"): - """ - Build ordered text lines from a set of OCR quad indices. - Uses layout detection to handle both horizontal and vertical text. - """ if not indices: return [] return build_text_from_layout(indices, ocr, reading_mode=reading_mode) def split_indices_into_vertical_blocks(indices, ocr, gap_factor=4.0): - """ - FIX A: Raised gap_factor from 2.5 → 4.0 - - The old value cut off trailing punctuation tokens ("...!!", "DY", - "ENEMIES.") that sit a few pixels below the main text block. - A larger gap is needed before we consider two groups to be in - separate bubbles — contour splitting handles the real separations. - """ + """FIX A: gap_factor raised from 2.5 → 4.0""" if not indices: return [] - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in indices] med_h = float(np.median(all_h)) if all_h else 14.0 gap_th = med_h * gap_factor @@ -2520,47 +2165,134 @@ def split_indices_into_vertical_blocks(indices, ocr, gap_factor=4.0): return blocks + # ============================================================ -# SPLIT HELPERS FOR enforce_max_box_size +# ENHANCED OCR ENGINE # ============================================================ -def split_bubble_if_multiple_columns(indices, ocr, bid=None, - use_aggressive_thresholds=False): - """ - Attempt to split indices into left/right column groups. - Returns list of groups if a clear column gap is found, else None. - """ - if len(indices) < 2: - return None +class ImprovedMacVisionDetector: + def __init__(self, source_lang="en"): + lang_key = source_lang.lower().strip() + lang_map = { + "en": "en-US", "english": "en-US", + "es": "es-ES", "spanish": "es-ES", + "ca": "ca-ES", "catalan": "ca-ES", + "fr": "fr-FR", "french": "fr-FR", + "ja": "ja-JP", "japanese": "ja-JP", + "it": "it-IT", "italian": "it-IT", + "de": "de-DE", "german": "de-DE", + "ko": "ko-KR", "korean": "ko-KR", + "zh": "zh-Hans", "chinese": "zh-Hans", + } + self.langs = [lang_map.get(lang_key, "en-US")] + print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})") - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in indices] - med_h = float(np.median(all_h)) if all_h else 14.0 - factor = 1.5 if use_aggressive_thresholds else 2.5 + def preprocess_variants(self, image_bgr): + variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))] + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + variants.append(("high_contrast", + cv2.cvtColor( + cv2.resize(hc, None, fx=2.5, fy=2.5, + interpolation=cv2.INTER_CUBIC), + cv2.COLOR_GRAY2BGR))) + variants.append(("bilateral", + cv2.resize( + cv2.bilateralFilter(image_bgr, 9, 75, 75), + None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("inverted", + cv2.resize( + cv2.bitwise_not(image_bgr), + None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("original", + cv2.resize(image_bgr, None, fx=2.5, fy=2.5, + interpolation=cv2.INTER_CUBIC))) + return variants - result = detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=factor) - if result is None: - return None - left_group, right_group = result - if not left_group or not right_group: - return None - return [left_group, right_group] + def run_vision_ocr(self, image_bgr): + if image_bgr is None or image_bgr.size == 0: + return [] + ih, iw = image_bgr.shape[:2] + success, buffer = cv2.imencode('.png', image_bgr) + if not success: + return [] + ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer)) + cg_image = Quartz.CGImageSourceCreateWithData(ns_data, None) + cg_image = Quartz.CGImageSourceCreateImageAtIndex(cg_image, 0, None) + request = Vision.VNRecognizeTextRequest.alloc().init() + request.setRecognitionLevel_(1) + request.setUsesLanguageCorrection_(True) + request.setRecognitionLanguages_(self.langs) + handler = Vision.VNImageRequestHandler.alloc().initWithCGImage_options_( + cg_image, {}) + handler.performRequests_error_([request], None) + results = [] + scale_x, scale_y = iw, ih + for obs in (request.results() or []): + bbox = obs.boundingBox() + x1 = int(bbox.origin.x * scale_x) + y1 = int((1 - bbox.origin.y - bbox.size.height) * scale_y) + x2 = int((bbox.origin.x + bbox.size.width) * scale_x) + y2 = int((1 - bbox.origin.y) * scale_y) + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(iw, x2), min(ih, y2) + if x2 <= x1 or y2 <= y1: + continue + text = obs.topCandidates_(1)[0].string() if obs.topCandidates_(1) else "" + conf = float(obs.topCandidates_(1)[0].confidence()) \ + if obs.topCandidates_(1) else 0.0 + quad = [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] + results.append((quad, text, conf)) + return results + def detect(self, image_bgr): + if image_bgr is None or image_bgr.size == 0: + return [] + variants = self.preprocess_variants(image_bgr) + all_results = [] + variant_names = [] + for name, variant_img in variants: + try: + res = self.run_vision_ocr(variant_img) + vh, vw = variant_img.shape[:2] + oh, ow = image_bgr.shape[:2] + sx, sy = ow / max(1, vw), oh / max(1, vh) + scaled = [] + for quad, text, conf in res: + sq = [[int(p[0]*sx), int(p[1]*sy)] for p in quad] + scaled.append((sq, text, conf)) + all_results.append(scaled) + variant_names.append(name) + except Exception as e: + print(f" ⚠️ Variant '{name}' failed: {e}") + if not all_results: + return [] + return self._merge_variant_results(all_results, variant_names) -def split_bubble_if_multiple_rows(indices, ocr, bid=None): - """ - Attempt to split indices into top/bottom row groups. - Returns list of groups if a clear row gap is found, else None. - """ - if len(indices) < 2: - return None + def _merge_variant_results(self, all_results, variant_names): + """FIX E: use self.langs[0] locale for is_meaningful_text()""" + if not all_results: + return [] - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in indices] - med_h = float(np.median(all_h)) if all_h else 14.0 + lang_code = self.langs[0].split("-")[0].lower() + base_idx = max(range(len(all_results)), key=lambda i: len(all_results[i])) + base = list(all_results[base_idx]) + others = [r for i, r in enumerate(all_results) if i != base_idx] - groups = check_vertical_alignment_split(indices, ocr, - threshold=int(med_h * 2.5)) - if len(groups) > 1: - return groups - return None + for other in others: + for quad_o, text_o, conf_o in other: + box_o = quad_bbox(quad_o) + matched = False + for k, (quad_b, text_b, conf_b) in enumerate(base): + box_b = quad_bbox(quad_b) + if boxes_iou(box_o, box_b) > 0.40: + if conf_o > conf_b: + base[k] = (quad_b, text_o, conf_o) + matched = True + break + if not matched and is_meaningful_text(text_o, lang_code): + base.append((quad_o, text_o, conf_o)) + + return base # ============================================================ @@ -2570,26 +2302,7 @@ def process_manga_page(image_path: str, source_lang: str = "en", target_lang: str = "ca", output_json: str = None, - output_txt: str = None) -> Dict[str, Any]: - """ - Full manga page OCR + translation pipeline. - - Pipeline order: - 1. Load image - 2. Run multi-variant OCR - 3. Filter noise / invalid quads - 4. Pre-split wide quads by column gap - 5. Propose text regions (contour-aware, tightened thresholds) <- FIX - 6. Contour-aware pre-split of merged groups <- FIX - 7. Auto-fix bubble detection (split multi-bubble, merge frags) - 8. Reconcile region + bubble groups (tightened IoU/overlap) <- FIX - 9. Remove nested/duplicate boxes - 10. Enforce max box size - 11. Classify region types - 12. Correct OCR text - 13. Translate - 14. Build output - """ + output_txt: str = None) -> Dict[str, Any]: print(f"\n{'='*60}") print(f"📖 Processing: {os.path.basename(image_path)}") print(f"{'='*60}") @@ -2624,7 +2337,6 @@ def process_manga_page(image_path: str, print(f" Filtered OCR detections: {len(filtered_ocr)}") - # Build indexed OCR list for downstream functions ocr = [(item[0], item[1], item[2]) for item in filtered_ocr] all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) @@ -2641,15 +2353,12 @@ def process_manga_page(image_path: str, med_h = float(np.median(all_h)) if all_h else 14.0 # ── Step 5: Propose regions (contour-aware) ────────────── - # FIX: pass image_bgr so contour membership gates merging region_lines, region_boxes, region_quads, region_indices = \ propose_text_regions_from_ocr(ocr, image_bgr.shape, image_bgr=image_bgr) print(f" Proposed regions: {len(region_boxes)}") # ── Step 6: Contour-aware pre-split ────────────────────── - # FIX: split any region that spans multiple speech-bubble contours - # BEFORE any merging pass runs — primary fix for Box-8 / Box-6 region_lines, region_boxes, region_quads, region_indices = \ apply_contour_split_to_all_boxes( region_boxes, region_indices, region_quads, @@ -2666,9 +2375,6 @@ def process_manga_page(image_path: str, print(f" Regions after auto-fix: {len(region_boxes)}") # ── Step 8: Reconcile region + bubble groups ───────────── - # For this pipeline we use region groups as both inputs since - # we have already applied contour splitting above. - # bubble_* mirrors region_* here; reconcile deduplicates overlaps. out_lines, out_boxes, out_quads, out_indices = \ reconcile_region_and_bubble_groups( region_lines, region_boxes, region_quads, region_indices, @@ -2718,6 +2424,9 @@ def process_manga_page(image_path: str, # Correct OCR corrected_text, correction_gain = correct_region_text(raw_text, region_type) + # Apply bold-font fixes on top of dialogue correction + corrected_text = fix_common_ocr_errors(corrected_text) + # Confidence conf = compute_region_confidence( raw_text, corrected_text, box, region_type, image_bgr) @@ -2730,13 +2439,13 @@ def process_manga_page(image_path: str, bubble_groups = build_text_from_layout(indices, ocr) # ── Step 13: Translate ──────────────────────────────── - translated = "" + translated = "" translation_input = corrected_text if region_type not in {"sfx"} and is_meaningful_text(corrected_text, source_lang): try: raw_translation = translator.translate(translation_input) - translated = postprocess_translation_general(raw_translation or "") + translated = postprocess_translation_general(raw_translation or "") except Exception as e: print(f" ⚠️ Translation failed for BOX#{bid}: {e}") translated = corrected_text @@ -2757,18 +2466,18 @@ def process_manga_page(image_path: str, flags.append("SEGMENTED") results[str(bid)] = { - "order": order_idx, - "region_type": region_type, - "confidence": round(conf, 4), - "ocr_source": ocr_source, - "raw_ocr": raw_text, - "corrected_ocr": corrected_text, + "order": order_idx, + "region_type": region_type, + "confidence": round(conf, 4), + "ocr_source": ocr_source, + "raw_ocr": raw_text, + "corrected_ocr": corrected_text, "translation_input": translation_input, - "translated": translated, - "flags": flags, - "bubble_groups": bubble_groups, - "box": xyxy_to_xywh(box), - "lines": bubble_groups, + "translated": translated, + "flags": flags, + "bubble_groups": bubble_groups, + "box": xyxy_to_xywh(box), + "lines": bubble_groups, } print(f"\n ✅ Processed {len(results)} text region(s).") @@ -2787,7 +2496,6 @@ def process_manga_page(image_path: str, # OUTPUT WRITERS # ============================================================ def _write_json_output(results: Dict[str, Any], path: str) -> None: - """Write full results dict to a JSON file.""" try: with open(path, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) @@ -2797,12 +2505,6 @@ def _write_json_output(results: Dict[str, Any], path: str) -> None: def _write_txt_output(results: Dict[str, Any], path: str) -> None: - """ - Write a human-readable columnar summary to a .txt file. - - Format: - BUBBLE|ORDER|TYPE|CONF|OCR_SOURCE|ORIGINAL|CORRECTED|BUBBLE_GROUPS|TRANSLATED|FLAGS - """ sep = "─" * 120 lines = [ "BUBBLE|ORDER|TYPE|CONF|OCR_SOURCE|ORIGINAL|CORRECTED|BUBBLE_GROUPS|TRANSLATED|FLAGS", @@ -2838,29 +2540,19 @@ def _write_txt_output(results: Dict[str, Any], path: str) -> None: # DEBUG VISUALISER # ============================================================ def draw_debug_clusters(image_bgr: np.ndarray, - out_boxes: Dict[int, tuple], - out_lines: Dict[int, list], + out_boxes: Dict[int, tuple], + out_lines: Dict[int, list], out_indices: Dict[int, list], - ocr: list, - save_path: str = None) -> np.ndarray: - """ - Draw all detected boxes with their IDs and first line of text - onto a copy of the image for visual debugging. - - Color coding: - Green = dialogue - Orange = narration - Cyan = reaction - Red = sfx / unknown - """ - vis = image_bgr.copy() + ocr: list, + save_path: str = None) -> np.ndarray: + vis = image_bgr.copy() ih, iw = vis.shape[:2] COLOR_MAP = { - "dialogue": (0, 200, 0), + "dialogue": (0, 200, 0), "narration": (0, 165, 255), - "reaction": (255, 200, 0), - "sfx": (0, 0, 220), + "reaction": (255, 200, 0), + "sfx": (0, 0, 220), "unknown": (120, 120, 120), } @@ -2883,13 +2575,12 @@ def draw_debug_clusters(image_bgr: np.ndarray, x1, y1, x2, y2 = box cv2.rectangle(vis, (x1, y1), (x2, y2), color, 2) - label = f"BOX#{bid} [{rtype}]" - preview = (text[:40] + "...") if len(text) > 40 else text - font = cv2.FONT_HERSHEY_SIMPLEX + label = f"BOX#{bid} [{rtype}]" + preview = (text[:40] + "...") if len(text) > 40 else text + font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.38 thickness = 1 - # label background (lw, lh), _ = cv2.getTextSize(label, font, font_scale, thickness) cv2.rectangle(vis, (x1, max(0, y1 - lh - 6)), @@ -2900,18 +2591,18 @@ def draw_debug_clusters(image_bgr: np.ndarray, font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA) - # preview text below label cv2.putText(vis, preview, (x1 + 2, min(ih - 5, y1 + lh + 6)), font, font_scale * 0.85, color, thickness, cv2.LINE_AA) - # draw individual OCR quad outlines in lighter shade + # Draw individual OCR quad outlines for idx in out_indices.get(bid, []): - q = ocr[idx][0] - pts = np.array(q, dtype=np.int32).reshape((-1, 1, 2)) - cv2.polylines(vis, [pts], True, - tuple(min(255, c + 80) for c in color), 1) + if idx < len(ocr): + q = ocr[idx][0] + pts = np.array(q, dtype=np.int32).reshape((-1, 1, 2)) + cv2.polylines(vis, [pts], True, + tuple(min(255, c + 80) for c in color), 1) if save_path: cv2.imwrite(save_path, vis) @@ -2928,21 +2619,20 @@ def main(): parser = argparse.ArgumentParser( description="Manga page OCR + translation pipeline (macOS Vision)") - parser.add_argument("image", help="Path to manga page image") - parser.add_argument("--source", "-s", default="en", + parser.add_argument("image", help="Path to manga page image") + parser.add_argument("--source", "-s", default="en", help="Source language code (default: en)") - parser.add_argument("--target", "-t", default="ca", + parser.add_argument("--target", "-t", default="ca", help="Target language code (default: ca)") - parser.add_argument("--json", "-j", default=None, + parser.add_argument("--json", "-j", default=None, help="Output JSON file path") - parser.add_argument("--txt", "-o", default=None, + parser.add_argument("--txt", "-o", default=None, help="Output TXT file path") - parser.add_argument("--debug", "-d", default=None, + parser.add_argument("--debug", "-d", default=None, help="Save debug visualisation to this path") args = parser.parse_args() - # derive default output paths from image name if not specified - base = os.path.splitext(args.image)[0] + base = os.path.splitext(args.image)[0] json_out = args.json or f"{base}_bubbles.json" txt_out = args.txt or f"{base}_output.txt" debug_out = args.debug or f"{base}_debug_clusters.png" @@ -2962,8 +2652,6 @@ def main(): # ── Debug visualisation ─────────────────────────────────── image_bgr = cv2.imread(args.image) if image_bgr is not None: - # Rebuild out_boxes / out_lines / out_indices from results - # for the visualiser (they were local to process_manga_page) vis_boxes: Dict[int, tuple] = {} vis_lines: Dict[int, list] = {} vis_indices: Dict[int, list] = {} @@ -2971,22 +2659,22 @@ def main(): for bid_str, data in results.items(): bid = int(bid_str) xywh = data["box"] - vis_boxes[bid] = ( + vis_boxes[bid] = ( xywh["x"], xywh["y"], xywh["x"] + xywh["w"], xywh["y"] + xywh["h"], ) vis_lines[bid] = data.get("lines", []) - vis_indices[bid] = [] # indices not stored in output; quads drawn from box only + vis_indices[bid] = [] draw_debug_clusters( image_bgr, vis_boxes, vis_lines, vis_indices, - ocr=[], # no raw quads available at this stage - save_path=debug_out, + ocr = [], + save_path = debug_out, ) # ── Console summary ───────────────────────────────────────