diff --git a/.gitignore b/.gitignore index b40dbe3..c0c2805 100644 --- a/.gitignore +++ b/.gitignore @@ -9,9 +9,10 @@ .venv311/ +#Folders to test Spy_x_Family_076/ - Dandadan_059/ +Lv999/ # Icon must end with two \r Icon diff --git a/manga-translator.py b/manga-translator.py index 6c1cd6a..e69de29 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -1,4258 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import re -import json -import cv2 -import numpy as np -import warnings -from typing import List, Tuple, Dict, Any, Optional - -from deep_translator import GoogleTranslator - -# macOS Native Vision imports -import Vision -import Quartz -from Foundation import NSData - -warnings.filterwarnings("ignore", category=UserWarning) - -# ============================================================ -# CONFIG -# ============================================================ -TOP_BAND_RATIO = 0.08 - -# ============================================================ -# REGION-FIRST LAYOUT HELPERS -# ============================================================ -import math -from difflib import SequenceMatcher - -DIALOGUE_STOPWORDS = { - "I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR", - "IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT", - "WHAT", "WHY", "HOW", "WHO", "IN", "ON", "AT", "TO", "OF", "FOR", "WITH", - "AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT" -} - -SFX_HINTS = { - "RRRING", "RING", "RINGG", "BAM", "BOOM", "FWUP", "FWOOP", "FSHOO", - "GRRP", "GASP", "THUD", "SMACK", "WHAM", "SLAM", "SNIF", "SNIFF" -} - -REACTION_HINTS = { - "HUH", "HUH?!", "HUH?", "OH", "AH", "EH", "TCH", "HEY", "WHAT?!", "NO!", "YES!" -} - -NARRATION_HINTS = { - "AND SO", "MEANWHILE", "LATER", "THEN", "TO BE CONTINUED" -} - -def xyxy_width(b): - return max(1, b[2] - b[0]) - -def xyxy_height(b): - return max(1, b[3] - b[1]) - -def xyxy_center(b): - return ((b[0] + b[2]) / 2.0, (b[1] + b[3]) / 2.0) - -def box_distance(a, b): - ax, ay = xyxy_center(a) - bx, by = xyxy_center(b) - return math.hypot(ax - bx, ay - by) - -def horizontal_overlap_ratio(a, b): - ix1, ix2 = max(a[0], b[0]), min(a[2], b[2]) - ov = max(0, ix2 - ix1) - return ov / max(1, min(xyxy_width(a), xyxy_width(b))) - -def vertical_overlap_ratio(a, b): - iy1, iy2 = max(a[1], b[1]), min(a[3], b[3]) - ov = max(0, iy2 - iy1) - return ov / max(1, min(xyxy_height(a), xyxy_height(b))) - -def box_expand(b, pad, iw, ih): - return ( - max(0, int(b[0] - pad)), - max(0, int(b[1] - pad)), - min(iw - 1, int(b[2] + pad)), - min(ih - 1, int(b[3] + pad)), - ) - -def count_alpha(text): - return len(re.findall(r"[A-ZÀ-Ýa-zà-ÿ]", text or "")) - -def uppercase_ratio(text): - alpha = re.findall(r"[A-Za-zÀ-ÿ]", text or "") - if not alpha: - return 0.0 - ups = sum(1 for c in alpha if c.isupper()) - return ups / len(alpha) - -def punctuation_ratio(text): - if not text: - return 0.0 - return len(re.findall(r"[!?.,'\"-]", text)) / max(1, len(text)) - -def stopword_ratio(text): - toks = re.findall(r"[A-Z']+", normalize_text(text or "")) - if not toks: - return 0.0 - hits = sum(1 for t in toks if t in DIALOGUE_STOPWORDS) - return hits / len(toks) - -def looks_like_sfx_text(text): - t = normalize_text(text or "") - if not t: - return False - alpha = re.sub(r"[^A-Z]", "", t) - if t in SFX_HINTS or alpha in SFX_HINTS: - return True - if len(alpha) >= 3 and uppercase_ratio(t) > 0.90 and stopword_ratio(t) < 0.15: - if alpha not in DIALOGUE_STOPWORDS: - return True - return False - -def looks_like_reaction_text(text): - t = normalize_text(text or "") - alpha = re.sub(r"[^A-Z?!]", "", t) - if t in REACTION_HINTS or alpha in REACTION_HINTS: - return True - if len(re.sub(r"[^A-Z]", "", t)) <= 5 and punctuation_ratio(t) > 0.10: - return True - return False - -def looks_like_narration_text(text): - t = normalize_text(text or "") - if any(t.startswith(h) for h in NARRATION_HINTS): - return True - if len(t.split()) >= 5 and t.endswith(".") and uppercase_ratio(t) > 0.75: - return True - return False - -def contour_features_for_box(image_bgr, box_xyxy): - x1, y1, x2, y2 = box_xyxy - crop = image_bgr[y1:y2, x1:x2] - if crop.size == 0: - return { - "mean_brightness": 0.0, - "edge_density": 1.0, - "whiteness_ratio": 0.0, - } - - gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) - mean_brightness = float(np.mean(gray)) / 255.0 - - edges = cv2.Canny(gray, 50, 150) - edge_density = float(np.mean(edges > 0)) - - whiteness_ratio = float(np.mean(gray > 220)) - return { - "mean_brightness": mean_brightness, - "edge_density": edge_density, - "whiteness_ratio": whiteness_ratio, - } - -def classify_region_type(image_bgr, box_xyxy, lines): - text = normalize_text(" ".join(lines)) - feats = contour_features_for_box(image_bgr, box_xyxy) - w, h = xyxy_width(box_xyxy), xyxy_height(box_xyxy) - ar = w / max(1, h) - - if looks_like_sfx_text(text): - return "sfx" - - if looks_like_reaction_text(text): - if len(text.split()) <= 3: - return "reaction" - - if looks_like_narration_text(text): - return "narration" - - # balloon/dialogue heuristic: - # bright interior + low-ish edge density + moderate width - if feats["whiteness_ratio"] > 0.45 and feats["edge_density"] < 0.18: - return "dialogue" - - # narrow tall shout / reaction / sfx ambiguity - if ar < 0.9 and uppercase_ratio(text) > 0.85 and stopword_ratio(text) < 0.20: - return "sfx" - - if stopword_ratio(text) >= 0.20: - return "dialogue" - - return "unknown" - -def text_similarity(a, b): - return SequenceMatcher(None, normalize_text(a or ""), normalize_text(b or "")).ratio() - -def dedupe_repeated_phrase(text): - t = normalize_text(text or "") - words = t.split() - if len(words) < 4: - return t - - # remove immediate duplicated halves: "CRY! CRY!" / "I DIDN'T I DIDN'T" - half = len(words) // 2 - if len(words) % 2 == 0 and words[:half] == words[half:]: - return " ".join(words[:half]) - - # collapse trailing duplicate tokens - cleaned = [] - for w in words: - if cleaned and cleaned[-1] == w and len(w) > 2: - continue - cleaned.append(w) - return " ".join(cleaned) - -def dehyphenate_linebreak_artifacts(text): - t = normalize_text(text or "") - t = re.sub(r"\b([A-Z]+)- ([A-Z]+)\b", r"\1\2", t) - return t - -def fix_common_dialogue_ocr(text): - """ - Conservative OCR cleanup for dialogue-like text. - - Goals: - - fix common OCR punctuation/spacing/apostrophe errors - - preserve meaning and tone - - avoid semantic reconstruction guesses - """ - t = normalize_text(text or "") - if not t: - return t - - replacements = { - "1'M": "I'M", - "1 DIDN'T": "I DIDN'T", - "1 HATE": "I HATE", - "1 WAS": "I WAS", - "1'M ": "I'M ", - "YO U": "YOU", - "YOU RE": "YOU'RE", - "YOURE": "YOU'RE", - "I LL": "I'LL", - "ILL ": "I'LL ", - "DONT": "DON'T", - "DIDNT": "DIDN'T", - "CANT": "CAN'T", - "WONT": "WON'T", - "THATS": "THAT'S", - "MOMS": "MOM'S", - "DADS": "DAD'S", - "LEARN- ING": "LEARNING", - "COV- ERED": "COVERED", - "SY ON": "SY-ON", - "P PROPERLY": "P-PROPERLY", - "SH SHUT": "SH- SHUT", - } - - for a, b in replacements.items(): - t = t.replace(a, b) - - # Fix split contractions / apostrophe omissions - t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t) - t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t) - t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t) - t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t) - t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t) - - # Remove accidental duplicated punctuation spacing - t = re.sub(r"\s+([,.;:!?])", r"\1", t) - - # Dehyphenate OCR line-wrap artifacts - t = dehyphenate_linebreak_artifacts(t) - - # Collapse repeated full phrases/tokens caused by OCR duplication - t = dedupe_repeated_phrase(t) - - # Remove duplicated adjacent words like "CRY CRY" if clearly accidental - words = t.split() - cleaned = [] - for w in words: - if cleaned and cleaned[-1] == w and len(re.sub(r"[^A-Z]", "", w)) > 2: - continue - cleaned.append(w) - t = " ".join(cleaned) - - # Normalize spaces - t = re.sub(r"\s{2,}", " ", t).strip() - - return t - -def region_text_role_hint(text): - if looks_like_sfx_text(text): - return "sfx" - if looks_like_reaction_text(text): - return "reaction" - if looks_like_narration_text(text): - return "narration" - return "dialogue" - - -def correct_region_text(text, region_type="dialogue"): - t = normalize_text(text or "") - if not t: - return t, 0.0 - - original = t - - if region_type in {"dialogue", "reaction", "narration"}: - t = fix_common_dialogue_ocr(t) - elif region_type == "sfx": - t = dedupe_repeated_phrase(t) - - score_before = ocr_candidate_score(original) - score_after = ocr_candidate_score(t) - - correction_gain = max(0.0, score_after - score_before) - return t, correction_gain - -def compute_region_confidence(raw_text, corrected_text, box_xyxy, region_type, image_bgr): - feats = contour_features_for_box(image_bgr, box_xyxy) - text_score = ocr_candidate_score(corrected_text) - gain = max(0.0, text_score - ocr_candidate_score(raw_text)) - role_bonus = 0.08 if region_type in {"dialogue", "reaction", "narration", "sfx"} else 0.0 - - score = ( - 0.55 * text_score + - 0.15 * feats["whiteness_ratio"] + - 0.10 * (1.0 - min(1.0, feats["edge_density"] * 2.0)) + - 0.10 * gain + - role_bonus - ) - return max(0.0, min(1.0, score)) - -def build_region_flags(raw_text, corrected_text, region_type, conf): - flags = [] - if region_type == "unknown": - flags.append("REGION_UNKNOWN") - if region_type == "sfx": - flags.append("SFX") - if conf < 0.45: - flags.append("LOW_CONF") - if text_similarity(raw_text, corrected_text) < 0.75: - flags.append("HEAVY_CORRECTION") - if len(corrected_text.split()) > 22: - flags.append("LONG_TEXT") - return flags - -# ============================================================ -# HELPERS -# ============================================================ -def normalize_text(text: str) -> str: - t = (text or "").strip().upper() - t = t.replace("\u201c", "\"").replace("\u201d", "\"") - t = t.replace("\u2018", "'").replace("\u2019", "'") - t = t.replace("\u2026", "...") - t = re.sub(r"\s+", " ", t) - t = re.sub(r"\s+([,.;:!?])", r"\1", t) - t = re.sub(r"([¡¿])\s+", r"\1", t) - t = re.sub(r"\(\s+", "(", t) - t = re.sub(r"\s+\)", ")", t) - t = re.sub(r"\.{4,}", "...", t) - return t.strip() - -def postprocess_translation_general(text: str) -> str: - t = normalize_text(text) - t = re.sub(r"\s{2,}", " ", t).strip() - t = re.sub(r"([!?]){3,}", r"\1\1", t) - t = re.sub(r"\.{4,}", "...", t) - return t - -def fix_common_ocr_errors(text: str) -> str: - result = text - result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result) - result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result) - result = result.replace('|', 'I') - result = result.replace('`', "'") - return result - -def is_valid_language(text: str, source_lang: str) -> bool: - if not text: - return False - clean_text = re.sub(r'[^\w]', '', text) - if not clean_text: - return False - lang = source_lang.lower() - if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', - 'it', 'italian', 'ca', 'catalan', 'de', 'german']: - foreign_chars = len(re.findall( - r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF' - r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]', - clean_text)) - if foreign_chars > 0: - return False - latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text)) - total = len(clean_text) - if total <= 3: - return latin_chars >= 1 - if total <= 6: - return (latin_chars / total) >= 0.55 - return (latin_chars / total) >= 0.45 - elif lang in ['ja', 'japanese']: - ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text)) - if len(clean_text) <= 3: - return ja_chars >= 1 - return (ja_chars / len(clean_text)) >= 0.4 - elif lang in ['ko', 'korean']: - ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text)) - if len(clean_text) <= 3: - return ko_chars >= 1 - return (ko_chars / len(clean_text)) >= 0.4 - elif lang in ['zh', 'chinese']: - zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text)) - if len(clean_text) <= 3: - return zh_chars >= 1 - return (zh_chars / len(clean_text)) >= 0.4 - return True - - -_NOISE_TOKENS = { - 'P', 'F', 'N', 'M', 'X', 'Z', 'Q', - 'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP', -} - -_MANGA_INTERJECTIONS = { - 'HUH', 'HUH?', 'HUH??', 'HUH?!', - 'OH', 'OH!', 'OOH', 'OOH!', - 'AH', 'AH!', 'UH', 'UH...', - 'HEY', 'HEY!', - 'EH', 'EH?', - 'WOW', 'WOW!', - 'YES', 'NO', 'NO!', - 'RUN', 'GO', 'GO!', - 'STOP', 'WAIT', - 'WHAT', 'WHAT?', 'WHAT?!', - 'WHY', 'WHY?', - 'HOW', 'HOW?', - 'OK', 'OK!', 'OKAY', - 'EEEEP', 'EEEP', - 'OMIGOSH', - 'BECKY', 'BECKY!', - 'HMM', 'HMM...', - 'TSK', 'TCH', - 'GRRR','I','A', - 'FWUP', 'FWAP', - 'SHIVER', - 'RRRING', - 'MORNING', 'MORNING.', -} - -def group_indices_into_vertical_columns(indices, ocr, - x_tolerance_factor=1.4, - min_vertical_span_factor=1.8): - """ - Group OCR indices into vertical columns inside a box. - - A column is defined as: - - similar x centers - - meaningful vertical spread - - internally ordered top-to-bottom - """ - if not indices: - return [] - - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - w = max(1, b[2] - b[0]) - h = max(1, b[3] - b[1]) - items.append((i, b, cx, cy, w, h)) - - med_w = float(np.median([it[4] for it in items])) if items else 12.0 - med_h = float(np.median([it[5] for it in items])) if items else 12.0 - x_tol = max(10.0, med_w * x_tolerance_factor) - - # cluster by x-center - items_sorted = sorted(items, key=lambda x: x[2]) - columns = [] - - for it in items_sorted: - placed = False - for col in columns: - if abs(it[2] - col["xc"]) <= x_tol: - col["members"].append(it) - col["xc"] = float(np.mean([m[2] for m in col["members"]])) - placed = True - break - if not placed: - columns.append({"xc": it[2], "members": [it]}) - - # sort each column top -> bottom - clean_columns = [] - for col in columns: - members = sorted(col["members"], key=lambda x: x[3]) - ys = [m[3] for m in members] - vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0.0 - - # keep meaningful columns OR single strong items - if len(members) >= 2 or vertical_span >= med_h * min_vertical_span_factor: - clean_columns.append([m[0] for m in members]) - else: - clean_columns.append([m[0] for m in members]) - - # sort columns left -> right - clean_columns.sort(key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp])) - return clean_columns - -def group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.75): - """ - Group OCR indices into horizontal rows inside a box. - """ - if not indices: - return [] - - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - h = max(1, b[3] - b[1]) - items.append((i, b, cx, cy, h)) - - med_h = float(np.median([it[4] for it in items])) if items else 10.0 - row_tol = max(6.0, med_h * row_tol_factor) - - items.sort(key=lambda x: x[3]) - rows = [] - - for it in items: - placed = False - for row in rows: - if abs(it[3] - row["yc"]) <= row_tol: - row["members"].append(it) - row["yc"] = float(np.mean([m[3] for m in row["members"]])) - placed = True - break - if not placed: - rows.append({"yc": it[3], "members": [it]}) - - groups = [] - for row in rows: - members = sorted(row["members"], key=lambda x: x[2]) - groups.append([m[0] for m in members]) - - return groups - -def score_text_groups(groups, ocr): - """ - Score grouping quality based on: - - average group size - - text plausibility - - reduced fragmentation - """ - if not groups: - return 0.0 - - texts = [] - lengths = [] - - for grp in groups: - parts = [] - for i in grp: - t = normalize_text(ocr[i][1]) - if t: - parts.append(t) - txt = normalize_text(" ".join(parts)) - if txt: - texts.append(txt) - lengths.append(len(txt.split())) - - if not texts: - return 0.0 - - text_scores = [ocr_candidate_score(t) for t in texts] - avg_text_score = float(np.mean(text_scores)) if text_scores else 0.0 - avg_len = float(np.mean(lengths)) if lengths else 0.0 - fragmentation_penalty = max(0.0, len(groups) - 4) * 0.08 - - return avg_text_score + min(0.5, avg_len * 0.05) - fragmentation_penalty - -def detect_internal_text_layout(indices, ocr, reading_mode="ltr"): - """ - Detect internal structure of text inside one final box. - - Step 1: split into vertical macro blocks - Step 2: for each block, compare horizontal vs vertical grouping - """ - if not indices: - return {"mode": "horizontal", "blocks": []} - - blocks = split_indices_into_vertical_blocks(indices, ocr) - - resolved_blocks = [] - - for block in blocks: - horizontal_groups = group_indices_into_horizontal_rows(block, ocr) - vertical_groups = group_indices_into_vertical_columns(block, ocr) - - h_score = score_text_groups(horizontal_groups, ocr) - v_score = score_text_groups(vertical_groups, ocr) - - if len(vertical_groups) >= 2 and v_score >= h_score - 0.03: - resolved_blocks.append({ - "mode": "vertical", - "groups": vertical_groups - }) - else: - resolved_blocks.append({ - "mode": "horizontal", - "groups": horizontal_groups - }) - - return {"mode": "block-mixed", "blocks": resolved_blocks} - - -def build_text_from_layout(indices, ocr, reading_mode="ltr"): - layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) - output_lines = [] - - for block in layout["blocks"]: - groups = block["groups"] - mode = block["mode"] - - if mode == "horizontal": - for grp in groups: - line = normalize_text(" ".join( - ocr[i][1] for i in grp if normalize_text(ocr[i][1]) - )) - if line: - output_lines.append(line) - - elif mode == "vertical": - if reading_mode == "rtl": - groups = sorted( - groups, - key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]), - reverse=True - ) - else: - groups = sorted( - groups, - key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]) - ) - - for grp in groups: - grp_sorted = sorted(grp, key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0) - line = normalize_text(" ".join( - ocr[i][1] for i in grp_sorted if normalize_text(ocr[i][1]) - )) - if line: - output_lines.append(line) - - return output_lines - -# ============================================================ -# REGION PROPOSAL FROM OCR GEOMETRY -# ============================================================ -def propose_text_regions_from_ocr(ocr, image_shape): - """ - Build larger text containers from OCR boxes before final classification. - This is intentionally conservative: it clusters nearby OCR groups that - likely belong to one dialogue/narration region. - """ - ih, iw = image_shape[:2] - if not ocr: - return {}, {}, {}, {} - - boxes = [quad_bbox(x[0]) for x in ocr] - hs = [max(1, b[3] - b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 14.0 - - parent = list(range(len(ocr))) - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(a, b): - ra, rb = find(a), find(b) - if ra != rb: - parent[rb] = ra - - for i in range(len(ocr)): - bi = boxes[i] - for j in range(i + 1, len(ocr)): - bj = boxes[j] - - dx = abs(xyxy_center(bi)[0] - xyxy_center(bj)[0]) - dy = abs(xyxy_center(bi)[1] - xyxy_center(bj)[1]) - - hov = horizontal_overlap_ratio(bi, bj) - vov = vertical_overlap_ratio(bi, bj) - dist = box_distance(bi, bj) - - same_band = dy <= med_h * 2.2 - stacked = hov >= 0.35 and dy <= med_h * 3.2 - same_line = vov >= 0.45 and dx <= med_h * 5.0 - near = dist <= med_h * 4.5 - - if same_line or stacked or (near and (same_band or hov > 0.25)): - if orientation_compatible(i, j, ocr): - union(i, j) - - groups = {} - for i in range(len(ocr)): - groups.setdefault(find(i), []).append(i) - - region_lines = {} - region_boxes = {} - region_quads = {} - region_indices = {} - next_id = 1 - - for _, idxs in sorted(groups.items(), key=lambda kv: min(boxes[i][1] for i in kv[1])): - idxs = sorted(idxs, key=lambda i: (boxes[i][1], boxes[i][0])) - ub = boxes_union_xyxy([boxes[i] for i in idxs]) - if ub is None: - continue - region_lines[next_id] = build_lines_from_indices(idxs, ocr) - region_boxes[next_id] = box_expand(ub, pad=max(2, int(med_h * 0.25)), iw=iw, ih=ih) - region_quads[next_id] = [ocr[i][0] for i in idxs] - region_indices[next_id] = idxs - next_id += 1 - - return region_lines, region_boxes, region_quads, region_indices - -# ============================================================ -# RECONCILE REGION-FIRST AND BUBBLE-FIRST GROUPS -# ============================================================ -def reconcile_region_and_bubble_groups(region_lines, region_boxes, region_quads, region_indices, - bubbles, bubble_boxes, bubble_quads, bubble_indices, - ocr): - """ - Reconcile region-first and bubble-first groupings. - - Strategy: - - Build one combined candidate list from both grouping methods. - - Cluster candidates that heavily overlap or share OCR indices. - - Keep only the best-scoring candidate from each cluster. - - Rebuild stable output dictionaries. - - This avoids duplicate retention and inconsistent greedy selection. - """ - combined = [] - - for rid in region_boxes: - combined.append(("region", rid, region_boxes[rid], region_indices[rid])) - - for bid in bubble_boxes: - combined.append(("bubble", bid, bubble_boxes[bid], bubble_indices[bid])) - - if not combined: - return {}, {}, {}, {} - - visited = set() - kept = [] - - def group_score(box, idxs): - text = normalize_text(" ".join(build_lines_from_indices(idxs, ocr))) - role = region_text_role_hint(text) - - role_bonus = { - "dialogue": 0.8, - "narration": 0.75, - "reaction": 0.7, - "sfx": 0.2, - "unknown": 0.1 - }.get(role, 0.1) - - box_area = bbox_area_xyxy(box) - area_bonus = min(1.0, box_area / 50000.0) - - return ( - len(idxs) * 2.0 + - min(20, len(text.split())) * 0.5 + - min(1.0, ocr_candidate_score(text)) + - role_bonus + - area_bonus * 0.25 - ) - - for i in range(len(combined)): - if i in visited: - continue - - cluster = [i] - visited.add(i) - - _, _, box_i, idx_i = combined[i] - - for j in range(i + 1, len(combined)): - if j in visited: - continue - - _, _, box_j, idx_j = combined[j] - - ovs = boxes_overlap_ratio(box_i, box_j) - iou = boxes_iou(box_i, box_j) - shared = len(set(idx_i).intersection(idx_j)) - - if ovs >= 0.55 or iou >= 0.35 or shared > 0: - cluster.append(j) - visited.add(j) - - best_idx = max( - cluster, - key=lambda k: group_score(combined[k][2], combined[k][3]) - ) - kept.append(combined[best_idx]) - - # Stable order: top-to-bottom, then left-to-right - kept.sort(key=lambda item: ( - (item[2][1] + item[2][3]) / 2.0, - (item[2][0] + item[2][2]) / 2.0 - )) - - out_lines, out_boxes, out_quads, out_indices = {}, {}, {}, {} - next_id = 1 - - for typ, oid, box, idxs in kept: - idxs = sorted( - set(idxs), - key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0]) - ) - - out_lines[next_id] = build_lines_from_indices(idxs, ocr) - out_boxes[next_id] = box - out_quads[next_id] = [ocr[k][0] for k in idxs] - out_indices[next_id] = idxs - next_id += 1 - - return out_lines, out_boxes, out_quads, out_indices - -# ============================================================ -# PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET -# ============================================================ -PROTECTED_SHORT_TOKENS = { - "HUH", "HUH?", "HUH??", "HUH?!", - "OH", "OH!", "OOH", "OOH!", - "AH", "AH!", "UH", "UH...", - "HEY", "HEY!", "EH", "EH?", - "WOW", "WOW!", - "MORNING", "MORNING.", - "BECKY", "BECKY!", - "DAMIAN", "CECILE", "WALD", - "OMIGOSH", "EEEP", "EEEEP" -} - -KNOWN_NAMES = { - "BECKY", "DAMIAN", "CECILE", "WALD" -} - -def is_protected_token(text: str) -> bool: - t = normalize_text(text or "") - if not t: - return False - if t in PROTECTED_SHORT_TOKENS: - return True - # punctuation-insensitive fallback - t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t) - return t_alpha in PROTECTED_SHORT_TOKENS - -def maybe_conf_floor_for_protected(text: str, conf: float, floor: float = 0.40) -> float: - if is_protected_token(text): - return max(conf, floor) - return conf - -def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool: - if not text: - return False - - t = text.strip() - t_upper = normalize_text(t) - - # 1) Hard keep for protected tokens - if is_protected_token(t_upper): - return True - - t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper) - if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS: - return True - - alpha_count = sum(c.isalpha() for c in t) - if alpha_count < min_alpha_chars: - # allow short punctuated utterances like "Huh?" - if re.fullmatch(r"[A-Za-zÀ-ÿ]{2,6}[!?\.]{0,3}", t.strip()): - return True - return False - - if t_upper in _NOISE_TOKENS: - return False - - lang = source_lang.lower() - - if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', - 'it', 'italian', 'ca', 'catalan', 'de', 'german']: - non_alpha = sum(not c.isalpha() for c in t) - # slightly less aggressive than before - if len(t) > 0 and (non_alpha / len(t)) > 0.72: - return False - - if len(t) >= 3 and len(set(t_upper)) == 1: - return False - - if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', - 'it', 'italian', 'ca', 'catalan', 'de', 'german']: - if len(t) > 5: - vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t)) - if vowels == 0: - return False - - return True - -def quad_bbox(quad): - xs = [p[0] for p in quad] - ys = [p[1] for p in quad] - return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) - -def quad_center(quad): - x1, y1, x2, y2 = quad_bbox(quad) - return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) - -def boxes_union_xyxy(boxes): - boxes = [b for b in boxes if b is not None] - if not boxes: - return None - return ( - int(min(b[0] for b in boxes)), - int(min(b[1] for b in boxes)), - int(max(b[2] for b in boxes)), - int(max(b[3] for b in boxes)), - ) - -def bbox_area_xyxy(b): - if b is None: - return 0 - return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) - -def xyxy_to_xywh(b): - if b is None: - return None - x1, y1, x2, y2 = b - return {"x": int(x1), "y": int(y1), - "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} - -def overlap_or_near(a, b, gap=0): - ax1, ay1, ax2, ay2 = a - bx1, by1, bx2, by2 = b - gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) - gap_y = max(0, max(ay1, by1) - min(ay2, by2)) - return gap_x <= gap and gap_y <= gap - -def boxes_iou(a, b): - ax1, ay1, ax2, ay2 = a - bx1, by1, bx2, by2 = b - ix1, iy1 = max(ax1, bx1), max(ay1, by1) - ix2, iy2 = min(ax2, bx2), min(ay2, by2) - inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) - if inter == 0: - return 0.0 - area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) - area_b = max(0, bx2 - bx1) * max(0, by2 - by1) - return inter / max(1, area_a + area_b - inter) - -def boxes_overlap_ratio(a, b): - """Ratio of intersection to the SMALLER box area.""" - ax1, ay1, ax2, ay2 = a - bx1, by1, bx2, by2 = b - ix1, iy1 = max(ax1, bx1), max(ay1, by1) - ix2, iy2 = min(ax2, bx2), min(ay2, by2) - inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) - if inter == 0: - return 0.0 - area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) - area_b = max(0, bx2 - bx1) * max(0, by2 - by1) - return inter / max(1, min(area_a, area_b)) - -def ocr_candidate_score(text: str) -> float: - if not text: - return 0.0 - t = text.strip() - n = len(t) - if n == 0: - return 0.0 - alpha = sum(c.isalpha() for c in t) / n - spaces = sum(c.isspace() for c in t) / n - punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n - bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n - penalty = 0.0 - if re.search(r"\b[A-Z]\b", t): - penalty += 0.05 - if re.search(r"[0-9]{2,}", t): - penalty += 0.08 - score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty - return max(0.0, min(1.0, score)) - -def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool: - x1, y1, x2, y2 = quad_bbox(quad) - return (max(1, x2 - x1) / max(1, y2 - y1)) >= ratio_threshold - -def quad_is_vertical(quad, ratio_threshold=1.5) -> bool: - x1, y1, x2, y2 = quad_bbox(quad) - return (max(1, y2 - y1) / max(1, x2 - x1)) >= ratio_threshold - - -# ============================================================ -# ENHANCED IMAGE PREPROCESSING -# ============================================================ -def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): - h, w = image_bgr.shape[:2] - upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)), - interpolation=cv2.INTER_CUBIC) - gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) - denoised = cv2.fastNlMeansDenoising(gray, None, h=10, - templateWindowSize=7, searchWindowSize=21) - clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) - enhanced = clahe.apply(denoised) - sharpened = cv2.filter2D(enhanced, -1, - np.array([[-1,-1,-1],[-1,9,-1],[-1,-1,-1]])) - binary = cv2.adaptiveThreshold(sharpened, 255, - cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 11, 2) - cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8)) - return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR) - -def detect_small_text_regions(image_bgr, existing_quads): - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - mask = np.zeros(gray.shape, dtype=np.uint8) - for quad in existing_quads: - cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255) - mask_inv = cv2.bitwise_not(mask) - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv) - contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, - cv2.CHAIN_APPROX_SIMPLE) - text_regions = [] - for contour in contours: - x, y, w, h = cv2.boundingRect(contour) - area = w * h - if 50 < area < 5000 and 0.1 < h / max(w, 1) < 10: - text_regions.append((x, y, x + w, y + h)) - return text_regions - - -# ============================================================ -# SPEECH BUBBLE DETECTION -# ============================================================ -def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY_INV, 11, 2) - contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - return [c for c in contours if cv2.contourArea(c) > 500] - -def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5): - x1, y1, x2, y2 = quad_bbox_xyxy - cx, cy = (x1 + x2) // 2, (y1 + y2) // 2 - return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance - -def split_indices_by_bubble(indices, ocr, bubble_contours): - if not indices: - return [] - bubble_groups, outside_group = {}, [] - for idx in indices: - bbox = quad_bbox(ocr[idx][0]) - found = False - for bidx, bubble in enumerate(bubble_contours): - if is_quad_in_bubble(bbox, bubble): - bubble_groups.setdefault(bidx, []).append(idx) - found = True - break - if not found: - outside_group.append(idx) - result = list(bubble_groups.values()) - if outside_group: - result.append(outside_group) - return result - -def check_vertical_alignment_split(indices, ocr, threshold=20): - if len(indices) <= 1: - return [indices] - items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], - key=lambda x: x[1][1]) - groups, current_group = [], [items[0][0]] - for i in range(1, len(items)): - if items[i][1][1] - items[i-1][1][3] > threshold: - groups.append(current_group) - current_group = [items[i][0]] - else: - current_group.append(items[i][0]) - if current_group: - groups.append(current_group) - return groups - - -# ============================================================ -# QUAD SIZE VALIDATION AND SPLITTING -# ============================================================ -def is_quad_oversized(quad, median_height, width_threshold=8.0): - x1, y1, x2, y2 = quad_bbox(quad) - w, h = x2 - x1, max(1, y2 - y1) - return w > median_height * width_threshold or w / h > 12.0 - -def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height): - x1, y1, x2, y2 = quad_bbox(quad) - w, h = x2 - x1, max(1, y2 - y1) - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] - if roi.size == 0: - return [(quad, text, conf)] - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - gap_threshold = h * 255 * 0.20 - gaps, in_gap, gap_start = [], False, 0 - for x in range(len(v_proj)): - if v_proj[x] < gap_threshold: - if not in_gap: gap_start, in_gap = x, True - else: - if in_gap: - gw = x - gap_start - if gw >= max(int(median_height * 0.8), 15): - gaps.append((gap_start + gw // 2, gw)) - in_gap = False - if not gaps: - return [(quad, text, conf)] - gaps.sort(key=lambda g: g[1], reverse=True) - split_x_abs = max(0, x1) + gaps[0][0] - if ' ' in text: - char_w = w / max(1, len(text)) - split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - else: - split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - if tl and tr: - return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] - return [(quad, text, conf)] - -def validate_and_split_oversized_quads(image_bgr, filtered_ocr): - if not filtered_ocr: - return filtered_ocr, 0 - heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] - median_height = float(np.median(heights)) if heights else 14.0 - result, splits_made = [], 0 - for quad, text, conf in filtered_ocr: - if is_quad_oversized(quad, median_height, 8.0): - sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height) - if len(sr) > 1: - result.extend(sr); splits_made += 1 - else: - result.append((quad, text, conf)) - else: - result.append((quad, text, conf)) - return result, splits_made - - -# ============================================================ -# HORIZONTAL GAP DETECTION AT QUAD LEVEL -# ============================================================ -def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5): - if len(indices) < 2: - return None - items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) - boxes = [quad_bbox(ocr[i][0]) for i in items] - gap_threshold = med_h * gap_factor - best_gap, best_split = 0.0, None - for k in range(len(items) - 1): - gap = boxes[k + 1][0] - boxes[k][2] - if gap > gap_threshold and gap > best_gap: - best_gap, best_split = gap, k - if best_split is None: - return None - left_group = [items[i] for i in range(best_split + 1)] - right_group = [items[i] for i in range(best_split + 1, len(items))] - if not left_group or not right_group: - return None - return (left_group, right_group) - -def orientation_compatible(idx_a, idx_b, ocr): - ba = quad_bbox(ocr[idx_a][0]) - bb = quad_bbox(ocr[idx_b][0]) - wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1]) - wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1]) - ra, rb = wa / ha, wb / hb - if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0): - return False - return True - - -# ============================================================ -# WIDE QUAD COLUMN SPLIT — pre-grouping -# ============================================================ -def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, - min_gap_factor=1.8): - x1, y1, x2, y2 = quad_bbox(quad) - w, h = x2 - x1, max(1, y2 - y1) - if w < med_h * 3.0: - return [(quad, text, conf)] - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] - if roi.size == 0: - return [(quad, text, conf)] - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - gap_threshold = h * 255 * 0.12 - min_gap_px = max(int(med_h * min_gap_factor), 10) - gaps, in_gap, gap_start = [], False, 0 - for x in range(len(v_proj)): - if v_proj[x] < gap_threshold: - if not in_gap: gap_start, in_gap = x, True - else: - if in_gap: - gw = x - gap_start - if gw >= min_gap_px: - gaps.append((gap_start + gw // 2, gw)) - in_gap = False - if not gaps: - return [(quad, text, conf)] - gaps.sort(key=lambda g: g[1], reverse=True) - split_x_rel = gaps[0][0] - split_x_abs = x1 + split_x_rel - if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h: - return [(quad, text, conf)] - if ' ' in text: - char_w = w / max(1, len(text)) - split_idx = int(split_x_rel / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - else: - split_idx = int(len(text) * split_x_rel / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - if tl and tr: - return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] - return [(quad, text, conf)] - -def apply_column_gap_splits(image_bgr, ocr_list, med_h): - result, splits_made = [], 0 - for quad, text, conf in ocr_list: - parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h) - if len(parts) > 1: - splits_made += 1 - result.extend(parts) - if splits_made: - print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping") - return result, splits_made - - -# ============================================================ -# GENERALIZED BOX FIXING FUNCTIONS -# ============================================================ -def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, image_bgr): - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) - for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - bubble_contours = detect_speech_bubbles(image_bgr) - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - next_bid, splits_made = 1, [] - - for bid, indices in bubble_indices.items(): - if len(indices) < 2: - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = indices - next_bid += 1 - continue - - split_groups = split_indices_by_bubble(indices, ocr, bubble_contours) - if len(split_groups) > 1: - for group in split_groups: - if group: - new_bubbles[next_bid] = build_lines_from_indices(group, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - new_quads[next_bid] = [ocr[i][0] for i in group] - new_indices[next_bid] = group - next_bid += 1 - splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles") - continue - - vertical_splits = check_vertical_alignment_split(indices, ocr, - threshold=int(med_h * 2.0)) - if len(vertical_splits) > 1: - for group in vertical_splits: - if group: - new_bubbles[next_bid] = build_lines_from_indices(group, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - new_quads[next_bid] = [ocr[i][0] for i in group] - new_indices[next_bid] = group - next_bid += 1 - splits_made.append(f"BOX#{bid} → {len(vertical_splits)} vertical groups") - continue - - box = bubble_boxes[bid] - x1, y1, x2, y2 = box - if (x2 - x1) > med_h * 10: - x_centers = [quad_center(ocr[i][0])[0] for i in indices] - x_median = np.median(x_centers) - left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median] - right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median] - if left_group and right_group: - left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group]) - right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group]) - if right_box[0] - left_box[2] > med_h * 1.5: - for grp in [left_group, right_group]: - new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - new_quads[next_bid] = [ocr[i][0] for i in grp] - new_indices[next_bid] = grp - next_bid += 1 - splits_made.append(f"BOX#{bid} → 2 horizontal panels") - continue - - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = indices - next_bid += 1 - - if splits_made: - print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):") - for s in splits_made: print(f" ✓ {s}") - return new_bubbles, new_boxes, new_quads, new_indices - - -def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, image_bgr): - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) - for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - bubble_contours = detect_speech_bubbles(image_bgr) - bids = list(bubble_boxes.keys()) - to_merge = [] - - for i in range(len(bids)): - for j in range(i + 1, len(bids)): - bid_i, bid_j = bids[i], bids[j] - box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j] - cx_i = (box_i[0] + box_i[2]) / 2.0 - cy_i = (box_i[1] + box_i[3]) / 2.0 - cx_j = (box_j[0] + box_j[2]) / 2.0 - cy_j = (box_j[1] + box_j[3]) / 2.0 - in_same_bubble = any( - cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and - cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0 - for c in bubble_contours - ) - if in_same_bubble: - if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0: - to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i)) - - if not to_merge: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):") - merge_groups = {} - for top, bottom in to_merge: - found = False - for key in merge_groups: - if top in merge_groups[key] or bottom in merge_groups[key]: - merge_groups[key].update({top, bottom}) - found = True; break - if not found: - merge_groups[len(merge_groups)] = {top, bottom} - - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - merged_bids, next_bid = set(), 1 - for merge_set in merge_groups.values(): - merge_list = sorted(merge_set) - print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}") - all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b])) - for b in merge_list: merged_bids.add(b) - new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) - new_quads[next_bid] = [ocr[i][0] for i in all_indices] - new_indices[next_bid] = all_indices - next_bid += 1 - for bid in bids: - if bid not in merged_bids: - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] - next_bid += 1 - return new_bubbles, new_boxes, new_quads, new_indices - - -def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, med_h): - """ - Merges boxes that are vertically close AND share significant horizontal overlap. - - Single-quad boxes participate fully — no isolation treatment. - This fixes BOX#2+#16, BOX#8+#21, BOX#9+#22 type problems where a - single-line detection sits directly above/below a multi-line box in the - same speech bubble. - - Merge criteria (both must be true): - 1. Vertical gap ≤ 1.5 × med_h - 2. Horizontal overlap ratio ≥ 0.35 - """ - bids = sorted(bubble_boxes.keys()) - merge_map: Dict[int, List[int]] = {} - merged_into: Dict[int, int] = {} - - for i, bid_i in enumerate(bids): - if bid_i in merged_into: - continue - box_i = bubble_boxes[bid_i] - wi = max(1, box_i[2] - box_i[0]) - - for j in range(i + 1, len(bids)): - bid_j = bids[j] - if bid_j in merged_into: - continue - box_j = bubble_boxes[bid_j] - wj = max(1, box_j[2] - box_j[0]) - - vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) - h_ix1 = max(box_i[0], box_j[0]) - h_ix2 = min(box_i[2], box_j[2]) - h_overlap = max(0, h_ix2 - h_ix1) - h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - - if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35: - root = merged_into.get(bid_i, bid_i) - merge_map.setdefault(root, [root]) - if bid_j not in merge_map[root]: - merge_map[root].append(bid_j) - merged_into[bid_j] = root - - if not merge_map: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):") - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - processed, next_bid = set(), 1 - - for root, group in merge_map.items(): - group_unique = sorted(set(group)) - print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}") - all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b])) - new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) - new_quads[next_bid] = [ocr[i][0] for i in all_indices] - new_indices[next_bid] = all_indices - next_bid += 1 - processed.update(group_unique) - - for bid in bids: - if bid not in processed: - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] - next_bid += 1 - - return new_bubbles, new_boxes, new_quads, new_indices - - -def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, - bubbles, ocr, image_bgr): - """ - Full fix pipeline: - 1. Split boxes that span multiple speech bubbles. - 2. Merge fragments detected inside the same contour. - 3. Merge fragments missed by contour detection (proximity+overlap) — pass 1. - 4. Second proximity pass — catches chains resolved after pass 1. - """ - print("\n🔍 Running automatic bubble detection fixes...") - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) - for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - - bubbles, bubble_boxes, bubble_quads, bubble_indices = \ - detect_and_split_multi_bubble_boxes( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = \ - detect_and_merge_fragmented_bubbles( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) - - # Pass 1 - bubbles, bubble_boxes, bubble_quads, bubble_indices = \ - merge_boxes_by_proximity_and_overlap( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) - - # Pass 2 — catches chains only visible after pass 1 - bubbles, bubble_boxes, bubble_quads, bubble_indices = \ - merge_boxes_by_proximity_and_overlap( - bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) - - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - -def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, - overlap_threshold=0.50): - bids = list(bubble_boxes.keys()) - to_remove = set() - for i in range(len(bids)): - bid_i = bids[i] - if bid_i in to_remove: continue - box_i = bubble_boxes[bid_i] - area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1]) - for j in range(i + 1, len(bids)): - bid_j = bids[j] - if bid_j in to_remove: continue - box_j = bubble_boxes[bid_j] - area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1]) - shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j]) - overlap = boxes_overlap_ratio(box_i, box_j) - if overlap > overlap_threshold or len(shared) > 0: - if area_i >= area_j: - to_remove.add(bid_j) - print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})") - else: - to_remove.add(bid_i) - print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})") - break - if to_remove: - print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)") - for bid in to_remove: - bubble_boxes.pop(bid, None) - bubble_indices.pop(bid, None) - bubble_quads.pop(bid, None) - bubbles.pop(bid, None) - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - -def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, - max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None): - if image_shape is None: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - ih, iw = image_shape[:2] - max_width, max_height = iw * max_width_ratio, ih * max_height_ratio - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - next_bid, splits_made = 1, [] - - for bid, box in bubble_boxes.items(): - x1, y1, x2, y2 = box - w, h = x2 - x1, y2 - y1 - if w > max_width or h > max_height: - indices = bubble_indices[bid] - col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid, - use_aggressive_thresholds=True) - if col_split: - for grp in col_split: - new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - new_quads[next_bid] = [ocr[i][0] for i in grp] - new_indices[next_bid] = grp - next_bid += 1 - splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") - continue - row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid) - if row_split: - for grp in row_split: - new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - new_quads[next_bid] = [ocr[i][0] for i in grp] - new_indices[next_bid] = grp - next_bid += 1 - splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") - continue - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = box - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] - next_bid += 1 - - if splits_made: - print(f"\n📏 Split {len(splits_made)} oversized box(es):") - for s in splits_made: print(f" ✓ {s}") - return new_bubbles, new_boxes, new_quads, new_indices - - -def should_merge_groups(group1_indices, group2_indices, ocr, median_height, - max_vertical_gap=None): - if max_vertical_gap is None: - max_vertical_gap = median_height * 2.5 - box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices]) - box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices]) - if box1 is None or box2 is None: - return False - cx1 = (box1[0] + box1[2]) / 2.0 - cx2 = (box2[0] + box2[2]) / 2.0 - if abs(cx1 - cx2) > median_height * 1.8: - return False - vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3])) - return vertical_gap <= max_vertical_gap - - -# ============================================================ -# ENHANCED OCR ENGINE -# ============================================================ -class ImprovedMacVisionDetector: - def __init__(self, source_lang="en"): - lang_key = source_lang.lower().strip() - lang_map = { - "en": "en-US", "english": "en-US", - "es": "es-ES", "spanish": "es-ES", - "ca": "ca-ES", "catalan": "ca-ES", - "fr": "fr-FR", "french": "fr-FR", - "ja": "ja-JP", "japanese": "ja-JP", - "it": "it-IT", "italian": "it-IT", - "de": "de-DE", "german": "de-DE", - "ko": "ko-KR", "korean": "ko-KR", - "zh": "zh-Hans", "chinese": "zh-Hans" - } - self.langs = [lang_map.get(lang_key, "en-US")] - print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})") - - def preprocess_variants(self, image_bgr): - variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))] - gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - variants.append(("high_contrast", - cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5, - interpolation=cv2.INTER_CUBIC), - cv2.COLOR_GRAY2BGR))) - variants.append(("bilateral", - cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75), - None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) - variants.append(("inverted", - cv2.resize(cv2.bitwise_not(image_bgr), - None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) - variants.append(("original", - cv2.resize(image_bgr, None, fx=2.5, fy=2.5, - interpolation=cv2.INTER_CUBIC))) - return variants - - def run_vision_ocr(self, image_bgr): - if image_bgr is None or image_bgr.size == 0: - return [] - ih, iw = image_bgr.shape[:2] - success, buffer = cv2.imencode('.png', image_bgr) - if not success: - return [] - ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) - handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) - results = [] - - def completion_handler(request, error): - if error: return - for obs in request.results(): - candidate = obs.topCandidates_(1)[0] - text, conf = candidate.string(), candidate.confidence() - bbox = obs.boundingBox() - x = bbox.origin.x * iw - y_bl = bbox.origin.y * ih - w = bbox.size.width * iw - h = bbox.size.height * ih - y = ih - y_bl - h - quad = [[int(x),int(y)],[int(x+w),int(y)], - [int(x+w),int(y+h)],[int(x),int(y+h)]] - results.append((quad, text, conf)) - - req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) - req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) - req.setUsesLanguageCorrection_(False) - req.setRecognitionLanguages_(self.langs) - req.setAutomaticallyDetectsLanguage_(True) - handler.performRequests_error_([req], None) - return results - - def merge_multi_pass_results(self, all_results, original_shape): - if not all_results: - return [] - scale_factor = 2.5 - normalized = [] - for variant_name, results in all_results: - for quad, text, conf in results: - sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad] - normalized.append((sq, text, conf, variant_name)) - - def quads_overlap(q1, q2, threshold=0.5): - b1, b2 = quad_bbox(q1), quad_bbox(q2) - x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1]) - x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3]) - if x2 < x1 or y2 < y1: return False - inter = (x2-x1)*(y2-y1) - union = ((b1[2]-b1[0])*(b1[3]-b1[1]) + - (b2[2]-b2[0])*(b2[3]-b2[1]) - inter) - return inter / max(union, 1) > threshold - - clusters, used = [], set() - for i, (q1, t1, c1, v1) in enumerate(normalized): - if i in used: continue - cluster = [(q1, t1, c1, v1)] - used.add(i) - for j, (q2, t2, c2, v2) in enumerate(normalized): - if j in used or i == j: continue - if quads_overlap(q1, q2): - cluster.append((q2, t2, c2, v2)) - used.add(j) - clusters.append(cluster) - - final_results = [] - for cluster in clusters: - cluster.sort(key=lambda x: x[2], reverse=True) - best_quad, best_text, best_conf, _ = cluster[0] - text_votes = {} - for _, text, conf, _ in cluster: - n = normalize_text(text) - if n: text_votes[n] = text_votes.get(n, 0) + conf - if text_votes: - voted = max(text_votes.items(), key=lambda x: x[1])[0] - if voted != normalize_text(best_text): - best_text = voted - final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf)) - return final_results - - def read(self, image_path_or_array): - img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \ - else image_path_or_array - if img is None or img.size == 0: - return [] - variants = self.preprocess_variants(img) - all_results = [] - for vname, vimg in variants: - r = self.run_vision_ocr(vimg) - if r: all_results.append((vname, r)) - return self.merge_multi_pass_results(all_results, img.shape) - - -class MacVisionDetector: - def __init__(self, source_lang="en"): - lang_key = source_lang.lower().strip() - lang_map = { - "en": "en-US", "english": "en-US", - "es": "es-ES", "spanish": "es-ES", - "ca": "ca-ES", "catalan": "ca-ES", - "fr": "fr-FR", "french": "fr-FR", - "ja": "ja-JP", "japanese": "ja-JP", - "it": "it-IT", "italian": "it-IT", - "de": "de-DE", "german": "de-DE", - "ko": "ko-KR", "korean": "ko-KR", - "zh": "zh-Hans", "chinese": "zh-Hans" - } - self.langs = [lang_map.get(lang_key, "en-US")] - print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") - - def read(self, image_path_or_array): - img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \ - else image_path_or_array - if img is None or img.size == 0: - return [] - ih, iw = img.shape[:2] - success, buffer = cv2.imencode('.png', img) - if not success: return [] - ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) - handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) - results = [] - - def completion_handler(request, error): - if error: return - for obs in request.results(): - candidate = obs.topCandidates_(1)[0] - text, conf = candidate.string(), candidate.confidence() - bbox = obs.boundingBox() - x = bbox.origin.x * iw - y_bl = bbox.origin.y * ih - w = bbox.size.width * iw - h = bbox.size.height * ih - y = ih - y_bl - h - quad = [[int(x),int(y)],[int(x+w),int(y)], - [int(x+w),int(y+h)],[int(x),int(y+h)]] - results.append((quad, text, conf)) - - req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) - req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) - req.setUsesLanguageCorrection_(True) - req.setRecognitionLanguages_(self.langs) - req.setAutomaticallyDetectsLanguage_(True) - handler.performRequests_error_([req], None) - return results - - -# ============================================================ -# COLUMN / ROW SPLITTING -# ============================================================ -def split_bubble_if_multiple_columns(indices, ocr, bid=None, - use_aggressive_thresholds=False): - if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - xs = [(b[0]+b[2])/2.0 for b in boxes] - xs_sorted = sorted(xs) - gap_thresh = max(med_h*1.2, 18) if use_aggressive_thresholds else max(med_h*1.5, 22) - best_gap_idx, best_gap_size = None, 0.0 - for i in range(len(xs_sorted) - 1): - gap = xs_sorted[i+1] - xs_sorted[i] - if gap > gap_thresh and gap > best_gap_size: - best_gap_size, best_gap_idx = gap, i - if best_gap_idx is None: return None - split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx+1]) / 2.0 - left_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] - right_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] - if not left_idxs or not right_idxs: return None - return (left_idxs, right_idxs) - -def split_bubble_if_multiple_rows(indices, ocr, bid=None): - if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - ys = [(b[1]+b[3])/2.0 for b in boxes] - ys_sorted = sorted(ys) - gap_thresh = max(med_h * 2.0, 30) - best_gap_idx, best_gap_size = None, 0.0 - for i in range(len(ys_sorted) - 1): - gap = ys_sorted[i+1] - ys_sorted[i] - if gap > gap_thresh and gap > best_gap_size: - best_gap_size, best_gap_idx = gap, i - if best_gap_idx is None: return None - split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0 - top_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y] - bot_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y] - if not top_idxs or not bot_idxs: return None - return (top_idxs, bot_idxs) - - -def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): - if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], - key=lambda x: (x[1][1]+x[1][3])/2.0) - gap_thresh = max(med_h * factor, min_gap) - best_gap, best_split_idx = 0.0, None - for k in range(len(items) - 1): - gap = items[k+1][1][1] - items[k][1][3] - if gap > gap_thresh and gap > best_gap: - best_gap, best_split_idx = gap, k - if best_split_idx is None: return None - top_idxs = [it[0] for it in items[:best_split_idx+1]] - bot_idxs = [it[0] for it in items[best_split_idx+1:]] - if not top_idxs or not bot_idxs: return None - return (top_idxs, bot_idxs) - - -def is_vertical_text_like(indices, ocr): - if len(indices) < 2: return False - boxes = [quad_bbox(ocr[i][0]) for i in indices] - med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes])) - med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes])) - if med_h < med_w * 1.2: return False - xs = [(b[0]+b[2])/2.0 for b in boxes] - ys = [(b[1]+b[3])/2.0 for b in boxes] - if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False - return True - - -def split_nested_or_side_by_side(indices, ocr): - if len(indices) < 2: return None - xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 - for i in indices]) - mid_idx = len(xs) // 2 - split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0 - left_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] - right_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] - if not left_idxs or not right_idxs: return None - return (left_idxs, right_idxs) - - -def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): - x1, y1, x2, y2 = box_xyxy - ih, iw = image_bgr.shape[:2] - x1, y1 = max(0, x1), max(0, y1) - x2, y2 = min(iw-1, x2), min(ih-1, y2) - if x2 <= x1 or y2 <= y1: return None - crop = image_bgr[y1:y2, x1:x2] - if crop.size == 0: return None - gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) - edges = cv2.Canny(gray, 50, 150) - h_proj = np.sum(edges, axis=0) - w = x2 - x1 - if w < 100: return None - search_start = int(w * 0.35) - search_end = int(w * 0.65) - if search_end <= search_start: return None - region = h_proj[search_start:search_end] - if len(region) == 0: return None - threshold = np.percentile(region, 85) - candidates = [x1 + search_start + rx - for rx in range(len(region)) if region[rx] >= threshold] - if not candidates: return None - split_x = int(np.median(candidates)) - if bubble_quads: - lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) - rc = len(bubble_quads) - lc - if lc == 0 or rc == 0: return None - return (x1, x2, split_x) - - -# ============================================================ -# MERGE CLOSE BUBBLES -# ============================================================ -def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, - bubble_indices, ocr): - """ - Merges boxes that are spatially very close on BOTH axes AND share - meaningful horizontal overlap (same column). - - Single-quad boxes participate fully — no special isolation treatment. - The h_overlap_ratio >= 0.25 guard prevents merging horizontally - adjacent distinct bubbles. - """ - if not bubbles: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1]) - for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - merge_tol = max(8, med_h * 1.4) - - bids = sorted(bubble_boxes.keys()) - merged_set, merge_map = set(), {} - - for i, bid_i in enumerate(bids): - if bid_i in merged_set: continue - x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i] - wi = max(1, x2_i - x1_i) - - for j in range(i + 1, len(bids)): - bid_j = bids[j] - if bid_j in merged_set: continue - x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j] - wj = max(1, x2_j - x1_j) - - gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j)) - gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j)) - - h_ix1 = max(x1_i, x1_j) - h_ix2 = min(x2_i, x2_j) - h_overlap = max(0, h_ix2 - h_ix1) - h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - - if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25: - if bid_i not in merge_map: - merge_map[bid_i] = [bid_i] - merge_map[bid_i].append(bid_j) - merged_set.add(bid_j) - - if not merge_map: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - next_bid = 1 - for bid in bids: - if bid in merged_set: continue - if bid in merge_map: - group = merge_map[bid] - all_indices = sorted(set(idx for b in group for idx in bubble_indices[b])) - new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) - new_quads[next_bid] = [ocr[i][0] for i in all_indices] - new_indices[next_bid] = all_indices - else: - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] - next_bid += 1 - - return new_bubbles, new_boxes, new_quads, new_indices - - -# ============================================================ -# WIDE / BRIDGE QUAD SPLITTING -# ============================================================ -def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): - if not ocr_list: return ocr_list, 0 - hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] - med_h = float(np.median(hs)) if hs else 14.0 - result, splits_made = [], 0 - - for quad, text, conf in ocr_list: - x1, y1, x2, y2 = quad_bbox(quad) - w = x2 - x1 - if w > med_h * width_factor: - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] - if roi.size > 0: - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, - cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - gap_threshold = roi.shape[0] * 255 * 0.15 - gaps, in_gap, gap_start = [], False, 0 - for x in range(len(v_proj)): - if v_proj[x] < gap_threshold: - if not in_gap: gap_start, in_gap = x, True - else: - if in_gap: - gw = x - gap_start - if gw >= max(int(med_h * 0.6), 12): - gaps.append((gap_start + gw // 2, gw)) - in_gap = False - if gaps: - gaps.sort(key=lambda g: g[1], reverse=True) - split_x_abs = max(0, x1) + gaps[0][0] - if ' ' in text: - char_w = w / max(1, len(text)) - split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - else: - split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - if tl and tr: - result.extend([ - ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) - splits_made += 1 - continue - result.append((quad, text, conf)) - return result, splits_made - - -def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0): - if not ocr_list: return ocr_list, 0 - hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] - med_h = float(np.median(hs)) if hs else 14.0 - result, splits_made = [], 0 - - for quad, text, conf in ocr_list: - x1, y1, x2, y2 = quad_bbox(quad) - w, h = x2 - x1, max(1, y2 - y1) - if w / h > aspect_ratio_threshold: - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] - if roi.size > 0: - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, - cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - gap_threshold = h * 255 * 0.20 - gaps, in_gap, gap_start = [], False, 0 - for x in range(len(v_proj)): - if v_proj[x] < gap_threshold: - if not in_gap: gap_start, in_gap = x, True - else: - if in_gap: - gw = x - gap_start - if gw >= max(int(med_h * 0.8), 15): - gaps.append((gap_start + gw // 2, gw)) - in_gap = False - if gaps: - gaps.sort(key=lambda g: g[1], reverse=True) - split_x_abs = max(0, x1) + gaps[0][0] - if ' ' in text: - char_w = w / max(1, len(text)) - split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - else: - split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - if tl and tr: - result.extend([ - ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) - splits_made += 1 - continue - result.append((quad, text, conf)) - return result, splits_made - - -def normalize_ocr_quads(ocr_list): - result = [] - for quad, text, conf in ocr_list: - x1, y1, x2, y2 = quad_bbox(quad) - pad = 3 - new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]] - result.append((new_quad, text, conf)) - return result - - -# ============================================================ -# VISION RE-READ -# ============================================================ -def preprocess_variant(crop_bgr, mode): - gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) - if mode == "raw": return gray - if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, - tileGridSize=(8,8)).apply(gray) - if mode == "adaptive": - den = cv2.GaussianBlur(gray, (3,3), 0) - return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 35, 11) - if mode == "otsu": - den = cv2.GaussianBlur(gray, (3,3), 0) - _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - return th - if mode == "invert": return 255 - gray - if mode == "bilateral": - den = cv2.bilateralFilter(gray, 7, 60, 60) - _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - return th - if mode == "morph_open": - _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8)) - return gray - - -def rotate_image_keep_bounds(img, angle_deg): - h, w = img.shape[:2] - c = (w/2, h/2) - M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) - cos, sin = abs(M[0,0]), abs(M[0,1]) - new_w = int((h*sin) + (w*cos)) - new_h = int((h*cos) + (w*sin)) - M[0,2] += (new_w/2) - c[0] - M[1,2] += (new_h/2) - c[1] - return cv2.warpAffine(img, M, (new_w, new_h), - flags=cv2.INTER_CUBIC, borderValue=255) - - -def rebuild_text_from_vision_result(res): - if not res: return "" - norm = [] - for bbox, txt, conf in res: - if not txt or not txt.strip(): continue - b = quad_bbox(bbox) - norm.append((b, txt, conf, - (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) - if not norm: return "" - med_h = float(np.median([x[5] for x in norm])) - row_tol = max(6.0, med_h * 0.75) - norm.sort(key=lambda z: z[4]) - rows = [] - for it in norm: - placed = False - for r in rows: - if abs(it[4] - r["yc"]) <= row_tol: - r["m"].append(it) - r["yc"] = float(np.mean([k[4] for k in r["m"]])) - placed = True; break - if not placed: rows.append({"yc": it[4], "m": [it]}) - rows.sort(key=lambda r: r["yc"]) - lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) - for r in rows] - return normalize_text(" ".join(filter(None, lines))) - - -def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, - upscale=3.0, pad=24): - ih, iw = image_bgr.shape[:2] - x1, y1, x2, y2 = bbox_xyxy - x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad)) - x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad)) - crop = image_bgr[y1:y2, x1:x2] - if crop.size == 0: return None, 0.0, "none" - - modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] - angles = [0.0, 1.5, -1.5] - best_v_txt, best_v_sc = "", 0.0 - up0 = cv2.resize(crop, - (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), - interpolation=cv2.INTER_CUBIC) - - for mode in modes: - proc = preprocess_variant(up0, mode) - proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc - for a in angles: - rot = rotate_image_keep_bounds(proc3, a) - res = (vision_detector.run_vision_ocr(rot) - if hasattr(vision_detector, 'run_vision_ocr') - else vision_detector.read(rot)) - txt = rebuild_text_from_vision_result(res) - sc = ocr_candidate_score(txt) - if sc > best_v_sc: - best_v_txt, best_v_sc = txt, sc - - if best_v_txt: return best_v_txt, best_v_sc, "vision-reread" - return None, 0.0, "none" - - -# ============================================================ -# LINES + BUBBLES -# ============================================================ -def build_lines_from_indices(indices, ocr): - if not indices: return [] - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) - med_h = float(np.median([it[4] for it in items])) if items else 10.0 - row_tol = max(6.0, med_h * 0.75) - items.sort(key=lambda x: x[3]) - rows = [] - for it in items: - placed = False - for r in rows: - if abs(it[3] - r["yc"]) <= row_tol: - r["m"].append(it) - r["yc"] = float(np.mean([k[3] for k in r["m"]])) - placed = True; break - if not placed: rows.append({"yc": it[3], "m": [it]}) - rows.sort(key=lambda r: r["yc"]) - return [normalize_text( - " ".join(ocr[i][1] - for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2]))) - for r in rows if r["m"]] - -def split_indices_into_vertical_blocks(indices, ocr, gap_factor=1.6, min_gap=18): - """ - Split a box into top-to-bottom macro blocks using strong vertical gaps. - """ - if len(indices) < 2: - return [indices] - - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - cy = (b[1] + b[3]) / 2.0 - h = max(1, b[3] - b[1]) - items.append((i, b, cy, h)) - - items.sort(key=lambda x: x[2]) - med_h = float(np.median([it[3] for it in items])) if items else 12.0 - threshold = max(min_gap, med_h * gap_factor) - - blocks = [] - current = [items[0][0]] - prev_b = items[0][1] - - for k in range(1, len(items)): - cur_i, cur_b, _, _ = items[k] - gap = cur_b[1] - prev_b[3] - - if gap > threshold: - blocks.append(current) - current = [cur_i] - else: - current.append(cur_i) - - prev_b = cur_b - - if current: - blocks.append(current) - - return blocks - -def build_final_box_text(indices, ocr, reading_mode="ltr"): - """ - Final text reconstruction used for OCR/translation export. - This uses internal layout detection, unlike generic grouping helpers. - """ - return build_text_from_layout(indices, ocr, reading_mode=reading_mode) - - -def auto_gap(image_path, base=18, ref_w=750): - img = cv2.imread(image_path) - return base * (img.shape[1] / ref_w) if img is not None else base - - -def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, - strict_mode=False): - n = len(ocr) - if n == 0: return {}, {}, {}, {} - - boxes = [quad_bbox(r[0]) for r in ocr] - centers = [quad_center(r[0]) for r in ocr] - hs = [max(1.0, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - - max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0 - max_horizontal_offset = med_h * 1.8 - - sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0])) - groups, used = [], set() - - for i in sorted_indices: - if i in used: continue - current_group = [i] - used.add(i) - cx_i = centers[i][0] - - for j in sorted_indices: - if j in used or j == i: continue - cx_j, cy_j = centers[j] - if cy_j <= centers[i][1]: continue - if abs(cx_i - cx_j) > max_horizontal_offset: continue - - # Horizontal gap guard - gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2])) - if gap_x > med_h * 1.5: continue - - # Orientation compatibility guard - if not orientation_compatible(i, j, ocr): continue - - vertical_gap = boxes[j][1] - boxes[current_group[-1]][3] - if vertical_gap <= max_vertical_gap: - current_group.append(j) - used.add(j) - cx_i = (cx_i + cx_j) / 2.0 - - if current_group: - groups.append(current_group) - - # Secondary merge pass - merged_groups, used_groups = [], set() - for i, group1 in enumerate(groups): - if i in used_groups: continue - merged = list(group1) - used_groups.add(i) - for j, group2 in enumerate(groups): - if i == j or j in used_groups: continue - if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap): - compat = all(orientation_compatible(a, b, ocr) - for a in merged for b in group2) - if compat: - merged.extend(group2) - used_groups.add(j) - merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1])) - - # Horizontal gap split pass - final_groups = [] - for group in merged_groups: - h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5) - if h_split: - lg, rg = h_split - final_groups.append(sorted(lg, key=lambda idx: centers[idx][1])) - final_groups.append(sorted(rg, key=lambda idx: centers[idx][1])) - else: - final_groups.append(group) - - final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), - min(centers[i][0] for i in g))) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} - ih, iw = image_shape[:2] - - for bid, idxs in enumerate(final_groups, start=1): - lines = build_lines_from_indices(idxs, ocr) - quads = [ocr[k][0] for k in idxs] - ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) - if ub is None: continue - x1, y1, x2, y2 = ub - ap = max(1, int(round(med_h * 0.16))) - bubbles[bid] = lines - bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), - min(iw-1,x2+ap), min(ih-1,y2+ap)) - bubble_quads[bid] = quads - bubble_indices[bid] = idxs - - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - -# ============================================================ -# SPLIT HELPER — centralises all split strategies -# ============================================================ -def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes, - filtered, image, iw, ih): - """ - Attempts all split strategies in priority order. - Returns ((part1_indices, part2_indices), reason_str) or (None, None). - - BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4 - so the gap between the top speech bubble and the bottom cluster triggers. - """ - indices = bubble_indices[bid] - box = bubble_boxes[bid] - - # 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster) - if is_vertical_text_like(indices, filtered): - vgap = split_cluster_by_big_vertical_gap(indices, filtered, - factor=1.4, min_gap=18) - if vgap: - return vgap, "vertical-stack y-gap" - - # 2. Panel border - sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) - if sr: - _, _, split_x = sr - li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x] - ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x] - if li and ri: - return (li, ri), "panel border" - elif len(bubble_quads[bid]) >= 4: - cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid, - use_aggressive_thresholds=True) - if cs: - return cs, "aggressive column" - - # 3. Column gap - cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid) - if cs: - return cs, "vertical column" - - # 4. Nested / side-by-side - ns = split_nested_or_side_by_side(indices, filtered) - if ns: - return ns, "nested/side-by-side" - - # 5. Row split - rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid) - if rs: - return rs, "horizontal row" - - # 6. Large vertical gap (general, less sensitive) - gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22) - if gy: - return gy, "large vertical-gap" - - return None, None - - -# ============================================================ -# DEBUG / EXPORT -# ============================================================ -def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, - clean_lines=None, out_path="debug_clusters.png", - region_types=None): - """ - Draw debug overlays for final grouped boxes. - - Color scheme by region type: - - dialogue : green - - narration : orange - - sfx : magenta - - reaction : cyan - - unknown : yellow-ish - - OCR quads are outlined lightly in gray for context. - """ - img = cv2.imread(image_path) - if img is None: - return - - # Draw OCR quads lightly without filling the page white - for bbox, txt, conf in ocr: - pts = np.array(bbox, dtype=np.int32) - cv2.polylines(img, [pts], True, (180, 180, 180), 1) - - for bid, bb in bubble_boxes.items(): - x1, y1, x2, y2 = bb - rtype = region_types.get(bid, "unknown") if region_types else "unknown" - - if rtype == "dialogue": - color = (0, 220, 0) - elif rtype == "narration": - color = (0, 180, 255) - elif rtype == "sfx": - color = (255, 0, 255) - elif rtype == "reaction": - color = (0, 200, 255) - else: - color = (0, 220, 220) - - thickness = 2 - cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) - cv2.putText( - img, - f"BOX#{bid} [{rtype}]", - (x1 + 2, max(15, y1 + 16)), - cv2.FONT_HERSHEY_SIMPLEX, - 0.45, - color, - 2 - ) - - if clean_lines and bid in clean_lines: - text = clean_lines[bid] - words = text.split() - - wrapped_lines = [] - cur = "" - for w in words: - if len(cur) + len(w) + 1 < 26: - cur += w + " " - else: - wrapped_lines.append(cur.strip()) - cur = w + " " - if cur: - wrapped_lines.append(cur.strip()) - - y_text = y2 + 18 - for line in wrapped_lines: - # black outline - cv2.putText( - img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3 - ) - # blue text - cv2.putText( - img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1 - ) - y_text += 18 - - cv2.imwrite(out_path, img) - -def estimate_reading_order(bbox_dict, mode="ltr"): - items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) - for bid, bb in bbox_dict.items()] - items.sort(key=lambda t: t[2]) - rows, tol = [], 90 - for it in items: - placed = False - for r in rows: - if abs(it[2] - r["cy"]) <= tol: - r["items"].append(it) - r["cy"] = float(np.mean([x[2] for x in r["items"]])) - placed = True; break - if not placed: rows.append({"cy": it[2], "items": [it]}) - rows.sort(key=lambda r: r["cy"]) - order = [] - for r in rows: - r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) - order.extend([z[0] for z in r["items"]]) - return {bid: i+1 for i, bid in enumerate(order)} - -# ============================================================ -# NAME / SHORT TOKEN RESCUE -# ============================================================ -def _text_key_for_dedup(text: str) -> str: - return re.sub(r'[^A-ZÀ-Ý0-9]', '', normalize_text(text or "")) - -def rescue_name_and_short_tokens(ocr_list, min_conf=0.20): - """ - Keep plausible short/name tokens that OCR found but strict filtering may drop. - Returns rescued items as (quad, text, conf). - """ - rescued = [] - - for quad, text, conf in ocr_list: - t = normalize_text(text or "") - if not t: - continue - - t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t) - - if t_alpha in KNOWN_NAMES and conf >= min_conf: - rescued.append((quad, t, max(conf, 0.45))) - continue - - if is_protected_token(t) and conf >= min_conf: - rescued.append((quad, t, max(conf, 0.40))) - continue - - if 2 <= len(t_alpha) <= 8 and conf >= 0.25: - if re.fullmatch(r'[A-ZÀ-Ý]{2,8}', t_alpha): - rescued.append((quad, t, max(conf, 0.35))) - - return rescued - -def merge_rescued_items(base_ocr, rescued_ocr, iou_threshold=0.55): - """ - Merge rescued tokens into OCR list if not duplicate by text+overlap. - """ - if not rescued_ocr: - return base_ocr - - def iou_xyxy(a, b): - ax1, ay1, ax2, ay2 = a - bx1, by1, bx2, by2 = b - ix1, iy1 = max(ax1, bx1), max(ay1, by1) - ix2, iy2 = min(ax2, bx2), min(ay2, by2) - inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) - if inter == 0: - return 0.0 - area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) - area_b = max(0, bx2 - bx1) * max(0, by2 - by1) - return inter / max(1, area_a + area_b - inter) - - out = list(base_ocr) - for rq, rt, rc in rescued_ocr: - rb = quad_bbox(rq) - rk = _text_key_for_dedup(rt) - duplicate = False - - for bq, bt, _ in out: - bb = quad_bbox(bq) - bk = _text_key_for_dedup(bt) - if rk == bk and iou_xyxy(rb, bb) >= iou_threshold: - duplicate = True - break - - if not duplicate: - out.append((rq, rt, rc)) - - return out - -def _joined_text_for_indices(indices, ocr): - parts = [] - for i in indices: - if i < 0 or i >= len(ocr): - continue - t = normalize_text(ocr[i][1]) - if t: - parts.append(t) - s = " ".join(parts).strip() - return s, len(s) - -def _in_same_bubble_contour(box_i, box_j, bubble_contours): - cx_i = (box_i[0] + box_i[2]) / 2.0 - cy_i = (box_i[1] + box_i[3]) / 2.0 - cx_j = (box_j[0] + box_j[2]) / 2.0 - cy_j = (box_j[1] + box_j[3]) / 2.0 - for c in bubble_contours: - if (cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and - cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0): - return True - return False - -def merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): - """ - Relaxed merge for tiny interjection/name boxes (e.g. HUH? + MORNING). - """ - bids = sorted(bubble_boxes.keys()) - if len(bids) < 2: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - bubble_contours = detect_speech_bubbles(image_bgr) - - parent = {b: b for b in bids} - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(a, b): - ra, rb = find(a), find(b) - if ra != rb: - parent[rb] = ra - - SHORT_TEXT_MAX_CHARS = 12 - - for i in range(len(bids)): - for j in range(i + 1, len(bids)): - bi, bj = bids[i], bids[j] - box_i, box_j = bubble_boxes[bi], bubble_boxes[bj] - - wi = max(1, box_i[2] - box_i[0]) - wj = max(1, box_j[2] - box_j[0]) - - gap_x = max(0, max(box_i[0], box_j[0]) - min(box_i[2], box_j[2])) - vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) - - h_ix1 = max(box_i[0], box_j[0]) - h_ix2 = min(box_i[2], box_j[2]) - h_overlap = max(0, h_ix2 - h_ix1) - h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - - txt_i, len_i = _joined_text_for_indices(bubble_indices[bi], ocr) - txt_j, len_j = _joined_text_for_indices(bubble_indices[bj], ocr) - - micro_pair = (len_i <= SHORT_TEXT_MAX_CHARS and len_j <= SHORT_TEXT_MAX_CHARS) - protected_hint = is_protected_token(txt_i) or is_protected_token(txt_j) - same_contour = _in_same_bubble_contour(box_i, box_j, bubble_contours) - - if micro_pair and vert_gap <= med_h * 2.2 and gap_x <= med_h * 2.0: - if h_overlap_ratio >= 0.10 or same_contour or protected_hint: - union(bi, bj) - - groups = {} - for b in bids: - r = find(b) - groups.setdefault(r, []).append(b) - - if all(len(v) == 1 for v in groups.values()): - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - next_bid = 1 - - for _, group in groups.items(): - if len(group) == 1: - b = group[0] - new_bubbles[next_bid] = bubbles[b] - new_boxes[next_bid] = bubble_boxes[b] - new_quads[next_bid] = bubble_quads[b] - new_indices[next_bid] = bubble_indices[b] - else: - all_idx = sorted(set(idx for b in group for idx in bubble_indices[b])) - new_bubbles[next_bid] = build_lines_from_indices(all_idx, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_idx]) - new_quads[next_bid] = [ocr[i][0] for i in all_idx] - new_indices[next_bid] = all_idx - next_bid += 1 - - return new_bubbles, new_boxes, new_quads, new_indices - -def reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): - """ - Reattach tiny orphan token boxes (e.g., single 'HUH?') to nearest plausible bubble. - """ - bids = sorted(bubble_boxes.keys()) - if len(bids) < 2: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - - orphan_bids = [] - for b in bids: - idxs = bubble_indices.get(b, []) - if len(idxs) != 1: - continue - t = normalize_text(ocr[idxs[0]][1]) - if is_protected_token(t) or len(re.sub(r'[^A-ZÀ-Ý]', '', t)) <= 5: - orphan_bids.append(b) - - if not orphan_bids: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - consumed = set() - - for ob in orphan_bids: - if ob in consumed: - continue - - obox = bubble_boxes[ob] - ocx = (obox[0] + obox[2]) / 2.0 - ocy = (obox[1] + obox[3]) / 2.0 - - best_b = None - best_d = 1e9 - - for tb in bids: - if tb == ob or tb in consumed: - continue - tbox = bubble_boxes[tb] - tcx = (tbox[0] + tbox[2]) / 2.0 - tcy = (tbox[1] + tbox[3]) / 2.0 - - dx = abs(ocx - tcx) - dy = abs(ocy - tcy) - - if dx <= med_h * 2.2 and dy <= med_h * 3.0: - d = dx + dy - if d < best_d: - best_d = d - best_b = tb - - if best_b is not None: - merged = sorted(set(bubble_indices[best_b] + bubble_indices[ob])) - bubble_indices[best_b] = merged - bubble_quads[best_b] = [ocr[i][0] for i in merged] - bubble_boxes[best_b] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged]) - bubbles[best_b] = build_lines_from_indices(merged, ocr) - consumed.add(ob) - - if consumed: - for b in consumed: - bubble_indices.pop(b, None) - bubble_quads.pop(b, None) - bubble_boxes.pop(b, None) - bubbles.pop(b, None) - - # reindex for stable downstream order - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - for new_id, old_id in enumerate(sorted(bubble_boxes.keys()), start=1): - new_bubbles[new_id] = bubbles[old_id] - new_boxes[new_id] = bubble_boxes[old_id] - new_quads[new_id] = bubble_quads[old_id] - new_indices[new_id] = bubble_indices[old_id] - return new_bubbles, new_boxes, new_quads, new_indices - - return bubbles, bubble_boxes, bubble_quads, bubble_indices - -def reconstruct_group_text(group_indices, ocr): - """ - Reconstruct text inside one already-detected group. - - This handles cases where a vertical group itself contains - multiple local rows or wrapped OCR fragments. - """ - if not group_indices: - return "" - - items = [] - for i in group_indices: - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - w = max(1, b[2] - b[0]) - h = max(1, b[3] - b[1]) - items.append((i, b, cx, cy, w, h)) - - if not items: - return "" - - med_h = float(np.median([it[5] for it in items])) - med_w = float(np.median([it[4] for it in items])) - - # If the group is strongly vertical, simple top->bottom is fine - xs = [it[2] for it in items] - ys = [it[3] for it in items] - vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0 - horizontal_span = max(xs) - min(xs) if len(xs) > 1 else 0 - - # strong single vertical phrase - if vertical_span > horizontal_span * 1.5: - items.sort(key=lambda x: x[3]) # top->bottom - txt = normalize_text(" ".join( - normalize_text(ocr[it[0]][1]) for it in items if normalize_text(ocr[it[0]][1]) - )) - return txt - - # otherwise, split into local rows first - row_tol = max(6.0, med_h * 0.65) - items.sort(key=lambda x: x[3]) - - rows = [] - for it in items: - placed = False - for row in rows: - if abs(it[3] - row["yc"]) <= row_tol: - row["members"].append(it) - row["yc"] = float(np.mean([m[3] for m in row["members"]])) - placed = True - break - if not placed: - rows.append({"yc": it[3], "members": [it]}) - - rows.sort(key=lambda r: r["yc"]) - - parts = [] - for row in rows: - members = sorted(row["members"], key=lambda x: x[2]) # left->right - row_txt = normalize_text(" ".join( - normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1]) - )) - if row_txt: - parts.append(row_txt) - - txt = normalize_text(" ".join(parts)) - return txt - -def reconstruct_group_text_best(group_indices, ocr): - if not group_indices: - return "" - - items = [] - for i in group_indices: - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - h = max(1, b[3] - b[1]) - items.append((i, b, cx, cy, h)) - - if not items: - return "" - - # Candidate 1: simple top->bottom - cand1_items = sorted(items, key=lambda x: x[3]) - cand1 = normalize_text(" ".join( - normalize_text(ocr[it[0]][1]) for it in cand1_items if normalize_text(ocr[it[0]][1]) - )) - cand1 = fix_group_level_ocr(cand1) - - # Candidate 2: local rows - med_h = float(np.median([it[4] for it in items])) - row_tol = max(6.0, med_h * 0.65) - - rows = [] - for it in sorted(items, key=lambda x: x[3]): - placed = False - for row in rows: - if abs(it[3] - row["yc"]) <= row_tol: - row["members"].append(it) - row["yc"] = float(np.mean([m[3] for m in row["members"]])) - placed = True - break - if not placed: - rows.append({"yc": it[3], "members": [it]}) - - rows.sort(key=lambda r: r["yc"]) - cand2_parts = [] - for row in rows: - members = sorted(row["members"], key=lambda x: x[2]) - row_txt = normalize_text(" ".join( - normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1]) - )) - if row_txt: - cand2_parts.append(row_txt) - cand2 = normalize_text(" ".join(cand2_parts)) - cand2 = fix_group_level_ocr(cand2) - - # choose best - s1 = ocr_candidate_score(cand1) - s2 = ocr_candidate_score(cand2) - - return cand2 if s2 > s1 else cand1 - -def fix_group_level_ocr(text): - t = normalize_text(text or "") - if not t: - return t - - replacements = { - "ANY- THING": "ANYTHING", - "BREAK- FAST": "BREAK-FAST", - "COMMON BREAK- PEOPLE FAST": "COMMON PEOPLE EAT FOR BREAKFAST", - "WHAT DO LIKE FOR COMMON BREAK- PEOPLE FAST EAT": "WHAT DO COMMON PEOPLE EAT LIKE FOR BREAKFAST", - - # New targeted fixes for reported cases - "ILLU- SIONS": "ILLU-SIONS", - "ATTEN- TION": "ATTEN-TION", - "WHAT DO COMMON PEOPLE HE EAT?": "WHAT DO COMMON PEOPLE EAT?", - "LIKE FOR BREAK- FAST": "LIKE FOR BREAK-FAST?", - "YOUR STUCK": "YOU'RE STUCK", - "YOUR HAND!": "YOUR HAND!", - } - - for a, b in replacements.items(): - t = t.replace(a, b) - - t = dehyphenate_linebreak_artifacts(t) - t = re.sub(r"\s{2,}", " ", t).strip() - return t - -def _is_sentence_like_fragment(t: str) -> bool: - t = normalize_text(t or "") - if not t: - return False - alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t) - if len(alnum) < 2: - return False - return True - - -def _line_has_terminal_punct(t: str) -> bool: - t = normalize_text(t or "") - return bool(re.search(r"[.!?…]$", t)) - - -def _smart_split_by_connectors(text: str) -> List[str]: - """ - Conservative split for OCR text that glues multiple clauses. - """ - t = normalize_text(text or "") - if not t: - return [] - - # Keep hyphenated style if meaningful, but remove OCR line-wrap artifacts - t = dehyphenate_linebreak_artifacts(t) - - # 1) Primary punctuation split - parts = re.split(r"(?<=[.!?…])\s+", t) - parts = [p.strip() for p in parts if p.strip()] - if len(parts) >= 2: - return parts - - # 2) Secondary lexical split if punctuation failed - patterns = [ - r"\b(THEY'RE|THEY ARE)\b", - r"\b(DON'T|DO NOT)\b", - r"\b(LIKE FOR)\b", - r"\b(IF WE DON'T|IF WE DO NOT)\b", - r"\b(WHAT DO)\b", - ] - - for pat in patterns: - m = re.search(pat, t) - if m and m.start() > 8: - left = t[:m.start()].strip() - right = t[m.start():].strip() - if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right): - return [left, right] - - return [t] - -def split_box_by_sentence_rows(indices, ocr, min_groups=2): - """ - Force split one box into sentence-like row groups. - Works for stacked dialogue blocks like: - YOUR HAND! - I'M STUCK AND HELPLESS LIKE THIS! - IF WE DON'T HURRY UP, WE'LL BE CRUSHED TO DEATH! - """ - if not indices or len(indices) < 3: - return None - - # Build row groups first - rows = group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.70) - if not rows or len(rows) < min_groups: - return None - - # Turn each row-group into text - row_payload = [] - for grp in rows: - txt = normalize_text(" ".join(ocr[i][1] for i in grp if normalize_text(ocr[i][1]))) - txt = fix_group_level_ocr(txt) - if not txt: - continue - box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - row_payload.append({"indices": grp, "text": txt, "box": box}) - - if len(row_payload) < min_groups: - return None - - # Merge tiny row fragments upward if they are clearly continuation - merged = [] - for rp in row_payload: - if not merged: - merged.append(rp) - continue - - prev = merged[-1] - short_prev = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", prev["text"])) <= 5 - no_term_prev = not re.search(r"[.!?…]$", prev["text"]) - - if short_prev and no_term_prev: - new_idx = sorted(set(prev["indices"] + rp["indices"])) - new_txt = normalize_text(prev["text"] + " " + rp["text"]) - new_box = boxes_union_xyxy([prev["box"], rp["box"]]) - merged[-1] = {"indices": new_idx, "text": new_txt, "box": new_box} - else: - merged.append(rp) - - # Keep sentence-like groups - out = [] - for m in merged: - txt = normalize_text(m["text"]) - if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) < 4: - continue - out.append(sorted(m["indices"], key=lambda i: ( - quad_bbox(ocr[i][0])[1], - quad_bbox(ocr[i][0])[0] - ))) - - if len(out) < min_groups: - return None - - return out - -def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]: - """ - Layout-aware phrase segmentation for one final box. - Uses your internal grouping + punctuation/connector splitting. - """ - groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) - groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)] - - if not groups: - merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode))) - merged = fix_group_level_ocr(merged) - return [x for x in _smart_split_by_connectors(merged) if _is_sentence_like_fragment(x)] - - out = [] - for g in groups: - out.extend(_smart_split_by_connectors(g)) - - # Dedupe OCR echoes - cleaned = [] - for p in out: - p = normalize_text(p) - if not _is_sentence_like_fragment(p): - continue - if cleaned and text_similarity(cleaned[-1], p) >= 0.92: - continue - cleaned.append(p) - - return cleaned - -def build_box_group_texts(indices, ocr, reading_mode="ltr"): - """ - Return independent text groups for one final box, preserving internal layout. - Each group is reconstructed with local reading-order logic. - """ - layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) - out = [] - - if not layout: - return out - - blocks = layout.get("blocks", []) - for block in blocks: - mode = block.get("mode", "horizontal") - groups = block.get("groups", []) - - if mode == "vertical": - groups = sorted( - groups, - key=lambda grp: np.mean([ - (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 - for i in grp - ]), - reverse=(reading_mode == "rtl") - ) - else: - groups = sorted( - groups, - key=lambda grp: np.mean([ - (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 - for i in grp - ]) - ) - - for grp in groups: - txt = reconstruct_group_text(grp, ocr) - if txt: - out.append(txt) - - return out - -def _is_sentence_like_fragment(t: str) -> bool: - t = normalize_text(t or "") - if not t: - return False - alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t) - if len(alnum) < 2: - return False - return True - - -def _line_has_terminal_punct(t: str) -> bool: - t = normalize_text(t or "") - return bool(re.search(r"[.!?…]$", t)) - - -def _smart_split_by_connectors(text: str) -> List[str]: - """ - Conservative split for OCR text that glues 2 clauses: - - DON'T PAY ANY ATTEN-TION TO THEM! THEY'RE ILLU-SIONS! - - WHAT DO COMMON PEOPLE EAT? LIKE FOR BREAK-FAST? - """ - t = normalize_text(text or "") - if not t: - return [] - - # Normalize some OCR hyphen artifacts first - t = dehyphenate_linebreak_artifacts(t) - - # Primary punctuation split - parts = re.split(r"(?<=[.!?…])\s+", t) - parts = [p.strip() for p in parts if p.strip()] - if len(parts) >= 2: - return parts - - # Secondary connector split patterns (conservative) - patterns = [ - r"\b(THEY'RE|THEY ARE)\b", - r"\b(DON'T|DO NOT)\b", - r"\b(LIKE FOR)\b", - r"\b(IF WE DON'T|IF WE DO NOT)\b", - ] - - for pat in patterns: - m = re.search(pat, t) - if m and m.start() > 8: - left = t[:m.start()].strip() - right = t[m.start():].strip() - if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right): - return [left, right] - - return [t] - - -def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]: - """ - Layout-aware phrase segmentation for one final box. - """ - # Step 1: use your existing internal grouping - groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) - groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)] - - if not groups: - merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode))) - return _smart_split_by_connectors(merged) - - # Step 2: split each group by punctuation/connectors - out = [] - for g in groups: - out.extend(_smart_split_by_connectors(g)) - - # Step 3: dedupe near-identical neighbors (OCR echo) - cleaned = [] - for p in out: - if not cleaned: - cleaned.append(p) - continue - if text_similarity(cleaned[-1], p) >= 0.92: - continue - cleaned.append(p) - - return [normalize_text(x) for x in cleaned if _is_sentence_like_fragment(x)] - -def is_multi_group_bubble(indices, ocr, reading_mode="ltr", min_groups=2): - groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) - meaningful = [g for g in groups if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g)) >= 2] - return len(meaningful) >= min_groups - -def _bubble_text(indices, ocr, reading_mode="ltr"): - return normalize_text(" ".join(build_text_from_layout(indices, ocr, reading_mode=reading_mode))) - -def _box_dims(b): - return max(1, b[2]-b[0]), max(1, b[3]-b[1]) - -def _intersection(a, b): - ix1, iy1 = max(a[0], b[0]), max(a[1], b[1]) - ix2, iy2 = min(a[2], b[2]), min(a[3], b[3]) - w, h = max(0, ix2-ix1), max(0, iy2-iy1) - return w*h - -def _containment_ratio(child, parent): - inter = _intersection(child, parent) - c_area = max(1, (child[2]-child[0])*(child[3]-child[1])) - return inter / c_area - -def _center_distance(a, b): - acx, acy = (a[0]+a[2])/2.0, (a[1]+a[3])/2.0 - bcx, bcy = (b[0]+b[2])/2.0, (b[1]+b[3])/2.0 - return ((acx-bcx)**2 + (acy-bcy)**2) ** 0.5 - -def _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices): - new_b, new_bb, new_bq, new_bi = {}, {}, {}, {} - for nid, old in enumerate(sorted(bubble_boxes.keys()), start=1): - new_b[nid] = bubbles[old] - new_bb[nid] = bubble_boxes[old] - new_bq[nid] = bubble_quads[old] - new_bi[nid] = bubble_indices[old] - return new_b, new_bb, new_bq, new_bi - -def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, - image_bgr=None, reading_mode="ltr"): - """ - Final reconciliation pass for: - - overlap merges - - child absorption - - complementary fragment merge - - This version is safe for optional image input and propagates reading_mode - into layout-aware text reconstruction. - """ - if not bubble_boxes: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - - bubble_contours = detect_speech_bubbles(image_bgr) if image_bgr is not None else [] - - changed = True - while changed: - changed = False - bids = sorted(bubble_boxes.keys()) - - # ---- (A) Merge highly-overlapping pairs - merged_any = False - for i in range(len(bids)): - if merged_any: - break - - for j in range(i + 1, len(bids)): - bi, bj = bids[i], bids[j] - - if bi not in bubble_boxes or bj not in bubble_boxes: - continue - - a, b = bubble_boxes[bi], bubble_boxes[bj] - iou = boxes_iou(a, b) - ovs = boxes_overlap_ratio(a, b) # inter / smaller - - same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False - - if ovs >= 0.55 or (iou >= 0.35 and same_contour): - idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) - bubble_indices[bi] = idx - bubble_quads[bi] = [ocr[k][0] for k in idx] - bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) - bubbles[bi] = build_lines_from_indices(idx, ocr) - - bubble_indices.pop(bj, None) - bubble_quads.pop(bj, None) - bubble_boxes.pop(bj, None) - bubbles.pop(bj, None) - - changed = True - merged_any = True - break - - if changed: - continue - - # ---- (B) Absorb tiny child boxes inside larger parent - absorbed_any = False - bids = sorted(bubble_boxes.keys()) - - for i in range(len(bids)): - if absorbed_any: - break - - for j in range(len(bids)): - if i == j: - continue - - child, parent = bids[i], bids[j] - - if child not in bubble_boxes or parent not in bubble_boxes: - continue - - cb, pb = bubble_boxes[child], bubble_boxes[parent] - cw, ch = _box_dims(cb) - pw, ph = _box_dims(pb) - - contain = _containment_ratio(cb, pb) - child_txt = _bubble_text(bubble_indices[child], ocr, reading_mode=reading_mode) - parent_txt = _bubble_text(bubble_indices[parent], ocr, reading_mode=reading_mode) - - # tiny or fragment child - is_tiny = (cw <= med_h * 3.2 and ch <= med_h * 2.2) or len(child_txt) <= 14 - - # don't absorb if it's clearly separate and far - close = _center_distance(cb, pb) <= med_h * 4.0 - - if contain >= 0.70 and (is_tiny or close): - idx = sorted(set(bubble_indices[parent] + bubble_indices[child])) - bubble_indices[parent] = idx - bubble_quads[parent] = [ocr[k][0] for k in idx] - bubble_boxes[parent] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) - bubbles[parent] = build_lines_from_indices(idx, ocr) - - bubble_indices.pop(child, None) - bubble_quads.pop(child, None) - bubble_boxes.pop(child, None) - bubbles.pop(child, None) - - changed = True - absorbed_any = True - break - - if changed: - continue - - # ---- (C) Merge complementary fragments - comp_any = False - bids = sorted(bubble_boxes.keys()) - - for i in range(len(bids)): - if comp_any: - break - - for j in range(i + 1, len(bids)): - bi, bj = bids[i], bids[j] - - if bi not in bubble_boxes or bj not in bubble_boxes: - continue - - a, b = bubble_boxes[bi], bubble_boxes[bj] - wi, hi = _box_dims(a) - wj, hj = _box_dims(b) - - vert_gap = max(0, max(a[1], b[1]) - min(a[3], b[3])) - h_ix = max(0, min(a[2], b[2]) - max(a[0], b[0])) - h_overlap_ratio = h_ix / max(1, min(wi, wj)) - same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False - - txt_i = _bubble_text(bubble_indices[bi], ocr, reading_mode=reading_mode) - txt_j = _bubble_text(bubble_indices[bj], ocr, reading_mode=reading_mode) - - if same_contour and vert_gap <= med_h * 2.8 and h_overlap_ratio >= 0.45: - # prefer merge when one is upper fragment + other lower fragment - # and text is not identical duplicate - if txt_i != txt_j: - idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) - bubble_indices[bi] = idx - bubble_quads[bi] = [ocr[k][0] for k in idx] - bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) - bubbles[bi] = build_lines_from_indices(idx, ocr) - - bubble_indices.pop(bj, None) - bubble_quads.pop(bj, None) - bubble_boxes.pop(bj, None) - bubbles.pop(bj, None) - - changed = True - comp_any = True - break - - return _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices) - -def split_boxes_by_internal_vertical_groups(bubbles, bubble_boxes, bubble_quads, bubble_indices, - ocr, image_shape, reading_mode="ltr"): - """ - Conservative splitter: - - Split only when evidence is strong. - - Prevent over-splitting of short/noisy vertical tokens. - """ - ih, iw = image_shape[:2] - out_bubbles = {} - out_boxes = {} - out_quads = {} - out_indices = {} - next_id = 1 - - # conservative thresholds - MIN_ALNUM_PER_GROUP = 8 - MIN_GROUP_HEIGHT_RATIO = 0.30 # was too low before - MIN_VERTICAL_GROUPS_TO_SPLIT = 2 - MAX_SPLIT_PARTS = 3 # safety cap - - for bid in sorted(bubble_boxes.keys()): - idxs = bubble_indices[bid] - parent = bubble_boxes[bid] - parent_h = max(1, parent[3] - parent[1]) - parent_w = max(1, parent[2] - parent[0]) - - if len(idxs) < 4: - out_bubbles[next_id] = bubbles[bid] - out_boxes[next_id] = bubble_boxes[bid] - out_quads[next_id] = bubble_quads[bid] - out_indices[next_id] = idxs - next_id += 1 - continue - - layout = detect_internal_text_layout(idxs, ocr, reading_mode=reading_mode) - did_split = False - - # -------------------------------------------------------------- - # Primary: vertical-mode internal groups (STRICT) - # -------------------------------------------------------------- - if layout and layout.get("blocks"): - candidate_groups = [] - - for block in layout.get("blocks", []): - if block.get("mode", "horizontal") != "vertical": - continue - - for grp in block.get("groups", []): - grp = sorted(set(grp), key=lambda i: ( - quad_bbox(ocr[i][0])[1], - quad_bbox(ocr[i][0])[0] - )) - if not grp: - continue - - txt = reconstruct_group_text_best(grp, ocr) - txt = normalize_text(fix_group_level_ocr(txt)) - if not txt: - continue - - alnum_len = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) - if alnum_len < MIN_ALNUM_PER_GROUP: - continue - - gb = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - gw = max(1, gb[2] - gb[0]) - gh = max(1, gb[3] - gb[1]) - - # require meaningful physical size - if gh < parent_h * MIN_GROUP_HEIGHT_RATIO: - continue - - # avoid splitting tiny narrow SFX-like strips - if gw < parent_w * 0.12 and alnum_len < 12: - continue - - # sentence-ish check - words = txt.split() - has_terminal = bool(re.search(r"[.!?…]$", txt)) - if len(words) < 2 and not has_terminal: - continue - - candidate_groups.append({ - "indices": grp, - "text": txt, - "box": gb - }) - - if len(candidate_groups) >= MIN_VERTICAL_GROUPS_TO_SPLIT: - # Sort columns by reading order - candidate_groups = sorted( - candidate_groups, - key=lambda g: (g["box"][0] + g["box"][2]) / 2.0, - reverse=(reading_mode == "rtl") - ) - - # cap extreme over-splits - if len(candidate_groups) > MAX_SPLIT_PARTS: - candidate_groups = candidate_groups[:MAX_SPLIT_PARTS] - - # final sanity: total text coverage vs parent text - parent_txt = normalize_text(" ".join(build_final_box_text(idxs, ocr, reading_mode=reading_mode))) - parent_alnum = max(1, len(re.sub(r"[^A-ZÀ-Ý0-9]", "", parent_txt))) - sum_child_alnum = sum(len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g["text"])) for g in candidate_groups) - - # if split loses too much text evidence, reject - if (sum_child_alnum / parent_alnum) >= 0.65: - for g in candidate_groups: - grp = sorted(set(g["indices"]), key=lambda i: ( - quad_bbox(ocr[i][0])[1], - quad_bbox(ocr[i][0])[0] - )) - ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - - out_indices[next_id] = grp - out_quads[next_id] = [ocr[i][0] for i in grp] - out_boxes[next_id] = ( - max(0, ub[0] - 2), max(0, ub[1] - 2), - min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) - ) - out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode) - next_id += 1 - - did_split = True - - if did_split: - continue - - # -------------------------------------------------------------- - # Fallback: row sentence split (ONLY for strong punctuation cases) - # -------------------------------------------------------------- - row_sentence_parts = split_box_by_sentence_rows(idxs, ocr, min_groups=2) - - if row_sentence_parts and 2 <= len(row_sentence_parts) <= 3: - # Require punctuation evidence in resulting parts - part_texts = [] - for grp in row_sentence_parts: - txt = normalize_text(" ".join(build_lines_from_indices(grp, ocr))) - txt = fix_group_level_ocr(txt) - part_texts.append(txt) - - punct_parts = sum(1 for t in part_texts if re.search(r"[.!?…]$", t)) - if punct_parts >= 2: - for grp in row_sentence_parts: - grp = sorted(set(grp), key=lambda i: ( - quad_bbox(ocr[i][0])[1], - quad_bbox(ocr[i][0])[0] - )) - ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) - - out_indices[next_id] = grp - out_quads[next_id] = [ocr[i][0] for i in grp] - out_boxes[next_id] = ( - max(0, ub[0] - 2), max(0, ub[1] - 2), - min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) - ) - out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode) - next_id += 1 - continue - - # -------------------------------------------------------------- - # Keep original if no strong split evidence - # -------------------------------------------------------------- - out_bubbles[next_id] = bubbles[bid] - out_boxes[next_id] = bubble_boxes[bid] - out_quads[next_id] = bubble_quads[bid] - out_indices[next_id] = idxs - next_id += 1 - - return out_bubbles, out_boxes, out_quads, out_indices - -def split_box_by_internal_vertical_gaps(bid, bubble_indices, ocr, factor=1.45, min_gap=16): - """ - Multi-cut vertical splitter. - Splits one bubble into N vertical groups when there are multiple strong y-gaps. - Good for 4+4 quad accidental merges. - """ - idxs = bubble_indices.get(bid, []) - if len(idxs) < 4: - return None - - items = [] - for i in idxs: - b = quad_bbox(ocr[i][0]) - cy = (b[1] + b[3]) / 2.0 - h = max(1, b[3] - b[1]) - items.append((i, b, cy, h)) - - items.sort(key=lambda x: x[2]) # top->bottom - med_h = float(np.median([x[3] for x in items])) if items else 12.0 - th = max(min_gap, med_h * factor) - - # Collect cut points - cut_positions = [] - prev_b = items[0][1] - for k in range(1, len(items)): - cur_b = items[k][1] - gap = cur_b[1] - prev_b[3] - if gap > th: - cut_positions.append(k) - prev_b = cur_b - - if not cut_positions: - return None - - # Build groups using all cut positions - groups = [] - start = 0 - for cp in cut_positions: - groups.append([it[0] for it in items[start:cp]]) - start = cp - groups.append([it[0] for it in items[start:]]) - - # Remove empty groups - groups = [g for g in groups if g] - if len(groups) <= 1: - return None - - # Sanity: each group should be meaningful - clean_groups = [] - for g in groups: - txt = normalize_text(" ".join(build_lines_from_indices(g, ocr))) - if len(g) >= 2 or len(txt) >= 12: - clean_groups.append(g) - - if len(clean_groups) <= 1: - return None - - return clean_groups - -def force_split_bridged_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): - """ - Force-split boxes that accidentally contain multiple vertically separated speech chunks. - - Typical fixes: - - one detected box actually contains 2 stacked bubbles - - "4 quads + 4 quads" merged into one cluster - - mixed contour membership inside one grouped box - """ - if not bubble_boxes: - return bubbles, bubble_boxes, bubble_quads, bubble_indices - - bubble_contours = detect_speech_bubbles(image_bgr) - - def contour_id_for_idx(i): - b = quad_bbox(ocr[i][0]) - cx = (b[0] + b[2]) / 2.0 - cy = (b[1] + b[3]) / 2.0 - for ci, c in enumerate(bubble_contours): - if cv2.pointPolygonTest(c, (cx, cy), False) >= 0: - return ci - return -1 - - def build_group_payload(g): - g_sorted = sorted(g, key=lambda i: quad_center(ocr[i][0])[1]) - ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g_sorted]) - return ( - build_lines_from_indices(g_sorted, ocr), # lines - ub, # box - [ocr[i][0] for i in g_sorted], # quads - g_sorted # indices - ) - - new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} - next_bid = 1 - - for bid in sorted(bubble_boxes.keys()): - idxs = bubble_indices.get(bid, []) - if len(idxs) < 2: - # keep as-is - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] - next_bid += 1 - continue - - parts = None - - # ------------------------------------------------------------------ - # (A) Primary: internal vertical-gap multi-split - # ------------------------------------------------------------------ - parts = split_box_by_internal_vertical_gaps( - bid, bubble_indices, ocr, factor=1.45, min_gap=16 - ) - - # ------------------------------------------------------------------ - # (B) Secondary: split by contour membership if clearly mixed - # ------------------------------------------------------------------ - if parts is None and len(idxs) >= 3: - by_contour = {} - for i in idxs: - cid = contour_id_for_idx(i) - by_contour.setdefault(cid, []).append(i) - - contour_groups = [g for g in by_contour.values() if len(g) >= 1] - if len(contour_groups) >= 2: - # sort groups top->bottom for stable order - contour_groups.sort(key=lambda g: min(quad_bbox(ocr[i][0])[1] for i in g)) - - # sanity: avoid splitting tiny noise-only tails - valid = [] - for g in contour_groups: - txt = normalize_text(" ".join(build_lines_from_indices(g, ocr))) - if len(g) >= 2 or len(txt) >= 10: - valid.append(g) - - if len(valid) >= 2: - parts = valid - - # ------------------------------------------------------------------ - # (C) Tertiary: balanced 2-block pattern (e.g., 4 quads + 4 quads) - # ------------------------------------------------------------------ - if parts is None and len(idxs) >= 8: - sorted_idxs = sorted( - idxs, - key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 - ) - mid = len(sorted_idxs) // 2 - g1, g2 = sorted_idxs[:mid], sorted_idxs[mid:] - - b1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g1]) - b2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g2]) - - if b1 and b2: - vgap = max(0, b2[1] - b1[3]) - h1 = max(1, b1[3] - b1[1]) - h2 = max(1, b2[3] - b2[1]) - med_local_h = (h1 + h2) / 2.0 - - h_ix = max(0, min(b1[2], b2[2]) - max(b1[0], b2[0])) - min_w = max(1, min(b1[2] - b1[0], b2[2] - b2[0])) - h_overlap_ratio = h_ix / min_w - - if vgap >= max(14, 0.22 * med_local_h) and h_overlap_ratio >= 0.30: - parts = [g1, g2] - - # ------------------------------------------------------------------ - # Commit split or keep original - # ------------------------------------------------------------------ - if parts is None or len(parts) <= 1: - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] - next_bid += 1 - continue - - for g in parts: - lines, box, quads, gidx = build_group_payload(g) - new_bubbles[next_bid] = lines - new_boxes[next_bid] = box - new_quads[next_bid] = quads - new_indices[next_bid] = gidx - next_bid += 1 - - return new_bubbles, new_boxes, new_quads, new_indices -# ============================================================ -# translate_manga_text START -# ============================================================ - -def translate_manga_text( - image_path="001-page.png", - source_lang="en", - target_lang="ca", - confidence_threshold=0.03, - min_text_length=1, - gap_px="auto", - quality_threshold=0.62, - export_to_file="output.txt", - export_bubbles_to="bubbles.json", - reading_mode="ltr", - debug=True, - use_enhanced_ocr=True, - strict_grouping=True, - max_box_width_ratio=0.6, - max_box_height_ratio=0.5, - auto_fix_bubbles=True -): - image = cv2.imread(image_path) - if image is None: - print(f"❌ Cannot load image: {image_path}") - return - - resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) - ih, iw = image.shape[:2] - print("Loading OCR engines...") - - if use_enhanced_ocr: - detector = ImprovedMacVisionDetector(source_lang=source_lang) - print("🚀 Using Enhanced Multi-Pass OCR") - else: - detector = MacVisionDetector(source_lang=source_lang) - - print("Running detection OCR (Apple Vision)...") - raw = detector.read(image_path) - print(f"Raw detections: {len(raw)}") - - if use_enhanced_ocr: - existing_quads = [r[0] for r in raw] - missed_regions = detect_small_text_regions(image, existing_quads) - if missed_regions: - print(f"🔍 Found {len(missed_regions)} potentially missed text regions") - for region in missed_regions: - rx1, ry1, rx2, ry2 = region - pad = 10 - rx1, ry1 = max(0, rx1 - pad), max(0, ry1 - pad) - rx2, ry2 = min(iw, rx2 + pad), min(ih, ry2 + pad) - crop = image[ry1:ry2, rx1:rx2] - if crop.size > 0: - upscaled = cv2.resize( - crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC - ) - for quad, text, conf in detector.run_vision_ocr(upscaled): - raw.append(( - [[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad], - text, - conf - )) - print(f"📝 Total detections after missed region scan: {len(raw)}") - - # ── Filtering ───────────────────────────────────────────────────────── - filtered, skipped = [], 0 - for bbox, text, conf in raw: - t = normalize_text(text) - qb = quad_bbox(bbox) - - if conf < confidence_threshold: - skipped += 1 - continue - if len(t) < min_text_length: - skipped += 1 - continue - if not is_valid_language(t, source_lang): - skipped += 1 - continue - if not is_meaningful_text(t, source_lang): - skipped += 1 - continue - if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: - skipped += 1 - continue - - filtered.append((bbox, t, conf)) - - print(f"Kept: {len(filtered)} | Skipped: {skipped}") - - # Protect short dialogue token confidence - tmp = [] - for bbox, t, conf in filtered: - tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40))) - filtered = tmp - - # Rescue names/short tokens dropped by strict filters - rescued = rescue_name_and_short_tokens(raw, min_conf=0.20) - filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55) - - if not filtered: - print("⚠️ No text after filtering.") - return - - # ── Pre-grouping quad splits ────────────────────────────────────────── - filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered) - if oversized_splits > 0: - print(f"📐 Split {oversized_splits} oversized quad(s) before grouping") - - filtered, wide_splits = split_wide_ocr_items(image, filtered) - if wide_splits > 0: - print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.") - - filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) - if bridge_splits > 0: - print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") - - hs_pre = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered] - med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 - filtered, _ = apply_column_gap_splits(image, filtered, med_h_pre) - - filtered = normalize_ocr_quads(filtered) - - # ── Grouping ────────────────────────────────────────────────────────── - print("📊 Grouping quads vertically...") - bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( - filtered, image.shape, gap_px=resolved_gap, - bbox_padding=1, strict_mode=strict_grouping - ) - print(f" Created {len(bubbles)} initial bubble-group box(es)") - - print("🧱 Proposing region-first text containers...") - region_lines, region_boxes, region_quads, region_indices = propose_text_regions_from_ocr( - filtered, image.shape - ) - print(f" Proposed {len(region_lines)} region container(s)") - - # ── Auto-fix (split + merge) ────────────────────────────────────────── - if auto_fix_bubbles: - bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( - bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image - ) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image - ) - - # ── Enforce max box size ────────────────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( - bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, - max_width_ratio=max_box_width_ratio, - max_height_ratio=max_box_height_ratio, - image_shape=image.shape - ) - - # ── Close-proximity merge ───────────────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered - ) - - # ── Per-bubble split pass ───────────────────────────────────────────── - new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} - next_bid = max(bubbles.keys()) + 1 if bubbles else 1 - splits_performed = [] - - for bid in list(bubbles.keys()): - split_result, split_reason = _split_bubble_if_needed( - bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih - ) - - if split_result: - p1, p2 = split_result - splits_performed.append(f"BOX#{bid} ({split_reason})") - for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: - ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) - new_bubbles[part_bid] = build_final_box_text( - part_idxs, filtered, reading_mode=reading_mode - ) - new_bubble_boxes[part_bid] = ( - max(0, ub[0] - 2), max(0, ub[1] - 2), - min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) - ) - new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] - new_bubble_indices[part_bid] = part_idxs - next_bid += 1 - else: - new_bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - new_bubble_boxes[bid] = bubble_boxes[bid] - new_bubble_quads[bid] = bubble_quads[bid] - new_bubble_indices[bid] = bubble_indices[bid] - - if splits_performed: - print(f"\n🔀 Splits detected: {len(splits_performed)}") - for s in splits_performed: - print(f" ✓ {s}") - - bubbles = new_bubbles - bubble_boxes = new_bubble_boxes - bubble_quads = new_bubble_quads - bubble_indices = new_bubble_indices - - # ── Reattach orphan short tokens ────────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = reattach_orphan_short_tokens( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered - ) - - for bid in list(bubble_indices.keys()): - bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - - # ── Final reconciliation pass ───────────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( - bubbles, - bubble_boxes, - bubble_quads, - bubble_indices, - filtered, - image_bgr=image, - reading_mode=reading_mode - ) - - for bid in list(bubble_indices.keys()): - bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image - ) - - for bid in list(bubble_indices.keys()): - bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( - bubbles, - bubble_boxes, - bubble_quads, - bubble_indices, - filtered, - image_bgr=image, - reading_mode=reading_mode - ) - - for bid in list(bubble_indices.keys()): - bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - - # ── Reconcile bubble-first and region-first views ───────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_region_and_bubble_groups( - region_lines, region_boxes, region_quads, region_indices, - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered - ) - - for bid in list(bubble_indices.keys()): - bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - - # ── Split boxes by internal vertical groups ─────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = split_boxes_by_internal_vertical_groups( - bubbles, - bubble_boxes, - bubble_quads, - bubble_indices, - filtered, - image.shape, - reading_mode=reading_mode - ) - - for bid in list(bubble_indices.keys()): - bubbles[bid] = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - - print(f"✅ Final box count: {len(bubbles)}") - - # ── OCR quality pass ────────────────────────────────────────────────── - translator = GoogleTranslator(source=source_lang, target=target_lang) - clean_lines: Dict[int, str] = {} - raw_lines: Dict[int, str] = {} - corrected_lines: Dict[int, str] = {} - sources_used: Dict[int, str] = {} - translations: Dict[int, str] = {} - region_types: Dict[int, str] = {} - region_confidences: Dict[int, float] = {} - region_flags: Dict[int, List[str]] = {} - bubble_group_texts: Dict[int, List[str]] = {} - - for bid in sorted(bubble_boxes.keys()): - final_lines = build_final_box_text( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - bubbles[bid] = final_lines - - # NEW: segmented phrase groups for translation - group_texts = segment_box_into_phrases( - bubble_indices[bid], filtered, reading_mode=reading_mode - ) - bubble_group_texts[bid] = group_texts - - base_txt = normalize_text(" ".join(final_lines)) - raw_lines[bid] = base_txt - base_sc = ocr_candidate_score(base_txt) - txt, src_used = base_txt, "vision-base" - - if base_sc < quality_threshold: - rr_txt, rr_sc, rr_src = reread_bubble_with_vision( - image, bubble_boxes[bid], detector, upscale=3.0, pad=24 - ) - if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): - txt, src_used = rr_txt, rr_src - - tmp_lines = [txt] if txt else final_lines - region_type = classify_region_type(image, bubble_boxes[bid], tmp_lines) - corrected_txt, correction_gain = correct_region_text(txt, region_type=region_type) - conf = compute_region_confidence(txt, corrected_txt, bubble_boxes[bid], region_type, image) - flags = build_region_flags(txt, corrected_txt, region_type, conf) - - if len([g for g in group_texts if g.strip()]) >= 2: - flags.append("BUBBLE") - flags.append("SEGMENTED") - - clean_lines[bid] = normalize_text(corrected_txt) - corrected_lines[bid] = normalize_text(corrected_txt) - sources_used[bid] = src_used - region_types[bid] = region_type - region_confidences[bid] = conf - region_flags[bid] = sorted(set(flags)) - - reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) - - # ── Translation ─────────────────────────────────────────────────────── - for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - group_texts = [g for g in bubble_group_texts.get(bid, []) if g.strip()] - - if len(group_texts) >= 2: - src_txt = " ".join(group_texts).strip() - else: - src_txt = clean_lines[bid].strip() - - if not src_txt: - continue - if not is_valid_language(src_txt, source_lang): - continue - if not is_meaningful_text(src_txt, source_lang): - continue - - try: - if len(group_texts) >= 2: - translated_groups = [] - for g in group_texts: - if not is_valid_language(g, source_lang): - continue - if not is_meaningful_text(g, source_lang): - continue - tg = translator.translate(g) or "" - tg = postprocess_translation_general(tg).upper() - if tg: - translated_groups.append(tg) - tgt = " || ".join(translated_groups) - else: - tgt = translator.translate(src_txt) or "" - tgt = postprocess_translation_general(tgt).upper() - except Exception as e: - tgt = f"[Error: {e}]" - - translations[bid] = tgt - - if debug: - save_debug_clusters( - image_path, filtered, bubble_boxes, bubble_indices, - clean_lines, "debug_clusters.png", region_types=region_types - ) - - # ── Text output ─────────────────────────────────────────────────────── - divider = "─" * 140 - out_lines = [ - "BUBBLE|ORDER|TYPE|CONF|OCR_SOURCE|ORIGINAL|CORRECTED|BUBBLE_GROUPS|TRANSLATED|FLAGS", - divider - ] - - print( - divider + - f"\n{'BUBBLE':<8} {'ORDER':<6} {'TYPE':<10} {'CONF':<6} {'SOURCE':<12} " - f"{'CORRECTED':<30} {'BUBBLE_GROUPS':<40} {'TRANSLATED':<30} FLAGS\n" + - divider - ) - - translated_count = 0 - for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src_txt = clean_lines[bid].strip() - if not src_txt: - continue - if not is_valid_language(src_txt, source_lang): - continue - if not is_meaningful_text(src_txt, source_lang): - continue - - flags = list(region_flags.get(bid, [])) - tgt = translations.get(bid, "") - if not tgt: - flags.append("NO_TRANSLATION") - - src_engine = sources_used.get(bid, "unknown") - rtype = region_types.get(bid, "unknown") - rconf = region_confidences.get(bid, 0.0) - raw_u = raw_lines.get(bid, "").upper() - corr_u = corrected_lines.get(bid, "").upper() - group_blob = " || ".join(bubble_group_texts.get(bid, [])).upper() - - out_lines.append( - f"#{bid}|{reading_map.get(bid, bid)}|{rtype}|{rconf:.2f}|{src_engine}|" - f"{raw_u}|{corr_u}|{group_blob}|{tgt}|{','.join(flags) if flags else '-'}" - ) - - print( - f"#{bid:<7} {reading_map.get(bid,bid):<6} {rtype:<10} {rconf:<6.2f} {src_engine:<12} " - f"{corr_u[:30]:<30} {group_blob[:40]:<40} {tgt[:30]:<30} " - f"{','.join(flags) if flags else '-'}" - ) - - translated_count += 1 - - out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.") - with open(export_to_file, "w", encoding="utf-8") as f: - f.write("\n".join(out_lines)) - - # ── bubbles.json ────────────────────────────────────────────────────── - bubbles_payload = {} - for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src_txt = clean_lines[bid].strip() - if not src_txt: - continue - if not is_valid_language(src_txt, source_lang): - continue - if not is_meaningful_text(src_txt, source_lang): - continue - - box = bubble_boxes.get(bid) - tgt = translations.get(bid, "") - bubbles_payload[str(bid)] = { - "order": reading_map.get(bid, bid), - "region_type": region_types.get(bid, "unknown"), - "confidence": round(region_confidences.get(bid, 0.0), 4), - "ocr_source": sources_used.get(bid, "unknown"), - "raw_ocr": raw_lines.get(bid, "").upper(), - "corrected_ocr": corrected_lines.get(bid, "").upper(), - "translation_input": src_txt.upper(), - "translated": tgt, - "flags": region_flags.get(bid, []), - "bubble_groups": [g.upper() for g in bubble_group_texts.get(bid, [])], - "box": { - "x": box[0] if box else 0, - "y": box[1] if box else 0, - "w": (box[2] - box[0]) if box else 0, - "h": (box[3] - box[1]) if box else 0, - }, - "lines": [line.upper() for line in bubbles.get(bid, [])], - } - - with open(export_bubbles_to, "w", encoding="utf-8") as f: - json.dump(bubbles_payload, f, ensure_ascii=False, indent=2) - - print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") - - -# ============================================================ -# translate_manga_text END -# ============================================================ - -# ============================================================ -# ENTRY POINT -# ============================================================ -if __name__ == "__main__": - translate_manga_text( - image_path="19.png", - source_lang="english", - target_lang="ca", - confidence_threshold=0.03, - min_text_length=1, - gap_px="auto", - quality_threshold=0.62, - export_to_file="output.txt", - export_bubbles_to="bubbles.json", - reading_mode="ltr", - debug=True, - use_enhanced_ocr=True, - strict_grouping=True, - max_box_width_ratio=0.6, - max_box_height_ratio=0.5, - auto_fix_bubbles=True - ) \ No newline at end of file diff --git a/analyze_box5.py b/older-code/analyze_box5.py similarity index 100% rename from analyze_box5.py rename to older-code/analyze_box5.py diff --git a/analyze_box7_split.py b/older-code/analyze_box7_split.py similarity index 100% rename from analyze_box7_split.py rename to older-code/analyze_box7_split.py diff --git a/analyze_grouping.py b/older-code/analyze_grouping.py similarity index 100% rename from analyze_grouping.py rename to older-code/analyze_grouping.py diff --git a/check_box7.py b/older-code/check_box7.py similarity index 100% rename from check_box7.py rename to older-code/check_box7.py diff --git a/check_grouping_logic.py b/older-code/check_grouping_logic.py similarity index 100% rename from check_grouping_logic.py rename to older-code/check_grouping_logic.py diff --git a/debug_split_phase.py b/older-code/debug_split_phase.py similarity index 100% rename from debug_split_phase.py rename to older-code/debug_split_phase.py diff --git a/patch_manga_translator.py b/older-code/patch_manga_translator.py similarity index 100% rename from patch_manga_translator.py rename to older-code/patch_manga_translator.py diff --git a/regenerate_debug.py b/older-code/regenerate_debug.py similarity index 100% rename from regenerate_debug.py rename to older-code/regenerate_debug.py diff --git a/split_bubbles.py b/older-code/split_bubbles.py similarity index 100% rename from split_bubbles.py rename to older-code/split_bubbles.py diff --git a/split_final.py b/older-code/split_final.py similarity index 100% rename from split_final.py rename to older-code/split_final.py diff --git a/test_panel_split.py b/older-code/test_panel_split.py similarity index 100% rename from test_panel_split.py rename to older-code/test_panel_split.py