From b730037a06851750745ae946b509f4dd5a4d0740 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Wed, 22 Apr 2026 16:18:59 +0200 Subject: [PATCH] Added big stuff --- manga-translator.py | 1714 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 1638 insertions(+), 76 deletions(-) diff --git a/manga-translator.py b/manga-translator.py index 56adde1..6c1cd6a 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -23,6 +23,323 @@ warnings.filterwarnings("ignore", category=UserWarning) # ============================================================ TOP_BAND_RATIO = 0.08 +# ============================================================ +# REGION-FIRST LAYOUT HELPERS +# ============================================================ +import math +from difflib import SequenceMatcher + +DIALOGUE_STOPWORDS = { + "I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR", + "IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT", + "WHAT", "WHY", "HOW", "WHO", "IN", "ON", "AT", "TO", "OF", "FOR", "WITH", + "AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT" +} + +SFX_HINTS = { + "RRRING", "RING", "RINGG", "BAM", "BOOM", "FWUP", "FWOOP", "FSHOO", + "GRRP", "GASP", "THUD", "SMACK", "WHAM", "SLAM", "SNIF", "SNIFF" +} + +REACTION_HINTS = { + "HUH", "HUH?!", "HUH?", "OH", "AH", "EH", "TCH", "HEY", "WHAT?!", "NO!", "YES!" +} + +NARRATION_HINTS = { + "AND SO", "MEANWHILE", "LATER", "THEN", "TO BE CONTINUED" +} + +def xyxy_width(b): + return max(1, b[2] - b[0]) + +def xyxy_height(b): + return max(1, b[3] - b[1]) + +def xyxy_center(b): + return ((b[0] + b[2]) / 2.0, (b[1] + b[3]) / 2.0) + +def box_distance(a, b): + ax, ay = xyxy_center(a) + bx, by = xyxy_center(b) + return math.hypot(ax - bx, ay - by) + +def horizontal_overlap_ratio(a, b): + ix1, ix2 = max(a[0], b[0]), min(a[2], b[2]) + ov = max(0, ix2 - ix1) + return ov / max(1, min(xyxy_width(a), xyxy_width(b))) + +def vertical_overlap_ratio(a, b): + iy1, iy2 = max(a[1], b[1]), min(a[3], b[3]) + ov = max(0, iy2 - iy1) + return ov / max(1, min(xyxy_height(a), xyxy_height(b))) + +def box_expand(b, pad, iw, ih): + return ( + max(0, int(b[0] - pad)), + max(0, int(b[1] - pad)), + min(iw - 1, int(b[2] + pad)), + min(ih - 1, int(b[3] + pad)), + ) + +def count_alpha(text): + return len(re.findall(r"[A-ZÀ-Ýa-zà-ÿ]", text or "")) + +def uppercase_ratio(text): + alpha = re.findall(r"[A-Za-zÀ-ÿ]", text or "") + if not alpha: + return 0.0 + ups = sum(1 for c in alpha if c.isupper()) + return ups / len(alpha) + +def punctuation_ratio(text): + if not text: + return 0.0 + return len(re.findall(r"[!?.,'\"-]", text)) / max(1, len(text)) + +def stopword_ratio(text): + toks = re.findall(r"[A-Z']+", normalize_text(text or "")) + if not toks: + return 0.0 + hits = sum(1 for t in toks if t in DIALOGUE_STOPWORDS) + return hits / len(toks) + +def looks_like_sfx_text(text): + t = normalize_text(text or "") + if not t: + return False + alpha = re.sub(r"[^A-Z]", "", t) + if t in SFX_HINTS or alpha in SFX_HINTS: + return True + if len(alpha) >= 3 and uppercase_ratio(t) > 0.90 and stopword_ratio(t) < 0.15: + if alpha not in DIALOGUE_STOPWORDS: + return True + return False + +def looks_like_reaction_text(text): + t = normalize_text(text or "") + alpha = re.sub(r"[^A-Z?!]", "", t) + if t in REACTION_HINTS or alpha in REACTION_HINTS: + return True + if len(re.sub(r"[^A-Z]", "", t)) <= 5 and punctuation_ratio(t) > 0.10: + return True + return False + +def looks_like_narration_text(text): + t = normalize_text(text or "") + if any(t.startswith(h) for h in NARRATION_HINTS): + return True + if len(t.split()) >= 5 and t.endswith(".") and uppercase_ratio(t) > 0.75: + return True + return False + +def contour_features_for_box(image_bgr, box_xyxy): + x1, y1, x2, y2 = box_xyxy + crop = image_bgr[y1:y2, x1:x2] + if crop.size == 0: + return { + "mean_brightness": 0.0, + "edge_density": 1.0, + "whiteness_ratio": 0.0, + } + + gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) + mean_brightness = float(np.mean(gray)) / 255.0 + + edges = cv2.Canny(gray, 50, 150) + edge_density = float(np.mean(edges > 0)) + + whiteness_ratio = float(np.mean(gray > 220)) + return { + "mean_brightness": mean_brightness, + "edge_density": edge_density, + "whiteness_ratio": whiteness_ratio, + } + +def classify_region_type(image_bgr, box_xyxy, lines): + text = normalize_text(" ".join(lines)) + feats = contour_features_for_box(image_bgr, box_xyxy) + w, h = xyxy_width(box_xyxy), xyxy_height(box_xyxy) + ar = w / max(1, h) + + if looks_like_sfx_text(text): + return "sfx" + + if looks_like_reaction_text(text): + if len(text.split()) <= 3: + return "reaction" + + if looks_like_narration_text(text): + return "narration" + + # balloon/dialogue heuristic: + # bright interior + low-ish edge density + moderate width + if feats["whiteness_ratio"] > 0.45 and feats["edge_density"] < 0.18: + return "dialogue" + + # narrow tall shout / reaction / sfx ambiguity + if ar < 0.9 and uppercase_ratio(text) > 0.85 and stopword_ratio(text) < 0.20: + return "sfx" + + if stopword_ratio(text) >= 0.20: + return "dialogue" + + return "unknown" + +def text_similarity(a, b): + return SequenceMatcher(None, normalize_text(a or ""), normalize_text(b or "")).ratio() + +def dedupe_repeated_phrase(text): + t = normalize_text(text or "") + words = t.split() + if len(words) < 4: + return t + + # remove immediate duplicated halves: "CRY! CRY!" / "I DIDN'T I DIDN'T" + half = len(words) // 2 + if len(words) % 2 == 0 and words[:half] == words[half:]: + return " ".join(words[:half]) + + # collapse trailing duplicate tokens + cleaned = [] + for w in words: + if cleaned and cleaned[-1] == w and len(w) > 2: + continue + cleaned.append(w) + return " ".join(cleaned) + +def dehyphenate_linebreak_artifacts(text): + t = normalize_text(text or "") + t = re.sub(r"\b([A-Z]+)- ([A-Z]+)\b", r"\1\2", t) + return t + +def fix_common_dialogue_ocr(text): + """ + Conservative OCR cleanup for dialogue-like text. + + Goals: + - fix common OCR punctuation/spacing/apostrophe errors + - preserve meaning and tone + - avoid semantic reconstruction guesses + """ + t = normalize_text(text or "") + if not t: + return t + + replacements = { + "1'M": "I'M", + "1 DIDN'T": "I DIDN'T", + "1 HATE": "I HATE", + "1 WAS": "I WAS", + "1'M ": "I'M ", + "YO U": "YOU", + "YOU RE": "YOU'RE", + "YOURE": "YOU'RE", + "I LL": "I'LL", + "ILL ": "I'LL ", + "DONT": "DON'T", + "DIDNT": "DIDN'T", + "CANT": "CAN'T", + "WONT": "WON'T", + "THATS": "THAT'S", + "MOMS": "MOM'S", + "DADS": "DAD'S", + "LEARN- ING": "LEARNING", + "COV- ERED": "COVERED", + "SY ON": "SY-ON", + "P PROPERLY": "P-PROPERLY", + "SH SHUT": "SH- SHUT", + } + + for a, b in replacements.items(): + t = t.replace(a, b) + + # Fix split contractions / apostrophe omissions + t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t) + t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t) + t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t) + t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t) + t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t) + + # Remove accidental duplicated punctuation spacing + t = re.sub(r"\s+([,.;:!?])", r"\1", t) + + # Dehyphenate OCR line-wrap artifacts + t = dehyphenate_linebreak_artifacts(t) + + # Collapse repeated full phrases/tokens caused by OCR duplication + t = dedupe_repeated_phrase(t) + + # Remove duplicated adjacent words like "CRY CRY" if clearly accidental + words = t.split() + cleaned = [] + for w in words: + if cleaned and cleaned[-1] == w and len(re.sub(r"[^A-Z]", "", w)) > 2: + continue + cleaned.append(w) + t = " ".join(cleaned) + + # Normalize spaces + t = re.sub(r"\s{2,}", " ", t).strip() + + return t + +def region_text_role_hint(text): + if looks_like_sfx_text(text): + return "sfx" + if looks_like_reaction_text(text): + return "reaction" + if looks_like_narration_text(text): + return "narration" + return "dialogue" + + +def correct_region_text(text, region_type="dialogue"): + t = normalize_text(text or "") + if not t: + return t, 0.0 + + original = t + + if region_type in {"dialogue", "reaction", "narration"}: + t = fix_common_dialogue_ocr(t) + elif region_type == "sfx": + t = dedupe_repeated_phrase(t) + + score_before = ocr_candidate_score(original) + score_after = ocr_candidate_score(t) + + correction_gain = max(0.0, score_after - score_before) + return t, correction_gain + +def compute_region_confidence(raw_text, corrected_text, box_xyxy, region_type, image_bgr): + feats = contour_features_for_box(image_bgr, box_xyxy) + text_score = ocr_candidate_score(corrected_text) + gain = max(0.0, text_score - ocr_candidate_score(raw_text)) + role_bonus = 0.08 if region_type in {"dialogue", "reaction", "narration", "sfx"} else 0.0 + + score = ( + 0.55 * text_score + + 0.15 * feats["whiteness_ratio"] + + 0.10 * (1.0 - min(1.0, feats["edge_density"] * 2.0)) + + 0.10 * gain + + role_bonus + ) + return max(0.0, min(1.0, score)) + +def build_region_flags(raw_text, corrected_text, region_type, conf): + flags = [] + if region_type == "unknown": + flags.append("REGION_UNKNOWN") + if region_type == "sfx": + flags.append("SFX") + if conf < 0.45: + flags.append("LOW_CONF") + if text_similarity(raw_text, corrected_text) < 0.75: + flags.append("HEAVY_CORRECTION") + if len(corrected_text.split()) > 22: + flags.append("LONG_TEXT") + return flags + # ============================================================ # HELPERS # ============================================================ @@ -125,6 +442,392 @@ _MANGA_INTERJECTIONS = { 'MORNING', 'MORNING.', } +def group_indices_into_vertical_columns(indices, ocr, + x_tolerance_factor=1.4, + min_vertical_span_factor=1.8): + """ + Group OCR indices into vertical columns inside a box. + + A column is defined as: + - similar x centers + - meaningful vertical spread + - internally ordered top-to-bottom + """ + if not indices: + return [] + + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + w = max(1, b[2] - b[0]) + h = max(1, b[3] - b[1]) + items.append((i, b, cx, cy, w, h)) + + med_w = float(np.median([it[4] for it in items])) if items else 12.0 + med_h = float(np.median([it[5] for it in items])) if items else 12.0 + x_tol = max(10.0, med_w * x_tolerance_factor) + + # cluster by x-center + items_sorted = sorted(items, key=lambda x: x[2]) + columns = [] + + for it in items_sorted: + placed = False + for col in columns: + if abs(it[2] - col["xc"]) <= x_tol: + col["members"].append(it) + col["xc"] = float(np.mean([m[2] for m in col["members"]])) + placed = True + break + if not placed: + columns.append({"xc": it[2], "members": [it]}) + + # sort each column top -> bottom + clean_columns = [] + for col in columns: + members = sorted(col["members"], key=lambda x: x[3]) + ys = [m[3] for m in members] + vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0.0 + + # keep meaningful columns OR single strong items + if len(members) >= 2 or vertical_span >= med_h * min_vertical_span_factor: + clean_columns.append([m[0] for m in members]) + else: + clean_columns.append([m[0] for m in members]) + + # sort columns left -> right + clean_columns.sort(key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp])) + return clean_columns + +def group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.75): + """ + Group OCR indices into horizontal rows inside a box. + """ + if not indices: + return [] + + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + h = max(1, b[3] - b[1]) + items.append((i, b, cx, cy, h)) + + med_h = float(np.median([it[4] for it in items])) if items else 10.0 + row_tol = max(6.0, med_h * row_tol_factor) + + items.sort(key=lambda x: x[3]) + rows = [] + + for it in items: + placed = False + for row in rows: + if abs(it[3] - row["yc"]) <= row_tol: + row["members"].append(it) + row["yc"] = float(np.mean([m[3] for m in row["members"]])) + placed = True + break + if not placed: + rows.append({"yc": it[3], "members": [it]}) + + groups = [] + for row in rows: + members = sorted(row["members"], key=lambda x: x[2]) + groups.append([m[0] for m in members]) + + return groups + +def score_text_groups(groups, ocr): + """ + Score grouping quality based on: + - average group size + - text plausibility + - reduced fragmentation + """ + if not groups: + return 0.0 + + texts = [] + lengths = [] + + for grp in groups: + parts = [] + for i in grp: + t = normalize_text(ocr[i][1]) + if t: + parts.append(t) + txt = normalize_text(" ".join(parts)) + if txt: + texts.append(txt) + lengths.append(len(txt.split())) + + if not texts: + return 0.0 + + text_scores = [ocr_candidate_score(t) for t in texts] + avg_text_score = float(np.mean(text_scores)) if text_scores else 0.0 + avg_len = float(np.mean(lengths)) if lengths else 0.0 + fragmentation_penalty = max(0.0, len(groups) - 4) * 0.08 + + return avg_text_score + min(0.5, avg_len * 0.05) - fragmentation_penalty + +def detect_internal_text_layout(indices, ocr, reading_mode="ltr"): + """ + Detect internal structure of text inside one final box. + + Step 1: split into vertical macro blocks + Step 2: for each block, compare horizontal vs vertical grouping + """ + if not indices: + return {"mode": "horizontal", "blocks": []} + + blocks = split_indices_into_vertical_blocks(indices, ocr) + + resolved_blocks = [] + + for block in blocks: + horizontal_groups = group_indices_into_horizontal_rows(block, ocr) + vertical_groups = group_indices_into_vertical_columns(block, ocr) + + h_score = score_text_groups(horizontal_groups, ocr) + v_score = score_text_groups(vertical_groups, ocr) + + if len(vertical_groups) >= 2 and v_score >= h_score - 0.03: + resolved_blocks.append({ + "mode": "vertical", + "groups": vertical_groups + }) + else: + resolved_blocks.append({ + "mode": "horizontal", + "groups": horizontal_groups + }) + + return {"mode": "block-mixed", "blocks": resolved_blocks} + + +def build_text_from_layout(indices, ocr, reading_mode="ltr"): + layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) + output_lines = [] + + for block in layout["blocks"]: + groups = block["groups"] + mode = block["mode"] + + if mode == "horizontal": + for grp in groups: + line = normalize_text(" ".join( + ocr[i][1] for i in grp if normalize_text(ocr[i][1]) + )) + if line: + output_lines.append(line) + + elif mode == "vertical": + if reading_mode == "rtl": + groups = sorted( + groups, + key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]), + reverse=True + ) + else: + groups = sorted( + groups, + key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]) + ) + + for grp in groups: + grp_sorted = sorted(grp, key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0) + line = normalize_text(" ".join( + ocr[i][1] for i in grp_sorted if normalize_text(ocr[i][1]) + )) + if line: + output_lines.append(line) + + return output_lines + +# ============================================================ +# REGION PROPOSAL FROM OCR GEOMETRY +# ============================================================ +def propose_text_regions_from_ocr(ocr, image_shape): + """ + Build larger text containers from OCR boxes before final classification. + This is intentionally conservative: it clusters nearby OCR groups that + likely belong to one dialogue/narration region. + """ + ih, iw = image_shape[:2] + if not ocr: + return {}, {}, {}, {} + + boxes = [quad_bbox(x[0]) for x in ocr] + hs = [max(1, b[3] - b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 14.0 + + parent = list(range(len(ocr))) + + def find(x): + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + + def union(a, b): + ra, rb = find(a), find(b) + if ra != rb: + parent[rb] = ra + + for i in range(len(ocr)): + bi = boxes[i] + for j in range(i + 1, len(ocr)): + bj = boxes[j] + + dx = abs(xyxy_center(bi)[0] - xyxy_center(bj)[0]) + dy = abs(xyxy_center(bi)[1] - xyxy_center(bj)[1]) + + hov = horizontal_overlap_ratio(bi, bj) + vov = vertical_overlap_ratio(bi, bj) + dist = box_distance(bi, bj) + + same_band = dy <= med_h * 2.2 + stacked = hov >= 0.35 and dy <= med_h * 3.2 + same_line = vov >= 0.45 and dx <= med_h * 5.0 + near = dist <= med_h * 4.5 + + if same_line or stacked or (near and (same_band or hov > 0.25)): + if orientation_compatible(i, j, ocr): + union(i, j) + + groups = {} + for i in range(len(ocr)): + groups.setdefault(find(i), []).append(i) + + region_lines = {} + region_boxes = {} + region_quads = {} + region_indices = {} + next_id = 1 + + for _, idxs in sorted(groups.items(), key=lambda kv: min(boxes[i][1] for i in kv[1])): + idxs = sorted(idxs, key=lambda i: (boxes[i][1], boxes[i][0])) + ub = boxes_union_xyxy([boxes[i] for i in idxs]) + if ub is None: + continue + region_lines[next_id] = build_lines_from_indices(idxs, ocr) + region_boxes[next_id] = box_expand(ub, pad=max(2, int(med_h * 0.25)), iw=iw, ih=ih) + region_quads[next_id] = [ocr[i][0] for i in idxs] + region_indices[next_id] = idxs + next_id += 1 + + return region_lines, region_boxes, region_quads, region_indices + +# ============================================================ +# RECONCILE REGION-FIRST AND BUBBLE-FIRST GROUPS +# ============================================================ +def reconcile_region_and_bubble_groups(region_lines, region_boxes, region_quads, region_indices, + bubbles, bubble_boxes, bubble_quads, bubble_indices, + ocr): + """ + Reconcile region-first and bubble-first groupings. + + Strategy: + - Build one combined candidate list from both grouping methods. + - Cluster candidates that heavily overlap or share OCR indices. + - Keep only the best-scoring candidate from each cluster. + - Rebuild stable output dictionaries. + + This avoids duplicate retention and inconsistent greedy selection. + """ + combined = [] + + for rid in region_boxes: + combined.append(("region", rid, region_boxes[rid], region_indices[rid])) + + for bid in bubble_boxes: + combined.append(("bubble", bid, bubble_boxes[bid], bubble_indices[bid])) + + if not combined: + return {}, {}, {}, {} + + visited = set() + kept = [] + + def group_score(box, idxs): + text = normalize_text(" ".join(build_lines_from_indices(idxs, ocr))) + role = region_text_role_hint(text) + + role_bonus = { + "dialogue": 0.8, + "narration": 0.75, + "reaction": 0.7, + "sfx": 0.2, + "unknown": 0.1 + }.get(role, 0.1) + + box_area = bbox_area_xyxy(box) + area_bonus = min(1.0, box_area / 50000.0) + + return ( + len(idxs) * 2.0 + + min(20, len(text.split())) * 0.5 + + min(1.0, ocr_candidate_score(text)) + + role_bonus + + area_bonus * 0.25 + ) + + for i in range(len(combined)): + if i in visited: + continue + + cluster = [i] + visited.add(i) + + _, _, box_i, idx_i = combined[i] + + for j in range(i + 1, len(combined)): + if j in visited: + continue + + _, _, box_j, idx_j = combined[j] + + ovs = boxes_overlap_ratio(box_i, box_j) + iou = boxes_iou(box_i, box_j) + shared = len(set(idx_i).intersection(idx_j)) + + if ovs >= 0.55 or iou >= 0.35 or shared > 0: + cluster.append(j) + visited.add(j) + + best_idx = max( + cluster, + key=lambda k: group_score(combined[k][2], combined[k][3]) + ) + kept.append(combined[best_idx]) + + # Stable order: top-to-bottom, then left-to-right + kept.sort(key=lambda item: ( + (item[2][1] + item[2][3]) / 2.0, + (item[2][0] + item[2][2]) / 2.0 + )) + + out_lines, out_boxes, out_quads, out_indices = {}, {}, {}, {} + next_id = 1 + + for typ, oid, box, idxs in kept: + idxs = sorted( + set(idxs), + key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0]) + ) + + out_lines[next_id] = build_lines_from_indices(idxs, ocr) + out_boxes[next_id] = box + out_quads[next_id] = [ocr[k][0] for k in idxs] + out_indices[next_id] = idxs + next_id += 1 + + return out_lines, out_boxes, out_quads, out_indices + # ============================================================ # PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET # ============================================================ @@ -1523,6 +2226,52 @@ def build_lines_from_indices(indices, ocr): for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2]))) for r in rows if r["m"]] +def split_indices_into_vertical_blocks(indices, ocr, gap_factor=1.6, min_gap=18): + """ + Split a box into top-to-bottom macro blocks using strong vertical gaps. + """ + if len(indices) < 2: + return [indices] + + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + cy = (b[1] + b[3]) / 2.0 + h = max(1, b[3] - b[1]) + items.append((i, b, cy, h)) + + items.sort(key=lambda x: x[2]) + med_h = float(np.median([it[3] for it in items])) if items else 12.0 + threshold = max(min_gap, med_h * gap_factor) + + blocks = [] + current = [items[0][0]] + prev_b = items[0][1] + + for k in range(1, len(items)): + cur_i, cur_b, _, _ = items[k] + gap = cur_b[1] - prev_b[3] + + if gap > threshold: + blocks.append(current) + current = [cur_i] + else: + current.append(cur_i) + + prev_b = cur_b + + if current: + blocks.append(current) + + return blocks + +def build_final_box_text(indices, ocr, reading_mode="ltr"): + """ + Final text reconstruction used for OCR/translation export. + This uses internal layout detection, unlike generic grouping helpers. + """ + return build_text_from_layout(indices, ocr, reading_mode=reading_mode) + def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) @@ -1685,48 +2434,87 @@ def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes, # DEBUG / EXPORT # ============================================================ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, - clean_lines=None, out_path="debug_clusters.png"): + clean_lines=None, out_path="debug_clusters.png", + region_types=None): """ - Draws all detected boxes. - Single-quad boxes are drawn in orange for visibility but are NOT - labelled as (ISOLATED) — they participate fully in merge passes. + Draw debug overlays for final grouped boxes. + + Color scheme by region type: + - dialogue : green + - narration : orange + - sfx : magenta + - reaction : cyan + - unknown : yellow-ish + + OCR quads are outlined lightly in gray for context. """ img = cv2.imread(image_path) - if img is None: return + if img is None: + return + # Draw OCR quads lightly without filling the page white for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) - cv2.fillPoly(img, [pts], (255, 255, 255)) cv2.polylines(img, [pts], True, (180, 180, 180), 1) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb - n_quads = len(bubble_indices.get(bid, [])) - color = (255, 165, 0) if n_quads == 1 else (0, 220, 0) - thickness = 3 if n_quads == 1 else 2 + rtype = region_types.get(bid, "unknown") if region_types else "unknown" + + if rtype == "dialogue": + color = (0, 220, 0) + elif rtype == "narration": + color = (0, 180, 255) + elif rtype == "sfx": + color = (255, 0, 255) + elif rtype == "reaction": + color = (0, 200, 255) + else: + color = (0, 220, 220) + + thickness = 2 cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) - cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + cv2.putText( + img, + f"BOX#{bid} [{rtype}]", + (x1 + 2, max(15, y1 + 16)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.45, + color, + 2 + ) if clean_lines and bid in clean_lines: - text = clean_lines[bid] + text = clean_lines[bid] words = text.split() - lines, cur = [], "" + + wrapped_lines = [] + cur = "" for w in words: - if len(cur) + len(w) < 25: cur += w + " " - else: lines.append(cur.strip()); cur = w + " " - if cur: lines.append(cur.strip()) + if len(cur) + len(w) + 1 < 26: + cur += w + " " + else: + wrapped_lines.append(cur.strip()) + cur = w + " " + if cur: + wrapped_lines.append(cur.strip()) + y_text = y2 + 18 - for line in lines: - cv2.putText(img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3) - cv2.putText(img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1) + for line in wrapped_lines: + # black outline + cv2.putText( + img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3 + ) + # blue text + cv2.putText( + img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1 + ) y_text += 18 cv2.imwrite(out_path, img) - def estimate_reading_order(bbox_dict, mode="ltr"): items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) for bid, bb in bbox_dict.items()] @@ -2000,8 +2788,432 @@ def reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_ind return bubbles, bubble_boxes, bubble_quads, bubble_indices -def _bubble_text(indices, ocr): - return normalize_text(" ".join(build_lines_from_indices(indices, ocr))) +def reconstruct_group_text(group_indices, ocr): + """ + Reconstruct text inside one already-detected group. + + This handles cases where a vertical group itself contains + multiple local rows or wrapped OCR fragments. + """ + if not group_indices: + return "" + + items = [] + for i in group_indices: + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + w = max(1, b[2] - b[0]) + h = max(1, b[3] - b[1]) + items.append((i, b, cx, cy, w, h)) + + if not items: + return "" + + med_h = float(np.median([it[5] for it in items])) + med_w = float(np.median([it[4] for it in items])) + + # If the group is strongly vertical, simple top->bottom is fine + xs = [it[2] for it in items] + ys = [it[3] for it in items] + vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0 + horizontal_span = max(xs) - min(xs) if len(xs) > 1 else 0 + + # strong single vertical phrase + if vertical_span > horizontal_span * 1.5: + items.sort(key=lambda x: x[3]) # top->bottom + txt = normalize_text(" ".join( + normalize_text(ocr[it[0]][1]) for it in items if normalize_text(ocr[it[0]][1]) + )) + return txt + + # otherwise, split into local rows first + row_tol = max(6.0, med_h * 0.65) + items.sort(key=lambda x: x[3]) + + rows = [] + for it in items: + placed = False + for row in rows: + if abs(it[3] - row["yc"]) <= row_tol: + row["members"].append(it) + row["yc"] = float(np.mean([m[3] for m in row["members"]])) + placed = True + break + if not placed: + rows.append({"yc": it[3], "members": [it]}) + + rows.sort(key=lambda r: r["yc"]) + + parts = [] + for row in rows: + members = sorted(row["members"], key=lambda x: x[2]) # left->right + row_txt = normalize_text(" ".join( + normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1]) + )) + if row_txt: + parts.append(row_txt) + + txt = normalize_text(" ".join(parts)) + return txt + +def reconstruct_group_text_best(group_indices, ocr): + if not group_indices: + return "" + + items = [] + for i in group_indices: + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + h = max(1, b[3] - b[1]) + items.append((i, b, cx, cy, h)) + + if not items: + return "" + + # Candidate 1: simple top->bottom + cand1_items = sorted(items, key=lambda x: x[3]) + cand1 = normalize_text(" ".join( + normalize_text(ocr[it[0]][1]) for it in cand1_items if normalize_text(ocr[it[0]][1]) + )) + cand1 = fix_group_level_ocr(cand1) + + # Candidate 2: local rows + med_h = float(np.median([it[4] for it in items])) + row_tol = max(6.0, med_h * 0.65) + + rows = [] + for it in sorted(items, key=lambda x: x[3]): + placed = False + for row in rows: + if abs(it[3] - row["yc"]) <= row_tol: + row["members"].append(it) + row["yc"] = float(np.mean([m[3] for m in row["members"]])) + placed = True + break + if not placed: + rows.append({"yc": it[3], "members": [it]}) + + rows.sort(key=lambda r: r["yc"]) + cand2_parts = [] + for row in rows: + members = sorted(row["members"], key=lambda x: x[2]) + row_txt = normalize_text(" ".join( + normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1]) + )) + if row_txt: + cand2_parts.append(row_txt) + cand2 = normalize_text(" ".join(cand2_parts)) + cand2 = fix_group_level_ocr(cand2) + + # choose best + s1 = ocr_candidate_score(cand1) + s2 = ocr_candidate_score(cand2) + + return cand2 if s2 > s1 else cand1 + +def fix_group_level_ocr(text): + t = normalize_text(text or "") + if not t: + return t + + replacements = { + "ANY- THING": "ANYTHING", + "BREAK- FAST": "BREAK-FAST", + "COMMON BREAK- PEOPLE FAST": "COMMON PEOPLE EAT FOR BREAKFAST", + "WHAT DO LIKE FOR COMMON BREAK- PEOPLE FAST EAT": "WHAT DO COMMON PEOPLE EAT LIKE FOR BREAKFAST", + + # New targeted fixes for reported cases + "ILLU- SIONS": "ILLU-SIONS", + "ATTEN- TION": "ATTEN-TION", + "WHAT DO COMMON PEOPLE HE EAT?": "WHAT DO COMMON PEOPLE EAT?", + "LIKE FOR BREAK- FAST": "LIKE FOR BREAK-FAST?", + "YOUR STUCK": "YOU'RE STUCK", + "YOUR HAND!": "YOUR HAND!", + } + + for a, b in replacements.items(): + t = t.replace(a, b) + + t = dehyphenate_linebreak_artifacts(t) + t = re.sub(r"\s{2,}", " ", t).strip() + return t + +def _is_sentence_like_fragment(t: str) -> bool: + t = normalize_text(t or "") + if not t: + return False + alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t) + if len(alnum) < 2: + return False + return True + + +def _line_has_terminal_punct(t: str) -> bool: + t = normalize_text(t or "") + return bool(re.search(r"[.!?…]$", t)) + + +def _smart_split_by_connectors(text: str) -> List[str]: + """ + Conservative split for OCR text that glues multiple clauses. + """ + t = normalize_text(text or "") + if not t: + return [] + + # Keep hyphenated style if meaningful, but remove OCR line-wrap artifacts + t = dehyphenate_linebreak_artifacts(t) + + # 1) Primary punctuation split + parts = re.split(r"(?<=[.!?…])\s+", t) + parts = [p.strip() for p in parts if p.strip()] + if len(parts) >= 2: + return parts + + # 2) Secondary lexical split if punctuation failed + patterns = [ + r"\b(THEY'RE|THEY ARE)\b", + r"\b(DON'T|DO NOT)\b", + r"\b(LIKE FOR)\b", + r"\b(IF WE DON'T|IF WE DO NOT)\b", + r"\b(WHAT DO)\b", + ] + + for pat in patterns: + m = re.search(pat, t) + if m and m.start() > 8: + left = t[:m.start()].strip() + right = t[m.start():].strip() + if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right): + return [left, right] + + return [t] + +def split_box_by_sentence_rows(indices, ocr, min_groups=2): + """ + Force split one box into sentence-like row groups. + Works for stacked dialogue blocks like: + YOUR HAND! + I'M STUCK AND HELPLESS LIKE THIS! + IF WE DON'T HURRY UP, WE'LL BE CRUSHED TO DEATH! + """ + if not indices or len(indices) < 3: + return None + + # Build row groups first + rows = group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.70) + if not rows or len(rows) < min_groups: + return None + + # Turn each row-group into text + row_payload = [] + for grp in rows: + txt = normalize_text(" ".join(ocr[i][1] for i in grp if normalize_text(ocr[i][1]))) + txt = fix_group_level_ocr(txt) + if not txt: + continue + box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + row_payload.append({"indices": grp, "text": txt, "box": box}) + + if len(row_payload) < min_groups: + return None + + # Merge tiny row fragments upward if they are clearly continuation + merged = [] + for rp in row_payload: + if not merged: + merged.append(rp) + continue + + prev = merged[-1] + short_prev = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", prev["text"])) <= 5 + no_term_prev = not re.search(r"[.!?…]$", prev["text"]) + + if short_prev and no_term_prev: + new_idx = sorted(set(prev["indices"] + rp["indices"])) + new_txt = normalize_text(prev["text"] + " " + rp["text"]) + new_box = boxes_union_xyxy([prev["box"], rp["box"]]) + merged[-1] = {"indices": new_idx, "text": new_txt, "box": new_box} + else: + merged.append(rp) + + # Keep sentence-like groups + out = [] + for m in merged: + txt = normalize_text(m["text"]) + if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) < 4: + continue + out.append(sorted(m["indices"], key=lambda i: ( + quad_bbox(ocr[i][0])[1], + quad_bbox(ocr[i][0])[0] + ))) + + if len(out) < min_groups: + return None + + return out + +def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]: + """ + Layout-aware phrase segmentation for one final box. + Uses your internal grouping + punctuation/connector splitting. + """ + groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) + groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)] + + if not groups: + merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode))) + merged = fix_group_level_ocr(merged) + return [x for x in _smart_split_by_connectors(merged) if _is_sentence_like_fragment(x)] + + out = [] + for g in groups: + out.extend(_smart_split_by_connectors(g)) + + # Dedupe OCR echoes + cleaned = [] + for p in out: + p = normalize_text(p) + if not _is_sentence_like_fragment(p): + continue + if cleaned and text_similarity(cleaned[-1], p) >= 0.92: + continue + cleaned.append(p) + + return cleaned + +def build_box_group_texts(indices, ocr, reading_mode="ltr"): + """ + Return independent text groups for one final box, preserving internal layout. + Each group is reconstructed with local reading-order logic. + """ + layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode) + out = [] + + if not layout: + return out + + blocks = layout.get("blocks", []) + for block in blocks: + mode = block.get("mode", "horizontal") + groups = block.get("groups", []) + + if mode == "vertical": + groups = sorted( + groups, + key=lambda grp: np.mean([ + (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 + for i in grp + ]), + reverse=(reading_mode == "rtl") + ) + else: + groups = sorted( + groups, + key=lambda grp: np.mean([ + (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 + for i in grp + ]) + ) + + for grp in groups: + txt = reconstruct_group_text(grp, ocr) + if txt: + out.append(txt) + + return out + +def _is_sentence_like_fragment(t: str) -> bool: + t = normalize_text(t or "") + if not t: + return False + alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t) + if len(alnum) < 2: + return False + return True + + +def _line_has_terminal_punct(t: str) -> bool: + t = normalize_text(t or "") + return bool(re.search(r"[.!?…]$", t)) + + +def _smart_split_by_connectors(text: str) -> List[str]: + """ + Conservative split for OCR text that glues 2 clauses: + - DON'T PAY ANY ATTEN-TION TO THEM! THEY'RE ILLU-SIONS! + - WHAT DO COMMON PEOPLE EAT? LIKE FOR BREAK-FAST? + """ + t = normalize_text(text or "") + if not t: + return [] + + # Normalize some OCR hyphen artifacts first + t = dehyphenate_linebreak_artifacts(t) + + # Primary punctuation split + parts = re.split(r"(?<=[.!?…])\s+", t) + parts = [p.strip() for p in parts if p.strip()] + if len(parts) >= 2: + return parts + + # Secondary connector split patterns (conservative) + patterns = [ + r"\b(THEY'RE|THEY ARE)\b", + r"\b(DON'T|DO NOT)\b", + r"\b(LIKE FOR)\b", + r"\b(IF WE DON'T|IF WE DO NOT)\b", + ] + + for pat in patterns: + m = re.search(pat, t) + if m and m.start() > 8: + left = t[:m.start()].strip() + right = t[m.start():].strip() + if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right): + return [left, right] + + return [t] + + +def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]: + """ + Layout-aware phrase segmentation for one final box. + """ + # Step 1: use your existing internal grouping + groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) + groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)] + + if not groups: + merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode))) + return _smart_split_by_connectors(merged) + + # Step 2: split each group by punctuation/connectors + out = [] + for g in groups: + out.extend(_smart_split_by_connectors(g)) + + # Step 3: dedupe near-identical neighbors (OCR echo) + cleaned = [] + for p in out: + if not cleaned: + cleaned.append(p) + continue + if text_similarity(cleaned[-1], p) >= 0.92: + continue + cleaned.append(p) + + return [normalize_text(x) for x in cleaned if _is_sentence_like_fragment(x)] + +def is_multi_group_bubble(indices, ocr, reading_mode="ltr", min_groups=2): + groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode) + meaningful = [g for g in groups if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g)) >= 2] + return len(meaningful) >= min_groups + +def _bubble_text(indices, ocr, reading_mode="ltr"): + return normalize_text(" ".join(build_text_from_layout(indices, ocr, reading_mode=reading_mode))) def _box_dims(b): return max(1, b[2]-b[0]), max(1, b[3]-b[1]) @@ -2031,19 +3243,24 @@ def _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices): new_bi[nid] = bubble_indices[old] return new_b, new_bb, new_bq, new_bi -def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): +def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, + image_bgr=None, reading_mode="ltr"): """ Final reconciliation pass for: - - overlap merges (5+16, 8+18) - - child absorption (4->14, 9->19) - - complementary fragment merge (1+11) + - overlap merges + - child absorption + - complementary fragment merge + + This version is safe for optional image input and propagates reading_mode + into layout-aware text reconstruction. """ if not bubble_boxes: return bubbles, bubble_boxes, bubble_quads, bubble_indices all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] med_h = float(np.median(all_h)) if all_h else 14.0 - bubble_contours = detect_speech_bubbles(image_bgr) + + bubble_contours = detect_speech_bubbles(image_bgr) if image_bgr is not None else [] changed = True while changed: @@ -2053,16 +3270,21 @@ def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, o # ---- (A) Merge highly-overlapping pairs merged_any = False for i in range(len(bids)): - if merged_any: break - for j in range(i+1, len(bids)): + if merged_any: + break + + for j in range(i + 1, len(bids)): bi, bj = bids[i], bids[j] + if bi not in bubble_boxes or bj not in bubble_boxes: continue + a, b = bubble_boxes[bi], bubble_boxes[bj] iou = boxes_iou(a, b) ovs = boxes_overlap_ratio(a, b) # inter / smaller - same_contour = _in_same_bubble_contour(a, b, bubble_contours) + same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False + if ovs >= 0.55 or (iou >= 0.35 and same_contour): idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) bubble_indices[bi] = idx @@ -2085,12 +3307,17 @@ def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, o # ---- (B) Absorb tiny child boxes inside larger parent absorbed_any = False bids = sorted(bubble_boxes.keys()) + for i in range(len(bids)): - if absorbed_any: break + if absorbed_any: + break + for j in range(len(bids)): if i == j: continue + child, parent = bids[i], bids[j] + if child not in bubble_boxes or parent not in bubble_boxes: continue @@ -2099,11 +3326,11 @@ def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, o pw, ph = _box_dims(pb) contain = _containment_ratio(cb, pb) - child_txt = _bubble_text(bubble_indices[child], ocr) - parent_txt = _bubble_text(bubble_indices[parent], ocr) + child_txt = _bubble_text(bubble_indices[child], ocr, reading_mode=reading_mode) + parent_txt = _bubble_text(bubble_indices[parent], ocr, reading_mode=reading_mode) # tiny or fragment child - is_tiny = (cw <= med_h*3.2 and ch <= med_h*2.2) or len(child_txt) <= 14 + is_tiny = (cw <= med_h * 3.2 and ch <= med_h * 2.2) or len(child_txt) <= 14 # don't absorb if it's clearly separate and far close = _center_distance(cb, pb) <= med_h * 4.0 @@ -2127,13 +3354,17 @@ def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, o if changed: continue - # ---- (C) Merge complementary fragments (partial overlap, same contour, similar x-span) + # ---- (C) Merge complementary fragments comp_any = False bids = sorted(bubble_boxes.keys()) + for i in range(len(bids)): - if comp_any: break - for j in range(i+1, len(bids)): + if comp_any: + break + + for j in range(i + 1, len(bids)): bi, bj = bids[i], bids[j] + if bi not in bubble_boxes or bj not in bubble_boxes: continue @@ -2144,14 +3375,14 @@ def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, o vert_gap = max(0, max(a[1], b[1]) - min(a[3], b[3])) h_ix = max(0, min(a[2], b[2]) - max(a[0], b[0])) h_overlap_ratio = h_ix / max(1, min(wi, wj)) - same_contour = _in_same_bubble_contour(a, b, bubble_contours) + same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False - txt_i = _bubble_text(bubble_indices[bi], ocr) - txt_j = _bubble_text(bubble_indices[bj], ocr) + txt_i = _bubble_text(bubble_indices[bi], ocr, reading_mode=reading_mode) + txt_j = _bubble_text(bubble_indices[bj], ocr, reading_mode=reading_mode) - if same_contour and vert_gap <= med_h*2.8 and h_overlap_ratio >= 0.45: - # prefer merge when one is “upper fragment” + the other “lower fragment” - # and text isn't identical duplicate + if same_contour and vert_gap <= med_h * 2.8 and h_overlap_ratio >= 0.45: + # prefer merge when one is upper fragment + other lower fragment + # and text is not identical duplicate if txt_i != txt_j: idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) bubble_indices[bi] = idx @@ -2170,6 +3401,177 @@ def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, o return _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices) +def split_boxes_by_internal_vertical_groups(bubbles, bubble_boxes, bubble_quads, bubble_indices, + ocr, image_shape, reading_mode="ltr"): + """ + Conservative splitter: + - Split only when evidence is strong. + - Prevent over-splitting of short/noisy vertical tokens. + """ + ih, iw = image_shape[:2] + out_bubbles = {} + out_boxes = {} + out_quads = {} + out_indices = {} + next_id = 1 + + # conservative thresholds + MIN_ALNUM_PER_GROUP = 8 + MIN_GROUP_HEIGHT_RATIO = 0.30 # was too low before + MIN_VERTICAL_GROUPS_TO_SPLIT = 2 + MAX_SPLIT_PARTS = 3 # safety cap + + for bid in sorted(bubble_boxes.keys()): + idxs = bubble_indices[bid] + parent = bubble_boxes[bid] + parent_h = max(1, parent[3] - parent[1]) + parent_w = max(1, parent[2] - parent[0]) + + if len(idxs) < 4: + out_bubbles[next_id] = bubbles[bid] + out_boxes[next_id] = bubble_boxes[bid] + out_quads[next_id] = bubble_quads[bid] + out_indices[next_id] = idxs + next_id += 1 + continue + + layout = detect_internal_text_layout(idxs, ocr, reading_mode=reading_mode) + did_split = False + + # -------------------------------------------------------------- + # Primary: vertical-mode internal groups (STRICT) + # -------------------------------------------------------------- + if layout and layout.get("blocks"): + candidate_groups = [] + + for block in layout.get("blocks", []): + if block.get("mode", "horizontal") != "vertical": + continue + + for grp in block.get("groups", []): + grp = sorted(set(grp), key=lambda i: ( + quad_bbox(ocr[i][0])[1], + quad_bbox(ocr[i][0])[0] + )) + if not grp: + continue + + txt = reconstruct_group_text_best(grp, ocr) + txt = normalize_text(fix_group_level_ocr(txt)) + if not txt: + continue + + alnum_len = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) + if alnum_len < MIN_ALNUM_PER_GROUP: + continue + + gb = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + gw = max(1, gb[2] - gb[0]) + gh = max(1, gb[3] - gb[1]) + + # require meaningful physical size + if gh < parent_h * MIN_GROUP_HEIGHT_RATIO: + continue + + # avoid splitting tiny narrow SFX-like strips + if gw < parent_w * 0.12 and alnum_len < 12: + continue + + # sentence-ish check + words = txt.split() + has_terminal = bool(re.search(r"[.!?…]$", txt)) + if len(words) < 2 and not has_terminal: + continue + + candidate_groups.append({ + "indices": grp, + "text": txt, + "box": gb + }) + + if len(candidate_groups) >= MIN_VERTICAL_GROUPS_TO_SPLIT: + # Sort columns by reading order + candidate_groups = sorted( + candidate_groups, + key=lambda g: (g["box"][0] + g["box"][2]) / 2.0, + reverse=(reading_mode == "rtl") + ) + + # cap extreme over-splits + if len(candidate_groups) > MAX_SPLIT_PARTS: + candidate_groups = candidate_groups[:MAX_SPLIT_PARTS] + + # final sanity: total text coverage vs parent text + parent_txt = normalize_text(" ".join(build_final_box_text(idxs, ocr, reading_mode=reading_mode))) + parent_alnum = max(1, len(re.sub(r"[^A-ZÀ-Ý0-9]", "", parent_txt))) + sum_child_alnum = sum(len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g["text"])) for g in candidate_groups) + + # if split loses too much text evidence, reject + if (sum_child_alnum / parent_alnum) >= 0.65: + for g in candidate_groups: + grp = sorted(set(g["indices"]), key=lambda i: ( + quad_bbox(ocr[i][0])[1], + quad_bbox(ocr[i][0])[0] + )) + ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + + out_indices[next_id] = grp + out_quads[next_id] = [ocr[i][0] for i in grp] + out_boxes[next_id] = ( + max(0, ub[0] - 2), max(0, ub[1] - 2), + min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) + ) + out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode) + next_id += 1 + + did_split = True + + if did_split: + continue + + # -------------------------------------------------------------- + # Fallback: row sentence split (ONLY for strong punctuation cases) + # -------------------------------------------------------------- + row_sentence_parts = split_box_by_sentence_rows(idxs, ocr, min_groups=2) + + if row_sentence_parts and 2 <= len(row_sentence_parts) <= 3: + # Require punctuation evidence in resulting parts + part_texts = [] + for grp in row_sentence_parts: + txt = normalize_text(" ".join(build_lines_from_indices(grp, ocr))) + txt = fix_group_level_ocr(txt) + part_texts.append(txt) + + punct_parts = sum(1 for t in part_texts if re.search(r"[.!?…]$", t)) + if punct_parts >= 2: + for grp in row_sentence_parts: + grp = sorted(set(grp), key=lambda i: ( + quad_bbox(ocr[i][0])[1], + quad_bbox(ocr[i][0])[0] + )) + ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + + out_indices[next_id] = grp + out_quads[next_id] = [ocr[i][0] for i in grp] + out_boxes[next_id] = ( + max(0, ub[0] - 2), max(0, ub[1] - 2), + min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) + ) + out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode) + next_id += 1 + continue + + # -------------------------------------------------------------- + # Keep original if no strong split evidence + # -------------------------------------------------------------- + out_bubbles[next_id] = bubbles[bid] + out_boxes[next_id] = bubble_boxes[bid] + out_quads[next_id] = bubble_quads[bid] + out_indices[next_id] = idxs + next_id += 1 + + return out_bubbles, out_boxes, out_quads, out_indices + def split_box_by_internal_vertical_gaps(bid, bubble_indices, ocr, factor=1.45, min_gap=16): """ Multi-cut vertical splitter. @@ -2356,6 +3758,9 @@ def force_split_bridged_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indice next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices +# ============================================================ +# translate_manga_text START +# ============================================================ def translate_manga_text( image_path="001-page.png", @@ -2406,9 +3811,15 @@ def translate_manga_text( rx2, ry2 = min(iw, rx2 + pad), min(ih, ry2 + pad) crop = image[ry1:ry2, rx1:rx2] if crop.size > 0: - upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC) + upscaled = cv2.resize( + crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC + ) for quad, text, conf in detector.run_vision_ocr(upscaled): - raw.append(([[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad], text, conf)) + raw.append(( + [[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad], + text, + conf + )) print(f"📝 Total detections after missed region scan: {len(raw)}") # ── Filtering ───────────────────────────────────────────────────────── @@ -2476,7 +3887,13 @@ def translate_manga_text( filtered, image.shape, gap_px=resolved_gap, bbox_padding=1, strict_mode=strict_grouping ) - print(f" Created {len(bubbles)} initial box(es)") + print(f" Created {len(bubbles)} initial bubble-group box(es)") + + print("🧱 Proposing region-first text containers...") + region_lines, region_boxes, region_quads, region_indices = propose_text_regions_from_ocr( + filtered, image.shape + ) + print(f" Proposed {len(region_lines)} region container(s)") # ── Auto-fix (split + merge) ────────────────────────────────────────── if auto_fix_bubbles: @@ -2516,7 +3933,9 @@ def translate_manga_text( splits_performed.append(f"BOX#{bid} ({split_reason})") for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) - new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) + new_bubbles[part_bid] = build_final_box_text( + part_idxs, filtered, reading_mode=reading_mode + ) new_bubble_boxes[part_bid] = ( max(0, ub[0] - 2), max(0, ub[1] - 2), min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) @@ -2525,7 +3944,9 @@ def translate_manga_text( new_bubble_indices[part_bid] = part_idxs next_bid += 1 else: - new_bubbles[bid] = bubbles[bid] + new_bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) new_bubble_boxes[bid] = bubble_boxes[bid] new_bubble_quads[bid] = bubble_quads[bid] new_bubble_indices[bid] = bubble_indices[bid] @@ -2535,7 +3956,6 @@ def translate_manga_text( for s in splits_performed: print(f" ✓ {s}") - # IMPORTANT: commit split-pass results bubbles = new_bubbles bubble_boxes = new_bubble_boxes bubble_quads = new_bubble_quads @@ -2546,29 +3966,106 @@ def translate_manga_text( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) - # ── Final reconciliation pass (overlaps, child absorb, complementary merge) ── + for bid in list(bubble_indices.keys()): + bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + + # ── Final reconciliation pass ───────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( + bubbles, + bubble_boxes, + bubble_quads, + bubble_indices, + filtered, + image_bgr=image, + reading_mode=reading_mode + ) + + for bid in list(bubble_indices.keys()): + bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image ) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image - ) - + + for bid in list(bubble_indices.keys()): + bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image + bubbles, + bubble_boxes, + bubble_quads, + bubble_indices, + filtered, + image_bgr=image, + reading_mode=reading_mode ) + for bid in list(bubble_indices.keys()): + bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + + # ── Reconcile bubble-first and region-first views ───────────────────── + bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_region_and_bubble_groups( + region_lines, region_boxes, region_quads, region_indices, + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered + ) + + for bid in list(bubble_indices.keys()): + bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + + # ── Split boxes by internal vertical groups ─────────────────────────── + bubbles, bubble_boxes, bubble_quads, bubble_indices = split_boxes_by_internal_vertical_groups( + bubbles, + bubble_boxes, + bubble_quads, + bubble_indices, + filtered, + image.shape, + reading_mode=reading_mode + ) + + for bid in list(bubble_indices.keys()): + bubbles[bid] = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + print(f"✅ Final box count: {len(bubbles)}") # ── OCR quality pass ────────────────────────────────────────────────── translator = GoogleTranslator(source=source_lang, target=target_lang) clean_lines: Dict[int, str] = {} + raw_lines: Dict[int, str] = {} + corrected_lines: Dict[int, str] = {} sources_used: Dict[int, str] = {} translations: Dict[int, str] = {} + region_types: Dict[int, str] = {} + region_confidences: Dict[int, float] = {} + region_flags: Dict[int, List[str]] = {} + bubble_group_texts: Dict[int, List[str]] = {} - for bid, lines in bubbles.items(): - base_txt = normalize_text(" ".join(lines)) + for bid in sorted(bubble_boxes.keys()): + final_lines = build_final_box_text( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + bubbles[bid] = final_lines + + # NEW: segmented phrase groups for translation + group_texts = segment_box_into_phrases( + bubble_indices[bid], filtered, reading_mode=reading_mode + ) + bubble_group_texts[bid] = group_texts + + base_txt = normalize_text(" ".join(final_lines)) + raw_lines[bid] = base_txt base_sc = ocr_candidate_score(base_txt) txt, src_used = base_txt, "vision-base" @@ -2579,14 +4076,34 @@ def translate_manga_text( if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): txt, src_used = rr_txt, rr_src - clean_lines[bid] = normalize_text(txt) + tmp_lines = [txt] if txt else final_lines + region_type = classify_region_type(image, bubble_boxes[bid], tmp_lines) + corrected_txt, correction_gain = correct_region_text(txt, region_type=region_type) + conf = compute_region_confidence(txt, corrected_txt, bubble_boxes[bid], region_type, image) + flags = build_region_flags(txt, corrected_txt, region_type, conf) + + if len([g for g in group_texts if g.strip()]) >= 2: + flags.append("BUBBLE") + flags.append("SEGMENTED") + + clean_lines[bid] = normalize_text(corrected_txt) + corrected_lines[bid] = normalize_text(corrected_txt) sources_used[bid] = src_used + region_types[bid] = region_type + region_confidences[bid] = conf + region_flags[bid] = sorted(set(flags)) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) # ── Translation ─────────────────────────────────────────────────────── for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src_txt = clean_lines[bid].strip() + group_texts = [g for g in bubble_group_texts.get(bid, []) if g.strip()] + + if len(group_texts) >= 2: + src_txt = " ".join(group_texts).strip() + else: + src_txt = clean_lines[bid].strip() + if not src_txt: continue if not is_valid_language(src_txt, source_lang): @@ -2595,21 +4112,45 @@ def translate_manga_text( continue try: - tgt = translator.translate(src_txt) or "" - tgt = postprocess_translation_general(tgt).upper() + if len(group_texts) >= 2: + translated_groups = [] + for g in group_texts: + if not is_valid_language(g, source_lang): + continue + if not is_meaningful_text(g, source_lang): + continue + tg = translator.translate(g) or "" + tg = postprocess_translation_general(tg).upper() + if tg: + translated_groups.append(tg) + tgt = " || ".join(translated_groups) + else: + tgt = translator.translate(src_txt) or "" + tgt = postprocess_translation_general(tgt).upper() except Exception as e: tgt = f"[Error: {e}]" translations[bid] = tgt if debug: - save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, clean_lines, "debug_clusters.png") + save_debug_clusters( + image_path, filtered, bubble_boxes, bubble_indices, + clean_lines, "debug_clusters.png", region_types=region_types + ) # ── Text output ─────────────────────────────────────────────────────── - divider = "─" * 120 - out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] - print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} " - f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider) + divider = "─" * 140 + out_lines = [ + "BUBBLE|ORDER|TYPE|CONF|OCR_SOURCE|ORIGINAL|CORRECTED|BUBBLE_GROUPS|TRANSLATED|FLAGS", + divider + ] + + print( + divider + + f"\n{'BUBBLE':<8} {'ORDER':<6} {'TYPE':<10} {'CONF':<6} {'SOURCE':<12} " + f"{'CORRECTED':<30} {'BUBBLE_GROUPS':<40} {'TRANSLATED':<30} FLAGS\n" + + divider + ) translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): @@ -2621,19 +4162,29 @@ def translate_manga_text( if not is_meaningful_text(src_txt, source_lang): continue - flags = [] + flags = list(region_flags.get(bid, [])) tgt = translations.get(bid, "") if not tgt: flags.append("NO_TRANSLATION") - src_u = src_txt.upper() + src_engine = sources_used.get(bid, "unknown") + rtype = region_types.get(bid, "unknown") + rconf = region_confidences.get(bid, 0.0) + raw_u = raw_lines.get(bid, "").upper() + corr_u = corrected_lines.get(bid, "").upper() + group_blob = " || ".join(bubble_group_texts.get(bid, [])).upper() out_lines.append( - f"#{bid}|{reading_map.get(bid, bid)}|{src_engine}|{src_u}|{tgt}|" + f"#{bid}|{reading_map.get(bid, bid)}|{rtype}|{rconf:.2f}|{src_engine}|" + f"{raw_u}|{corr_u}|{group_blob}|{tgt}|{','.join(flags) if flags else '-'}" + ) + + print( + f"#{bid:<7} {reading_map.get(bid,bid):<6} {rtype:<10} {rconf:<6.2f} {src_engine:<12} " + f"{corr_u[:30]:<30} {group_blob[:40]:<40} {tgt[:30]:<30} " f"{','.join(flags) if flags else '-'}" ) - print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " - f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}") + translated_count += 1 out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.") @@ -2655,9 +4206,15 @@ def translate_manga_text( tgt = translations.get(bid, "") bubbles_payload[str(bid)] = { "order": reading_map.get(bid, bid), + "region_type": region_types.get(bid, "unknown"), + "confidence": round(region_confidences.get(bid, 0.0), 4), "ocr_source": sources_used.get(bid, "unknown"), - "original": src_txt.upper(), + "raw_ocr": raw_lines.get(bid, "").upper(), + "corrected_ocr": corrected_lines.get(bid, "").upper(), + "translation_input": src_txt.upper(), "translated": tgt, + "flags": region_flags.get(bid, []), + "bubble_groups": [g.upper() for g in bubble_group_texts.get(bid, [])], "box": { "x": box[0] if box else 0, "y": box[1] if box else 0, @@ -2672,6 +4229,11 @@ def translate_manga_text( print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") + +# ============================================================ +# translate_manga_text END +# ============================================================ + # ============================================================ # ENTRY POINT # ============================================================ @@ -2686,7 +4248,7 @@ if __name__ == "__main__": quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", - reading_mode="rtl", + reading_mode="ltr", debug=True, use_enhanced_ocr=True, strict_grouping=True,