Files
manga-translator/manga-translator.py
Guillem Hernandez Sola b730037a06 Added big stuff
2026-04-22 16:18:59 +02:00

4258 lines
156 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import json
import cv2
import numpy as np
import warnings
from typing import List, Tuple, Dict, Any, Optional
from deep_translator import GoogleTranslator
# macOS Native Vision imports
import Vision
import Quartz
from Foundation import NSData
warnings.filterwarnings("ignore", category=UserWarning)
# ============================================================
# CONFIG
# ============================================================
TOP_BAND_RATIO = 0.08
# ============================================================
# REGION-FIRST LAYOUT HELPERS
# ============================================================
import math
from difflib import SequenceMatcher
DIALOGUE_STOPWORDS = {
"I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR",
"IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT",
"WHAT", "WHY", "HOW", "WHO", "IN", "ON", "AT", "TO", "OF", "FOR", "WITH",
"AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT"
}
SFX_HINTS = {
"RRRING", "RING", "RINGG", "BAM", "BOOM", "FWUP", "FWOOP", "FSHOO",
"GRRP", "GASP", "THUD", "SMACK", "WHAM", "SLAM", "SNIF", "SNIFF"
}
REACTION_HINTS = {
"HUH", "HUH?!", "HUH?", "OH", "AH", "EH", "TCH", "HEY", "WHAT?!", "NO!", "YES!"
}
NARRATION_HINTS = {
"AND SO", "MEANWHILE", "LATER", "THEN", "TO BE CONTINUED"
}
def xyxy_width(b):
return max(1, b[2] - b[0])
def xyxy_height(b):
return max(1, b[3] - b[1])
def xyxy_center(b):
return ((b[0] + b[2]) / 2.0, (b[1] + b[3]) / 2.0)
def box_distance(a, b):
ax, ay = xyxy_center(a)
bx, by = xyxy_center(b)
return math.hypot(ax - bx, ay - by)
def horizontal_overlap_ratio(a, b):
ix1, ix2 = max(a[0], b[0]), min(a[2], b[2])
ov = max(0, ix2 - ix1)
return ov / max(1, min(xyxy_width(a), xyxy_width(b)))
def vertical_overlap_ratio(a, b):
iy1, iy2 = max(a[1], b[1]), min(a[3], b[3])
ov = max(0, iy2 - iy1)
return ov / max(1, min(xyxy_height(a), xyxy_height(b)))
def box_expand(b, pad, iw, ih):
return (
max(0, int(b[0] - pad)),
max(0, int(b[1] - pad)),
min(iw - 1, int(b[2] + pad)),
min(ih - 1, int(b[3] + pad)),
)
def count_alpha(text):
return len(re.findall(r"[A-ZÀ-Ýa-zà-ÿ]", text or ""))
def uppercase_ratio(text):
alpha = re.findall(r"[A-Za-zÀ-ÿ]", text or "")
if not alpha:
return 0.0
ups = sum(1 for c in alpha if c.isupper())
return ups / len(alpha)
def punctuation_ratio(text):
if not text:
return 0.0
return len(re.findall(r"[!?.,'\"-]", text)) / max(1, len(text))
def stopword_ratio(text):
toks = re.findall(r"[A-Z']+", normalize_text(text or ""))
if not toks:
return 0.0
hits = sum(1 for t in toks if t in DIALOGUE_STOPWORDS)
return hits / len(toks)
def looks_like_sfx_text(text):
t = normalize_text(text or "")
if not t:
return False
alpha = re.sub(r"[^A-Z]", "", t)
if t in SFX_HINTS or alpha in SFX_HINTS:
return True
if len(alpha) >= 3 and uppercase_ratio(t) > 0.90 and stopword_ratio(t) < 0.15:
if alpha not in DIALOGUE_STOPWORDS:
return True
return False
def looks_like_reaction_text(text):
t = normalize_text(text or "")
alpha = re.sub(r"[^A-Z?!]", "", t)
if t in REACTION_HINTS or alpha in REACTION_HINTS:
return True
if len(re.sub(r"[^A-Z]", "", t)) <= 5 and punctuation_ratio(t) > 0.10:
return True
return False
def looks_like_narration_text(text):
t = normalize_text(text or "")
if any(t.startswith(h) for h in NARRATION_HINTS):
return True
if len(t.split()) >= 5 and t.endswith(".") and uppercase_ratio(t) > 0.75:
return True
return False
def contour_features_for_box(image_bgr, box_xyxy):
x1, y1, x2, y2 = box_xyxy
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0:
return {
"mean_brightness": 0.0,
"edge_density": 1.0,
"whiteness_ratio": 0.0,
}
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
mean_brightness = float(np.mean(gray)) / 255.0
edges = cv2.Canny(gray, 50, 150)
edge_density = float(np.mean(edges > 0))
whiteness_ratio = float(np.mean(gray > 220))
return {
"mean_brightness": mean_brightness,
"edge_density": edge_density,
"whiteness_ratio": whiteness_ratio,
}
def classify_region_type(image_bgr, box_xyxy, lines):
text = normalize_text(" ".join(lines))
feats = contour_features_for_box(image_bgr, box_xyxy)
w, h = xyxy_width(box_xyxy), xyxy_height(box_xyxy)
ar = w / max(1, h)
if looks_like_sfx_text(text):
return "sfx"
if looks_like_reaction_text(text):
if len(text.split()) <= 3:
return "reaction"
if looks_like_narration_text(text):
return "narration"
# balloon/dialogue heuristic:
# bright interior + low-ish edge density + moderate width
if feats["whiteness_ratio"] > 0.45 and feats["edge_density"] < 0.18:
return "dialogue"
# narrow tall shout / reaction / sfx ambiguity
if ar < 0.9 and uppercase_ratio(text) > 0.85 and stopword_ratio(text) < 0.20:
return "sfx"
if stopword_ratio(text) >= 0.20:
return "dialogue"
return "unknown"
def text_similarity(a, b):
return SequenceMatcher(None, normalize_text(a or ""), normalize_text(b or "")).ratio()
def dedupe_repeated_phrase(text):
t = normalize_text(text or "")
words = t.split()
if len(words) < 4:
return t
# remove immediate duplicated halves: "CRY! CRY!" / "I DIDN'T I DIDN'T"
half = len(words) // 2
if len(words) % 2 == 0 and words[:half] == words[half:]:
return " ".join(words[:half])
# collapse trailing duplicate tokens
cleaned = []
for w in words:
if cleaned and cleaned[-1] == w and len(w) > 2:
continue
cleaned.append(w)
return " ".join(cleaned)
def dehyphenate_linebreak_artifacts(text):
t = normalize_text(text or "")
t = re.sub(r"\b([A-Z]+)- ([A-Z]+)\b", r"\1\2", t)
return t
def fix_common_dialogue_ocr(text):
"""
Conservative OCR cleanup for dialogue-like text.
Goals:
- fix common OCR punctuation/spacing/apostrophe errors
- preserve meaning and tone
- avoid semantic reconstruction guesses
"""
t = normalize_text(text or "")
if not t:
return t
replacements = {
"1'M": "I'M",
"1 DIDN'T": "I DIDN'T",
"1 HATE": "I HATE",
"1 WAS": "I WAS",
"1'M ": "I'M ",
"YO U": "YOU",
"YOU RE": "YOU'RE",
"YOURE": "YOU'RE",
"I LL": "I'LL",
"ILL ": "I'LL ",
"DONT": "DON'T",
"DIDNT": "DIDN'T",
"CANT": "CAN'T",
"WONT": "WON'T",
"THATS": "THAT'S",
"MOMS": "MOM'S",
"DADS": "DAD'S",
"LEARN- ING": "LEARNING",
"COV- ERED": "COVERED",
"SY ON": "SY-ON",
"P PROPERLY": "P-PROPERLY",
"SH SHUT": "SH- SHUT",
}
for a, b in replacements.items():
t = t.replace(a, b)
# Fix split contractions / apostrophe omissions
t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t)
t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t)
t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t)
t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t)
t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t)
# Remove accidental duplicated punctuation spacing
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
# Dehyphenate OCR line-wrap artifacts
t = dehyphenate_linebreak_artifacts(t)
# Collapse repeated full phrases/tokens caused by OCR duplication
t = dedupe_repeated_phrase(t)
# Remove duplicated adjacent words like "CRY CRY" if clearly accidental
words = t.split()
cleaned = []
for w in words:
if cleaned and cleaned[-1] == w and len(re.sub(r"[^A-Z]", "", w)) > 2:
continue
cleaned.append(w)
t = " ".join(cleaned)
# Normalize spaces
t = re.sub(r"\s{2,}", " ", t).strip()
return t
def region_text_role_hint(text):
if looks_like_sfx_text(text):
return "sfx"
if looks_like_reaction_text(text):
return "reaction"
if looks_like_narration_text(text):
return "narration"
return "dialogue"
def correct_region_text(text, region_type="dialogue"):
t = normalize_text(text or "")
if not t:
return t, 0.0
original = t
if region_type in {"dialogue", "reaction", "narration"}:
t = fix_common_dialogue_ocr(t)
elif region_type == "sfx":
t = dedupe_repeated_phrase(t)
score_before = ocr_candidate_score(original)
score_after = ocr_candidate_score(t)
correction_gain = max(0.0, score_after - score_before)
return t, correction_gain
def compute_region_confidence(raw_text, corrected_text, box_xyxy, region_type, image_bgr):
feats = contour_features_for_box(image_bgr, box_xyxy)
text_score = ocr_candidate_score(corrected_text)
gain = max(0.0, text_score - ocr_candidate_score(raw_text))
role_bonus = 0.08 if region_type in {"dialogue", "reaction", "narration", "sfx"} else 0.0
score = (
0.55 * text_score +
0.15 * feats["whiteness_ratio"] +
0.10 * (1.0 - min(1.0, feats["edge_density"] * 2.0)) +
0.10 * gain +
role_bonus
)
return max(0.0, min(1.0, score))
def build_region_flags(raw_text, corrected_text, region_type, conf):
flags = []
if region_type == "unknown":
flags.append("REGION_UNKNOWN")
if region_type == "sfx":
flags.append("SFX")
if conf < 0.45:
flags.append("LOW_CONF")
if text_similarity(raw_text, corrected_text) < 0.75:
flags.append("HEAVY_CORRECTION")
if len(corrected_text.split()) > 22:
flags.append("LONG_TEXT")
return flags
# ============================================================
# HELPERS
# ============================================================
def normalize_text(text: str) -> str:
t = (text or "").strip().upper()
t = t.replace("\u201c", "\"").replace("\u201d", "\"")
t = t.replace("\u2018", "'").replace("\u2019", "'")
t = t.replace("\u2026", "...")
t = re.sub(r"\s+", " ", t)
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
t = re.sub(r"([¡¿])\s+", r"\1", t)
t = re.sub(r"\(\s+", "(", t)
t = re.sub(r"\s+\)", ")", t)
t = re.sub(r"\.{4,}", "...", t)
return t.strip()
def postprocess_translation_general(text: str) -> str:
t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip()
t = re.sub(r"([!?]){3,}", r"\1\1", t)
t = re.sub(r"\.{4,}", "...", t)
return t
def fix_common_ocr_errors(text: str) -> str:
result = text
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
result = result.replace('|', 'I')
result = result.replace('`', "'")
return result
def is_valid_language(text: str, source_lang: str) -> bool:
if not text:
return False
clean_text = re.sub(r'[^\w]', '', text)
if not clean_text:
return False
lang = source_lang.lower()
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
foreign_chars = len(re.findall(
r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF'
r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]',
clean_text))
if foreign_chars > 0:
return False
latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text))
total = len(clean_text)
if total <= 3:
return latin_chars >= 1
if total <= 6:
return (latin_chars / total) >= 0.55
return (latin_chars / total) >= 0.45
elif lang in ['ja', 'japanese']:
ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text))
if len(clean_text) <= 3:
return ja_chars >= 1
return (ja_chars / len(clean_text)) >= 0.4
elif lang in ['ko', 'korean']:
ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text))
if len(clean_text) <= 3:
return ko_chars >= 1
return (ko_chars / len(clean_text)) >= 0.4
elif lang in ['zh', 'chinese']:
zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text))
if len(clean_text) <= 3:
return zh_chars >= 1
return (zh_chars / len(clean_text)) >= 0.4
return True
_NOISE_TOKENS = {
'P', 'F', 'N', 'M', 'X', 'Z', 'Q',
'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP',
}
_MANGA_INTERJECTIONS = {
'HUH', 'HUH?', 'HUH??', 'HUH?!',
'OH', 'OH!', 'OOH', 'OOH!',
'AH', 'AH!', 'UH', 'UH...',
'HEY', 'HEY!',
'EH', 'EH?',
'WOW', 'WOW!',
'YES', 'NO', 'NO!',
'RUN', 'GO', 'GO!',
'STOP', 'WAIT',
'WHAT', 'WHAT?', 'WHAT?!',
'WHY', 'WHY?',
'HOW', 'HOW?',
'OK', 'OK!', 'OKAY',
'EEEEP', 'EEEP',
'OMIGOSH',
'BECKY', 'BECKY!',
'HMM', 'HMM...',
'TSK', 'TCH',
'GRRR','I','A',
'FWUP', 'FWAP',
'SHIVER',
'RRRING',
'MORNING', 'MORNING.',
}
def group_indices_into_vertical_columns(indices, ocr,
x_tolerance_factor=1.4,
min_vertical_span_factor=1.8):
"""
Group OCR indices into vertical columns inside a box.
A column is defined as:
- similar x centers
- meaningful vertical spread
- internally ordered top-to-bottom
"""
if not indices:
return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
cx = (b[0] + b[2]) / 2.0
cy = (b[1] + b[3]) / 2.0
w = max(1, b[2] - b[0])
h = max(1, b[3] - b[1])
items.append((i, b, cx, cy, w, h))
med_w = float(np.median([it[4] for it in items])) if items else 12.0
med_h = float(np.median([it[5] for it in items])) if items else 12.0
x_tol = max(10.0, med_w * x_tolerance_factor)
# cluster by x-center
items_sorted = sorted(items, key=lambda x: x[2])
columns = []
for it in items_sorted:
placed = False
for col in columns:
if abs(it[2] - col["xc"]) <= x_tol:
col["members"].append(it)
col["xc"] = float(np.mean([m[2] for m in col["members"]]))
placed = True
break
if not placed:
columns.append({"xc": it[2], "members": [it]})
# sort each column top -> bottom
clean_columns = []
for col in columns:
members = sorted(col["members"], key=lambda x: x[3])
ys = [m[3] for m in members]
vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0.0
# keep meaningful columns OR single strong items
if len(members) >= 2 or vertical_span >= med_h * min_vertical_span_factor:
clean_columns.append([m[0] for m in members])
else:
clean_columns.append([m[0] for m in members])
# sort columns left -> right
clean_columns.sort(key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]))
return clean_columns
def group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.75):
"""
Group OCR indices into horizontal rows inside a box.
"""
if not indices:
return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
cx = (b[0] + b[2]) / 2.0
cy = (b[1] + b[3]) / 2.0
h = max(1, b[3] - b[1])
items.append((i, b, cx, cy, h))
med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * row_tol_factor)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
placed = False
for row in rows:
if abs(it[3] - row["yc"]) <= row_tol:
row["members"].append(it)
row["yc"] = float(np.mean([m[3] for m in row["members"]]))
placed = True
break
if not placed:
rows.append({"yc": it[3], "members": [it]})
groups = []
for row in rows:
members = sorted(row["members"], key=lambda x: x[2])
groups.append([m[0] for m in members])
return groups
def score_text_groups(groups, ocr):
"""
Score grouping quality based on:
- average group size
- text plausibility
- reduced fragmentation
"""
if not groups:
return 0.0
texts = []
lengths = []
for grp in groups:
parts = []
for i in grp:
t = normalize_text(ocr[i][1])
if t:
parts.append(t)
txt = normalize_text(" ".join(parts))
if txt:
texts.append(txt)
lengths.append(len(txt.split()))
if not texts:
return 0.0
text_scores = [ocr_candidate_score(t) for t in texts]
avg_text_score = float(np.mean(text_scores)) if text_scores else 0.0
avg_len = float(np.mean(lengths)) if lengths else 0.0
fragmentation_penalty = max(0.0, len(groups) - 4) * 0.08
return avg_text_score + min(0.5, avg_len * 0.05) - fragmentation_penalty
def detect_internal_text_layout(indices, ocr, reading_mode="ltr"):
"""
Detect internal structure of text inside one final box.
Step 1: split into vertical macro blocks
Step 2: for each block, compare horizontal vs vertical grouping
"""
if not indices:
return {"mode": "horizontal", "blocks": []}
blocks = split_indices_into_vertical_blocks(indices, ocr)
resolved_blocks = []
for block in blocks:
horizontal_groups = group_indices_into_horizontal_rows(block, ocr)
vertical_groups = group_indices_into_vertical_columns(block, ocr)
h_score = score_text_groups(horizontal_groups, ocr)
v_score = score_text_groups(vertical_groups, ocr)
if len(vertical_groups) >= 2 and v_score >= h_score - 0.03:
resolved_blocks.append({
"mode": "vertical",
"groups": vertical_groups
})
else:
resolved_blocks.append({
"mode": "horizontal",
"groups": horizontal_groups
})
return {"mode": "block-mixed", "blocks": resolved_blocks}
def build_text_from_layout(indices, ocr, reading_mode="ltr"):
layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode)
output_lines = []
for block in layout["blocks"]:
groups = block["groups"]
mode = block["mode"]
if mode == "horizontal":
for grp in groups:
line = normalize_text(" ".join(
ocr[i][1] for i in grp if normalize_text(ocr[i][1])
))
if line:
output_lines.append(line)
elif mode == "vertical":
if reading_mode == "rtl":
groups = sorted(
groups,
key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp]),
reverse=True
)
else:
groups = sorted(
groups,
key=lambda grp: np.mean([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in grp])
)
for grp in groups:
grp_sorted = sorted(grp, key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0)
line = normalize_text(" ".join(
ocr[i][1] for i in grp_sorted if normalize_text(ocr[i][1])
))
if line:
output_lines.append(line)
return output_lines
# ============================================================
# REGION PROPOSAL FROM OCR GEOMETRY
# ============================================================
def propose_text_regions_from_ocr(ocr, image_shape):
"""
Build larger text containers from OCR boxes before final classification.
This is intentionally conservative: it clusters nearby OCR groups that
likely belong to one dialogue/narration region.
"""
ih, iw = image_shape[:2]
if not ocr:
return {}, {}, {}, {}
boxes = [quad_bbox(x[0]) for x in ocr]
hs = [max(1, b[3] - b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 14.0
parent = list(range(len(ocr)))
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(a, b):
ra, rb = find(a), find(b)
if ra != rb:
parent[rb] = ra
for i in range(len(ocr)):
bi = boxes[i]
for j in range(i + 1, len(ocr)):
bj = boxes[j]
dx = abs(xyxy_center(bi)[0] - xyxy_center(bj)[0])
dy = abs(xyxy_center(bi)[1] - xyxy_center(bj)[1])
hov = horizontal_overlap_ratio(bi, bj)
vov = vertical_overlap_ratio(bi, bj)
dist = box_distance(bi, bj)
same_band = dy <= med_h * 2.2
stacked = hov >= 0.35 and dy <= med_h * 3.2
same_line = vov >= 0.45 and dx <= med_h * 5.0
near = dist <= med_h * 4.5
if same_line or stacked or (near and (same_band or hov > 0.25)):
if orientation_compatible(i, j, ocr):
union(i, j)
groups = {}
for i in range(len(ocr)):
groups.setdefault(find(i), []).append(i)
region_lines = {}
region_boxes = {}
region_quads = {}
region_indices = {}
next_id = 1
for _, idxs in sorted(groups.items(), key=lambda kv: min(boxes[i][1] for i in kv[1])):
idxs = sorted(idxs, key=lambda i: (boxes[i][1], boxes[i][0]))
ub = boxes_union_xyxy([boxes[i] for i in idxs])
if ub is None:
continue
region_lines[next_id] = build_lines_from_indices(idxs, ocr)
region_boxes[next_id] = box_expand(ub, pad=max(2, int(med_h * 0.25)), iw=iw, ih=ih)
region_quads[next_id] = [ocr[i][0] for i in idxs]
region_indices[next_id] = idxs
next_id += 1
return region_lines, region_boxes, region_quads, region_indices
# ============================================================
# RECONCILE REGION-FIRST AND BUBBLE-FIRST GROUPS
# ============================================================
def reconcile_region_and_bubble_groups(region_lines, region_boxes, region_quads, region_indices,
bubbles, bubble_boxes, bubble_quads, bubble_indices,
ocr):
"""
Reconcile region-first and bubble-first groupings.
Strategy:
- Build one combined candidate list from both grouping methods.
- Cluster candidates that heavily overlap or share OCR indices.
- Keep only the best-scoring candidate from each cluster.
- Rebuild stable output dictionaries.
This avoids duplicate retention and inconsistent greedy selection.
"""
combined = []
for rid in region_boxes:
combined.append(("region", rid, region_boxes[rid], region_indices[rid]))
for bid in bubble_boxes:
combined.append(("bubble", bid, bubble_boxes[bid], bubble_indices[bid]))
if not combined:
return {}, {}, {}, {}
visited = set()
kept = []
def group_score(box, idxs):
text = normalize_text(" ".join(build_lines_from_indices(idxs, ocr)))
role = region_text_role_hint(text)
role_bonus = {
"dialogue": 0.8,
"narration": 0.75,
"reaction": 0.7,
"sfx": 0.2,
"unknown": 0.1
}.get(role, 0.1)
box_area = bbox_area_xyxy(box)
area_bonus = min(1.0, box_area / 50000.0)
return (
len(idxs) * 2.0 +
min(20, len(text.split())) * 0.5 +
min(1.0, ocr_candidate_score(text)) +
role_bonus +
area_bonus * 0.25
)
for i in range(len(combined)):
if i in visited:
continue
cluster = [i]
visited.add(i)
_, _, box_i, idx_i = combined[i]
for j in range(i + 1, len(combined)):
if j in visited:
continue
_, _, box_j, idx_j = combined[j]
ovs = boxes_overlap_ratio(box_i, box_j)
iou = boxes_iou(box_i, box_j)
shared = len(set(idx_i).intersection(idx_j))
if ovs >= 0.55 or iou >= 0.35 or shared > 0:
cluster.append(j)
visited.add(j)
best_idx = max(
cluster,
key=lambda k: group_score(combined[k][2], combined[k][3])
)
kept.append(combined[best_idx])
# Stable order: top-to-bottom, then left-to-right
kept.sort(key=lambda item: (
(item[2][1] + item[2][3]) / 2.0,
(item[2][0] + item[2][2]) / 2.0
))
out_lines, out_boxes, out_quads, out_indices = {}, {}, {}, {}
next_id = 1
for typ, oid, box, idxs in kept:
idxs = sorted(
set(idxs),
key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0])
)
out_lines[next_id] = build_lines_from_indices(idxs, ocr)
out_boxes[next_id] = box
out_quads[next_id] = [ocr[k][0] for k in idxs]
out_indices[next_id] = idxs
next_id += 1
return out_lines, out_boxes, out_quads, out_indices
# ============================================================
# PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET
# ============================================================
PROTECTED_SHORT_TOKENS = {
"HUH", "HUH?", "HUH??", "HUH?!",
"OH", "OH!", "OOH", "OOH!",
"AH", "AH!", "UH", "UH...",
"HEY", "HEY!", "EH", "EH?",
"WOW", "WOW!",
"MORNING", "MORNING.",
"BECKY", "BECKY!",
"DAMIAN", "CECILE", "WALD",
"OMIGOSH", "EEEP", "EEEEP"
}
KNOWN_NAMES = {
"BECKY", "DAMIAN", "CECILE", "WALD"
}
def is_protected_token(text: str) -> bool:
t = normalize_text(text or "")
if not t:
return False
if t in PROTECTED_SHORT_TOKENS:
return True
# punctuation-insensitive fallback
t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t)
return t_alpha in PROTECTED_SHORT_TOKENS
def maybe_conf_floor_for_protected(text: str, conf: float, floor: float = 0.40) -> float:
if is_protected_token(text):
return max(conf, floor)
return conf
def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool:
if not text:
return False
t = text.strip()
t_upper = normalize_text(t)
# 1) Hard keep for protected tokens
if is_protected_token(t_upper):
return True
t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper)
if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS:
return True
alpha_count = sum(c.isalpha() for c in t)
if alpha_count < min_alpha_chars:
# allow short punctuated utterances like "Huh?"
if re.fullmatch(r"[A-Za-zÀ-ÿ]{2,6}[!?\.]{0,3}", t.strip()):
return True
return False
if t_upper in _NOISE_TOKENS:
return False
lang = source_lang.lower()
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
non_alpha = sum(not c.isalpha() for c in t)
# slightly less aggressive than before
if len(t) > 0 and (non_alpha / len(t)) > 0.72:
return False
if len(t) >= 3 and len(set(t_upper)) == 1:
return False
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
if len(t) > 5:
vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t))
if vowels == 0:
return False
return True
def quad_bbox(quad):
xs = [p[0] for p in quad]
ys = [p[1] for p in quad]
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
def quad_center(quad):
x1, y1, x2, y2 = quad_bbox(quad)
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
def boxes_union_xyxy(boxes):
boxes = [b for b in boxes if b is not None]
if not boxes:
return None
return (
int(min(b[0] for b in boxes)),
int(min(b[1] for b in boxes)),
int(max(b[2] for b in boxes)),
int(max(b[3] for b in boxes)),
)
def bbox_area_xyxy(b):
if b is None:
return 0
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
def xyxy_to_xywh(b):
if b is None:
return None
x1, y1, x2, y2 = b
return {"x": int(x1), "y": int(y1),
"w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
def overlap_or_near(a, b, gap=0):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
gap_y = max(0, max(ay1, by1) - min(ay2, by2))
return gap_x <= gap and gap_y <= gap
def boxes_iou(a, b):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, area_a + area_b - inter)
def boxes_overlap_ratio(a, b):
"""Ratio of intersection to the SMALLER box area."""
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, min(area_a, area_b))
def ocr_candidate_score(text: str) -> float:
if not text:
return 0.0
t = text.strip()
n = len(t)
if n == 0:
return 0.0
alpha = sum(c.isalpha() for c in t) / n
spaces = sum(c.isspace() for c in t) / n
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
penalty = 0.0
if re.search(r"\b[A-Z]\b", t):
penalty += 0.05
if re.search(r"[0-9]{2,}", t):
penalty += 0.08
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
return max(0.0, min(1.0, score))
def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool:
x1, y1, x2, y2 = quad_bbox(quad)
return (max(1, x2 - x1) / max(1, y2 - y1)) >= ratio_threshold
def quad_is_vertical(quad, ratio_threshold=1.5) -> bool:
x1, y1, x2, y2 = quad_bbox(quad)
return (max(1, y2 - y1) / max(1, x2 - x1)) >= ratio_threshold
# ============================================================
# ENHANCED IMAGE PREPROCESSING
# ============================================================
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
h, w = image_bgr.shape[:2]
upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)),
interpolation=cv2.INTER_CUBIC)
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
denoised = cv2.fastNlMeansDenoising(gray, None, h=10,
templateWindowSize=7, searchWindowSize=21)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
sharpened = cv2.filter2D(enhanced, -1,
np.array([[-1,-1,-1],[-1,9,-1],[-1,-1,-1]]))
binary = cv2.adaptiveThreshold(sharpened, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8))
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
def detect_small_text_regions(image_bgr, existing_quads):
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
mask = np.zeros(gray.shape, dtype=np.uint8)
for quad in existing_quads:
cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255)
mask_inv = cv2.bitwise_not(mask)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
text_regions = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
if 50 < area < 5000 and 0.1 < h / max(w, 1) < 10:
text_regions.append((x, y, x + w, y + h))
return text_regions
# ============================================================
# SPEECH BUBBLE DETECTION
# ============================================================
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
return [c for c in contours if cv2.contourArea(c) > 500]
def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5):
x1, y1, x2, y2 = quad_bbox_xyxy
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance
def split_indices_by_bubble(indices, ocr, bubble_contours):
if not indices:
return []
bubble_groups, outside_group = {}, []
for idx in indices:
bbox = quad_bbox(ocr[idx][0])
found = False
for bidx, bubble in enumerate(bubble_contours):
if is_quad_in_bubble(bbox, bubble):
bubble_groups.setdefault(bidx, []).append(idx)
found = True
break
if not found:
outside_group.append(idx)
result = list(bubble_groups.values())
if outside_group:
result.append(outside_group)
return result
def check_vertical_alignment_split(indices, ocr, threshold=20):
if len(indices) <= 1:
return [indices]
items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices],
key=lambda x: x[1][1])
groups, current_group = [], [items[0][0]]
for i in range(1, len(items)):
if items[i][1][1] - items[i-1][1][3] > threshold:
groups.append(current_group)
current_group = [items[i][0]]
else:
current_group.append(items[i][0])
if current_group:
groups.append(current_group)
return groups
# ============================================================
# QUAD SIZE VALIDATION AND SPLITTING
# ============================================================
def is_quad_oversized(quad, median_height, width_threshold=8.0):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
return w > median_height * width_threshold or w / h > 12.0
def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size == 0:
return [(quad, text, conf)]
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(median_height * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if not gaps:
return [(quad, text, conf)]
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
return [(quad, text, conf)]
def validate_and_split_oversized_quads(image_bgr, filtered_ocr):
if not filtered_ocr:
return filtered_ocr, 0
heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
median_height = float(np.median(heights)) if heights else 14.0
result, splits_made = [], 0
for quad, text, conf in filtered_ocr:
if is_quad_oversized(quad, median_height, 8.0):
sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height)
if len(sr) > 1:
result.extend(sr); splits_made += 1
else:
result.append((quad, text, conf))
else:
result.append((quad, text, conf))
return result, splits_made
# ============================================================
# HORIZONTAL GAP DETECTION AT QUAD LEVEL
# ============================================================
def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5):
if len(indices) < 2:
return None
items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0])
boxes = [quad_bbox(ocr[i][0]) for i in items]
gap_threshold = med_h * gap_factor
best_gap, best_split = 0.0, None
for k in range(len(items) - 1):
gap = boxes[k + 1][0] - boxes[k][2]
if gap > gap_threshold and gap > best_gap:
best_gap, best_split = gap, k
if best_split is None:
return None
left_group = [items[i] for i in range(best_split + 1)]
right_group = [items[i] for i in range(best_split + 1, len(items))]
if not left_group or not right_group:
return None
return (left_group, right_group)
def orientation_compatible(idx_a, idx_b, ocr):
ba = quad_bbox(ocr[idx_a][0])
bb = quad_bbox(ocr[idx_b][0])
wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1])
wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1])
ra, rb = wa / ha, wb / hb
if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0):
return False
return True
# ============================================================
# WIDE QUAD COLUMN SPLIT — pre-grouping
# ============================================================
def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h,
min_gap_factor=1.8):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
if w < med_h * 3.0:
return [(quad, text, conf)]
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size == 0:
return [(quad, text, conf)]
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.12
min_gap_px = max(int(med_h * min_gap_factor), 10)
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= min_gap_px:
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if not gaps:
return [(quad, text, conf)]
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_rel = gaps[0][0]
split_x_abs = x1 + split_x_rel
if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h:
return [(quad, text, conf)]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int(split_x_rel / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * split_x_rel / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
return [(quad, text, conf)]
def apply_column_gap_splits(image_bgr, ocr_list, med_h):
result, splits_made = [], 0
for quad, text, conf in ocr_list:
parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h)
if len(parts) > 1:
splits_made += 1
result.extend(parts)
if splits_made:
print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping")
return result, splits_made
# ============================================================
# GENERALIZED BOX FIXING FUNCTIONS
# ============================================================
def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid, splits_made = 1, []
for bid, indices in bubble_indices.items():
if len(indices) < 2:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = indices
next_bid += 1
continue
split_groups = split_indices_by_bubble(indices, ocr, bubble_contours)
if len(split_groups) > 1:
for group in split_groups:
if group:
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
new_quads[next_bid] = [ocr[i][0] for i in group]
new_indices[next_bid] = group
next_bid += 1
splits_made.append(f"BOX#{bid}{len(split_groups)} bubbles")
continue
vertical_splits = check_vertical_alignment_split(indices, ocr,
threshold=int(med_h * 2.0))
if len(vertical_splits) > 1:
for group in vertical_splits:
if group:
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
new_quads[next_bid] = [ocr[i][0] for i in group]
new_indices[next_bid] = group
next_bid += 1
splits_made.append(f"BOX#{bid}{len(vertical_splits)} vertical groups")
continue
box = bubble_boxes[bid]
x1, y1, x2, y2 = box
if (x2 - x1) > med_h * 10:
x_centers = [quad_center(ocr[i][0])[0] for i in indices]
x_median = np.median(x_centers)
left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median]
right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median]
if left_group and right_group:
left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group])
right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group])
if right_box[0] - left_box[2] > med_h * 1.5:
for grp in [left_group, right_group]:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} → 2 horizontal panels")
continue
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = indices
next_bid += 1
if splits_made:
print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):")
for s in splits_made: print(f"{s}")
return new_bubbles, new_boxes, new_quads, new_indices
def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
bids = list(bubble_boxes.keys())
to_merge = []
for i in range(len(bids)):
for j in range(i + 1, len(bids)):
bid_i, bid_j = bids[i], bids[j]
box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j]
cx_i = (box_i[0] + box_i[2]) / 2.0
cy_i = (box_i[1] + box_i[3]) / 2.0
cx_j = (box_j[0] + box_j[2]) / 2.0
cy_j = (box_j[1] + box_j[3]) / 2.0
in_same_bubble = any(
cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0
for c in bubble_contours
)
if in_same_bubble:
if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0:
to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i))
if not to_merge:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):")
merge_groups = {}
for top, bottom in to_merge:
found = False
for key in merge_groups:
if top in merge_groups[key] or bottom in merge_groups[key]:
merge_groups[key].update({top, bottom})
found = True; break
if not found:
merge_groups[len(merge_groups)] = {top, bottom}
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
merged_bids, next_bid = set(), 1
for merge_set in merge_groups.values():
merge_list = sorted(merge_set)
print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}")
all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b]))
for b in merge_list: merged_bids.add(b)
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
next_bid += 1
for bid in bids:
if bid not in merged_bids:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, med_h):
"""
Merges boxes that are vertically close AND share significant horizontal overlap.
Single-quad boxes participate fully — no isolation treatment.
This fixes BOX#2+#16, BOX#8+#21, BOX#9+#22 type problems where a
single-line detection sits directly above/below a multi-line box in the
same speech bubble.
Merge criteria (both must be true):
1. Vertical gap ≤ 1.5 × med_h
2. Horizontal overlap ratio ≥ 0.35
"""
bids = sorted(bubble_boxes.keys())
merge_map: Dict[int, List[int]] = {}
merged_into: Dict[int, int] = {}
for i, bid_i in enumerate(bids):
if bid_i in merged_into:
continue
box_i = bubble_boxes[bid_i]
wi = max(1, box_i[2] - box_i[0])
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_into:
continue
box_j = bubble_boxes[bid_j]
wj = max(1, box_j[2] - box_j[0])
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
h_ix1 = max(box_i[0], box_j[0])
h_ix2 = min(box_i[2], box_j[2])
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35:
root = merged_into.get(bid_i, bid_i)
merge_map.setdefault(root, [root])
if bid_j not in merge_map[root]:
merge_map[root].append(bid_j)
merged_into[bid_j] = root
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):")
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
processed, next_bid = set(), 1
for root, group in merge_map.items():
group_unique = sorted(set(group))
print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}")
all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
next_bid += 1
processed.update(group_unique)
for bid in bids:
if bid not in processed:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
"""
Full fix pipeline:
1. Split boxes that span multiple speech bubbles.
2. Merge fragments detected inside the same contour.
3. Merge fragments missed by contour detection (proximity+overlap) — pass 1.
4. Second proximity pass — catches chains resolved after pass 1.
"""
print("\n🔍 Running automatic bubble detection fixes...")
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
detect_and_split_multi_bubble_boxes(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
detect_and_merge_fragmented_bubbles(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
# Pass 1
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_boxes_by_proximity_and_overlap(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
# Pass 2 — catches chains only visible after pass 1
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_boxes_by_proximity_and_overlap(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles,
overlap_threshold=0.50):
bids = list(bubble_boxes.keys())
to_remove = set()
for i in range(len(bids)):
bid_i = bids[i]
if bid_i in to_remove: continue
box_i = bubble_boxes[bid_i]
area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1])
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in to_remove: continue
box_j = bubble_boxes[bid_j]
area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1])
shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j])
overlap = boxes_overlap_ratio(box_i, box_j)
if overlap > overlap_threshold or len(shared) > 0:
if area_i >= area_j:
to_remove.add(bid_j)
print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})")
else:
to_remove.add(bid_i)
print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})")
break
if to_remove:
print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)")
for bid in to_remove:
bubble_boxes.pop(bid, None)
bubble_indices.pop(bid, None)
bubble_quads.pop(bid, None)
bubbles.pop(bid, None)
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr,
max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None):
if image_shape is None:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
ih, iw = image_shape[:2]
max_width, max_height = iw * max_width_ratio, ih * max_height_ratio
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid, splits_made = 1, []
for bid, box in bubble_boxes.items():
x1, y1, x2, y2 = box
w, h = x2 - x1, y2 - y1
if w > max_width or h > max_height:
indices = bubble_indices[bid]
col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid,
use_aggressive_thresholds=True)
if col_split:
for grp in col_split:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
continue
row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid)
if row_split:
for grp in row_split:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
continue
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = box
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
if splits_made:
print(f"\n📏 Split {len(splits_made)} oversized box(es):")
for s in splits_made: print(f"{s}")
return new_bubbles, new_boxes, new_quads, new_indices
def should_merge_groups(group1_indices, group2_indices, ocr, median_height,
max_vertical_gap=None):
if max_vertical_gap is None:
max_vertical_gap = median_height * 2.5
box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices])
box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices])
if box1 is None or box2 is None:
return False
cx1 = (box1[0] + box1[2]) / 2.0
cx2 = (box2[0] + box2[2]) / 2.0
if abs(cx1 - cx2) > median_height * 1.8:
return False
vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3]))
return vertical_gap <= max_vertical_gap
# ============================================================
# ENHANCED OCR ENGINE
# ============================================================
class ImprovedMacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
self.langs = [lang_map.get(lang_key, "en-US")]
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
def preprocess_variants(self, image_bgr):
variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))]
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
_, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
variants.append(("high_contrast",
cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5,
interpolation=cv2.INTER_CUBIC),
cv2.COLOR_GRAY2BGR)))
variants.append(("bilateral",
cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75),
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
variants.append(("inverted",
cv2.resize(cv2.bitwise_not(image_bgr),
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
variants.append(("original",
cv2.resize(image_bgr, None, fx=2.5, fy=2.5,
interpolation=cv2.INTER_CUBIC)))
return variants
def run_vision_ocr(self, image_bgr):
if image_bgr is None or image_bgr.size == 0:
return []
ih, iw = image_bgr.shape[:2]
success, buffer = cv2.imencode('.png', image_bgr)
if not success:
return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error: return
for obs in request.results():
candidate = obs.topCandidates_(1)[0]
text, conf = candidate.string(), candidate.confidence()
bbox = obs.boundingBox()
x = bbox.origin.x * iw
y_bl = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bl - h
quad = [[int(x),int(y)],[int(x+w),int(y)],
[int(x+w),int(y+h)],[int(x),int(y+h)]]
results.append((quad, text, conf))
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(False)
req.setRecognitionLanguages_(self.langs)
req.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([req], None)
return results
def merge_multi_pass_results(self, all_results, original_shape):
if not all_results:
return []
scale_factor = 2.5
normalized = []
for variant_name, results in all_results:
for quad, text, conf in results:
sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad]
normalized.append((sq, text, conf, variant_name))
def quads_overlap(q1, q2, threshold=0.5):
b1, b2 = quad_bbox(q1), quad_bbox(q2)
x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1])
x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3])
if x2 < x1 or y2 < y1: return False
inter = (x2-x1)*(y2-y1)
union = ((b1[2]-b1[0])*(b1[3]-b1[1]) +
(b2[2]-b2[0])*(b2[3]-b2[1]) - inter)
return inter / max(union, 1) > threshold
clusters, used = [], set()
for i, (q1, t1, c1, v1) in enumerate(normalized):
if i in used: continue
cluster = [(q1, t1, c1, v1)]
used.add(i)
for j, (q2, t2, c2, v2) in enumerate(normalized):
if j in used or i == j: continue
if quads_overlap(q1, q2):
cluster.append((q2, t2, c2, v2))
used.add(j)
clusters.append(cluster)
final_results = []
for cluster in clusters:
cluster.sort(key=lambda x: x[2], reverse=True)
best_quad, best_text, best_conf, _ = cluster[0]
text_votes = {}
for _, text, conf, _ in cluster:
n = normalize_text(text)
if n: text_votes[n] = text_votes.get(n, 0) + conf
if text_votes:
voted = max(text_votes.items(), key=lambda x: x[1])[0]
if voted != normalize_text(best_text):
best_text = voted
final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf))
return final_results
def read(self, image_path_or_array):
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
else image_path_or_array
if img is None or img.size == 0:
return []
variants = self.preprocess_variants(img)
all_results = []
for vname, vimg in variants:
r = self.run_vision_ocr(vimg)
if r: all_results.append((vname, r))
return self.merge_multi_pass_results(all_results, img.shape)
class MacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
self.langs = [lang_map.get(lang_key, "en-US")]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
def read(self, image_path_or_array):
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
else image_path_or_array
if img is None or img.size == 0:
return []
ih, iw = img.shape[:2]
success, buffer = cv2.imencode('.png', img)
if not success: return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error: return
for obs in request.results():
candidate = obs.topCandidates_(1)[0]
text, conf = candidate.string(), candidate.confidence()
bbox = obs.boundingBox()
x = bbox.origin.x * iw
y_bl = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bl - h
quad = [[int(x),int(y)],[int(x+w),int(y)],
[int(x+w),int(y+h)],[int(x),int(y+h)]]
results.append((quad, text, conf))
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(True)
req.setRecognitionLanguages_(self.langs)
req.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([req], None)
return results
# ============================================================
# COLUMN / ROW SPLITTING
# ============================================================
def split_bubble_if_multiple_columns(indices, ocr, bid=None,
use_aggressive_thresholds=False):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
xs = [(b[0]+b[2])/2.0 for b in boxes]
xs_sorted = sorted(xs)
gap_thresh = max(med_h*1.2, 18) if use_aggressive_thresholds else max(med_h*1.5, 22)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(xs_sorted) - 1):
gap = xs_sorted[i+1] - xs_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx+1]) / 2.0
left_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
right_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_bubble_if_multiple_rows(indices, ocr, bid=None):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
ys = [(b[1]+b[3])/2.0 for b in boxes]
ys_sorted = sorted(ys)
gap_thresh = max(med_h * 2.0, 30)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(ys_sorted) - 1):
gap = ys_sorted[i+1] - ys_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0
top_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y]
bot_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y]
if not top_idxs or not bot_idxs: return None
return (top_idxs, bot_idxs)
def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices],
key=lambda x: (x[1][1]+x[1][3])/2.0)
gap_thresh = max(med_h * factor, min_gap)
best_gap, best_split_idx = 0.0, None
for k in range(len(items) - 1):
gap = items[k+1][1][1] - items[k][1][3]
if gap > gap_thresh and gap > best_gap:
best_gap, best_split_idx = gap, k
if best_split_idx is None: return None
top_idxs = [it[0] for it in items[:best_split_idx+1]]
bot_idxs = [it[0] for it in items[best_split_idx+1:]]
if not top_idxs or not bot_idxs: return None
return (top_idxs, bot_idxs)
def is_vertical_text_like(indices, ocr):
if len(indices) < 2: return False
boxes = [quad_bbox(ocr[i][0]) for i in indices]
med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes]))
med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes]))
if med_h < med_w * 1.2: return False
xs = [(b[0]+b[2])/2.0 for b in boxes]
ys = [(b[1]+b[3])/2.0 for b in boxes]
if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False
return True
def split_nested_or_side_by_side(indices, ocr):
if len(indices) < 2: return None
xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0
for i in indices])
mid_idx = len(xs) // 2
split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0
left_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
right_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_panel_box(image_bgr, box_xyxy, bubble_quads=None):
x1, y1, x2, y2 = box_xyxy
ih, iw = image_bgr.shape[:2]
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(iw-1, x2), min(ih-1, y2)
if x2 <= x1 or y2 <= y1: return None
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
h_proj = np.sum(edges, axis=0)
w = x2 - x1
if w < 100: return None
search_start = int(w * 0.35)
search_end = int(w * 0.65)
if search_end <= search_start: return None
region = h_proj[search_start:search_end]
if len(region) == 0: return None
threshold = np.percentile(region, 85)
candidates = [x1 + search_start + rx
for rx in range(len(region)) if region[rx] >= threshold]
if not candidates: return None
split_x = int(np.median(candidates))
if bubble_quads:
lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x)
rc = len(bubble_quads) - lc
if lc == 0 or rc == 0: return None
return (x1, x2, split_x)
# ============================================================
# MERGE CLOSE BUBBLES
# ============================================================
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads,
bubble_indices, ocr):
"""
Merges boxes that are spatially very close on BOTH axes AND share
meaningful horizontal overlap (same column).
Single-quad boxes participate fully — no special isolation treatment.
The h_overlap_ratio >= 0.25 guard prevents merging horizontally
adjacent distinct bubbles.
"""
if not bubbles:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
merge_tol = max(8, med_h * 1.4)
bids = sorted(bubble_boxes.keys())
merged_set, merge_map = set(), {}
for i, bid_i in enumerate(bids):
if bid_i in merged_set: continue
x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i]
wi = max(1, x2_i - x1_i)
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_set: continue
x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j]
wj = max(1, x2_j - x1_j)
gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j))
gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j))
h_ix1 = max(x1_i, x1_j)
h_ix2 = min(x2_i, x2_j)
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25:
if bid_i not in merge_map:
merge_map[bid_i] = [bid_i]
merge_map[bid_i].append(bid_j)
merged_set.add(bid_j)
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for bid in bids:
if bid in merged_set: continue
if bid in merge_map:
group = merge_map[bid]
all_indices = sorted(set(idx for b in group for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
else:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# WIDE / BRIDGE QUAD SPLITTING
# ============================================================
def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w = x2 - x1
if w > med_h * width_factor:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = roi.shape[0] * 255 * 0.15
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.6), 12):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
if w / h > aspect_ratio_threshold:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def normalize_ocr_quads(ocr_list):
result = []
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
pad = 3
new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]]
result.append((new_quad, text, conf))
return result
# ============================================================
# VISION RE-READ
# ============================================================
def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw": return gray
if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0,
tileGridSize=(8,8)).apply(gray)
if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3,3), 0)
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11)
if mode == "otsu":
den = cv2.GaussianBlur(gray, (3,3), 0)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "invert": return 255 - gray
if mode == "bilateral":
den = cv2.bilateralFilter(gray, 7, 60, 60)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "morph_open":
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8))
return gray
def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2]
c = (w/2, h/2)
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
cos, sin = abs(M[0,0]), abs(M[0,1])
new_w = int((h*sin) + (w*cos))
new_h = int((h*cos) + (w*sin))
M[0,2] += (new_w/2) - c[0]
M[1,2] += (new_h/2) - c[1]
return cv2.warpAffine(img, M, (new_w, new_h),
flags=cv2.INTER_CUBIC, borderValue=255)
def rebuild_text_from_vision_result(res):
if not res: return ""
norm = []
for bbox, txt, conf in res:
if not txt or not txt.strip(): continue
b = quad_bbox(bbox)
norm.append((b, txt, conf,
(b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
if not norm: return ""
med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4])
rows = []
for it in norm:
placed = False
for r in rows:
if abs(it[4] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[4] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[4], "m": [it]})
rows.sort(key=lambda r: r["yc"])
lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3])))
for r in rows]
return normalize_text(" ".join(filter(None, lines)))
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector,
upscale=3.0, pad=24):
ih, iw = image_bgr.shape[:2]
x1, y1, x2, y2 = bbox_xyxy
x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad))
x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad))
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None, 0.0, "none"
modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
angles = [0.0, 1.5, -1.5]
best_v_txt, best_v_sc = "", 0.0
up0 = cv2.resize(crop,
(int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)),
interpolation=cv2.INTER_CUBIC)
for mode in modes:
proc = preprocess_variant(up0, mode)
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
for a in angles:
rot = rotate_image_keep_bounds(proc3, a)
res = (vision_detector.run_vision_ocr(rot)
if hasattr(vision_detector, 'run_vision_ocr')
else vision_detector.read(rot))
txt = rebuild_text_from_vision_result(res)
sc = ocr_candidate_score(txt)
if sc > best_v_sc:
best_v_txt, best_v_sc = txt, sc
if best_v_txt: return best_v_txt, best_v_sc, "vision-reread"
return None, 0.0, "none"
# ============================================================
# LINES + BUBBLES
# ============================================================
def build_lines_from_indices(indices, ocr):
if not indices: return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
placed = False
for r in rows:
if abs(it[3] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[3] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[3], "m": [it]})
rows.sort(key=lambda r: r["yc"])
return [normalize_text(
" ".join(ocr[i][1]
for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2])))
for r in rows if r["m"]]
def split_indices_into_vertical_blocks(indices, ocr, gap_factor=1.6, min_gap=18):
"""
Split a box into top-to-bottom macro blocks using strong vertical gaps.
"""
if len(indices) < 2:
return [indices]
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
cy = (b[1] + b[3]) / 2.0
h = max(1, b[3] - b[1])
items.append((i, b, cy, h))
items.sort(key=lambda x: x[2])
med_h = float(np.median([it[3] for it in items])) if items else 12.0
threshold = max(min_gap, med_h * gap_factor)
blocks = []
current = [items[0][0]]
prev_b = items[0][1]
for k in range(1, len(items)):
cur_i, cur_b, _, _ = items[k]
gap = cur_b[1] - prev_b[3]
if gap > threshold:
blocks.append(current)
current = [cur_i]
else:
current.append(cur_i)
prev_b = cur_b
if current:
blocks.append(current)
return blocks
def build_final_box_text(indices, ocr, reading_mode="ltr"):
"""
Final text reconstruction used for OCR/translation export.
This uses internal layout detection, unlike generic grouping helpers.
"""
return build_text_from_layout(indices, ocr, reading_mode=reading_mode)
def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path)
return base * (img.shape[1] / ref_w) if img is not None else base
def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1,
strict_mode=False):
n = len(ocr)
if n == 0: return {}, {}, {}, {}
boxes = [quad_bbox(r[0]) for r in ocr]
centers = [quad_center(r[0]) for r in ocr]
hs = [max(1.0, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0
max_horizontal_offset = med_h * 1.8
sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0]))
groups, used = [], set()
for i in sorted_indices:
if i in used: continue
current_group = [i]
used.add(i)
cx_i = centers[i][0]
for j in sorted_indices:
if j in used or j == i: continue
cx_j, cy_j = centers[j]
if cy_j <= centers[i][1]: continue
if abs(cx_i - cx_j) > max_horizontal_offset: continue
# Horizontal gap guard
gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2]))
if gap_x > med_h * 1.5: continue
# Orientation compatibility guard
if not orientation_compatible(i, j, ocr): continue
vertical_gap = boxes[j][1] - boxes[current_group[-1]][3]
if vertical_gap <= max_vertical_gap:
current_group.append(j)
used.add(j)
cx_i = (cx_i + cx_j) / 2.0
if current_group:
groups.append(current_group)
# Secondary merge pass
merged_groups, used_groups = [], set()
for i, group1 in enumerate(groups):
if i in used_groups: continue
merged = list(group1)
used_groups.add(i)
for j, group2 in enumerate(groups):
if i == j or j in used_groups: continue
if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap):
compat = all(orientation_compatible(a, b, ocr)
for a in merged for b in group2)
if compat:
merged.extend(group2)
used_groups.add(j)
merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1]))
# Horizontal gap split pass
final_groups = []
for group in merged_groups:
h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5)
if h_split:
lg, rg = h_split
final_groups.append(sorted(lg, key=lambda idx: centers[idx][1]))
final_groups.append(sorted(rg, key=lambda idx: centers[idx][1]))
else:
final_groups.append(group)
final_groups.sort(key=lambda g: (min(centers[i][1] for i in g),
min(centers[i][0] for i in g)))
bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
ih, iw = image_shape[:2]
for bid, idxs in enumerate(final_groups, start=1):
lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
if ub is None: continue
x1, y1, x2, y2 = ub
ap = max(1, int(round(med_h * 0.16)))
bubbles[bid] = lines
bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap),
min(iw-1,x2+ap), min(ih-1,y2+ap))
bubble_quads[bid] = quads
bubble_indices[bid] = idxs
return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ============================================================
# SPLIT HELPER — centralises all split strategies
# ============================================================
def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes,
filtered, image, iw, ih):
"""
Attempts all split strategies in priority order.
Returns ((part1_indices, part2_indices), reason_str) or (None, None).
BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4
so the gap between the top speech bubble and the bottom cluster triggers.
"""
indices = bubble_indices[bid]
box = bubble_boxes[bid]
# 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster)
if is_vertical_text_like(indices, filtered):
vgap = split_cluster_by_big_vertical_gap(indices, filtered,
factor=1.4, min_gap=18)
if vgap:
return vgap, "vertical-stack y-gap"
# 2. Panel border
sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
if sr:
_, _, split_x = sr
li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x]
ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x]
if li and ri:
return (li, ri), "panel border"
elif len(bubble_quads[bid]) >= 4:
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid,
use_aggressive_thresholds=True)
if cs:
return cs, "aggressive column"
# 3. Column gap
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid)
if cs:
return cs, "vertical column"
# 4. Nested / side-by-side
ns = split_nested_or_side_by_side(indices, filtered)
if ns:
return ns, "nested/side-by-side"
# 5. Row split
rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid)
if rs:
return rs, "horizontal row"
# 6. Large vertical gap (general, less sensitive)
gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22)
if gy:
return gy, "large vertical-gap"
return None, None
# ============================================================
# DEBUG / EXPORT
# ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices,
clean_lines=None, out_path="debug_clusters.png",
region_types=None):
"""
Draw debug overlays for final grouped boxes.
Color scheme by region type:
- dialogue : green
- narration : orange
- sfx : magenta
- reaction : cyan
- unknown : yellow-ish
OCR quads are outlined lightly in gray for context.
"""
img = cv2.imread(image_path)
if img is None:
return
# Draw OCR quads lightly without filling the page white
for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32)
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb
rtype = region_types.get(bid, "unknown") if region_types else "unknown"
if rtype == "dialogue":
color = (0, 220, 0)
elif rtype == "narration":
color = (0, 180, 255)
elif rtype == "sfx":
color = (255, 0, 255)
elif rtype == "reaction":
color = (0, 200, 255)
else:
color = (0, 220, 220)
thickness = 2
cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
cv2.putText(
img,
f"BOX#{bid} [{rtype}]",
(x1 + 2, max(15, y1 + 16)),
cv2.FONT_HERSHEY_SIMPLEX,
0.45,
color,
2
)
if clean_lines and bid in clean_lines:
text = clean_lines[bid]
words = text.split()
wrapped_lines = []
cur = ""
for w in words:
if len(cur) + len(w) + 1 < 26:
cur += w + " "
else:
wrapped_lines.append(cur.strip())
cur = w + " "
if cur:
wrapped_lines.append(cur.strip())
y_text = y2 + 18
for line in wrapped_lines:
# black outline
cv2.putText(
img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3
)
# blue text
cv2.putText(
img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1
)
y_text += 18
cv2.imwrite(out_path, img)
def estimate_reading_order(bbox_dict, mode="ltr"):
items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0)
for bid, bb in bbox_dict.items()]
items.sort(key=lambda t: t[2])
rows, tol = [], 90
for it in items:
placed = False
for r in rows:
if abs(it[2] - r["cy"]) <= tol:
r["items"].append(it)
r["cy"] = float(np.mean([x[2] for x in r["items"]]))
placed = True; break
if not placed: rows.append({"cy": it[2], "items": [it]})
rows.sort(key=lambda r: r["cy"])
order = []
for r in rows:
r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
order.extend([z[0] for z in r["items"]])
return {bid: i+1 for i, bid in enumerate(order)}
# ============================================================
# NAME / SHORT TOKEN RESCUE
# ============================================================
def _text_key_for_dedup(text: str) -> str:
return re.sub(r'[^A-ZÀ-Ý0-9]', '', normalize_text(text or ""))
def rescue_name_and_short_tokens(ocr_list, min_conf=0.20):
"""
Keep plausible short/name tokens that OCR found but strict filtering may drop.
Returns rescued items as (quad, text, conf).
"""
rescued = []
for quad, text, conf in ocr_list:
t = normalize_text(text or "")
if not t:
continue
t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t)
if t_alpha in KNOWN_NAMES and conf >= min_conf:
rescued.append((quad, t, max(conf, 0.45)))
continue
if is_protected_token(t) and conf >= min_conf:
rescued.append((quad, t, max(conf, 0.40)))
continue
if 2 <= len(t_alpha) <= 8 and conf >= 0.25:
if re.fullmatch(r'[A-ZÀ-Ý]{2,8}', t_alpha):
rescued.append((quad, t, max(conf, 0.35)))
return rescued
def merge_rescued_items(base_ocr, rescued_ocr, iou_threshold=0.55):
"""
Merge rescued tokens into OCR list if not duplicate by text+overlap.
"""
if not rescued_ocr:
return base_ocr
def iou_xyxy(a, b):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, area_a + area_b - inter)
out = list(base_ocr)
for rq, rt, rc in rescued_ocr:
rb = quad_bbox(rq)
rk = _text_key_for_dedup(rt)
duplicate = False
for bq, bt, _ in out:
bb = quad_bbox(bq)
bk = _text_key_for_dedup(bt)
if rk == bk and iou_xyxy(rb, bb) >= iou_threshold:
duplicate = True
break
if not duplicate:
out.append((rq, rt, rc))
return out
def _joined_text_for_indices(indices, ocr):
parts = []
for i in indices:
if i < 0 or i >= len(ocr):
continue
t = normalize_text(ocr[i][1])
if t:
parts.append(t)
s = " ".join(parts).strip()
return s, len(s)
def _in_same_bubble_contour(box_i, box_j, bubble_contours):
cx_i = (box_i[0] + box_i[2]) / 2.0
cy_i = (box_i[1] + box_i[3]) / 2.0
cx_j = (box_j[0] + box_j[2]) / 2.0
cy_j = (box_j[1] + box_j[3]) / 2.0
for c in bubble_contours:
if (cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0):
return True
return False
def merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr):
"""
Relaxed merge for tiny interjection/name boxes (e.g. HUH? + MORNING).
"""
bids = sorted(bubble_boxes.keys())
if len(bids) < 2:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
parent = {b: b for b in bids}
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(a, b):
ra, rb = find(a), find(b)
if ra != rb:
parent[rb] = ra
SHORT_TEXT_MAX_CHARS = 12
for i in range(len(bids)):
for j in range(i + 1, len(bids)):
bi, bj = bids[i], bids[j]
box_i, box_j = bubble_boxes[bi], bubble_boxes[bj]
wi = max(1, box_i[2] - box_i[0])
wj = max(1, box_j[2] - box_j[0])
gap_x = max(0, max(box_i[0], box_j[0]) - min(box_i[2], box_j[2]))
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
h_ix1 = max(box_i[0], box_j[0])
h_ix2 = min(box_i[2], box_j[2])
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
txt_i, len_i = _joined_text_for_indices(bubble_indices[bi], ocr)
txt_j, len_j = _joined_text_for_indices(bubble_indices[bj], ocr)
micro_pair = (len_i <= SHORT_TEXT_MAX_CHARS and len_j <= SHORT_TEXT_MAX_CHARS)
protected_hint = is_protected_token(txt_i) or is_protected_token(txt_j)
same_contour = _in_same_bubble_contour(box_i, box_j, bubble_contours)
if micro_pair and vert_gap <= med_h * 2.2 and gap_x <= med_h * 2.0:
if h_overlap_ratio >= 0.10 or same_contour or protected_hint:
union(bi, bj)
groups = {}
for b in bids:
r = find(b)
groups.setdefault(r, []).append(b)
if all(len(v) == 1 for v in groups.values()):
return bubbles, bubble_boxes, bubble_quads, bubble_indices
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for _, group in groups.items():
if len(group) == 1:
b = group[0]
new_bubbles[next_bid] = bubbles[b]
new_boxes[next_bid] = bubble_boxes[b]
new_quads[next_bid] = bubble_quads[b]
new_indices[next_bid] = bubble_indices[b]
else:
all_idx = sorted(set(idx for b in group for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_idx, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_idx])
new_quads[next_bid] = [ocr[i][0] for i in all_idx]
new_indices[next_bid] = all_idx
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr):
"""
Reattach tiny orphan token boxes (e.g., single 'HUH?') to nearest plausible bubble.
"""
bids = sorted(bubble_boxes.keys())
if len(bids) < 2:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
orphan_bids = []
for b in bids:
idxs = bubble_indices.get(b, [])
if len(idxs) != 1:
continue
t = normalize_text(ocr[idxs[0]][1])
if is_protected_token(t) or len(re.sub(r'[^A-ZÀ-Ý]', '', t)) <= 5:
orphan_bids.append(b)
if not orphan_bids:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
consumed = set()
for ob in orphan_bids:
if ob in consumed:
continue
obox = bubble_boxes[ob]
ocx = (obox[0] + obox[2]) / 2.0
ocy = (obox[1] + obox[3]) / 2.0
best_b = None
best_d = 1e9
for tb in bids:
if tb == ob or tb in consumed:
continue
tbox = bubble_boxes[tb]
tcx = (tbox[0] + tbox[2]) / 2.0
tcy = (tbox[1] + tbox[3]) / 2.0
dx = abs(ocx - tcx)
dy = abs(ocy - tcy)
if dx <= med_h * 2.2 and dy <= med_h * 3.0:
d = dx + dy
if d < best_d:
best_d = d
best_b = tb
if best_b is not None:
merged = sorted(set(bubble_indices[best_b] + bubble_indices[ob]))
bubble_indices[best_b] = merged
bubble_quads[best_b] = [ocr[i][0] for i in merged]
bubble_boxes[best_b] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged])
bubbles[best_b] = build_lines_from_indices(merged, ocr)
consumed.add(ob)
if consumed:
for b in consumed:
bubble_indices.pop(b, None)
bubble_quads.pop(b, None)
bubble_boxes.pop(b, None)
bubbles.pop(b, None)
# reindex for stable downstream order
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
for new_id, old_id in enumerate(sorted(bubble_boxes.keys()), start=1):
new_bubbles[new_id] = bubbles[old_id]
new_boxes[new_id] = bubble_boxes[old_id]
new_quads[new_id] = bubble_quads[old_id]
new_indices[new_id] = bubble_indices[old_id]
return new_bubbles, new_boxes, new_quads, new_indices
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def reconstruct_group_text(group_indices, ocr):
"""
Reconstruct text inside one already-detected group.
This handles cases where a vertical group itself contains
multiple local rows or wrapped OCR fragments.
"""
if not group_indices:
return ""
items = []
for i in group_indices:
b = quad_bbox(ocr[i][0])
cx = (b[0] + b[2]) / 2.0
cy = (b[1] + b[3]) / 2.0
w = max(1, b[2] - b[0])
h = max(1, b[3] - b[1])
items.append((i, b, cx, cy, w, h))
if not items:
return ""
med_h = float(np.median([it[5] for it in items]))
med_w = float(np.median([it[4] for it in items]))
# If the group is strongly vertical, simple top->bottom is fine
xs = [it[2] for it in items]
ys = [it[3] for it in items]
vertical_span = max(ys) - min(ys) if len(ys) > 1 else 0
horizontal_span = max(xs) - min(xs) if len(xs) > 1 else 0
# strong single vertical phrase
if vertical_span > horizontal_span * 1.5:
items.sort(key=lambda x: x[3]) # top->bottom
txt = normalize_text(" ".join(
normalize_text(ocr[it[0]][1]) for it in items if normalize_text(ocr[it[0]][1])
))
return txt
# otherwise, split into local rows first
row_tol = max(6.0, med_h * 0.65)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
placed = False
for row in rows:
if abs(it[3] - row["yc"]) <= row_tol:
row["members"].append(it)
row["yc"] = float(np.mean([m[3] for m in row["members"]]))
placed = True
break
if not placed:
rows.append({"yc": it[3], "members": [it]})
rows.sort(key=lambda r: r["yc"])
parts = []
for row in rows:
members = sorted(row["members"], key=lambda x: x[2]) # left->right
row_txt = normalize_text(" ".join(
normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1])
))
if row_txt:
parts.append(row_txt)
txt = normalize_text(" ".join(parts))
return txt
def reconstruct_group_text_best(group_indices, ocr):
if not group_indices:
return ""
items = []
for i in group_indices:
b = quad_bbox(ocr[i][0])
cx = (b[0] + b[2]) / 2.0
cy = (b[1] + b[3]) / 2.0
h = max(1, b[3] - b[1])
items.append((i, b, cx, cy, h))
if not items:
return ""
# Candidate 1: simple top->bottom
cand1_items = sorted(items, key=lambda x: x[3])
cand1 = normalize_text(" ".join(
normalize_text(ocr[it[0]][1]) for it in cand1_items if normalize_text(ocr[it[0]][1])
))
cand1 = fix_group_level_ocr(cand1)
# Candidate 2: local rows
med_h = float(np.median([it[4] for it in items]))
row_tol = max(6.0, med_h * 0.65)
rows = []
for it in sorted(items, key=lambda x: x[3]):
placed = False
for row in rows:
if abs(it[3] - row["yc"]) <= row_tol:
row["members"].append(it)
row["yc"] = float(np.mean([m[3] for m in row["members"]]))
placed = True
break
if not placed:
rows.append({"yc": it[3], "members": [it]})
rows.sort(key=lambda r: r["yc"])
cand2_parts = []
for row in rows:
members = sorted(row["members"], key=lambda x: x[2])
row_txt = normalize_text(" ".join(
normalize_text(ocr[m[0]][1]) for m in members if normalize_text(ocr[m[0]][1])
))
if row_txt:
cand2_parts.append(row_txt)
cand2 = normalize_text(" ".join(cand2_parts))
cand2 = fix_group_level_ocr(cand2)
# choose best
s1 = ocr_candidate_score(cand1)
s2 = ocr_candidate_score(cand2)
return cand2 if s2 > s1 else cand1
def fix_group_level_ocr(text):
t = normalize_text(text or "")
if not t:
return t
replacements = {
"ANY- THING": "ANYTHING",
"BREAK- FAST": "BREAK-FAST",
"COMMON BREAK- PEOPLE FAST": "COMMON PEOPLE EAT FOR BREAKFAST",
"WHAT DO LIKE FOR COMMON BREAK- PEOPLE FAST EAT": "WHAT DO COMMON PEOPLE EAT LIKE FOR BREAKFAST",
# New targeted fixes for reported cases
"ILLU- SIONS": "ILLU-SIONS",
"ATTEN- TION": "ATTEN-TION",
"WHAT DO COMMON PEOPLE HE EAT?": "WHAT DO COMMON PEOPLE EAT?",
"LIKE FOR BREAK- FAST": "LIKE FOR BREAK-FAST?",
"YOUR STUCK": "YOU'RE STUCK",
"YOUR HAND!": "YOUR HAND!",
}
for a, b in replacements.items():
t = t.replace(a, b)
t = dehyphenate_linebreak_artifacts(t)
t = re.sub(r"\s{2,}", " ", t).strip()
return t
def _is_sentence_like_fragment(t: str) -> bool:
t = normalize_text(t or "")
if not t:
return False
alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t)
if len(alnum) < 2:
return False
return True
def _line_has_terminal_punct(t: str) -> bool:
t = normalize_text(t or "")
return bool(re.search(r"[.!?…]$", t))
def _smart_split_by_connectors(text: str) -> List[str]:
"""
Conservative split for OCR text that glues multiple clauses.
"""
t = normalize_text(text or "")
if not t:
return []
# Keep hyphenated style if meaningful, but remove OCR line-wrap artifacts
t = dehyphenate_linebreak_artifacts(t)
# 1) Primary punctuation split
parts = re.split(r"(?<=[.!?…])\s+", t)
parts = [p.strip() for p in parts if p.strip()]
if len(parts) >= 2:
return parts
# 2) Secondary lexical split if punctuation failed
patterns = [
r"\b(THEY'RE|THEY ARE)\b",
r"\b(DON'T|DO NOT)\b",
r"\b(LIKE FOR)\b",
r"\b(IF WE DON'T|IF WE DO NOT)\b",
r"\b(WHAT DO)\b",
]
for pat in patterns:
m = re.search(pat, t)
if m and m.start() > 8:
left = t[:m.start()].strip()
right = t[m.start():].strip()
if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right):
return [left, right]
return [t]
def split_box_by_sentence_rows(indices, ocr, min_groups=2):
"""
Force split one box into sentence-like row groups.
Works for stacked dialogue blocks like:
YOUR HAND!
I'M STUCK AND HELPLESS LIKE THIS!
IF WE DON'T HURRY UP, WE'LL BE CRUSHED TO DEATH!
"""
if not indices or len(indices) < 3:
return None
# Build row groups first
rows = group_indices_into_horizontal_rows(indices, ocr, row_tol_factor=0.70)
if not rows or len(rows) < min_groups:
return None
# Turn each row-group into text
row_payload = []
for grp in rows:
txt = normalize_text(" ".join(ocr[i][1] for i in grp if normalize_text(ocr[i][1])))
txt = fix_group_level_ocr(txt)
if not txt:
continue
box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
row_payload.append({"indices": grp, "text": txt, "box": box})
if len(row_payload) < min_groups:
return None
# Merge tiny row fragments upward if they are clearly continuation
merged = []
for rp in row_payload:
if not merged:
merged.append(rp)
continue
prev = merged[-1]
short_prev = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", prev["text"])) <= 5
no_term_prev = not re.search(r"[.!?…]$", prev["text"])
if short_prev and no_term_prev:
new_idx = sorted(set(prev["indices"] + rp["indices"]))
new_txt = normalize_text(prev["text"] + " " + rp["text"])
new_box = boxes_union_xyxy([prev["box"], rp["box"]])
merged[-1] = {"indices": new_idx, "text": new_txt, "box": new_box}
else:
merged.append(rp)
# Keep sentence-like groups
out = []
for m in merged:
txt = normalize_text(m["text"])
if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt)) < 4:
continue
out.append(sorted(m["indices"], key=lambda i: (
quad_bbox(ocr[i][0])[1],
quad_bbox(ocr[i][0])[0]
)))
if len(out) < min_groups:
return None
return out
def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]:
"""
Layout-aware phrase segmentation for one final box.
Uses your internal grouping + punctuation/connector splitting.
"""
groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode)
groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)]
if not groups:
merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode)))
merged = fix_group_level_ocr(merged)
return [x for x in _smart_split_by_connectors(merged) if _is_sentence_like_fragment(x)]
out = []
for g in groups:
out.extend(_smart_split_by_connectors(g))
# Dedupe OCR echoes
cleaned = []
for p in out:
p = normalize_text(p)
if not _is_sentence_like_fragment(p):
continue
if cleaned and text_similarity(cleaned[-1], p) >= 0.92:
continue
cleaned.append(p)
return cleaned
def build_box_group_texts(indices, ocr, reading_mode="ltr"):
"""
Return independent text groups for one final box, preserving internal layout.
Each group is reconstructed with local reading-order logic.
"""
layout = detect_internal_text_layout(indices, ocr, reading_mode=reading_mode)
out = []
if not layout:
return out
blocks = layout.get("blocks", [])
for block in blocks:
mode = block.get("mode", "horizontal")
groups = block.get("groups", [])
if mode == "vertical":
groups = sorted(
groups,
key=lambda grp: np.mean([
(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0
for i in grp
]),
reverse=(reading_mode == "rtl")
)
else:
groups = sorted(
groups,
key=lambda grp: np.mean([
(quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0
for i in grp
])
)
for grp in groups:
txt = reconstruct_group_text(grp, ocr)
if txt:
out.append(txt)
return out
def _is_sentence_like_fragment(t: str) -> bool:
t = normalize_text(t or "")
if not t:
return False
alnum = re.sub(r"[^A-ZÀ-Ý0-9]", "", t)
if len(alnum) < 2:
return False
return True
def _line_has_terminal_punct(t: str) -> bool:
t = normalize_text(t or "")
return bool(re.search(r"[.!?…]$", t))
def _smart_split_by_connectors(text: str) -> List[str]:
"""
Conservative split for OCR text that glues 2 clauses:
- DON'T PAY ANY ATTEN-TION TO THEM! THEY'RE ILLU-SIONS!
- WHAT DO COMMON PEOPLE EAT? LIKE FOR BREAK-FAST?
"""
t = normalize_text(text or "")
if not t:
return []
# Normalize some OCR hyphen artifacts first
t = dehyphenate_linebreak_artifacts(t)
# Primary punctuation split
parts = re.split(r"(?<=[.!?…])\s+", t)
parts = [p.strip() for p in parts if p.strip()]
if len(parts) >= 2:
return parts
# Secondary connector split patterns (conservative)
patterns = [
r"\b(THEY'RE|THEY ARE)\b",
r"\b(DON'T|DO NOT)\b",
r"\b(LIKE FOR)\b",
r"\b(IF WE DON'T|IF WE DO NOT)\b",
]
for pat in patterns:
m = re.search(pat, t)
if m and m.start() > 8:
left = t[:m.start()].strip()
right = t[m.start():].strip()
if _is_sentence_like_fragment(left) and _is_sentence_like_fragment(right):
return [left, right]
return [t]
def segment_box_into_phrases(indices, ocr, reading_mode="ltr") -> List[str]:
"""
Layout-aware phrase segmentation for one final box.
"""
# Step 1: use your existing internal grouping
groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode)
groups = [fix_group_level_ocr(g) for g in groups if _is_sentence_like_fragment(g)]
if not groups:
merged = normalize_text(" ".join(build_final_box_text(indices, ocr, reading_mode=reading_mode)))
return _smart_split_by_connectors(merged)
# Step 2: split each group by punctuation/connectors
out = []
for g in groups:
out.extend(_smart_split_by_connectors(g))
# Step 3: dedupe near-identical neighbors (OCR echo)
cleaned = []
for p in out:
if not cleaned:
cleaned.append(p)
continue
if text_similarity(cleaned[-1], p) >= 0.92:
continue
cleaned.append(p)
return [normalize_text(x) for x in cleaned if _is_sentence_like_fragment(x)]
def is_multi_group_bubble(indices, ocr, reading_mode="ltr", min_groups=2):
groups = build_box_group_texts(indices, ocr, reading_mode=reading_mode)
meaningful = [g for g in groups if len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g)) >= 2]
return len(meaningful) >= min_groups
def _bubble_text(indices, ocr, reading_mode="ltr"):
return normalize_text(" ".join(build_text_from_layout(indices, ocr, reading_mode=reading_mode)))
def _box_dims(b):
return max(1, b[2]-b[0]), max(1, b[3]-b[1])
def _intersection(a, b):
ix1, iy1 = max(a[0], b[0]), max(a[1], b[1])
ix2, iy2 = min(a[2], b[2]), min(a[3], b[3])
w, h = max(0, ix2-ix1), max(0, iy2-iy1)
return w*h
def _containment_ratio(child, parent):
inter = _intersection(child, parent)
c_area = max(1, (child[2]-child[0])*(child[3]-child[1]))
return inter / c_area
def _center_distance(a, b):
acx, acy = (a[0]+a[2])/2.0, (a[1]+a[3])/2.0
bcx, bcy = (b[0]+b[2])/2.0, (b[1]+b[3])/2.0
return ((acx-bcx)**2 + (acy-bcy)**2) ** 0.5
def _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices):
new_b, new_bb, new_bq, new_bi = {}, {}, {}, {}
for nid, old in enumerate(sorted(bubble_boxes.keys()), start=1):
new_b[nid] = bubbles[old]
new_bb[nid] = bubble_boxes[old]
new_bq[nid] = bubble_quads[old]
new_bi[nid] = bubble_indices[old]
return new_b, new_bb, new_bq, new_bi
def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr,
image_bgr=None, reading_mode="ltr"):
"""
Final reconciliation pass for:
- overlap merges
- child absorption
- complementary fragment merge
This version is safe for optional image input and propagates reading_mode
into layout-aware text reconstruction.
"""
if not bubble_boxes:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr) if image_bgr is not None else []
changed = True
while changed:
changed = False
bids = sorted(bubble_boxes.keys())
# ---- (A) Merge highly-overlapping pairs
merged_any = False
for i in range(len(bids)):
if merged_any:
break
for j in range(i + 1, len(bids)):
bi, bj = bids[i], bids[j]
if bi not in bubble_boxes or bj not in bubble_boxes:
continue
a, b = bubble_boxes[bi], bubble_boxes[bj]
iou = boxes_iou(a, b)
ovs = boxes_overlap_ratio(a, b) # inter / smaller
same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False
if ovs >= 0.55 or (iou >= 0.35 and same_contour):
idx = sorted(set(bubble_indices[bi] + bubble_indices[bj]))
bubble_indices[bi] = idx
bubble_quads[bi] = [ocr[k][0] for k in idx]
bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx])
bubbles[bi] = build_lines_from_indices(idx, ocr)
bubble_indices.pop(bj, None)
bubble_quads.pop(bj, None)
bubble_boxes.pop(bj, None)
bubbles.pop(bj, None)
changed = True
merged_any = True
break
if changed:
continue
# ---- (B) Absorb tiny child boxes inside larger parent
absorbed_any = False
bids = sorted(bubble_boxes.keys())
for i in range(len(bids)):
if absorbed_any:
break
for j in range(len(bids)):
if i == j:
continue
child, parent = bids[i], bids[j]
if child not in bubble_boxes or parent not in bubble_boxes:
continue
cb, pb = bubble_boxes[child], bubble_boxes[parent]
cw, ch = _box_dims(cb)
pw, ph = _box_dims(pb)
contain = _containment_ratio(cb, pb)
child_txt = _bubble_text(bubble_indices[child], ocr, reading_mode=reading_mode)
parent_txt = _bubble_text(bubble_indices[parent], ocr, reading_mode=reading_mode)
# tiny or fragment child
is_tiny = (cw <= med_h * 3.2 and ch <= med_h * 2.2) or len(child_txt) <= 14
# don't absorb if it's clearly separate and far
close = _center_distance(cb, pb) <= med_h * 4.0
if contain >= 0.70 and (is_tiny or close):
idx = sorted(set(bubble_indices[parent] + bubble_indices[child]))
bubble_indices[parent] = idx
bubble_quads[parent] = [ocr[k][0] for k in idx]
bubble_boxes[parent] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx])
bubbles[parent] = build_lines_from_indices(idx, ocr)
bubble_indices.pop(child, None)
bubble_quads.pop(child, None)
bubble_boxes.pop(child, None)
bubbles.pop(child, None)
changed = True
absorbed_any = True
break
if changed:
continue
# ---- (C) Merge complementary fragments
comp_any = False
bids = sorted(bubble_boxes.keys())
for i in range(len(bids)):
if comp_any:
break
for j in range(i + 1, len(bids)):
bi, bj = bids[i], bids[j]
if bi not in bubble_boxes or bj not in bubble_boxes:
continue
a, b = bubble_boxes[bi], bubble_boxes[bj]
wi, hi = _box_dims(a)
wj, hj = _box_dims(b)
vert_gap = max(0, max(a[1], b[1]) - min(a[3], b[3]))
h_ix = max(0, min(a[2], b[2]) - max(a[0], b[0]))
h_overlap_ratio = h_ix / max(1, min(wi, wj))
same_contour = _in_same_bubble_contour(a, b, bubble_contours) if bubble_contours else False
txt_i = _bubble_text(bubble_indices[bi], ocr, reading_mode=reading_mode)
txt_j = _bubble_text(bubble_indices[bj], ocr, reading_mode=reading_mode)
if same_contour and vert_gap <= med_h * 2.8 and h_overlap_ratio >= 0.45:
# prefer merge when one is upper fragment + other lower fragment
# and text is not identical duplicate
if txt_i != txt_j:
idx = sorted(set(bubble_indices[bi] + bubble_indices[bj]))
bubble_indices[bi] = idx
bubble_quads[bi] = [ocr[k][0] for k in idx]
bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx])
bubbles[bi] = build_lines_from_indices(idx, ocr)
bubble_indices.pop(bj, None)
bubble_quads.pop(bj, None)
bubble_boxes.pop(bj, None)
bubbles.pop(bj, None)
changed = True
comp_any = True
break
return _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices)
def split_boxes_by_internal_vertical_groups(bubbles, bubble_boxes, bubble_quads, bubble_indices,
ocr, image_shape, reading_mode="ltr"):
"""
Conservative splitter:
- Split only when evidence is strong.
- Prevent over-splitting of short/noisy vertical tokens.
"""
ih, iw = image_shape[:2]
out_bubbles = {}
out_boxes = {}
out_quads = {}
out_indices = {}
next_id = 1
# conservative thresholds
MIN_ALNUM_PER_GROUP = 8
MIN_GROUP_HEIGHT_RATIO = 0.30 # was too low before
MIN_VERTICAL_GROUPS_TO_SPLIT = 2
MAX_SPLIT_PARTS = 3 # safety cap
for bid in sorted(bubble_boxes.keys()):
idxs = bubble_indices[bid]
parent = bubble_boxes[bid]
parent_h = max(1, parent[3] - parent[1])
parent_w = max(1, parent[2] - parent[0])
if len(idxs) < 4:
out_bubbles[next_id] = bubbles[bid]
out_boxes[next_id] = bubble_boxes[bid]
out_quads[next_id] = bubble_quads[bid]
out_indices[next_id] = idxs
next_id += 1
continue
layout = detect_internal_text_layout(idxs, ocr, reading_mode=reading_mode)
did_split = False
# --------------------------------------------------------------
# Primary: vertical-mode internal groups (STRICT)
# --------------------------------------------------------------
if layout and layout.get("blocks"):
candidate_groups = []
for block in layout.get("blocks", []):
if block.get("mode", "horizontal") != "vertical":
continue
for grp in block.get("groups", []):
grp = sorted(set(grp), key=lambda i: (
quad_bbox(ocr[i][0])[1],
quad_bbox(ocr[i][0])[0]
))
if not grp:
continue
txt = reconstruct_group_text_best(grp, ocr)
txt = normalize_text(fix_group_level_ocr(txt))
if not txt:
continue
alnum_len = len(re.sub(r"[^A-ZÀ-Ý0-9]", "", txt))
if alnum_len < MIN_ALNUM_PER_GROUP:
continue
gb = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
gw = max(1, gb[2] - gb[0])
gh = max(1, gb[3] - gb[1])
# require meaningful physical size
if gh < parent_h * MIN_GROUP_HEIGHT_RATIO:
continue
# avoid splitting tiny narrow SFX-like strips
if gw < parent_w * 0.12 and alnum_len < 12:
continue
# sentence-ish check
words = txt.split()
has_terminal = bool(re.search(r"[.!?…]$", txt))
if len(words) < 2 and not has_terminal:
continue
candidate_groups.append({
"indices": grp,
"text": txt,
"box": gb
})
if len(candidate_groups) >= MIN_VERTICAL_GROUPS_TO_SPLIT:
# Sort columns by reading order
candidate_groups = sorted(
candidate_groups,
key=lambda g: (g["box"][0] + g["box"][2]) / 2.0,
reverse=(reading_mode == "rtl")
)
# cap extreme over-splits
if len(candidate_groups) > MAX_SPLIT_PARTS:
candidate_groups = candidate_groups[:MAX_SPLIT_PARTS]
# final sanity: total text coverage vs parent text
parent_txt = normalize_text(" ".join(build_final_box_text(idxs, ocr, reading_mode=reading_mode)))
parent_alnum = max(1, len(re.sub(r"[^A-ZÀ-Ý0-9]", "", parent_txt)))
sum_child_alnum = sum(len(re.sub(r"[^A-ZÀ-Ý0-9]", "", g["text"])) for g in candidate_groups)
# if split loses too much text evidence, reject
if (sum_child_alnum / parent_alnum) >= 0.65:
for g in candidate_groups:
grp = sorted(set(g["indices"]), key=lambda i: (
quad_bbox(ocr[i][0])[1],
quad_bbox(ocr[i][0])[0]
))
ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
out_indices[next_id] = grp
out_quads[next_id] = [ocr[i][0] for i in grp]
out_boxes[next_id] = (
max(0, ub[0] - 2), max(0, ub[1] - 2),
min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2)
)
out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode)
next_id += 1
did_split = True
if did_split:
continue
# --------------------------------------------------------------
# Fallback: row sentence split (ONLY for strong punctuation cases)
# --------------------------------------------------------------
row_sentence_parts = split_box_by_sentence_rows(idxs, ocr, min_groups=2)
if row_sentence_parts and 2 <= len(row_sentence_parts) <= 3:
# Require punctuation evidence in resulting parts
part_texts = []
for grp in row_sentence_parts:
txt = normalize_text(" ".join(build_lines_from_indices(grp, ocr)))
txt = fix_group_level_ocr(txt)
part_texts.append(txt)
punct_parts = sum(1 for t in part_texts if re.search(r"[.!?…]$", t))
if punct_parts >= 2:
for grp in row_sentence_parts:
grp = sorted(set(grp), key=lambda i: (
quad_bbox(ocr[i][0])[1],
quad_bbox(ocr[i][0])[0]
))
ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
out_indices[next_id] = grp
out_quads[next_id] = [ocr[i][0] for i in grp]
out_boxes[next_id] = (
max(0, ub[0] - 2), max(0, ub[1] - 2),
min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2)
)
out_bubbles[next_id] = build_final_box_text(grp, ocr, reading_mode=reading_mode)
next_id += 1
continue
# --------------------------------------------------------------
# Keep original if no strong split evidence
# --------------------------------------------------------------
out_bubbles[next_id] = bubbles[bid]
out_boxes[next_id] = bubble_boxes[bid]
out_quads[next_id] = bubble_quads[bid]
out_indices[next_id] = idxs
next_id += 1
return out_bubbles, out_boxes, out_quads, out_indices
def split_box_by_internal_vertical_gaps(bid, bubble_indices, ocr, factor=1.45, min_gap=16):
"""
Multi-cut vertical splitter.
Splits one bubble into N vertical groups when there are multiple strong y-gaps.
Good for 4+4 quad accidental merges.
"""
idxs = bubble_indices.get(bid, [])
if len(idxs) < 4:
return None
items = []
for i in idxs:
b = quad_bbox(ocr[i][0])
cy = (b[1] + b[3]) / 2.0
h = max(1, b[3] - b[1])
items.append((i, b, cy, h))
items.sort(key=lambda x: x[2]) # top->bottom
med_h = float(np.median([x[3] for x in items])) if items else 12.0
th = max(min_gap, med_h * factor)
# Collect cut points
cut_positions = []
prev_b = items[0][1]
for k in range(1, len(items)):
cur_b = items[k][1]
gap = cur_b[1] - prev_b[3]
if gap > th:
cut_positions.append(k)
prev_b = cur_b
if not cut_positions:
return None
# Build groups using all cut positions
groups = []
start = 0
for cp in cut_positions:
groups.append([it[0] for it in items[start:cp]])
start = cp
groups.append([it[0] for it in items[start:]])
# Remove empty groups
groups = [g for g in groups if g]
if len(groups) <= 1:
return None
# Sanity: each group should be meaningful
clean_groups = []
for g in groups:
txt = normalize_text(" ".join(build_lines_from_indices(g, ocr)))
if len(g) >= 2 or len(txt) >= 12:
clean_groups.append(g)
if len(clean_groups) <= 1:
return None
return clean_groups
def force_split_bridged_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr):
"""
Force-split boxes that accidentally contain multiple vertically separated speech chunks.
Typical fixes:
- one detected box actually contains 2 stacked bubbles
- "4 quads + 4 quads" merged into one cluster
- mixed contour membership inside one grouped box
"""
if not bubble_boxes:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
bubble_contours = detect_speech_bubbles(image_bgr)
def contour_id_for_idx(i):
b = quad_bbox(ocr[i][0])
cx = (b[0] + b[2]) / 2.0
cy = (b[1] + b[3]) / 2.0
for ci, c in enumerate(bubble_contours):
if cv2.pointPolygonTest(c, (cx, cy), False) >= 0:
return ci
return -1
def build_group_payload(g):
g_sorted = sorted(g, key=lambda i: quad_center(ocr[i][0])[1])
ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g_sorted])
return (
build_lines_from_indices(g_sorted, ocr), # lines
ub, # box
[ocr[i][0] for i in g_sorted], # quads
g_sorted # indices
)
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for bid in sorted(bubble_boxes.keys()):
idxs = bubble_indices.get(bid, [])
if len(idxs) < 2:
# keep as-is
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
continue
parts = None
# ------------------------------------------------------------------
# (A) Primary: internal vertical-gap multi-split
# ------------------------------------------------------------------
parts = split_box_by_internal_vertical_gaps(
bid, bubble_indices, ocr, factor=1.45, min_gap=16
)
# ------------------------------------------------------------------
# (B) Secondary: split by contour membership if clearly mixed
# ------------------------------------------------------------------
if parts is None and len(idxs) >= 3:
by_contour = {}
for i in idxs:
cid = contour_id_for_idx(i)
by_contour.setdefault(cid, []).append(i)
contour_groups = [g for g in by_contour.values() if len(g) >= 1]
if len(contour_groups) >= 2:
# sort groups top->bottom for stable order
contour_groups.sort(key=lambda g: min(quad_bbox(ocr[i][0])[1] for i in g))
# sanity: avoid splitting tiny noise-only tails
valid = []
for g in contour_groups:
txt = normalize_text(" ".join(build_lines_from_indices(g, ocr)))
if len(g) >= 2 or len(txt) >= 10:
valid.append(g)
if len(valid) >= 2:
parts = valid
# ------------------------------------------------------------------
# (C) Tertiary: balanced 2-block pattern (e.g., 4 quads + 4 quads)
# ------------------------------------------------------------------
if parts is None and len(idxs) >= 8:
sorted_idxs = sorted(
idxs,
key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0
)
mid = len(sorted_idxs) // 2
g1, g2 = sorted_idxs[:mid], sorted_idxs[mid:]
b1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g1])
b2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g2])
if b1 and b2:
vgap = max(0, b2[1] - b1[3])
h1 = max(1, b1[3] - b1[1])
h2 = max(1, b2[3] - b2[1])
med_local_h = (h1 + h2) / 2.0
h_ix = max(0, min(b1[2], b2[2]) - max(b1[0], b2[0]))
min_w = max(1, min(b1[2] - b1[0], b2[2] - b2[0]))
h_overlap_ratio = h_ix / min_w
if vgap >= max(14, 0.22 * med_local_h) and h_overlap_ratio >= 0.30:
parts = [g1, g2]
# ------------------------------------------------------------------
# Commit split or keep original
# ------------------------------------------------------------------
if parts is None or len(parts) <= 1:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
continue
for g in parts:
lines, box, quads, gidx = build_group_payload(g)
new_bubbles[next_bid] = lines
new_boxes[next_bid] = box
new_quads[next_bid] = quads
new_indices[next_bid] = gidx
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# translate_manga_text START
# ============================================================
def translate_manga_text(
image_path="001-page.png",
source_lang="en",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
):
image = cv2.imread(image_path)
if image is None:
print(f"❌ Cannot load image: {image_path}")
return
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
ih, iw = image.shape[:2]
print("Loading OCR engines...")
if use_enhanced_ocr:
detector = ImprovedMacVisionDetector(source_lang=source_lang)
print("🚀 Using Enhanced Multi-Pass OCR")
else:
detector = MacVisionDetector(source_lang=source_lang)
print("Running detection OCR (Apple Vision)...")
raw = detector.read(image_path)
print(f"Raw detections: {len(raw)}")
if use_enhanced_ocr:
existing_quads = [r[0] for r in raw]
missed_regions = detect_small_text_regions(image, existing_quads)
if missed_regions:
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
for region in missed_regions:
rx1, ry1, rx2, ry2 = region
pad = 10
rx1, ry1 = max(0, rx1 - pad), max(0, ry1 - pad)
rx2, ry2 = min(iw, rx2 + pad), min(ih, ry2 + pad)
crop = image[ry1:ry2, rx1:rx2]
if crop.size > 0:
upscaled = cv2.resize(
crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC
)
for quad, text, conf in detector.run_vision_ocr(upscaled):
raw.append((
[[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad],
text,
conf
))
print(f"📝 Total detections after missed region scan: {len(raw)}")
# ── Filtering ─────────────────────────────────────────────────────────
filtered, skipped = [], 0
for bbox, text, conf in raw:
t = normalize_text(text)
qb = quad_bbox(bbox)
if conf < confidence_threshold:
skipped += 1
continue
if len(t) < min_text_length:
skipped += 1
continue
if not is_valid_language(t, source_lang):
skipped += 1
continue
if not is_meaningful_text(t, source_lang):
skipped += 1
continue
if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5:
skipped += 1
continue
filtered.append((bbox, t, conf))
print(f"Kept: {len(filtered)} | Skipped: {skipped}")
# Protect short dialogue token confidence
tmp = []
for bbox, t, conf in filtered:
tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40)))
filtered = tmp
# Rescue names/short tokens dropped by strict filters
rescued = rescue_name_and_short_tokens(raw, min_conf=0.20)
filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55)
if not filtered:
print("⚠️ No text after filtering.")
return
# ── Pre-grouping quad splits ──────────────────────────────────────────
filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered)
if oversized_splits > 0:
print(f"📐 Split {oversized_splits} oversized quad(s) before grouping")
filtered, wide_splits = split_wide_ocr_items(image, filtered)
if wide_splits > 0:
print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.")
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
if bridge_splits > 0:
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
hs_pre = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered]
med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0
filtered, _ = apply_column_gap_splits(image, filtered, med_h_pre)
filtered = normalize_ocr_quads(filtered)
# ── Grouping ──────────────────────────────────────────────────────────
print("📊 Grouping quads vertically...")
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical(
filtered, image.shape, gap_px=resolved_gap,
bbox_padding=1, strict_mode=strict_grouping
)
print(f" Created {len(bubbles)} initial bubble-group box(es)")
print("🧱 Proposing region-first text containers...")
region_lines, region_boxes, region_quads, region_indices = propose_text_regions_from_ocr(
filtered, image.shape
)
print(f" Proposed {len(region_lines)} region container(s)")
# ── Auto-fix (split + merge) ──────────────────────────────────────────
if auto_fix_bubbles:
bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image
)
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image
)
# ── Enforce max box size ──────────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered,
max_width_ratio=max_box_width_ratio,
max_height_ratio=max_box_height_ratio,
image_shape=image.shape
)
# ── Close-proximity merge ─────────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
)
# ── Per-bubble split pass ─────────────────────────────────────────────
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
next_bid = max(bubbles.keys()) + 1 if bubbles else 1
splits_performed = []
for bid in list(bubbles.keys()):
split_result, split_reason = _split_bubble_if_needed(
bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih
)
if split_result:
p1, p2 = split_result
splits_performed.append(f"BOX#{bid} ({split_reason})")
for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]:
ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs])
new_bubbles[part_bid] = build_final_box_text(
part_idxs, filtered, reading_mode=reading_mode
)
new_bubble_boxes[part_bid] = (
max(0, ub[0] - 2), max(0, ub[1] - 2),
min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2)
)
new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs]
new_bubble_indices[part_bid] = part_idxs
next_bid += 1
else:
new_bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
new_bubble_boxes[bid] = bubble_boxes[bid]
new_bubble_quads[bid] = bubble_quads[bid]
new_bubble_indices[bid] = bubble_indices[bid]
if splits_performed:
print(f"\n🔀 Splits detected: {len(splits_performed)}")
for s in splits_performed:
print(f"{s}")
bubbles = new_bubbles
bubble_boxes = new_bubble_boxes
bubble_quads = new_bubble_quads
bubble_indices = new_bubble_indices
# ── Reattach orphan short tokens ──────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = reattach_orphan_short_tokens(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
)
for bid in list(bubble_indices.keys()):
bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
# ── Final reconciliation pass ─────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes(
bubbles,
bubble_boxes,
bubble_quads,
bubble_indices,
filtered,
image_bgr=image,
reading_mode=reading_mode
)
for bid in list(bubble_indices.keys()):
bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image
)
for bid in list(bubble_indices.keys()):
bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes(
bubbles,
bubble_boxes,
bubble_quads,
bubble_indices,
filtered,
image_bgr=image,
reading_mode=reading_mode
)
for bid in list(bubble_indices.keys()):
bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
# ── Reconcile bubble-first and region-first views ─────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_region_and_bubble_groups(
region_lines, region_boxes, region_quads, region_indices,
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
)
for bid in list(bubble_indices.keys()):
bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
# ── Split boxes by internal vertical groups ───────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = split_boxes_by_internal_vertical_groups(
bubbles,
bubble_boxes,
bubble_quads,
bubble_indices,
filtered,
image.shape,
reading_mode=reading_mode
)
for bid in list(bubble_indices.keys()):
bubbles[bid] = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
print(f"✅ Final box count: {len(bubbles)}")
# ── OCR quality pass ──────────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
clean_lines: Dict[int, str] = {}
raw_lines: Dict[int, str] = {}
corrected_lines: Dict[int, str] = {}
sources_used: Dict[int, str] = {}
translations: Dict[int, str] = {}
region_types: Dict[int, str] = {}
region_confidences: Dict[int, float] = {}
region_flags: Dict[int, List[str]] = {}
bubble_group_texts: Dict[int, List[str]] = {}
for bid in sorted(bubble_boxes.keys()):
final_lines = build_final_box_text(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
bubbles[bid] = final_lines
# NEW: segmented phrase groups for translation
group_texts = segment_box_into_phrases(
bubble_indices[bid], filtered, reading_mode=reading_mode
)
bubble_group_texts[bid] = group_texts
base_txt = normalize_text(" ".join(final_lines))
raw_lines[bid] = base_txt
base_sc = ocr_candidate_score(base_txt)
txt, src_used = base_txt, "vision-base"
if base_sc < quality_threshold:
rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
image, bubble_boxes[bid], detector, upscale=3.0, pad=24
)
if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang):
txt, src_used = rr_txt, rr_src
tmp_lines = [txt] if txt else final_lines
region_type = classify_region_type(image, bubble_boxes[bid], tmp_lines)
corrected_txt, correction_gain = correct_region_text(txt, region_type=region_type)
conf = compute_region_confidence(txt, corrected_txt, bubble_boxes[bid], region_type, image)
flags = build_region_flags(txt, corrected_txt, region_type, conf)
if len([g for g in group_texts if g.strip()]) >= 2:
flags.append("BUBBLE")
flags.append("SEGMENTED")
clean_lines[bid] = normalize_text(corrected_txt)
corrected_lines[bid] = normalize_text(corrected_txt)
sources_used[bid] = src_used
region_types[bid] = region_type
region_confidences[bid] = conf
region_flags[bid] = sorted(set(flags))
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
# ── Translation ───────────────────────────────────────────────────────
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
group_texts = [g for g in bubble_group_texts.get(bid, []) if g.strip()]
if len(group_texts) >= 2:
src_txt = " ".join(group_texts).strip()
else:
src_txt = clean_lines[bid].strip()
if not src_txt:
continue
if not is_valid_language(src_txt, source_lang):
continue
if not is_meaningful_text(src_txt, source_lang):
continue
try:
if len(group_texts) >= 2:
translated_groups = []
for g in group_texts:
if not is_valid_language(g, source_lang):
continue
if not is_meaningful_text(g, source_lang):
continue
tg = translator.translate(g) or ""
tg = postprocess_translation_general(tg).upper()
if tg:
translated_groups.append(tg)
tgt = " || ".join(translated_groups)
else:
tgt = translator.translate(src_txt) or ""
tgt = postprocess_translation_general(tgt).upper()
except Exception as e:
tgt = f"[Error: {e}]"
translations[bid] = tgt
if debug:
save_debug_clusters(
image_path, filtered, bubble_boxes, bubble_indices,
clean_lines, "debug_clusters.png", region_types=region_types
)
# ── Text output ───────────────────────────────────────────────────────
divider = "" * 140
out_lines = [
"BUBBLE|ORDER|TYPE|CONF|OCR_SOURCE|ORIGINAL|CORRECTED|BUBBLE_GROUPS|TRANSLATED|FLAGS",
divider
]
print(
divider +
f"\n{'BUBBLE':<8} {'ORDER':<6} {'TYPE':<10} {'CONF':<6} {'SOURCE':<12} "
f"{'CORRECTED':<30} {'BUBBLE_GROUPS':<40} {'TRANSLATED':<30} FLAGS\n" +
divider
)
translated_count = 0
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt:
continue
if not is_valid_language(src_txt, source_lang):
continue
if not is_meaningful_text(src_txt, source_lang):
continue
flags = list(region_flags.get(bid, []))
tgt = translations.get(bid, "")
if not tgt:
flags.append("NO_TRANSLATION")
src_engine = sources_used.get(bid, "unknown")
rtype = region_types.get(bid, "unknown")
rconf = region_confidences.get(bid, 0.0)
raw_u = raw_lines.get(bid, "").upper()
corr_u = corrected_lines.get(bid, "").upper()
group_blob = " || ".join(bubble_group_texts.get(bid, [])).upper()
out_lines.append(
f"#{bid}|{reading_map.get(bid, bid)}|{rtype}|{rconf:.2f}|{src_engine}|"
f"{raw_u}|{corr_u}|{group_blob}|{tgt}|{','.join(flags) if flags else '-'}"
)
print(
f"#{bid:<7} {reading_map.get(bid,bid):<6} {rtype:<10} {rconf:<6.2f} {src_engine:<12} "
f"{corr_u[:30]:<30} {group_blob[:40]:<40} {tgt[:30]:<30} "
f"{','.join(flags) if flags else '-'}"
)
translated_count += 1
out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.")
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(out_lines))
# ── bubbles.json ──────────────────────────────────────────────────────
bubbles_payload = {}
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt:
continue
if not is_valid_language(src_txt, source_lang):
continue
if not is_meaningful_text(src_txt, source_lang):
continue
box = bubble_boxes.get(bid)
tgt = translations.get(bid, "")
bubbles_payload[str(bid)] = {
"order": reading_map.get(bid, bid),
"region_type": region_types.get(bid, "unknown"),
"confidence": round(region_confidences.get(bid, 0.0), 4),
"ocr_source": sources_used.get(bid, "unknown"),
"raw_ocr": raw_lines.get(bid, "").upper(),
"corrected_ocr": corrected_lines.get(bid, "").upper(),
"translation_input": src_txt.upper(),
"translated": tgt,
"flags": region_flags.get(bid, []),
"bubble_groups": [g.upper() for g in bubble_group_texts.get(bid, [])],
"box": {
"x": box[0] if box else 0,
"y": box[1] if box else 0,
"w": (box[2] - box[0]) if box else 0,
"h": (box[3] - box[1]) if box else 0,
},
"lines": [line.upper() for line in bubbles.get(bid, [])],
}
with open(export_bubbles_to, "w", encoding="utf-8") as f:
json.dump(bubbles_payload, f, ensure_ascii=False, indent=2)
print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}")
# ============================================================
# translate_manga_text END
# ============================================================
# ============================================================
# ENTRY POINT
# ============================================================
if __name__ == "__main__":
translate_manga_text(
image_path="19.png",
source_lang="english",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
)