Improving white coloring

This commit is contained in:
Guillem Hernandez Sola
2026-04-14 20:38:05 +02:00
parent f95b7d32d4
commit eadc28154a

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
import os import os
import re
import json import json
import cv2 import cv2
import numpy as np import numpy as np
@@ -10,9 +10,9 @@ import easyocr
from deep_translator import GoogleTranslator from deep_translator import GoogleTranslator
# ───────────────────────────────────────────── # ============================================================
# CONFIG # CONFIG
# ───────────────────────────────────────────── # ============================================================
GLOSSARY = { GLOSSARY = {
"ANYA": "ANYA", "ANYA": "ANYA",
"STARLIGHT ANYA": "STARLIGHT ANYA", "STARLIGHT ANYA": "STARLIGHT ANYA",
@@ -34,16 +34,17 @@ TITLE_PATTERNS = [
] ]
NOISE_PATTERNS = [ NOISE_PATTERNS = [
r"^[^a-zA-Z0-9\?!.]+$", r"^[^a-zA-Z0-9\?!.¡¿]+$",
r"^BOX[0-9A-Z#\s]*$", r"^BOX[#\s0-9A-Z\-]*$", # debug labels
r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", # e.g. 98x12
] ]
TOP_BAND_RATIO = 0.08 TOP_BAND_RATIO = 0.08
# ───────────────────────────────────────────── # ============================================================
# TEXT HELPERS # TEXT HELPERS
# ───────────────────────────────────────────── # ============================================================
def normalize_text(text: str) -> str: def normalize_text(text: str) -> str:
t = (text or "").strip().upper() t = (text or "").strip().upper()
t = t.replace("", "\"").replace("", "\"") t = t.replace("", "\"").replace("", "\"")
@@ -51,18 +52,22 @@ def normalize_text(text: str) -> str:
t = t.replace("", "...") t = t.replace("", "...")
t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+", " ", t)
t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t)
t = re.sub(r"([¡¿])\s+", r"\1", t)
t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\(\s+", "(", t)
t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\s+\)", ")", t)
t = re.sub(r"\.{4,}", "...", t) t = re.sub(r"\.{4,}", "...", t)
t = re.sub(r",\?", "?", t) t = t.replace("IQUE", "¡QUE")
t = t.replace("IQUIEN", "¿QUIEN")
return t.strip() return t.strip()
def apply_glossary(text: str) -> str: def apply_glossary(text: str) -> str:
out = text or "" out = text or ""
for k in sorted(GLOSSARY.keys(), key=len, reverse=True): for k in sorted(GLOSSARY.keys(), key=len, reverse=True):
out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE)
return out return out
def postprocess_translation_general(text: str) -> str: def postprocess_translation_general(text: str) -> str:
t = normalize_text(text) t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip() t = re.sub(r"\s{2,}", " ", t).strip()
@@ -71,34 +76,47 @@ def postprocess_translation_general(text: str) -> str:
return t return t
# ─────────────────────────────────────────────
# FILTERS
# ─────────────────────────────────────────────
def is_sound_effect(text: str) -> bool: def is_sound_effect(text: str) -> bool:
cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower()) cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
def is_title_text(text: str) -> bool: def is_title_text(text: str) -> bool:
t = (text or "").strip().lower() t = (text or "").strip().lower()
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
def is_noise_text(text: str) -> bool: def is_noise_text(text: str) -> bool:
t = (text or "").strip() t = (text or "").strip()
return any(re.fullmatch(p, t) for p in NOISE_PATTERNS) if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
return True
# very short isolated junk
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
return True
# mostly-symbol garbage
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
if len(t) <= 6 and symbol_ratio > 0.60:
return True
return False
# ───────────────────────────────────────────── # ============================================================
# GEOMETRY # GEOMETRY
# ───────────────────────────────────────────── # ============================================================
def quad_bbox(quad): def quad_bbox(quad):
xs = [p[0] for p in quad] xs = [p[0] for p in quad]
ys = [p[1] for p in quad] ys = [p[1] for p in quad]
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
def quad_center(quad): def quad_center(quad):
x1, y1, x2, y2 = quad_bbox(quad) x1, y1, x2, y2 = quad_bbox(quad)
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
def boxes_union_xyxy(boxes): def boxes_union_xyxy(boxes):
boxes = [b for b in boxes if b is not None] boxes = [b for b in boxes if b is not None]
if not boxes: if not boxes:
@@ -110,17 +128,20 @@ def boxes_union_xyxy(boxes):
int(max(b[3] for b in boxes)), int(max(b[3] for b in boxes)),
) )
def bbox_area_xyxy(b): def bbox_area_xyxy(b):
if b is None: if b is None:
return 0 return 0
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
def xyxy_to_xywh(b): def xyxy_to_xywh(b):
if b is None: if b is None:
return None return None
x1, y1, x2, y2 = b x1, y1, x2, y2 = b
return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
def overlap_or_near(a, b, gap=0): def overlap_or_near(a, b, gap=0):
ax1, ay1, ax2, ay2 = a ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b bx1, by1, bx2, by2 = b
@@ -129,21 +150,9 @@ def overlap_or_near(a, b, gap=0):
return gap_x <= gap and gap_y <= gap return gap_x <= gap and gap_y <= gap
# ───────────────────────────────────────────── # ============================================================
# QUALITY / SCORING # OCR QUALITY SCORING
# ───────────────────────────────────────────── # ============================================================
def ocr_quality_score(text: str) -> float:
if not text or len(text) < 2:
return 0.0
alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text))
penalty = 0.0
if re.search(r"[^\w\s\'\!\?\.,\-]{2,}", text):
penalty += 0.2
if re.search(r",,", text):
penalty += 0.2
bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0
return max(0.0, min(1.0, alpha_ratio - penalty + bonus))
def ocr_candidate_score(text: str) -> float: def ocr_candidate_score(text: str) -> float:
if not text: if not text:
return 0.0 return 0.0
@@ -154,8 +163,8 @@ def ocr_candidate_score(text: str) -> float:
alpha = sum(c.isalpha() for c in t) / n alpha = sum(c.isalpha() for c in t) / n
spaces = sum(c.isspace() for c in t) / n spaces = sum(c.isspace() for c in t) / n
punct_ok = sum(c in ".,!?'-:;()[]\"" for c in t) / n punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"]", t)) / n bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
penalty = 0.0 penalty = 0.0
if re.search(r"\b[A-Z]\b", t): if re.search(r"\b[A-Z]\b", t):
@@ -169,9 +178,9 @@ def ocr_candidate_score(text: str) -> float:
return max(0.0, min(1.0, score)) return max(0.0, min(1.0, score))
# ───────────────────────────────────────────── # ============================================================
# OCR MULTI-PASS # OCR MULTI-PASS REREAD
# ───────────────────────────────────────────── # ============================================================
def preprocess_variant(crop_bgr, mode): def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
@@ -198,6 +207,7 @@ def preprocess_variant(crop_bgr, mode):
return gray return gray
def rotate_image_keep_bounds(img, angle_deg): def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2] h, w = img.shape[:2]
c = (w / 2, h / 2) c = (w / 2, h / 2)
@@ -213,6 +223,7 @@ def rotate_image_keep_bounds(img, angle_deg):
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
def run_ocr_on_array(reader, arr): def run_ocr_on_array(reader, arr):
tmp = "_tmp_ocr.png" tmp = "_tmp_ocr.png"
cv2.imwrite(tmp, arr) cv2.imwrite(tmp, arr)
@@ -222,6 +233,7 @@ def run_ocr_on_array(reader, arr):
if os.path.exists(tmp): if os.path.exists(tmp):
os.remove(tmp) os.remove(tmp)
def rebuild_text_from_ocr_result(res): def rebuild_text_from_ocr_result(res):
if not res: if not res:
return "" return ""
@@ -245,7 +257,7 @@ def rebuild_text_from_ocr_result(res):
med_h = float(np.median([x[5] for x in norm])) med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75) row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4]) # y norm.sort(key=lambda z: z[4]) # y-center
rows = [] rows = []
for it in norm: for it in norm:
placed = False placed = False
@@ -259,17 +271,17 @@ def rebuild_text_from_ocr_result(res):
rows.append({"yc": it[4], "m": [it]}) rows.append({"yc": it[4], "m": [it]})
rows.sort(key=lambda r: r["yc"]) rows.sort(key=lambda r: r["yc"])
lines = [] lines = []
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z[3]) # x mem = sorted(r["m"], key=lambda z: z[3]) # x-center
line = normalize_text(" ".join(x[1] for x in mem)) line = normalize_text(" ".join(x[1] for x in mem))
if line: if line:
lines.append(line) lines.append(line)
return normalize_text(" ".join(lines)) return normalize_text(" ".join(lines))
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22):
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
ih, iw = image.shape[:2] ih, iw = image.shape[:2]
x1, y1, x2, y2 = bbox x1, y1, x2, y2 = bbox
x1 = max(0, int(x1 - pad)) x1 = max(0, int(x1 - pad))
@@ -318,9 +330,9 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22):
return best_text, best_score return best_text, best_score
# ───────────────────────────────────────────── # ============================================================
# LINES + YELLOW BOXES # LINE REBUILD + LINE BOXES (YELLOW)
# ───────────────────────────────────────────── # ============================================================
def build_lines_from_indices(indices, ocr): def build_lines_from_indices(indices, ocr):
if not indices: if not indices:
return [] return []
@@ -336,7 +348,7 @@ def build_lines_from_indices(indices, ocr):
med_h = float(np.median([it[4] for it in items])) if items else 10.0 med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75) row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3]) items.sort(key=lambda x: x[3]) # y
rows = [] rows = []
for it in items: for it in items:
i, b, xc, yc, h = it i, b, xc, yc, h = it
@@ -353,19 +365,23 @@ def build_lines_from_indices(indices, ocr):
rows.sort(key=lambda r: r["yc"]) rows.sort(key=lambda r: r["yc"])
lines = [] lines = []
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z[2]) mem = sorted(r["m"], key=lambda z: z[2]) # x
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
lines.append(txt) if txt and not is_noise_text(txt):
lines.append(txt)
return lines return lines
def build_line_boxes_from_indices(indices, ocr):
def build_line_boxes_from_indices(indices, ocr, image_shape=None):
""" """
Robust yellow-box generation with punctuation attachment: Improved yellow box builder:
- row grouping - row grouping
- chunking by x gap - x-gap chunking
- attach tiny punctuation/special tokens to nearest chunk - punctuation attachment
- token coverage guarantee - token coverage guarantee
- larger/asymmetric padding (fix clipped chars)
- min-size safety expansion
""" """
if not indices: if not indices:
return [] return []
@@ -374,19 +390,25 @@ def build_line_boxes_from_indices(indices, ocr):
for i in indices: for i in indices:
b = quad_bbox(ocr[i][0]) b = quad_bbox(ocr[i][0])
txt = normalize_text(ocr[i][1]) txt = normalize_text(ocr[i][1])
if is_noise_text(txt):
continue
xc = (b[0] + b[2]) / 2.0 xc = (b[0] + b[2]) / 2.0
yc = (b[1] + b[3]) / 2.0 yc = (b[1] + b[3]) / 2.0
w = max(1.0, b[2] - b[0]) w = max(1.0, b[2] - b[0])
h = max(1.0, b[3] - b[1]) h = max(1.0, b[3] - b[1])
items.append({ items.append({
"i": i, "b": b, "txt": txt, "i": i, "b": b, "txt": txt,
"xc": xc, "yc": yc, "w": w, "h": h "xc": xc, "yc": yc, "w": w, "h": h
}) })
med_h = float(np.median([it["h"] for it in items])) if items else 10.0 if not items:
return []
med_h = float(np.median([it["h"] for it in items]))
row_tol = max(6.0, med_h * 0.90) row_tol = max(6.0, med_h * 0.90)
gap_x_tol = max(8.0, med_h * 1.25) gap_x_tol = max(8.0, med_h * 1.25)
pad = max(1, int(round(med_h * 0.12))) pad = max(3, int(round(med_h * 0.22))) # was 0.12
def is_punct_like(t): def is_punct_like(t):
raw = (t or "").strip() raw = (t or "").strip()
@@ -395,7 +417,7 @@ def build_line_boxes_from_indices(indices, ocr):
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
return punct_ratio >= 0.5 or len(raw) <= 2 return punct_ratio >= 0.5 or len(raw) <= 2
# 1) row grouping # 1) group into rows
items_sorted = sorted(items, key=lambda x: x["yc"]) items_sorted = sorted(items, key=lambda x: x["yc"])
rows = [] rows = []
for it in items_sorted: for it in items_sorted:
@@ -414,6 +436,7 @@ def build_line_boxes_from_indices(indices, ocr):
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z["xc"]) mem = sorted(r["m"], key=lambda z: z["xc"])
normal = [t for t in mem if not is_punct_like(t["txt"])] normal = [t for t in mem if not is_punct_like(t["txt"])]
punct = [t for t in mem if is_punct_like(t["txt"])] punct = [t for t in mem if is_punct_like(t["txt"])]
@@ -421,7 +444,7 @@ def build_line_boxes_from_indices(indices, ocr):
normal = mem normal = mem
punct = [] punct = []
# 2) chunk normal by x-gap # 2) chunk by x-gap
chunks = [] chunks = []
cur = [normal[0]] cur = [normal[0]]
for t in normal[1:]: for t in normal[1:]:
@@ -435,7 +458,7 @@ def build_line_boxes_from_indices(indices, ocr):
cur = [t] cur = [t]
chunks.append(cur) chunks.append(cur)
# 3) attach punct tokens to nearest chunk # 3) attach punctuation/special tokens with larger near-gap
for p in punct: for p in punct:
pb = p["b"] pb = p["b"]
pxc, pyc = p["xc"], p["yc"] pxc, pyc = p["xc"], p["yc"]
@@ -450,7 +473,7 @@ def build_line_boxes_from_indices(indices, ocr):
dy = abs(pyc - cy) dy = abs(pyc - cy)
score = dx + 1.8 * dy score = dx + 1.8 * dy
near = overlap_or_near(pb, ub, gap=int(med_h * 0.9)) near = overlap_or_near(pb, ub, gap=int(med_h * 1.25))
if near: if near:
score -= med_h * 2.0 score -= med_h * 2.0
@@ -463,14 +486,17 @@ def build_line_boxes_from_indices(indices, ocr):
else: else:
chunks.append([p]) chunks.append([p])
# 4) emit chunk boxes # 4) emit chunk boxes with asymmetric padding
for ch in chunks: for ch in chunks:
ub = boxes_union_xyxy([x["b"] for x in ch]) ub = boxes_union_xyxy([x["b"] for x in ch])
if ub: if ub:
x1, y1, x2, y2 = ub x1, y1, x2, y2 = ub
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) pad_x = pad
pad_top = int(round(pad * 1.35))
pad_bot = int(round(pad * 0.95))
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
# 5) guarantee every token is inside some yellow box # 5) guarantee every token is covered
token_boxes = [it["b"] for it in items] token_boxes = [it["b"] for it in items]
def inside(tb, lb): def inside(tb, lb):
@@ -479,7 +505,10 @@ def build_line_boxes_from_indices(indices, ocr):
for tb in token_boxes: for tb in token_boxes:
if not any(inside(tb, lb) for lb in out_boxes): if not any(inside(tb, lb) for lb in out_boxes):
x1, y1, x2, y2 = tb x1, y1, x2, y2 = tb
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) pad_x = pad
pad_top = int(round(pad * 1.35))
pad_bot = int(round(pad * 0.95))
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
# 6) merge heavy overlaps # 6) merge heavy overlaps
merged = [] merged = []
@@ -499,19 +528,51 @@ def build_line_boxes_from_indices(indices, ocr):
if not merged_into: if not merged_into:
merged.append(b) merged.append(b)
# 7) min-size safety expansion (for tiny lines like "NO.")
safe = []
for (x1, y1, x2, y2) in merged:
w = x2 - x1
h = y2 - y1
if w < 28:
d = (28 - w) // 2 + 2
x1 -= d
x2 += d
if h < 18:
d = (18 - h) // 2 + 2
y1 -= d
y2 += d
safe.append((x1, y1, x2, y2))
merged = safe
# clamp bounds
if image_shape is not None:
ih, iw = image_shape[:2]
clamped = []
for b in merged:
x1 = max(0, int(b[0]))
y1 = max(0, int(b[1]))
x2 = min(iw - 1, int(b[2]))
y2 = min(ih - 1, int(b[3]))
if x2 > x1 and y2 > y1:
clamped.append((x1, y1, x2, y2))
merged = clamped
else:
merged = [(int(b[0]), int(b[1]), int(b[2]), int(b[3])) for b in merged]
merged.sort(key=lambda z: (z[1], z[0])) merged.sort(key=lambda z: (z[1], z[0]))
return merged return merged
# ───────────────────────────────────────────── # ============================================================
# GROUPING # GROUP TOKENS TO BUBBLES
# ───────────────────────────────────────────── # ============================================================
def auto_gap(image_path, base=18, ref_w=750): def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path) img = cv2.imread(image_path)
if img is None: if img is None:
return base return base
return base * (img.shape[1] / ref_w) return base * (img.shape[1] / ref_w)
def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
n = len(ocr) n = len(ocr)
if n == 0: if n == 0:
@@ -575,8 +636,8 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
x1, y1, x2, y2 = ub x1, y1, x2, y2 = ub
x1 = max(0, x1 - bbox_padding) x1 = max(0, x1 - bbox_padding)
y1 = max(0, y1 - bbox_padding) y1 = max(0, y1 - bbox_padding)
x2 = min(iw, x2 + bbox_padding) x2 = min(iw - 1, x2 + bbox_padding)
y2 = min(ih, y2 + bbox_padding) y2 = min(ih - 1, y2 + bbox_padding)
bubbles[bid] = lines bubbles[bid] = lines
bubble_boxes[bid] = (x1, y1, x2, y2) bubble_boxes[bid] = (x1, y1, x2, y2)
@@ -586,40 +647,40 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
return bubbles, bubble_boxes, bubble_quads, bubble_indices return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ───────────────────────────────────────────── # ============================================================
# DEBUG # DEBUG IMAGE
# ───────────────────────────────────────────── # ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
img = cv2.imread(image_path) img = cv2.imread(image_path)
if img is None: if img is None:
return return
# OCR token quads # OCR token quads (gray)
for bbox, txt, conf in ocr: for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32) pts = np.array(bbox, dtype=np.int32)
cv2.polylines(img, [pts], True, (180, 180, 180), 1) cv2.polylines(img, [pts], True, (180, 180, 180), 1)
# Bubble + line boxes # bubble boxes (green) + line boxes (yellow)
for bid, bb in bubble_boxes.items(): for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2
)
idxs = bubble_indices.get(bid, []) idxs = bubble_indices.get(bid, [])
line_boxes = build_line_boxes_from_indices(idxs, ocr) line_boxes = build_line_boxes_from_indices(idxs, ocr, image_shape=img.shape)
for lb in line_boxes: for lb in line_boxes:
lx1, ly1, lx2, ly2 = lb lx1, ly1, lx2, ly2 = lb
lx1 = max(0, int(lx1)); ly1 = max(0, int(ly1))
lx2 = min(img.shape[1] - 1, int(lx2)); ly2 = min(img.shape[0] - 1, int(ly2))
cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3) cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3)
cv2.imwrite(out_path, img) cv2.imwrite(out_path, img)
# ───────────────────────────────────────────── # ============================================================
# EXPORT # EXPORT
# ───────────────────────────────────────────── # ============================================================
def estimate_reading_order(bbox_dict, mode="ltr"): def estimate_reading_order(bbox_dict, mode="ltr"):
items = [] items = []
for bid, (x1, y1, x2, y2) in bbox_dict.items(): for bid, (x1, y1, x2, y2) in bbox_dict.items():
@@ -627,7 +688,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
cy = (y1 + y2) / 2.0 cy = (y1 + y2) / 2.0
items.append((bid, cx, cy)) items.append((bid, cx, cy))
items.sort(key=lambda t: t[2]) # top to bottom items.sort(key=lambda t: t[2]) # top -> bottom
rows = [] rows = []
tol = 90 tol = 90
@@ -650,9 +711,9 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
return {bid: i + 1 for i, bid in enumerate(order)} return {bid: i + 1 for i, bid in enumerate(order)}
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
out = {} out = {}
for bid, bb in bbox_dict.items(): for bid, bb in bbox_dict.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
quads = quads_dict.get(bid, []) quads = quads_dict.get(bid, [])
@@ -661,7 +722,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
qboxes = [quad_bbox(q) for q in quads] qboxes = [quad_bbox(q) for q in quads]
text_union = boxes_union_xyxy(qboxes) text_union = boxes_union_xyxy(qboxes)
line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr) line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr, image_shape=image_shape)
line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy) line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy)
line_union_area = bbox_area_xyxy(line_union_xyxy) line_union_area = bbox_area_xyxy(line_union_xyxy)
@@ -676,7 +737,6 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
[[int(p[0]), int(p[1])] for p in q] for q in quads [[int(p[0]), int(p[1])] for p in q] for q in quads
], ],
"text_bbox": xyxy_to_xywh(text_union), "text_bbox": xyxy_to_xywh(text_union),
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
"line_union_area": int(line_union_area), "line_union_area": int(line_union_area),
@@ -686,9 +746,9 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
json.dump(out, f, indent=2, ensure_ascii=False) json.dump(out, f, indent=2, ensure_ascii=False)
# ───────────────────────────────────────────── # ============================================================
# MAIN # MAIN PIPELINE
# ───────────────────────────────────────────── # ============================================================
def translate_manga_text( def translate_manga_text(
image_path, image_path,
source_lang="en", source_lang="en",
@@ -711,7 +771,14 @@ def translate_manga_text(
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
print("Loading OCR...") print("Loading OCR...")
ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] # Catalan often OCRs better with es+en in manga pages
if source_lang == "ca":
ocr_lang_list = ["es", "en"]
elif source_lang == "en":
ocr_lang_list = ["en", "es"]
else:
ocr_lang_list = [source_lang]
reader = easyocr.Reader(ocr_lang_list) reader = easyocr.Reader(ocr_lang_list)
print("Running OCR...") print("Running OCR...")
@@ -742,7 +809,7 @@ def translate_manga_text(
skipped += 1 skipped += 1
continue continue
# reduce false positives in very top strip # reduce top-strip false positives
if qb[1] < int(ih * TOP_BAND_RATIO): if qb[1] < int(ih * TOP_BAND_RATIO):
if conf < 0.70 and len(t) >= 5: if conf < 0.70 and len(t) >= 5:
skipped += 1 skipped += 1
@@ -770,20 +837,18 @@ def translate_manga_text(
translator = GoogleTranslator(source=source_lang, target=target_lang) translator = GoogleTranslator(source=source_lang, target=target_lang)
# robust bubble text cleanup
clean_lines = {} clean_lines = {}
for bid, lines in bubbles.items(): for bid, lines in bubbles.items():
base_txt = normalize_text(" ".join(lines)) base_txt = normalize_text(" ".join(lines))
base_sc = ocr_candidate_score(base_txt) base_sc = ocr_candidate_score(base_txt)
# only robust reread on low quality
if base_sc < quality_threshold: if base_sc < quality_threshold:
rr_txt, rr_sc = reread_crop_robust( rr_txt, rr_sc = reread_crop_robust(
image, image,
bubble_boxes[bid], bubble_boxes[bid],
reader, reader,
upscale=3.0, upscale=3.0,
pad=22 pad=24
) )
if rr_txt and rr_sc > base_sc + 0.06: if rr_txt and rr_sc > base_sc + 0.06:
txt = rr_txt txt = rr_txt
@@ -792,7 +857,12 @@ def translate_manga_text(
else: else:
txt = base_txt txt = base_txt
clean_lines[bid] = apply_glossary(txt) # tiny targeted corrections for common OCR confusions
txt = txt.replace(" BOMPORTA", " IMPORTA")
txt = txt.replace(" TESTO ", " ESTO ")
txt = txt.replace(" MIVERDAD", " MI VERDAD")
clean_lines[bid] = apply_glossary(normalize_text(txt))
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
@@ -822,6 +892,7 @@ def translate_manga_text(
out_lines.append( out_lines.append(
f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}"
) )
print( print(
f"#{bid:<7} {reading_map.get(bid,bid):<6} " f"#{bid:<7} {reading_map.get(bid,bid):<6} "
f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}" f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}"
@@ -851,10 +922,13 @@ def translate_manga_text(
print("Saved: debug_clusters.png") print("Saved: debug_clusters.png")
# ============================================================
# ENTRYPOINT
# ============================================================
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path="001-page.png", image_path="004-page.png",
source_lang="it", source_lang="es",
target_lang="ca", target_lang="ca",
confidence_threshold=0.12, confidence_threshold=0.12,
min_text_length=1, min_text_length=1,