From eadc28154ac69ca0291bcee6194e4e8080041948 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 14 Apr 2026 20:38:05 +0200 Subject: [PATCH] Improving white coloring --- manga-translator.py | 254 ++++++++++++++++++++++++++++---------------- 1 file changed, 164 insertions(+), 90 deletions(-) diff --git a/manga-translator.py b/manga-translator.py index 0aff648..916c162 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import re import os +import re import json import cv2 import numpy as np @@ -10,9 +10,9 @@ import easyocr from deep_translator import GoogleTranslator -# ───────────────────────────────────────────── +# ============================================================ # CONFIG -# ───────────────────────────────────────────── +# ============================================================ GLOSSARY = { "ANYA": "ANYA", "STARLIGHT ANYA": "STARLIGHT ANYA", @@ -34,16 +34,17 @@ TITLE_PATTERNS = [ ] NOISE_PATTERNS = [ - r"^[^a-zA-Z0-9\?!.]+$", - r"^BOX[0-9A-Z#\s]*$", + r"^[^a-zA-Z0-9\?!.¡¿]+$", + r"^BOX[#\s0-9A-Z\-]*$", # debug labels + r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", # e.g. 98x12 ] TOP_BAND_RATIO = 0.08 -# ───────────────────────────────────────────── +# ============================================================ # TEXT HELPERS -# ───────────────────────────────────────────── +# ============================================================ def normalize_text(text: str) -> str: t = (text or "").strip().upper() t = t.replace("“", "\"").replace("”", "\"") @@ -51,18 +52,22 @@ def normalize_text(text: str) -> str: t = t.replace("…", "...") t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t) + t = re.sub(r"([¡¿])\s+", r"\1", t) t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\.{4,}", "...", t) - t = re.sub(r",\?", "?", t) + t = t.replace("IQUE", "¡QUE") + t = t.replace("IQUIEN", "¿QUIEN") return t.strip() + def apply_glossary(text: str) -> str: out = text or "" for k in sorted(GLOSSARY.keys(), key=len, reverse=True): out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) return out + def postprocess_translation_general(text: str) -> str: t = normalize_text(text) t = re.sub(r"\s{2,}", " ", t).strip() @@ -71,34 +76,47 @@ def postprocess_translation_general(text: str) -> str: return t -# ───────────────────────────────────────────── -# FILTERS -# ───────────────────────────────────────────── def is_sound_effect(text: str) -> bool: cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower()) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) + def is_title_text(text: str) -> bool: t = (text or "").strip().lower() return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) + def is_noise_text(text: str) -> bool: t = (text or "").strip() - return any(re.fullmatch(p, t) for p in NOISE_PATTERNS) + if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): + return True + + # very short isolated junk + if len(t) <= 2 and not re.search(r"[A-Z0-9]", t): + return True + + # mostly-symbol garbage + symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) + if len(t) <= 6 and symbol_ratio > 0.60: + return True + + return False -# ───────────────────────────────────────────── +# ============================================================ # GEOMETRY -# ───────────────────────────────────────────── +# ============================================================ def quad_bbox(quad): xs = [p[0] for p in quad] ys = [p[1] for p in quad] return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) + def quad_center(quad): x1, y1, x2, y2 = quad_bbox(quad) return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) + def boxes_union_xyxy(boxes): boxes = [b for b in boxes if b is not None] if not boxes: @@ -110,17 +128,20 @@ def boxes_union_xyxy(boxes): int(max(b[3] for b in boxes)), ) + def bbox_area_xyxy(b): if b is None: return 0 return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) + def xyxy_to_xywh(b): if b is None: return None x1, y1, x2, y2 = b return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} + def overlap_or_near(a, b, gap=0): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b @@ -129,21 +150,9 @@ def overlap_or_near(a, b, gap=0): return gap_x <= gap and gap_y <= gap -# ───────────────────────────────────────────── -# QUALITY / SCORING -# ───────────────────────────────────────────── -def ocr_quality_score(text: str) -> float: - if not text or len(text) < 2: - return 0.0 - alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text)) - penalty = 0.0 - if re.search(r"[^\w\s\'\!\?\.,\-]{2,}", text): - penalty += 0.2 - if re.search(r",,", text): - penalty += 0.2 - bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0 - return max(0.0, min(1.0, alpha_ratio - penalty + bonus)) - +# ============================================================ +# OCR QUALITY SCORING +# ============================================================ def ocr_candidate_score(text: str) -> float: if not text: return 0.0 @@ -154,8 +163,8 @@ def ocr_candidate_score(text: str) -> float: alpha = sum(c.isalpha() for c in t) / n spaces = sum(c.isspace() for c in t) / n - punct_ok = sum(c in ".,!?'-:;()[]\"" for c in t) / n - bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"]", t)) / n + punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n + bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n penalty = 0.0 if re.search(r"\b[A-Z]\b", t): @@ -169,9 +178,9 @@ def ocr_candidate_score(text: str) -> float: return max(0.0, min(1.0, score)) -# ───────────────────────────────────────────── -# OCR MULTI-PASS -# ───────────────────────────────────────────── +# ============================================================ +# OCR MULTI-PASS REREAD +# ============================================================ def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) @@ -198,6 +207,7 @@ def preprocess_variant(crop_bgr, mode): return gray + def rotate_image_keep_bounds(img, angle_deg): h, w = img.shape[:2] c = (w / 2, h / 2) @@ -213,6 +223,7 @@ def rotate_image_keep_bounds(img, angle_deg): return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) + def run_ocr_on_array(reader, arr): tmp = "_tmp_ocr.png" cv2.imwrite(tmp, arr) @@ -222,6 +233,7 @@ def run_ocr_on_array(reader, arr): if os.path.exists(tmp): os.remove(tmp) + def rebuild_text_from_ocr_result(res): if not res: return "" @@ -245,7 +257,7 @@ def rebuild_text_from_ocr_result(res): med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) - norm.sort(key=lambda z: z[4]) # y + norm.sort(key=lambda z: z[4]) # y-center rows = [] for it in norm: placed = False @@ -259,17 +271,17 @@ def rebuild_text_from_ocr_result(res): rows.append({"yc": it[4], "m": [it]}) rows.sort(key=lambda r: r["yc"]) - lines = [] for r in rows: - mem = sorted(r["m"], key=lambda z: z[3]) # x + mem = sorted(r["m"], key=lambda z: z[3]) # x-center line = normalize_text(" ".join(x[1] for x in mem)) if line: lines.append(line) return normalize_text(" ".join(lines)) -def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22): + +def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24): ih, iw = image.shape[:2] x1, y1, x2, y2 = bbox x1 = max(0, int(x1 - pad)) @@ -318,9 +330,9 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22): return best_text, best_score -# ───────────────────────────────────────────── -# LINES + YELLOW BOXES -# ───────────────────────────────────────────── +# ============================================================ +# LINE REBUILD + LINE BOXES (YELLOW) +# ============================================================ def build_lines_from_indices(indices, ocr): if not indices: return [] @@ -336,7 +348,7 @@ def build_lines_from_indices(indices, ocr): med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.75) - items.sort(key=lambda x: x[3]) + items.sort(key=lambda x: x[3]) # y rows = [] for it in items: i, b, xc, yc, h = it @@ -353,19 +365,23 @@ def build_lines_from_indices(indices, ocr): rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: - mem = sorted(r["m"], key=lambda z: z[2]) + mem = sorted(r["m"], key=lambda z: z[2]) # x txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) - lines.append(txt) + if txt and not is_noise_text(txt): + lines.append(txt) return lines -def build_line_boxes_from_indices(indices, ocr): + +def build_line_boxes_from_indices(indices, ocr, image_shape=None): """ - Robust yellow-box generation with punctuation attachment: + Improved yellow box builder: - row grouping - - chunking by x gap - - attach tiny punctuation/special tokens to nearest chunk + - x-gap chunking + - punctuation attachment - token coverage guarantee + - larger/asymmetric padding (fix clipped chars) + - min-size safety expansion """ if not indices: return [] @@ -374,19 +390,25 @@ def build_line_boxes_from_indices(indices, ocr): for i in indices: b = quad_bbox(ocr[i][0]) txt = normalize_text(ocr[i][1]) + if is_noise_text(txt): + continue xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 w = max(1.0, b[2] - b[0]) h = max(1.0, b[3] - b[1]) + items.append({ "i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "w": w, "h": h }) - med_h = float(np.median([it["h"] for it in items])) if items else 10.0 + if not items: + return [] + + med_h = float(np.median([it["h"] for it in items])) row_tol = max(6.0, med_h * 0.90) gap_x_tol = max(8.0, med_h * 1.25) - pad = max(1, int(round(med_h * 0.12))) + pad = max(3, int(round(med_h * 0.22))) # was 0.12 def is_punct_like(t): raw = (t or "").strip() @@ -395,7 +417,7 @@ def build_line_boxes_from_indices(indices, ocr): punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) return punct_ratio >= 0.5 or len(raw) <= 2 - # 1) row grouping + # 1) group into rows items_sorted = sorted(items, key=lambda x: x["yc"]) rows = [] for it in items_sorted: @@ -414,6 +436,7 @@ def build_line_boxes_from_indices(indices, ocr): for r in rows: mem = sorted(r["m"], key=lambda z: z["xc"]) + normal = [t for t in mem if not is_punct_like(t["txt"])] punct = [t for t in mem if is_punct_like(t["txt"])] @@ -421,7 +444,7 @@ def build_line_boxes_from_indices(indices, ocr): normal = mem punct = [] - # 2) chunk normal by x-gap + # 2) chunk by x-gap chunks = [] cur = [normal[0]] for t in normal[1:]: @@ -435,7 +458,7 @@ def build_line_boxes_from_indices(indices, ocr): cur = [t] chunks.append(cur) - # 3) attach punct tokens to nearest chunk + # 3) attach punctuation/special tokens with larger near-gap for p in punct: pb = p["b"] pxc, pyc = p["xc"], p["yc"] @@ -450,7 +473,7 @@ def build_line_boxes_from_indices(indices, ocr): dy = abs(pyc - cy) score = dx + 1.8 * dy - near = overlap_or_near(pb, ub, gap=int(med_h * 0.9)) + near = overlap_or_near(pb, ub, gap=int(med_h * 1.25)) if near: score -= med_h * 2.0 @@ -463,14 +486,17 @@ def build_line_boxes_from_indices(indices, ocr): else: chunks.append([p]) - # 4) emit chunk boxes + # 4) emit chunk boxes with asymmetric padding for ch in chunks: ub = boxes_union_xyxy([x["b"] for x in ch]) if ub: x1, y1, x2, y2 = ub - out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) + pad_x = pad + pad_top = int(round(pad * 1.35)) + pad_bot = int(round(pad * 0.95)) + out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) - # 5) guarantee every token is inside some yellow box + # 5) guarantee every token is covered token_boxes = [it["b"] for it in items] def inside(tb, lb): @@ -479,7 +505,10 @@ def build_line_boxes_from_indices(indices, ocr): for tb in token_boxes: if not any(inside(tb, lb) for lb in out_boxes): x1, y1, x2, y2 = tb - out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) + pad_x = pad + pad_top = int(round(pad * 1.35)) + pad_bot = int(round(pad * 0.95)) + out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) # 6) merge heavy overlaps merged = [] @@ -499,19 +528,51 @@ def build_line_boxes_from_indices(indices, ocr): if not merged_into: merged.append(b) + # 7) min-size safety expansion (for tiny lines like "NO.") + safe = [] + for (x1, y1, x2, y2) in merged: + w = x2 - x1 + h = y2 - y1 + if w < 28: + d = (28 - w) // 2 + 2 + x1 -= d + x2 += d + if h < 18: + d = (18 - h) // 2 + 2 + y1 -= d + y2 += d + safe.append((x1, y1, x2, y2)) + merged = safe + + # clamp bounds + if image_shape is not None: + ih, iw = image_shape[:2] + clamped = [] + for b in merged: + x1 = max(0, int(b[0])) + y1 = max(0, int(b[1])) + x2 = min(iw - 1, int(b[2])) + y2 = min(ih - 1, int(b[3])) + if x2 > x1 and y2 > y1: + clamped.append((x1, y1, x2, y2)) + merged = clamped + else: + merged = [(int(b[0]), int(b[1]), int(b[2]), int(b[3])) for b in merged] + merged.sort(key=lambda z: (z[1], z[0])) return merged -# ───────────────────────────────────────────── -# GROUPING -# ───────────────────────────────────────────── +# ============================================================ +# GROUP TOKENS TO BUBBLES +# ============================================================ def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) if img is None: return base return base * (img.shape[1] / ref_w) + def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): n = len(ocr) if n == 0: @@ -575,8 +636,8 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): x1, y1, x2, y2 = ub x1 = max(0, x1 - bbox_padding) y1 = max(0, y1 - bbox_padding) - x2 = min(iw, x2 + bbox_padding) - y2 = min(ih, y2 + bbox_padding) + x2 = min(iw - 1, x2 + bbox_padding) + y2 = min(ih - 1, y2 + bbox_padding) bubbles[bid] = lines bubble_boxes[bid] = (x1, y1, x2, y2) @@ -586,40 +647,40 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): return bubbles, bubble_boxes, bubble_quads, bubble_indices -# ───────────────────────────────────────────── -# DEBUG -# ───────────────────────────────────────────── +# ============================================================ +# DEBUG IMAGE +# ============================================================ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): img = cv2.imread(image_path) if img is None: return - # OCR token quads + # OCR token quads (gray) for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) cv2.polylines(img, [pts], True, (180, 180, 180), 1) - # Bubble + line boxes + # bubble boxes (green) + line boxes (yellow) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) - cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) + cv2.putText( + img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2 + ) idxs = bubble_indices.get(bid, []) - line_boxes = build_line_boxes_from_indices(idxs, ocr) + line_boxes = build_line_boxes_from_indices(idxs, ocr, image_shape=img.shape) for lb in line_boxes: lx1, ly1, lx2, ly2 = lb - lx1 = max(0, int(lx1)); ly1 = max(0, int(ly1)) - lx2 = min(img.shape[1] - 1, int(lx2)); ly2 = min(img.shape[0] - 1, int(ly2)) cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3) cv2.imwrite(out_path, img) -# ───────────────────────────────────────────── +# ============================================================ # EXPORT -# ───────────────────────────────────────────── +# ============================================================ def estimate_reading_order(bbox_dict, mode="ltr"): items = [] for bid, (x1, y1, x2, y2) in bbox_dict.items(): @@ -627,7 +688,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"): cy = (y1 + y2) / 2.0 items.append((bid, cx, cy)) - items.sort(key=lambda t: t[2]) # top to bottom + items.sort(key=lambda t: t[2]) # top -> bottom rows = [] tol = 90 @@ -650,9 +711,9 @@ def estimate_reading_order(bbox_dict, mode="ltr"): return {bid: i + 1 for i, bid in enumerate(order)} + def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): out = {} - for bid, bb in bbox_dict.items(): x1, y1, x2, y2 = bb quads = quads_dict.get(bid, []) @@ -661,7 +722,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m qboxes = [quad_bbox(q) for q in quads] text_union = boxes_union_xyxy(qboxes) - line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr) + line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr, image_shape=image_shape) line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy) line_union_area = bbox_area_xyxy(line_union_xyxy) @@ -676,7 +737,6 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m [[int(p[0]), int(p[1])] for p in q] for q in quads ], "text_bbox": xyxy_to_xywh(text_union), - "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, "line_union_area": int(line_union_area), @@ -686,9 +746,9 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m json.dump(out, f, indent=2, ensure_ascii=False) -# ───────────────────────────────────────────── -# MAIN -# ───────────────────────────────────────────── +# ============================================================ +# MAIN PIPELINE +# ============================================================ def translate_manga_text( image_path, source_lang="en", @@ -711,7 +771,14 @@ def translate_manga_text( resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) print("Loading OCR...") - ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] + # Catalan often OCRs better with es+en in manga pages + if source_lang == "ca": + ocr_lang_list = ["es", "en"] + elif source_lang == "en": + ocr_lang_list = ["en", "es"] + else: + ocr_lang_list = [source_lang] + reader = easyocr.Reader(ocr_lang_list) print("Running OCR...") @@ -742,7 +809,7 @@ def translate_manga_text( skipped += 1 continue - # reduce false positives in very top strip + # reduce top-strip false positives if qb[1] < int(ih * TOP_BAND_RATIO): if conf < 0.70 and len(t) >= 5: skipped += 1 @@ -770,20 +837,18 @@ def translate_manga_text( translator = GoogleTranslator(source=source_lang, target=target_lang) - # robust bubble text cleanup clean_lines = {} for bid, lines in bubbles.items(): base_txt = normalize_text(" ".join(lines)) base_sc = ocr_candidate_score(base_txt) - # only robust reread on low quality if base_sc < quality_threshold: rr_txt, rr_sc = reread_crop_robust( image, bubble_boxes[bid], reader, upscale=3.0, - pad=22 + pad=24 ) if rr_txt and rr_sc > base_sc + 0.06: txt = rr_txt @@ -792,7 +857,12 @@ def translate_manga_text( else: txt = base_txt - clean_lines[bid] = apply_glossary(txt) + # tiny targeted corrections for common OCR confusions + txt = txt.replace(" BOMPORTA", " IMPORTA") + txt = txt.replace(" TESTO ", " ESTO ") + txt = txt.replace(" MIVERDAD", " MI VERDAD") + + clean_lines[bid] = apply_glossary(normalize_text(txt)) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) @@ -822,6 +892,7 @@ def translate_manga_text( out_lines.append( f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" ) + print( f"#{bid:<7} {reading_map.get(bid,bid):<6} " f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}" @@ -851,10 +922,13 @@ def translate_manga_text( print("Saved: debug_clusters.png") +# ============================================================ +# ENTRYPOINT +# ============================================================ if __name__ == "__main__": translate_manga_text( - image_path="001-page.png", - source_lang="it", + image_path="004-page.png", + source_lang="es", target_lang="ca", confidence_threshold=0.12, min_text_length=1,