Added hybrid

2026-04-15 16:22:35 +02:00
parent eadc28154a
commit 5ef8c39f69
2 changed files with 212 additions and 81 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@
 .AppleDouble
 .LSOverride
 .venv311/
 # Icon must end with two \r
 Icon
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -6,9 +6,13 @@ import re
 import json
 import cv2
 import numpy as np
-import easyocr
+
 from deep_translator import GoogleTranslator
 # OCR engines
 import easyocr
 from paddleocr import PaddleOCR
 # ============================================================
 # CONFIG
@@ -35,8 +39,8 @@ TITLE_PATTERNS = [
 NOISE_PATTERNS = [
    r"^[^a-zA-Z0-9\?!.¡¿]+$",
-    r"^BOX[#\s0-9A-Z\-]*$",      # debug labels
+    r"^BOX[#\s0-9A-Z\-]*$",
-    r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$",  # e.g. 98x12
+    r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$",
 ]
 TOP_BAND_RATIO = 0.08
@@ -56,8 +60,6 @@ def normalize_text(text: str) -> str:
    t = re.sub(r"\(\s+", "(", t)
    t = re.sub(r"\s+\)", ")", t)
    t = re.sub(r"\.{4,}", "...", t)
    t = t.replace("IQUE", "¡QUE")
    t = t.replace("IQUIEN", "¿QUIEN")
    return t.strip()
@@ -91,11 +93,9 @@ def is_noise_text(text: str) -> bool:
    if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
        return True
    # very short isolated junk
    if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
        return True
    # mostly-symbol garbage
    symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
    if len(t) <= 6 and symbol_ratio > 0.60:
        return True
@@ -104,7 +104,7 @@ def is_noise_text(text: str) -> bool:
 # ============================================================
-# GEOMETRY
+# GEOMETRY HELPERS
 # ============================================================
 def quad_bbox(quad):
    xs = [p[0] for p in quad]
@@ -151,7 +151,7 @@ def overlap_or_near(a, b, gap=0):
 # ============================================================
-# OCR QUALITY SCORING
+# QUALITY
 # ============================================================
 def ocr_candidate_score(text: str) -> float:
    if not text:
@@ -179,7 +179,180 @@ def ocr_candidate_score(text: str) -> float:
 # ============================================================
-# OCR MULTI-PASS REREAD
+# OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID)
 # ============================================================
 class HybridOCR:
    def __init__(self, source_lang="en", use_gpu=False):
        self.source_lang = source_lang
        # Paddle language choice (single lang for Paddle)
        # For manga EN/ES pages, latin model is robust.
        if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"):
            paddle_lang = "latin"
        elif source_lang in ("ja",):
            paddle_lang = "japan"
        elif source_lang in ("ko",):
            paddle_lang = "korean"
        elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"):
            paddle_lang = "ch"
        else:
            paddle_lang = "latin"
        # EasyOCR language list
        if source_lang == "ca":
            easy_langs = ["es", "en"]
        elif source_lang == "en":
            easy_langs = ["en", "es"]
        elif source_lang == "es":
            easy_langs = ["es", "en"]
        else:
            easy_langs = [source_lang]
        self.paddle = PaddleOCR(
            use_angle_cls=True,
            lang=paddle_lang,
            use_gpu=use_gpu,
            show_log=False
        )
        self.easy = easyocr.Reader(easy_langs, gpu=use_gpu)
    @staticmethod
    def _paddle_to_std(result):
        """
        Convert Paddle result to Easy-like:
        [ (quad, text, conf), ... ]
        """
        out = []
        # paddle.ocr(...) returns list per image
        # each item line: [ [ [x,y],...4pts ], (text, conf) ]
        if not result:
            return out
        # result can be [None] or nested list
        blocks = result if isinstance(result, list) else [result]
        for blk in blocks:
            if blk is None:
                continue
            if len(blk) == 0:
                continue
            # some versions wrap once more
            if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2:
                lines = blk
            elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2:
                lines = blk
            else:
                # maybe nested once more
                if len(blk) == 1 and isinstance(blk[0], list):
                    lines = blk[0]
                else:
                    lines = []
            for ln in lines:
                try:
                    pts, rec = ln
                    txt, conf = rec[0], float(rec[1])
                    quad = [[float(p[0]), float(p[1])] for p in pts]
                    out.append((quad, txt, conf))
                except Exception:
                    continue
        return out
    def read_full_image(self, image_path):
        """
        Primary: Paddle
        Fallback merge: EasyOCR
        Returns merged standardized detections.
        """
        # Paddle
        pr = self.paddle.ocr(image_path, cls=True)
        paddle_det = self._paddle_to_std(pr)
        # Easy
        easy_det = self.easy.readtext(image_path, paragraph=False)
        # Merge by IOU/text proximity
        merged = list(paddle_det)
        for eb in easy_det:
            eq, et, ec = eb
            ebox = quad_bbox(eq)
            keep = True
            for pb in paddle_det:
                pq, pt, pc = pb
                pbox = quad_bbox(pq)
                ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
                ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
                inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
                a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
                a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
                iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
                if iou > 0.55:
                    # if overlapped and paddle exists, keep paddle unless easy much higher conf
                    if float(ec) > float(pc) + 0.20:
                        # replace paddle with easy-like entry
                        try:
                            merged.remove(pb)
                        except Exception:
                            pass
                        merged.append((eq, et, float(ec)))
                    keep = False
                    break
            if keep:
                merged.append((eq, et, float(ec)))
        return merged
    def read_array_with_both(self, arr_gray_or_bgr):
        """
        OCR from array (used in robust reread pass).
        Returns merged detections in standardized format.
        """
        tmp = "_tmp_ocr_hybrid.png"
        cv2.imwrite(tmp, arr_gray_or_bgr)
        try:
            pr = self.paddle.ocr(tmp, cls=True)
            paddle_det = self._paddle_to_std(pr)
            easy_det = self.easy.readtext(tmp, paragraph=False)
            merged = list(paddle_det)
            for eb in easy_det:
                eq, et, ec = eb
                ebox = quad_bbox(eq)
                keep = True
                for pb in paddle_det:
                    pq, pt, pc = pb
                    pbox = quad_bbox(pq)
                    ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
                    ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
                    inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
                    a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
                    a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
                    iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
                    if iou > 0.55:
                        if float(ec) > float(pc) + 0.20:
                            try:
                                merged.remove(pb)
                            except Exception:
                                pass
                            merged.append((eq, et, float(ec)))
                        keep = False
                        break
                if keep:
                    merged.append((eq, et, float(ec)))
            return merged
        finally:
            if os.path.exists(tmp):
                os.remove(tmp)
 # ============================================================
 # PREPROCESS + ROBUST REREAD
 # ============================================================
 def preprocess_variant(crop_bgr, mode):
    gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
@@ -212,8 +385,7 @@ def rotate_image_keep_bounds(img, angle_deg):
    h, w = img.shape[:2]
    c = (w / 2, h / 2)
    M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
-    cos = abs(M[0, 0])
+    cos = abs(M[0, 0]); sin = abs(M[0, 1])
    sin = abs(M[0, 1])
    new_w = int((h * sin) + (w * cos))
    new_h = int((h * cos) + (w * sin))
@@ -224,16 +396,6 @@ def rotate_image_keep_bounds(img, angle_deg):
    return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
 def run_ocr_on_array(reader, arr):
    tmp = "_tmp_ocr.png"
    cv2.imwrite(tmp, arr)
    try:
        return reader.readtext(tmp, paragraph=False)
    finally:
        if os.path.exists(tmp):
            os.remove(tmp)
 def rebuild_text_from_ocr_result(res):
    if not res:
        return ""
@@ -257,7 +419,7 @@ def rebuild_text_from_ocr_result(res):
    med_h = float(np.median([x[5] for x in norm]))
    row_tol = max(6.0, med_h * 0.75)
-    norm.sort(key=lambda z: z[4])  # y-center
+    norm.sort(key=lambda z: z[4])  # y
    rows = []
    for it in norm:
        placed = False
@@ -273,7 +435,7 @@ def rebuild_text_from_ocr_result(res):
    rows.sort(key=lambda r: r["yc"])
    lines = []
    for r in rows:
-        mem = sorted(r["m"], key=lambda z: z[3])  # x-center
+        mem = sorted(r["m"], key=lambda z: z[3])  # x
        line = normalize_text(" ".join(x[1] for x in mem))
        if line:
            lines.append(line)
@@ -281,7 +443,7 @@ def rebuild_text_from_ocr_result(res):
    return normalize_text(" ".join(lines))
-def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
+def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24):
    ih, iw = image.shape[:2]
    x1, y1, x2, y2 = bbox
    x1 = max(0, int(x1 - pad))
@@ -313,12 +475,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
        for a in angles:
            rot = rotate_image_keep_bounds(proc3, a)
-            if len(rot.shape) == 3:
+            res = hybrid_ocr.read_array_with_both(rot)
                rot_in = cv2.cvtColor(rot, cv2.COLOR_BGR2GRAY)
            else:
                rot_in = rot
            res = run_ocr_on_array(reader, rot_in)
            txt = rebuild_text_from_ocr_result(res)
            sc = ocr_candidate_score(txt)
@@ -331,7 +488,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
 # ============================================================
-# LINE REBUILD + LINE BOXES (YELLOW)
+# LINE REBUILD + YELLOW BOXES
 # ============================================================
 def build_lines_from_indices(indices, ocr):
    if not indices:
@@ -348,7 +505,7 @@ def build_lines_from_indices(indices, ocr):
    med_h = float(np.median([it[4] for it in items])) if items else 10.0
    row_tol = max(6.0, med_h * 0.75)
-    items.sort(key=lambda x: x[3])  # y
+    items.sort(key=lambda x: x[3])
    rows = []
    for it in items:
        i, b, xc, yc, h = it
@@ -365,7 +522,7 @@ def build_lines_from_indices(indices, ocr):
    rows.sort(key=lambda r: r["yc"])
    lines = []
    for r in rows:
-        mem = sorted(r["m"], key=lambda z: z[2])  # x
+        mem = sorted(r["m"], key=lambda z: z[2])
        txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
        if txt and not is_noise_text(txt):
            lines.append(txt)
@@ -374,15 +531,6 @@ def build_lines_from_indices(indices, ocr):
 def build_line_boxes_from_indices(indices, ocr, image_shape=None):
    """
    Improved yellow box builder:
    - row grouping
    - x-gap chunking
    - punctuation attachment
    - token coverage guarantee
    - larger/asymmetric padding (fix clipped chars)
    - min-size safety expansion
    """
    if not indices:
        return []
@@ -392,6 +540,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
        txt = normalize_text(ocr[i][1])
        if is_noise_text(txt):
            continue
        xc = (b[0] + b[2]) / 2.0
        yc = (b[1] + b[3]) / 2.0
        w = max(1.0, b[2] - b[0])
@@ -408,7 +557,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
    med_h = float(np.median([it["h"] for it in items]))
    row_tol = max(6.0, med_h * 0.90)
    gap_x_tol = max(8.0, med_h * 1.25)
-    pad = max(3, int(round(med_h * 0.22)))  # was 0.12
+    pad = max(3, int(round(med_h * 0.22)))
    def is_punct_like(t):
        raw = (t or "").strip()
@@ -417,7 +566,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
        punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
        return punct_ratio >= 0.5 or len(raw) <= 2
    # 1) group into rows
    items_sorted = sorted(items, key=lambda x: x["yc"])
    rows = []
    for it in items_sorted:
@@ -436,7 +584,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
    for r in rows:
        mem = sorted(r["m"], key=lambda z: z["xc"])
        normal = [t for t in mem if not is_punct_like(t["txt"])]
        punct = [t for t in mem if is_punct_like(t["txt"])]
@@ -444,7 +591,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
            normal = mem
            punct = []
        # 2) chunk by x-gap
        chunks = []
        cur = [normal[0]]
        for t in normal[1:]:
@@ -458,7 +604,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
                cur = [t]
        chunks.append(cur)
        # 3) attach punctuation/special tokens with larger near-gap
        for p in punct:
            pb = p["b"]
            pxc, pyc = p["xc"], p["yc"]
@@ -486,7 +631,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
            else:
                chunks.append([p])
        # 4) emit chunk boxes with asymmetric padding
        for ch in chunks:
            ub = boxes_union_xyxy([x["b"] for x in ch])
            if ub:
@@ -496,7 +640,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
                pad_bot = int(round(pad * 0.95))
                out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
    # 5) guarantee every token is covered
    token_boxes = [it["b"] for it in items]
    def inside(tb, lb):
@@ -510,7 +653,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
            pad_bot = int(round(pad * 0.95))
            out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
    # 6) merge heavy overlaps
    merged = []
    for b in out_boxes:
        merged_into = False
@@ -528,23 +670,19 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
        if not merged_into:
            merged.append(b)
    # 7) min-size safety expansion (for tiny lines like "NO.")
    safe = []
    for (x1, y1, x2, y2) in merged:
        w = x2 - x1
        h = y2 - y1
        if w < 28:
            d = (28 - w) // 2 + 2
-            x1 -= d
+            x1 -= d; x2 += d
            x2 += d
        if h < 18:
            d = (18 - h) // 2 + 2
-            y1 -= d
+            y1 -= d; y2 += d
            y2 += d
        safe.append((x1, y1, x2, y2))
    merged = safe
    # clamp bounds
    if image_shape is not None:
        ih, iw = image_shape[:2]
        clamped = []
@@ -564,7 +702,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
 # ============================================================
-# GROUP TOKENS TO BUBBLES
+# GROUPING
 # ============================================================
 def auto_gap(image_path, base=18, ref_w=750):
    img = cv2.imread(image_path)
@@ -648,19 +786,17 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
 # ============================================================
-# DEBUG IMAGE
+# DEBUG
 # ============================================================
 def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
    img = cv2.imread(image_path)
    if img is None:
        return
    # OCR token quads (gray)
    for bbox, txt, conf in ocr:
        pts = np.array(bbox, dtype=np.int32)
        cv2.polylines(img, [pts], True, (180, 180, 180), 1)
    # bubble boxes (green) + line boxes (yellow)
    for bid, bb in bubble_boxes.items():
        x1, y1, x2, y2 = bb
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
@@ -688,7 +824,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
        cy = (y1 + y2) / 2.0
        items.append((bid, cx, cy))
-    items.sort(key=lambda t: t[2])  # top -> bottom
+    items.sort(key=lambda t: t[2])
    rows = []
    tol = 90
@@ -714,6 +850,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
 def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
    out = {}
    for bid, bb in bbox_dict.items():
        x1, y1, x2, y2 = bb
        quads = quads_dict.get(bid, [])
@@ -761,7 +898,8 @@ def translate_manga_text(
    export_to_file="output.txt",
    export_bubbles_to="bubbles.json",
    reading_mode="ltr",
-    debug=True
+    debug=True,
    use_gpu=False
 ):
    image = cv2.imread(image_path)
    if image is None:
@@ -770,20 +908,12 @@ def translate_manga_text(
    resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
-    print("Loading OCR...")
+    print("Loading Hybrid OCR (Paddle + EasyOCR)...")
-    # Catalan often OCRs better with es+en in manga pages
+    hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu)
    if source_lang == "ca":
        ocr_lang_list = ["es", "en"]
    elif source_lang == "en":
        ocr_lang_list = ["en", "es"]
    else:
        ocr_lang_list = [source_lang]
    reader = easyocr.Reader(ocr_lang_list)
    print("Running OCR...")
-    raw = reader.readtext(image_path, paragraph=False)
+    raw = hybrid.read_full_image(image_path)
-    print(f"Raw detections: {len(raw)}")
+    print(f"Raw detections (merged): {len(raw)}")
    filtered = []
    skipped = 0
@@ -809,7 +939,6 @@ def translate_manga_text(
            skipped += 1
            continue
        # reduce top-strip false positives
        if qb[1] < int(ih * TOP_BAND_RATIO):
            if conf < 0.70 and len(t) >= 5:
                skipped += 1
@@ -846,7 +975,7 @@ def translate_manga_text(
            rr_txt, rr_sc = reread_crop_robust(
                image,
                bubble_boxes[bid],
-                reader,
+                hybrid,
                upscale=3.0,
                pad=24
            )
@@ -857,7 +986,6 @@ def translate_manga_text(
        else:
            txt = base_txt
        # tiny targeted corrections for common OCR confusions
        txt = txt.replace(" BOMPORTA", " IMPORTA")
        txt = txt.replace(" TESTO ", " ESTO ")
        txt = txt.replace(" MIVERDAD", " MI VERDAD")
@@ -927,8 +1055,8 @@ def translate_manga_text(
 # ============================================================
 if __name__ == "__main__":
    translate_manga_text(
-        image_path="004-page.png",
+        image_path="001-page.png",
-        source_lang="es",
+        source_lang="it",
        target_lang="ca",
        confidence_threshold=0.12,
        min_text_length=1,
@@ -938,5 +1066,6 @@ if __name__ == "__main__":
        export_to_file="output.txt",
        export_bubbles_to="bubbles.json",
        reading_mode="ltr",
-        debug=True
+        debug=True,
        use_gpu=False
    )