diff --git a/.gitignore b/.gitignore index a9879e8..de18566 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ .AppleDouble .LSOverride +.venv311/ + # Icon must end with two \r Icon diff --git a/manga-translator.py b/manga-translator.py index 916c162..7a95073 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -6,9 +6,13 @@ import re import json import cv2 import numpy as np -import easyocr + from deep_translator import GoogleTranslator +# OCR engines +import easyocr +from paddleocr import PaddleOCR + # ============================================================ # CONFIG @@ -35,8 +39,8 @@ TITLE_PATTERNS = [ NOISE_PATTERNS = [ r"^[^a-zA-Z0-9\?!.¡¿]+$", - r"^BOX[#\s0-9A-Z\-]*$", # debug labels - r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", # e.g. 98x12 + r"^BOX[#\s0-9A-Z\-]*$", + r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", ] TOP_BAND_RATIO = 0.08 @@ -56,8 +60,6 @@ def normalize_text(text: str) -> str: t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\.{4,}", "...", t) - t = t.replace("IQUE", "¡QUE") - t = t.replace("IQUIEN", "¿QUIEN") return t.strip() @@ -91,11 +93,9 @@ def is_noise_text(text: str) -> bool: if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): return True - # very short isolated junk if len(t) <= 2 and not re.search(r"[A-Z0-9]", t): return True - # mostly-symbol garbage symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) if len(t) <= 6 and symbol_ratio > 0.60: return True @@ -104,7 +104,7 @@ def is_noise_text(text: str) -> bool: # ============================================================ -# GEOMETRY +# GEOMETRY HELPERS # ============================================================ def quad_bbox(quad): xs = [p[0] for p in quad] @@ -151,7 +151,7 @@ def overlap_or_near(a, b, gap=0): # ============================================================ -# OCR QUALITY SCORING +# QUALITY # ============================================================ def ocr_candidate_score(text: str) -> float: if not text: @@ -179,7 +179,180 @@ def ocr_candidate_score(text: str) -> float: # ============================================================ -# OCR MULTI-PASS REREAD +# OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID) +# ============================================================ +class HybridOCR: + def __init__(self, source_lang="en", use_gpu=False): + self.source_lang = source_lang + + # Paddle language choice (single lang for Paddle) + # For manga EN/ES pages, latin model is robust. + if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"): + paddle_lang = "latin" + elif source_lang in ("ja",): + paddle_lang = "japan" + elif source_lang in ("ko",): + paddle_lang = "korean" + elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"): + paddle_lang = "ch" + else: + paddle_lang = "latin" + + # EasyOCR language list + if source_lang == "ca": + easy_langs = ["es", "en"] + elif source_lang == "en": + easy_langs = ["en", "es"] + elif source_lang == "es": + easy_langs = ["es", "en"] + else: + easy_langs = [source_lang] + + self.paddle = PaddleOCR( + use_angle_cls=True, + lang=paddle_lang, + use_gpu=use_gpu, + show_log=False + ) + self.easy = easyocr.Reader(easy_langs, gpu=use_gpu) + + @staticmethod + def _paddle_to_std(result): + """ + Convert Paddle result to Easy-like: + [ (quad, text, conf), ... ] + """ + out = [] + # paddle.ocr(...) returns list per image + # each item line: [ [ [x,y],...4pts ], (text, conf) ] + if not result: + return out + # result can be [None] or nested list + blocks = result if isinstance(result, list) else [result] + for blk in blocks: + if blk is None: + continue + if len(blk) == 0: + continue + # some versions wrap once more + if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2: + lines = blk + elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2: + lines = blk + else: + # maybe nested once more + if len(blk) == 1 and isinstance(blk[0], list): + lines = blk[0] + else: + lines = [] + + for ln in lines: + try: + pts, rec = ln + txt, conf = rec[0], float(rec[1]) + quad = [[float(p[0]), float(p[1])] for p in pts] + out.append((quad, txt, conf)) + except Exception: + continue + return out + + def read_full_image(self, image_path): + """ + Primary: Paddle + Fallback merge: EasyOCR + Returns merged standardized detections. + """ + # Paddle + pr = self.paddle.ocr(image_path, cls=True) + paddle_det = self._paddle_to_std(pr) + + # Easy + easy_det = self.easy.readtext(image_path, paragraph=False) + + # Merge by IOU/text proximity + merged = list(paddle_det) + for eb in easy_det: + eq, et, ec = eb + ebox = quad_bbox(eq) + keep = True + for pb in paddle_det: + pq, pt, pc = pb + pbox = quad_bbox(pq) + + ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1]) + ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3]) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1])) + a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1])) + iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 + + if iou > 0.55: + # if overlapped and paddle exists, keep paddle unless easy much higher conf + if float(ec) > float(pc) + 0.20: + # replace paddle with easy-like entry + try: + merged.remove(pb) + except Exception: + pass + merged.append((eq, et, float(ec))) + keep = False + break + + if keep: + merged.append((eq, et, float(ec))) + + return merged + + def read_array_with_both(self, arr_gray_or_bgr): + """ + OCR from array (used in robust reread pass). + Returns merged detections in standardized format. + """ + tmp = "_tmp_ocr_hybrid.png" + cv2.imwrite(tmp, arr_gray_or_bgr) + try: + pr = self.paddle.ocr(tmp, cls=True) + paddle_det = self._paddle_to_std(pr) + easy_det = self.easy.readtext(tmp, paragraph=False) + + merged = list(paddle_det) + + for eb in easy_det: + eq, et, ec = eb + ebox = quad_bbox(eq) + keep = True + for pb in paddle_det: + pq, pt, pc = pb + pbox = quad_bbox(pq) + + ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1]) + ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3]) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1])) + a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1])) + iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 + + if iou > 0.55: + if float(ec) > float(pc) + 0.20: + try: + merged.remove(pb) + except Exception: + pass + merged.append((eq, et, float(ec))) + keep = False + break + + if keep: + merged.append((eq, et, float(ec))) + + return merged + finally: + if os.path.exists(tmp): + os.remove(tmp) + + +# ============================================================ +# PREPROCESS + ROBUST REREAD # ============================================================ def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) @@ -212,8 +385,7 @@ def rotate_image_keep_bounds(img, angle_deg): h, w = img.shape[:2] c = (w / 2, h / 2) M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) - cos = abs(M[0, 0]) - sin = abs(M[0, 1]) + cos = abs(M[0, 0]); sin = abs(M[0, 1]) new_w = int((h * sin) + (w * cos)) new_h = int((h * cos) + (w * sin)) @@ -224,16 +396,6 @@ def rotate_image_keep_bounds(img, angle_deg): return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) -def run_ocr_on_array(reader, arr): - tmp = "_tmp_ocr.png" - cv2.imwrite(tmp, arr) - try: - return reader.readtext(tmp, paragraph=False) - finally: - if os.path.exists(tmp): - os.remove(tmp) - - def rebuild_text_from_ocr_result(res): if not res: return "" @@ -257,7 +419,7 @@ def rebuild_text_from_ocr_result(res): med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) - norm.sort(key=lambda z: z[4]) # y-center + norm.sort(key=lambda z: z[4]) # y rows = [] for it in norm: placed = False @@ -273,7 +435,7 @@ def rebuild_text_from_ocr_result(res): rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: - mem = sorted(r["m"], key=lambda z: z[3]) # x-center + mem = sorted(r["m"], key=lambda z: z[3]) # x line = normalize_text(" ".join(x[1] for x in mem)) if line: lines.append(line) @@ -281,7 +443,7 @@ def rebuild_text_from_ocr_result(res): return normalize_text(" ".join(lines)) -def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24): +def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24): ih, iw = image.shape[:2] x1, y1, x2, y2 = bbox x1 = max(0, int(x1 - pad)) @@ -313,12 +475,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24): for a in angles: rot = rotate_image_keep_bounds(proc3, a) - if len(rot.shape) == 3: - rot_in = cv2.cvtColor(rot, cv2.COLOR_BGR2GRAY) - else: - rot_in = rot - - res = run_ocr_on_array(reader, rot_in) + res = hybrid_ocr.read_array_with_both(rot) txt = rebuild_text_from_ocr_result(res) sc = ocr_candidate_score(txt) @@ -331,7 +488,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24): # ============================================================ -# LINE REBUILD + LINE BOXES (YELLOW) +# LINE REBUILD + YELLOW BOXES # ============================================================ def build_lines_from_indices(indices, ocr): if not indices: @@ -348,7 +505,7 @@ def build_lines_from_indices(indices, ocr): med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.75) - items.sort(key=lambda x: x[3]) # y + items.sort(key=lambda x: x[3]) rows = [] for it in items: i, b, xc, yc, h = it @@ -365,7 +522,7 @@ def build_lines_from_indices(indices, ocr): rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: - mem = sorted(r["m"], key=lambda z: z[2]) # x + mem = sorted(r["m"], key=lambda z: z[2]) txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) if txt and not is_noise_text(txt): lines.append(txt) @@ -374,15 +531,6 @@ def build_lines_from_indices(indices, ocr): def build_line_boxes_from_indices(indices, ocr, image_shape=None): - """ - Improved yellow box builder: - - row grouping - - x-gap chunking - - punctuation attachment - - token coverage guarantee - - larger/asymmetric padding (fix clipped chars) - - min-size safety expansion - """ if not indices: return [] @@ -392,6 +540,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): txt = normalize_text(ocr[i][1]) if is_noise_text(txt): continue + xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 w = max(1.0, b[2] - b[0]) @@ -408,7 +557,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): med_h = float(np.median([it["h"] for it in items])) row_tol = max(6.0, med_h * 0.90) gap_x_tol = max(8.0, med_h * 1.25) - pad = max(3, int(round(med_h * 0.22))) # was 0.12 + pad = max(3, int(round(med_h * 0.22))) def is_punct_like(t): raw = (t or "").strip() @@ -417,7 +566,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) return punct_ratio >= 0.5 or len(raw) <= 2 - # 1) group into rows items_sorted = sorted(items, key=lambda x: x["yc"]) rows = [] for it in items_sorted: @@ -436,7 +584,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): for r in rows: mem = sorted(r["m"], key=lambda z: z["xc"]) - normal = [t for t in mem if not is_punct_like(t["txt"])] punct = [t for t in mem if is_punct_like(t["txt"])] @@ -444,7 +591,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): normal = mem punct = [] - # 2) chunk by x-gap chunks = [] cur = [normal[0]] for t in normal[1:]: @@ -458,7 +604,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): cur = [t] chunks.append(cur) - # 3) attach punctuation/special tokens with larger near-gap for p in punct: pb = p["b"] pxc, pyc = p["xc"], p["yc"] @@ -486,7 +631,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): else: chunks.append([p]) - # 4) emit chunk boxes with asymmetric padding for ch in chunks: ub = boxes_union_xyxy([x["b"] for x in ch]) if ub: @@ -496,7 +640,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): pad_bot = int(round(pad * 0.95)) out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) - # 5) guarantee every token is covered token_boxes = [it["b"] for it in items] def inside(tb, lb): @@ -510,7 +653,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): pad_bot = int(round(pad * 0.95)) out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) - # 6) merge heavy overlaps merged = [] for b in out_boxes: merged_into = False @@ -528,23 +670,19 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): if not merged_into: merged.append(b) - # 7) min-size safety expansion (for tiny lines like "NO.") safe = [] for (x1, y1, x2, y2) in merged: w = x2 - x1 h = y2 - y1 if w < 28: d = (28 - w) // 2 + 2 - x1 -= d - x2 += d + x1 -= d; x2 += d if h < 18: d = (18 - h) // 2 + 2 - y1 -= d - y2 += d + y1 -= d; y2 += d safe.append((x1, y1, x2, y2)) merged = safe - # clamp bounds if image_shape is not None: ih, iw = image_shape[:2] clamped = [] @@ -564,7 +702,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): # ============================================================ -# GROUP TOKENS TO BUBBLES +# GROUPING # ============================================================ def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) @@ -648,19 +786,17 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): # ============================================================ -# DEBUG IMAGE +# DEBUG # ============================================================ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): img = cv2.imread(image_path) if img is None: return - # OCR token quads (gray) for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) cv2.polylines(img, [pts], True, (180, 180, 180), 1) - # bubble boxes (green) + line boxes (yellow) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) @@ -688,7 +824,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"): cy = (y1 + y2) / 2.0 items.append((bid, cx, cy)) - items.sort(key=lambda t: t[2]) # top -> bottom + items.sort(key=lambda t: t[2]) rows = [] tol = 90 @@ -714,6 +850,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"): def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): out = {} + for bid, bb in bbox_dict.items(): x1, y1, x2, y2 = bb quads = quads_dict.get(bid, []) @@ -761,7 +898,8 @@ def translate_manga_text( export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", - debug=True + debug=True, + use_gpu=False ): image = cv2.imread(image_path) if image is None: @@ -770,20 +908,12 @@ def translate_manga_text( resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) - print("Loading OCR...") - # Catalan often OCRs better with es+en in manga pages - if source_lang == "ca": - ocr_lang_list = ["es", "en"] - elif source_lang == "en": - ocr_lang_list = ["en", "es"] - else: - ocr_lang_list = [source_lang] - - reader = easyocr.Reader(ocr_lang_list) + print("Loading Hybrid OCR (Paddle + EasyOCR)...") + hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu) print("Running OCR...") - raw = reader.readtext(image_path, paragraph=False) - print(f"Raw detections: {len(raw)}") + raw = hybrid.read_full_image(image_path) + print(f"Raw detections (merged): {len(raw)}") filtered = [] skipped = 0 @@ -809,7 +939,6 @@ def translate_manga_text( skipped += 1 continue - # reduce top-strip false positives if qb[1] < int(ih * TOP_BAND_RATIO): if conf < 0.70 and len(t) >= 5: skipped += 1 @@ -846,7 +975,7 @@ def translate_manga_text( rr_txt, rr_sc = reread_crop_robust( image, bubble_boxes[bid], - reader, + hybrid, upscale=3.0, pad=24 ) @@ -857,7 +986,6 @@ def translate_manga_text( else: txt = base_txt - # tiny targeted corrections for common OCR confusions txt = txt.replace(" BOMPORTA", " IMPORTA") txt = txt.replace(" TESTO ", " ESTO ") txt = txt.replace(" MIVERDAD", " MI VERDAD") @@ -927,8 +1055,8 @@ def translate_manga_text( # ============================================================ if __name__ == "__main__": translate_manga_text( - image_path="004-page.png", - source_lang="es", + image_path="001-page.png", + source_lang="it", target_lang="ca", confidence_threshold=0.12, min_text_length=1, @@ -938,5 +1066,6 @@ if __name__ == "__main__": export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", - debug=True + debug=True, + use_gpu=False )