import re import os import json import cv2 import numpy as np import easyocr from deep_translator import GoogleTranslator # ───────────────────────────────────────────── # CONFIG # ───────────────────────────────────────────── GLOSSARY = { "ANYA": "ANYA", "STARLIGHT ANYA": "STARLIGHT ANYA", "MR. HENDERSON": "MR. HENDERSON", "HENDERSON": "HENDERSON", "STELLA STAR": "STELLA STAR", } SOUND_EFFECT_PATTERNS = [ r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$", r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" ] TITLE_PATTERNS = [ r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$", r"^(spy|family|spy.family)$", r"^by\s+.+$", ] NOISE_PATTERNS = [ r"^[^a-zA-Z0-9\?!.]+$", r"^BOX[0-9A-Z]*$", ] TOP_BAND_RATIO = 0.08 # ───────────────────────────────────────────── # TEXT HELPERS # ───────────────────────────────────────────── def normalize_text(text): t = text.strip().upper() t = t.replace("“", "\"").replace("”", "\"") t = t.replace("’", "'").replace("‘", "'") t = t.replace("…", "...") t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\.{4,}", "...", t) t = re.sub(r",\?", "?", t) return t.strip() def apply_glossary(text): out = text for k in sorted(GLOSSARY.keys(), key=len, reverse=True): out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) return out def postprocess_translation_general(text): t = normalize_text(text) t = re.sub(r"\s{2,}", " ", t).strip() t = re.sub(r"([!?]){3,}", r"\1\1", t) t = re.sub(r"\.{4,}", "...", t) return t # ───────────────────────────────────────────── # FILTERS # ───────────────────────────────────────────── def is_sound_effect(text): cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) def is_title_text(text): t = text.strip().lower() return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) def is_noise_text(text): t = text.strip() return any(re.fullmatch(p, t) for p in NOISE_PATTERNS) # ───────────────────────────────────────────── # GEOMETRY # ───────────────────────────────────────────── def quad_bbox(quad): xs = [p[0] for p in quad] ys = [p[1] for p in quad] return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) def quad_center(quad): x1, y1, x2, y2 = quad_bbox(quad) return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) def boxes_union_xyxy(boxes): boxes = [b for b in boxes if b is not None] if not boxes: return None return ( int(min(b[0] for b in boxes)), int(min(b[1] for b in boxes)), int(max(b[2] for b in boxes)), int(max(b[3] for b in boxes)), ) def bbox_area_xyxy(b): if b is None: return 0 return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) def xyxy_to_xywh(b): if b is None: return None x1, y1, x2, y2 = b return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} def overlap_or_near(a, b, gap=0): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) gap_y = max(0, max(ay1, by1) - min(ay2, by2)) return gap_x <= gap and gap_y <= gap # ───────────────────────────────────────────── # QUALITY # ───────────────────────────────────────────── def ocr_quality_score(text): if not text or len(text) < 2: return 0.0 alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text)) penalty = 0.0 if re.search(r"[^\w\s\'\!\?\.,\-]{2,}", text): penalty += 0.2 if re.search(r",,", text): penalty += 0.2 bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0 return max(0.0, min(1.0, alpha_ratio - penalty + bonus)) # ───────────────────────────────────────────── # OCR RE-READ # ───────────────────────────────────────────── def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) if mode == "raw": return gray if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) if mode == "adaptive": den = cv2.GaussianBlur(gray, (3, 3), 0) return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) return gray def run_ocr_on_array(reader, arr): tmp = "_tmp_ocr.png" cv2.imwrite(tmp, arr) try: return reader.readtext(tmp, paragraph=False) finally: if os.path.exists(tmp): os.remove(tmp) def reread_crop(image, bbox, reader, upscale=2.5, pad=18): ih, iw = image.shape[:2] x1, y1, x2, y2 = bbox x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad)) crop = image[y1:y2, x1:x2] if crop.size == 0: return None up = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC) best = None for mode in ("raw", "clahe", "adaptive"): proc = preprocess_variant(up, mode) res = run_ocr_on_array(reader, proc) if not res: continue res.sort(key=lambda r: (r[0][0][1], r[0][0][0])) lines = [normalize_text(t) for _, t, _ in res if t.strip()] merged = re.sub(r"\s{2,}", " ", " ".join(lines)).strip() s = ocr_quality_score(merged) if best is None or s > best[0]: best = (s, merged) return best[1] if best else None # ───────────────────────────────────────────── # LINES + YELLOW BOXES # ───────────────────────────────────────────── def build_lines_from_indices(indices, ocr): if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 h = max(1.0, b[3] - b[1]) items.append((i, b, xc, yc, h)) med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.75) items.sort(key=lambda x: x[3]) rows = [] for it in items: i, b, xc, yc, h = it placed = False for r in rows: if abs(yc - r["yc"]) <= row_tol: r["m"].append((i, b, xc, yc)) r["yc"] = float(np.mean([k[3] for k in r["m"]])) placed = True break if not placed: rows.append({"yc": yc, "m": [(i, b, xc, yc)]}) rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: mem = sorted(r["m"], key=lambda z: z[2]) txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) lines.append(txt) return lines def build_line_boxes_from_indices(indices, ocr): """ Robust yellow-box generation with punctuation attachment: - row grouping - chunking by x gap - attach tiny punctuation/special tokens to nearest chunk - coverage guarantee """ if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) txt = normalize_text(ocr[i][1]) xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 w = max(1.0, b[2] - b[0]) h = max(1.0, b[3] - b[1]) items.append({ "i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "w": w, "h": h }) med_h = float(np.median([it["h"] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.90) gap_x_tol = max(8.0, med_h * 1.25) pad = max(1, int(round(med_h * 0.12))) def is_punct_like(t): raw = t.strip() if raw == "": return True punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) return punct_ratio >= 0.5 or len(raw) <= 2 # 1) rows items_sorted = sorted(items, key=lambda x: x["yc"]) rows = [] for it in items_sorted: placed = False for r in rows: if abs(it["yc"] - r["yc"]) <= row_tol: r["m"].append(it) r["yc"] = float(np.mean([k["yc"] for k in r["m"]])) placed = True break if not placed: rows.append({"yc": it["yc"], "m": [it]}) rows.sort(key=lambda r: r["yc"]) out_boxes = [] for r in rows: mem = sorted(r["m"], key=lambda z: z["xc"]) normal = [t for t in mem if not is_punct_like(t["txt"])] punct = [t for t in mem if is_punct_like(t["txt"])] if not normal: normal = mem punct = [] # 2) chunk normal tokens chunks = [] cur = [normal[0]] for t in normal[1:]: prev = cur[-1]["b"] b = t["b"] gap = b[0] - prev[2] if gap <= gap_x_tol: cur.append(t) else: chunks.append(cur) cur = [t] chunks.append(cur) # 3) attach punctuation tokens for p in punct: pb = p["b"] pxc, pyc = p["xc"], p["yc"] best_k = -1 best_score = 1e18 for k, ch in enumerate(chunks): ub = boxes_union_xyxy([x["b"] for x in ch]) cx = (ub[0] + ub[2]) / 2.0 cy = (ub[1] + ub[3]) / 2.0 dx = abs(pxc - cx) dy = abs(pyc - cy) score = dx + 1.8 * dy near = overlap_or_near(pb, ub, gap=int(med_h * 0.9)) if near: score -= med_h * 2.0 if score < best_score: best_score = score best_k = k if best_k >= 0: chunks[best_k].append(p) else: chunks.append([p]) # 4) chunk boxes for ch in chunks: ub = boxes_union_xyxy([x["b"] for x in ch]) if ub: x1, y1, x2, y2 = ub out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) # 5) guarantee all tokens included token_boxes = [it["b"] for it in items] def inside(tb, lb): return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3] for tb in token_boxes: ok = any(inside(tb, lb) for lb in out_boxes) if not ok: x1, y1, x2, y2 = tb out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) # 6) merge heavy overlaps merged = [] for b in out_boxes: merged_into = False for i, m in enumerate(merged): ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1]) ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3]) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) a1 = max(1, (b[2]-b[0])*(b[3]-b[1])) a2 = max(1, (m[2]-m[0])*(m[3]-m[1])) iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 if iou > 0.72: merged[i] = boxes_union_xyxy([b, m]) merged_into = True break if not merged_into: merged.append(b) merged.sort(key=lambda z: (z[1], z[0])) return merged # ───────────────────────────────────────────── # GROUPING # ───────────────────────────────────────────── def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) if img is None: return base return base * (img.shape[1] / ref_w) def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): n = len(ocr) if n == 0: return {}, {}, {}, {} boxes = [quad_bbox(r[0]) for r in ocr] centers = [quad_center(r[0]) for r in ocr] hs = [max(1.0, b[3] - b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 dist_thresh = max(20.0, med_h * 2.2) p = list(range(n)) def find(x): while p[x] != x: p[x] = p[p[x]] x = p[x] return x def unite(a, b): p[find(a)] = find(b) for i in range(n): for j in range(i + 1, n): if overlap_or_near(boxes[i], boxes[j], gap=gap_px): unite(i, j) continue cx1, cy1 = centers[i] cx2, cy2 = centers[j] d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 3.0: unite(i, j) groups = {} for i in range(n): groups.setdefault(find(i), []).append(i) sorted_groups = sorted(groups.values(), key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs))) bubbles = {} bubble_boxes = {} bubble_quads = {} bubble_indices = {} ih, iw = image_shape[:2] for bid, idxs in enumerate(sorted_groups, start=1): idxs = sorted(idxs, key=lambda k: boxes[k][1]) lines = build_lines_from_indices(idxs, ocr) quads = [ocr[k][0] for k in idxs] ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) if ub is None: continue x1, y1, x2, y2 = ub x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding) x2 = min(iw, x2 + bbox_padding); y2 = min(ih, y2 + bbox_padding) bubbles[bid] = lines bubble_boxes[bid] = (x1, y1, x2, y2) bubble_quads[bid] = quads bubble_indices[bid] = idxs return bubbles, bubble_boxes, bubble_quads, bubble_indices # ───────────────────────────────────────────── # DEBUG # ───────────────────────────────────────────── def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): img = cv2.imread(image_path) if img is None: return # token quads for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) cv2.polylines(img, [pts], True, (180, 180, 180), 1) # bubble boxes + yellow line boxes for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) idxs = bubble_indices.get(bid, []) line_boxes = build_line_boxes_from_indices(idxs, ocr) for lb in line_boxes: lx1, ly1, lx2, ly2 = lb lx1 = max(0, int(lx1)); ly1 = max(0, int(ly1)) lx2 = min(img.shape[1] - 1, int(lx2)); ly2 = min(img.shape[0] - 1, int(ly2)) cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3) cv2.imwrite(out_path, img) # ───────────────────────────────────────────── # EXPORT # ───────────────────────────────────────────── def estimate_reading_order(bbox_dict, mode="ltr"): items = [] for bid, (x1, y1, x2, y2) in bbox_dict.items(): cx = (x1 + x2) / 2.0 cy = (y1 + y2) / 2.0 items.append((bid, cx, cy)) items.sort(key=lambda t: t[2]) rows = [] tol = 90 for it in items: placed = False for r in rows: if abs(it[2] - r["cy"]) <= tol: r["items"].append(it) r["cy"] = float(np.mean([x[2] for x in r["items"]])) placed = True break if not placed: rows.append({"cy": it[2], "items": [it]}) rows.sort(key=lambda r: r["cy"]) order = [] for r in rows: r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) order.extend([z[0] for z in r["items"]]) return {bid: i + 1 for i, bid in enumerate(order)} def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): out = {} for bid, bb in bbox_dict.items(): x1, y1, x2, y2 = bb quads = quads_dict.get(bid, []) idxs = indices_dict.get(bid, []) qboxes = [quad_bbox(q) for q in quads] text_union = boxes_union_xyxy(qboxes) line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr) line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy) line_union_area = bbox_area_xyxy(line_union_xyxy) out[str(bid)] = { "x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1), "reading_order": int(reading_map.get(bid, bid)), "quad_bboxes": [{"x": int(b[0]), "y": int(b[1]), "w": int(b[2]-b[0]), "h": int(b[3]-b[1])} for b in qboxes], "quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads], "text_bbox": xyxy_to_xywh(text_union), # yellow geometry "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, "line_union_area": int(line_union_area), } with open(filepath, "w", encoding="utf-8") as f: json.dump(out, f, indent=2, ensure_ascii=False) # ───────────────────────────────────────────── # MAIN # ───────────────────────────────────────────── def translate_manga_text( image_path, source_lang="en", target_lang="ca", confidence_threshold=0.12, min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", debug=True ): image = cv2.imread(image_path) if image is None: print(f"❌ Cannot load image: {image_path}") return resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) print("Loading OCR...") ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] reader = easyocr.Reader(ocr_lang_list) print("Running OCR...") raw = reader.readtext(image_path, paragraph=False) print(f"Raw detections: {len(raw)}") filtered = [] skipped = 0 ih, iw = image.shape[:2] for bbox, text, conf in raw: t = normalize_text(text) qb = quad_bbox(bbox) if conf < confidence_threshold: skipped += 1 continue if len(t) < min_text_length: skipped += 1 continue if is_noise_text(t): skipped += 1 continue if filter_sound_effects and is_sound_effect(t): skipped += 1 continue if is_title_text(t): skipped += 1 continue if qb[1] < int(ih * TOP_BAND_RATIO): if conf < 0.70 and len(t) >= 5: skipped += 1 continue filtered.append((bbox, t, conf)) print(f"Kept: {len(filtered)} | Skipped: {skipped}") if not filtered: print("⚠️ No text after filtering.") return bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens( filtered, image.shape, gap_px=resolved_gap, bbox_padding=3 ) if debug: save_debug_clusters( image_path=image_path, ocr=filtered, bubble_boxes=bubble_boxes, bubble_indices=bubble_indices, out_path="debug_clusters.png" ) translator = GoogleTranslator(source=source_lang, target=target_lang) clean_lines = {} for bid, lines in bubbles.items(): txt = normalize_text(" ".join(lines)) q = ocr_quality_score(txt) if q < quality_threshold: reread = reread_crop(image, bubble_boxes[bid], reader, upscale=2.5, pad=18) if reread: txt = normalize_text(reread) clean_lines[bid] = apply_glossary(txt) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) divider = "─" * 120 out_lines = ["BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS", divider] print(divider) print(f"{'BUBBLE':<8} {'ORDER':<6} {'ORIGINAL':<50} {'TRANSLATED':<50} FLAGS") print(divider) translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src = clean_lines[bid].strip() if not src: continue flags = [] try: tgt = translator.translate(src) or "" except Exception as e: tgt = f"[Translation error: {e}]" tgt = apply_glossary(postprocess_translation_general(tgt)).upper() src_u = src.upper() out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}") print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}") translated_count += 1 out_lines.append(divider) out_lines.append(f"✅ Done! {translated_count} bubble(s) translated, {skipped} detection(s) skipped.") with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(out_lines)) export_bubbles( export_bubbles_to, bbox_dict=bubble_boxes, quads_dict=bubble_quads, indices_dict=bubble_indices, ocr=filtered, reading_map=reading_map, image_shape=image.shape ) print(divider) print(f"Saved: {export_to_file}") print(f"Saved: {export_bubbles_to}") if debug: print("Saved: debug_clusters.png (special chars included in yellow boxes)") if __name__ == "__main__": translate_manga_text( image_path="002-page.png", source_lang="en", target_lang="ca", confidence_threshold=0.12, min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", debug=True )