import re import os import json import cv2 import numpy as np import easyocr from deep_translator import GoogleTranslator # ───────────────────────────────────────────── # LANGUAGE CODE REFERENCE # ───────────────────────────────────────────── SUPPORTED_LANGUAGES = { "Vietnamese" : "vi", "Japanese" : "ja", "English" : "en", "Spanish" : "es", "Korean" : "ko", "Chinese (Simplified)" : "ch_sim", "Chinese (Traditional)": "ch_tra", "French" : "fr", "German" : "de", "Italian" : "it", "Portuguese" : "pt", "Arabic" : "ar", "Russian" : "ru", "Thai" : "th", "Catalan" : "ca", } # ───────────────────────────────────────────── # SOUND EFFECT FILTER # ───────────────────────────────────────────── SOUND_EFFECT_PATTERNS = [ r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$", r"^ugh+$", r"^gr+$", r"^bam+$", r"^pow+$", r"^crash+$", r"^boom+$", r"^bang+$", r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$", r"^zip+$", r"^swoosh+$", r"^chirp+$", r"^tweet+$", ] def is_sound_effect(text): cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) # ───────────────────────────────────────────── # TITLE / LOGO / AUTHOR FILTER # ───────────────────────────────────────────── TITLE_PATTERNS = [ r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$", r"^(spy|family|spy.family)$", r"^by\s+.+$", # "BY TATSUYA ENDO" r"^[a-z]{1,4}\s+[a-z]+\s+[a-z]+$", # short author-style lines ] def is_title_text(text): cleaned = text.strip().lower() return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in TITLE_PATTERNS) # ───────────────────────────────────────────── # GARBAGE TOKEN FILTER # Catches OCR misreads that are mostly # non-alpha or suspiciously short/mangled # ───────────────────────────────────────────── GARBAGE_PATTERNS = [ r"^[^a-zA-Z]*$", # no letters at all r"^.{1,2}$", # 1-2 char tokens r".*\d+.*", # contains digits (YO4, HLNGRY etc.) r"^[A-Z]{1,4}$", # isolated caps abbreviations (IILK) ] def is_garbage(text): t = text.strip() return any(re.fullmatch(p, t) for p in GARBAGE_PATTERNS) # ───────────────────────────────────────────── # TOKEN CLASSIFIER # ───────────────────────────────────────────── def classify_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects): """ Returns one of: "alpha" | "punct" | "noise" """ cleaned = text.strip() if confidence < confidence_threshold: return "noise" if len(cleaned) < min_text_length: return "noise" if re.fullmatch(r"\d+", cleaned): return "noise" if len(cleaned) == 1 and not cleaned.isalpha(): return "noise" if filter_sound_effects and is_sound_effect(cleaned): return "noise" if is_title_text(cleaned): return "noise" if is_garbage(cleaned): return "noise" if not any(ch.isalpha() for ch in cleaned): return "punct" return "alpha" def should_keep_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects): cat = classify_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects) return cat != "noise", cat # ───────────────────────────────────────────── # QUAD HELPERS # ───────────────────────────────────────────── def quad_bbox(quad): xs = [pt[0] for pt in quad] ys = [pt[1] for pt in quad] return min(xs), min(ys), max(xs), max(ys) def quads_bbox(quads, image_shape, padding_px=10): img_h, img_w = image_shape[:2] all_x = [pt[0] for quad in quads for pt in quad] all_y = [pt[1] for quad in quads for pt in quad] x1 = max(0, min(all_x) - padding_px) y1 = max(0, min(all_y) - padding_px) x2 = min(img_w, max(all_x) + padding_px) y2 = min(img_h, max(all_y) + padding_px) return x1, y1, x2, y2 def bboxes_overlap_or_touch(a, b, gap_px=0): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) gap_y = max(0, max(ay1, by1) - min(ay2, by2)) return gap_x <= gap_px and gap_y <= gap_px # ───────────────────────────────────────────── # OVERLAP-BASED GROUPING (Union-Find) # ───────────────────────────────────────────── def group_quads_by_overlap(ocr_results, image_shape, gap_px=18, bbox_padding=10): n = len(ocr_results) if n == 0: return {}, {}, {} token_bboxes = [quad_bbox(r[0]) for r in ocr_results] parent = list(range(n)) def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x def union(x, y): parent[find(x)] = find(y) for i in range(n): for j in range(i + 1, n): if bboxes_overlap_or_touch( token_bboxes[i], token_bboxes[j], gap_px=gap_px): union(i, j) groups = {} for i in range(n): root = find(i) groups.setdefault(root, []) groups[root].append(i) def group_sort_key(indices): ys = [token_bboxes[i][1] for i in indices] xs = [token_bboxes[i][0] for i in indices] return (min(ys) // 150, min(xs)) sorted_groups = sorted(groups.values(), key=group_sort_key) bubble_dict = {} bbox_dict = {} ocr_quads = {} for gid, indices in enumerate(sorted_groups, start=1): indices_sorted = sorted( indices, key=lambda i: token_bboxes[i][1]) quads = [ocr_results[i][0] for i in indices_sorted] raw_texts = [ocr_results[i][1] for i in indices_sorted] alpha_lines = [] punct_tokens = [] for i in indices_sorted: _, text, _ = ocr_results[i] yc = (token_bboxes[i][1] + token_bboxes[i][3]) / 2.0 if any(ch.isalpha() for ch in text): alpha_lines.append((yc, text)) else: punct_tokens.append((yc, text)) for pcy, ptext in punct_tokens: if alpha_lines: closest = min( range(len(alpha_lines)), key=lambda k: abs(alpha_lines[k][0] - pcy) ) yc_a, text_a = alpha_lines[closest] alpha_lines[closest] = (yc_a, text_a + ptext) text_lines = [t for _, t in alpha_lines] or raw_texts bubble_dict[gid] = text_lines ocr_quads[gid] = quads bbox_dict[gid] = quads_bbox(quads, image_shape, padding_px=bbox_padding) b = bbox_dict[gid] print(f" Group #{gid}: {len(quads)} quad(s) " f"bbox=({int(b[0])},{int(b[1])})→" f"({int(b[2])},{int(b[3])}) " f"w={int(b[2]-b[0])} h={int(b[3]-b[1])} " f"text={text_lines}") return bubble_dict, bbox_dict, ocr_quads # ───────────────────────────────────────────── # HYPHEN REMOVAL # ───────────────────────────────────────────── def fix_hyphens(lines): if not lines: return "" merged = lines[0] for line in lines[1:]: line = line.strip() merged = (merged[:-1] + line if merged.endswith("-") else merged + " " + line) return re.sub(r" {2,}", " ", merged).strip().upper() # ───────────────────────────────────────────── # CROP-BASED OCR RE-READ # ───────────────────────────────────────────── def reread_cluster_crop(image, bbox, reader, padding_px=20, upscale_factor=2.5): img_h, img_w = image.shape[:2] x1, y1, x2, y2 = bbox x1 = max(0, int(x1) - padding_px) y1 = max(0, int(y1) - padding_px) x2 = min(img_w, int(x2) + padding_px) y2 = min(img_h, int(y2) + padding_px) crop = image[y1:y2, x1:x2] if crop.size == 0: return None new_w = int(crop.shape[1] * upscale_factor) new_h = int(crop.shape[0] * upscale_factor) upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened = cv2.filter2D(upscaled, -1, kernel) temp_path = "_temp_crop_ocr.png" cv2.imwrite(temp_path, sharpened) try: crop_results = reader.readtext(temp_path, paragraph=False) finally: if os.path.exists(temp_path): os.remove(temp_path) if not crop_results: return None crop_results.sort(key=lambda r: r[0][0][1]) lines = [t.strip().upper() for _, t, _ in crop_results if t.strip()] return fix_hyphens(lines) if lines else None # ───────────────────────────────────────────── # AUTO GAP # ───────────────────────────────────────────── def compute_auto_gap(image_path, base_gap=18, reference_width=750): image = cv2.imread(image_path) if image is None: return base_gap img_w = image.shape[1] scaled = base_gap * (img_w / reference_width) print(f" ℹ️ Image width: {img_w}px → auto gap: {scaled:.1f}px") return scaled # ───────────────────────────────────────────── # OCR QUALITY SCORE # ───────────────────────────────────────────── def ocr_quality_score(text): if not text or len(text) < 2: return 0.0 alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text) garbage = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"] penalty = sum(0.2 for p in garbage if re.search(p, text)) return max(0.0, min(1.0, alpha_ratio - penalty)) # ───────────────────────────────────────────── # BUBBLE JSON EXPORT # bbox_expand_ratio: grow bbox by this fraction # of its own size in each direction to better # approximate the full speech bubble boundary. # ───────────────────────────────────────────── def export_bubble_boxes(bbox_dict, ocr_quads_dict, filepath="bubbles.json", bbox_expand_ratio=0.35, image_shape=None): export = {} for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): quads = ocr_quads_dict.get(bubble_id, []) # ── Expand bbox to approximate full bubble ──────────────── w_orig = x2 - x1 h_orig = y2 - y1 pad_x = int(w_orig * bbox_expand_ratio) pad_y = int(h_orig * bbox_expand_ratio) # Clamp to image bounds if image_shape provided if image_shape is not None: img_h, img_w = image_shape[:2] ex1 = max(0, x1 - pad_x) ey1 = max(0, y1 - pad_y) ex2 = min(img_w, x2 + pad_x) ey2 = min(img_h, y2 + pad_y) else: ex1 = x1 - pad_x ey1 = y1 - pad_y ex2 = x2 + pad_x ey2 = y2 + pad_y export[str(bubble_id)] = { "x" : int(ex1), "y" : int(ey1), "w" : int(ex2 - ex1), "h" : int(ey2 - ey1), # Original tight bbox kept for reference "x_tight" : int(x1), "y_tight" : int(y1), "w_tight" : int(w_orig), "h_tight" : int(h_orig), "quad_bboxes" : [ { "x": int(quad_bbox(q)[0]), "y": int(quad_bbox(q)[1]), "w": int(quad_bbox(q)[2] - quad_bbox(q)[0]), "h": int(quad_bbox(q)[3] - quad_bbox(q)[1]), } for q in quads ], "quads": [[[int(pt[0]), int(pt[1])] for pt in quad] for quad in quads], } with open(filepath, "w", encoding="utf-8") as f: json.dump(export, f, indent=2, ensure_ascii=False) print(f"\n📦 Bubble boxes saved → {filepath}") for bid, v in export.items(): print(f" #{bid}: expanded=({v['x']},{v['y']}) " f"{v['w']}×{v['h']}px " f"tight={v['w_tight']}×{v['h_tight']}px " f"[{len(v['quads'])} quad(s)]") # ───────────────────────────────────────────── # OUTPUT.TXT WRITER # Uses a pipe | as unambiguous delimiter # Format: #ID|ORIGINAL|TRANSLATED # ───────────────────────────────────────────── def write_output(output_lines, filepath): with open(filepath, "w", encoding="utf-8") as f: f.write("\n".join(output_lines)) print(f"📄 Translations saved → {filepath}") # ───────────────────────────────────────────── # DEBUG IMAGE # ───────────────────────────────────────────── def save_debug_clusters(image_path, ocr_results, bubble_dict, bbox_dict): image = cv2.imread(image_path) if image is None: return np.random.seed(42) num_bubbles = max(bubble_dict.keys(), default=1) colors = [ tuple(int(c) for c in col) for col in np.random.randint( 50, 230, size=(num_bubbles + 2, 3)) ] text_to_bubble = {} for bubble_id, lines in bubble_dict.items(): for line in lines: text_to_bubble[line] = bubble_id for bbox, text, _ in ocr_results: bubble_id = text_to_bubble.get(text, 0) color = colors[(bubble_id - 1) % len(colors)] pts = np.array(bbox, dtype=np.int32) cv2.polylines(image, [pts], isClosed=True, color=color, thickness=1) for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): color = colors[(bubble_id - 1) % len(colors)] cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2) cv2.putText(image, f"BOX#{bubble_id}", (int(x1) + 2, int(y1) + 16), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) cv2.imwrite("debug_clusters.png", image) print(" 🐛 debug_clusters.png saved") # ───────────────────────────────────────────── # CORE FUNCTION # ───────────────────────────────────────────── def translate_manga_text( image_path, source_lang="en", target_lang="ca", confidence_threshold=0.10, export_to_file=None, export_bubbles_to="bubbles.json", min_text_length=2, gap_px="auto", filter_sound_effects=True, quality_threshold=0.5, upscale_factor=2.5, bbox_padding=10, debug=False, ): # ── 1. Resolve gap ──────────────────────────────────────────── if gap_px == "auto": resolved_gap = compute_auto_gap(image_path) else: resolved_gap = float(gap_px) # ── 2. Load full image ──────────────────────────────────────── full_image = cv2.imread(image_path) if full_image is None: print(f"❌ Could not load image: {image_path}") return # ── 3. Initialize OCR ───────────────────────────────────────── print("\nLoading OCR model...") ocr_lang_list = ["en", "es"] if source_lang == "ca" \ else [source_lang] reader = easyocr.Reader(ocr_lang_list) # ── 4. Initialize translator ────────────────────────────────── translator = GoogleTranslator(source=source_lang, target=target_lang) # ── 5. Run OCR ──────────────────────────────────────────────── print(f"\nRunning OCR on: {image_path}") results = reader.readtext(image_path, paragraph=False) print(f" Raw detections: {len(results)}") # ── 6. Filter tokens ────────────────────────────────────────── filtered = [] skipped = 0 for bbox, text, confidence in results: cleaned = text.strip().upper() keep, category = should_keep_token( cleaned, confidence, confidence_threshold, min_text_length, filter_sound_effects ) if keep: filtered.append((bbox, cleaned, confidence)) if category == "punct": print(f" ✔ Punct kept: '{cleaned}'") else: tag = ("🔇 SFX" if is_sound_effect(cleaned) else "🏷 Title" if is_title_text(cleaned) else "🗑 Garbage" if is_garbage(cleaned) else "✂️ Low-conf") print(f" {tag} skipped: '{cleaned}'") skipped += 1 print(f" ✅ {len(filtered)} kept, {skipped} skipped.\n") if not filtered: print("⚠️ No text detected after filtering.") return # ── 7. Group by overlap ─────────────────────────────────────── print(f"Grouping by overlap " f"(gap_px={resolved_gap:.1f}, " f"bbox_padding={bbox_padding}px)...") bubble_dict, bbox_dict, ocr_quads = group_quads_by_overlap( filtered, image_shape = full_image.shape, gap_px = resolved_gap, bbox_padding = bbox_padding, ) print(f" ✅ {len(bubble_dict)} bubble(s) detected.\n") # ── 8. Debug ────────────────────────────────────────────────── if debug: save_debug_clusters(image_path, filtered, bubble_dict, bbox_dict) # ── 9. Fix hyphens ──────────────────────────────────────────── clean_bubbles = { i: fix_hyphens(lines) for i, lines in bubble_dict.items() if lines } # ── 10. Quality check + crop re-read ────────────────────────── print("Checking OCR quality per bubble...") for i, text in clean_bubbles.items(): score = ocr_quality_score(text) status = "✅" if score >= quality_threshold else "🔁" print(f" #{i}: score={score:.2f} {status} " f"'{text[:55]}'") if score < quality_threshold: print(f" → Re-reading #{i} from crop...") reread = reread_cluster_crop( full_image, bbox_dict[i], reader, upscale_factor=upscale_factor, ) if reread: print(f" → '{reread}'") clean_bubbles[i] = reread else: print(f" → Nothing found, keeping original.") # ── 11. Translate ───────────────────────────────────────────── # Output format (pipe-delimited, unambiguous): # #ID|ORIGINAL TEXT|TRANSLATED TEXT print() header = "BUBBLE|ORIGINAL|TRANSLATED" divider = "─" * 80 output_lines = [header, divider] translations = {} translated_count = 0 print(f"{'BUBBLE':<8} {'ORIGINAL':<45} {'TRANSLATED'}") print(divider) for i in sorted(clean_bubbles.keys()): bubble_text = clean_bubbles[i].strip() if not bubble_text: continue try: result = translator.translate(bubble_text) except Exception as e: result = f"[Translation error: {e}]" if result is None: result = "[No translation returned]" result = result.upper() translations[i] = result translated_count += 1 # Pipe-delimited line — safe regardless of text content output_lines.append(f"#{i}|{bubble_text}|{result}") print(f"#{i:<7} {bubble_text:<45} {result}") output_lines.append(divider) summary = (f"✅ Done! {translated_count} bubble(s) " f"translated, {skipped} detection(s) skipped.") output_lines.append(summary) print(divider) print(summary) # ── 12. Export translations ─────────────────────────────────── if export_to_file: write_output(output_lines, export_to_file) # ── 13. Export bubble boxes ─────────────────────────────────── if export_bubbles_to: export_bubble_boxes( bbox_dict, ocr_quads, filepath = export_bubbles_to, bbox_expand_ratio = 0.1, # ← tune this image_shape = full_image.shape, ) # ───────────────────────────────────────────── # ENTRY POINT # ───────────────────────────────────────────── if __name__ == "__main__": translate_manga_text( image_path = "002-page.jpg", source_lang = "en", target_lang = "ca", confidence_threshold = 0.10, min_text_length = 2, export_to_file = "output.txt", export_bubbles_to = "bubbles.json", gap_px = "auto", filter_sound_effects = True, quality_threshold = 0.5, upscale_factor = 2.5, bbox_padding = 1, debug = True, )