From 555892348f8a480fd582fed39316b30a0f9ef086 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Sat, 11 Apr 2026 14:00:07 +0200 Subject: [PATCH] Added new --- bubbles.json | 420 +++++++++++++++++++++++++++++++++++++--- manga-renderer.py | 359 ++++++++++++++++------------------ manga-translator.py | 456 ++++++++++++++++++++++++++------------------ 3 files changed, 836 insertions(+), 399 deletions(-) diff --git a/bubbles.json b/bubbles.json index 41eafae..184c79f 100644 --- a/bubbles.json +++ b/bubbles.json @@ -1,38 +1,410 @@ { "1": { - "x": 251, - "y": 149, - "w": 60, - "h": 60 + "x": 204, + "y": 137, + "w": 153, + "h": 82, + "quads": [ + [ + [ + 204, + 172 + ], + [ + 348, + 137 + ], + [ + 358, + 185 + ], + [ + 215, + 220 + ] + ] + ] }, "2": { - "x": 1202, - "y": 226, - "w": 61, - "h": 159 + "x": 1167, + "y": 240, + "w": 132, + "h": 134, + "quads": [ + [ + [ + 1214, + 240 + ], + [ + 1252, + 240 + ], + [ + 1252, + 272 + ], + [ + 1214, + 272 + ] + ], + [ + [ + 1167, + 271 + ], + [ + 1299, + 271 + ], + [ + 1299, + 307 + ], + [ + 1167, + 307 + ] + ], + [ + [ + 1175, + 303 + ], + [ + 1289, + 303 + ], + [ + 1289, + 339 + ], + [ + 1175, + 339 + ] + ], + [ + [ + 1206, + 340 + ], + [ + 1260, + 340 + ], + [ + 1260, + 370 + ], + [ + 1206, + 370 + ] + ] + ] }, "3": { - "x": 966, - "y": 364, - "w": 62, - "h": 156 + "x": 930, + "y": 378, + "w": 136, + "h": 132, + "quads": [ + [ + [ + 930, + 378 + ], + [ + 1062, + 378 + ], + [ + 1062, + 410 + ], + [ + 930, + 410 + ] + ], + [ + [ + 930, + 410 + ], + [ + 1066, + 410 + ], + [ + 1066, + 442 + ], + [ + 930, + 442 + ] + ], + [ + [ + 954, + 439 + ], + [ + 1041, + 439 + ], + [ + 1041, + 475 + ], + [ + 954, + 475 + ] + ], + [ + [ + 946, + 474 + ], + [ + 1050, + 474 + ], + [ + 1050, + 506 + ], + [ + 946, + 506 + ] + ] + ] }, "4": { - "x": 265, - "y": 471, - "w": 62, - "h": 230 + "x": 220, + "y": 486, + "w": 150, + "h": 210, + "quads": [ + [ + [ + 278, + 486 + ], + [ + 312, + 486 + ], + [ + 312, + 516 + ], + [ + 278, + 516 + ] + ], + [ + [ + 236, + 514 + ], + [ + 356, + 514 + ], + [ + 356, + 544 + ], + [ + 236, + 544 + ] + ], + [ + [ + 236, + 542 + ], + [ + 358, + 542 + ], + [ + 358, + 572 + ], + [ + 236, + 572 + ] + ], + [ + [ + 220, + 572 + ], + [ + 370, + 572 + ], + [ + 370, + 600 + ], + [ + 220, + 600 + ] + ], + [ + [ + 240, + 598 + ], + [ + 350, + 598 + ], + [ + 350, + 630 + ], + [ + 240, + 630 + ] + ], + [ + [ + 246, + 628 + ], + [ + 346, + 628 + ], + [ + 346, + 658 + ], + [ + 246, + 658 + ] + ], + [ + [ + 250, + 656 + ], + [ + 340, + 656 + ], + [ + 340, + 686 + ], + [ + 250, + 686 + ] + ] + ] }, "5": { - "x": 359, - "y": 1114, - "w": 72, - "h": 134 + "x": 354, + "y": 1132, + "w": 92, + "h": 102, + "quads": [ + [ + [ + 384, + 1132 + ], + [ + 418, + 1132 + ], + [ + 418, + 1156 + ], + [ + 384, + 1156 + ] + ], + [ + [ + 354, + 1154 + ], + [ + 446, + 1154 + ], + [ + 446, + 1208 + ], + [ + 354, + 1208 + ] + ], + [ + [ + 366, + 1206 + ], + [ + 412, + 1206 + ], + [ + 412, + 1230 + ], + [ + 366, + 1230 + ] + ] + ] }, "6": { - "x": 729, - "y": 1306, - "w": 60, - "h": 60 + "x": 740, + "y": 1324, + "w": 38, + "h": 24, + "quads": [ + [ + [ + 740, + 1324 + ], + [ + 778, + 1324 + ], + [ + 778, + 1348 + ], + [ + 740, + 1348 + ] + ] + ] } } \ No newline at end of file diff --git a/manga-renderer.py b/manga-renderer.py index ad1e46c..99e50a4 100644 --- a/manga-renderer.py +++ b/manga-renderer.py @@ -13,218 +13,172 @@ INPUT_IMAGE = "page.png" OUTPUT_IMAGE = "page_translated.png" TRANSLATIONS_FILE = "output.txt" BUBBLES_FILE = "bubbles.json" - FONT_PATH = "font.ttf" FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc" FONT_COLOR = (0, 0, 0) -BUBBLE_FILL = (255, 255, 255) # ───────────────────────────────────────────── -# STEP 1: PARSE output.txt -# Robust parser: always takes the LAST -# whitespace-separated column as translation. +# PARSE output.txt # ───────────────────────────────────────────── def parse_translations(filepath): """ - Parses output.txt and returns {bubble_id: translated_text}. - - Strategy: split each #N line on 2+ consecutive spaces, - then always take the LAST token as the translation. - This is robust even when original or translated text - contains internal spaces. - - Args: - filepath : Path to output.txt - - Returns: - Dict {1: "LA NOIA ESTÀ IL·LESA!", ...} + Parses output.txt → {bubble_id: translated_text}. + Only bubbles present in the file are returned. + Absent IDs are left completely untouched on the page. """ translations = {} - with open(filepath, "r", encoding="utf-8") as f: for line in f: line = line.rstrip("\n") - - # Must start with #N - if not re.match(r"^#\d+", line.strip()): + if not re.match(r"^\s*#\d+", line): continue - - # Split on 2+ spaces → [bubble_id_col, original_col, translated_col] parts = re.split(r" {2,}", line.strip()) - if len(parts) < 3: continue - bubble_id = int(re.sub(r"[^0-9]", "", parts[0])) - translated = parts[-1].strip() # always last column - + translated = parts[-1].strip() + if translated.startswith("["): + continue translations[bubble_id] = translated - print(f" ✅ Parsed {len(translations)} translation(s) from {filepath}") + print(f" ✅ {len(translations)} bubble(s) to translate: " + f"{sorted(translations.keys())}") for bid, text in sorted(translations.items()): print(f" #{bid}: {text}") - return translations # ───────────────────────────────────────────── -# STEP 2: LOAD BUBBLE BOXES from bubbles.json -# These were saved by manga-translator.py -# and are guaranteed to match the clusters. +# LOAD bubbles.json # ───────────────────────────────────────────── def load_bubble_boxes(filepath): - """ - Loads bubble bounding boxes from bubbles.json. - - Expected format: - { - "1": {"x": 120, "y": 45, "w": 180, "h": 210}, - "2": { ... }, - ... - } - - Args: - filepath : Path to bubbles.json - - Returns: - Dict {bubble_id (int): (x, y, w, h)} - """ with open(filepath, "r", encoding="utf-8") as f: raw = json.load(f) - - boxes = {} - for key, val in raw.items(): - bubble_id = int(key) - boxes[bubble_id] = (val["x"], val["y"], val["w"], val["h"]) - - print(f" ✅ Loaded {len(boxes)} bubble box(es) from {filepath}") - for bid, (x, y, w, h) in sorted(boxes.items()): - print(f" #{bid}: ({x},{y}) {w}×{h}px") - + boxes = {int(k): v for k, v in raw.items()} + print(f" ✅ Loaded {len(boxes)} bubble(s)") + for bid, val in sorted(boxes.items()): + print(f" #{bid}: ({val['x']},{val['y']}) " + f"{val['w']}×{val['h']}px") return boxes # ───────────────────────────────────────────── -# STEP 3: ERASE BUBBLE CONTENT -# Fills a rectangular region with white. -# Uses a slightly inset rect to preserve -# the bubble border. +# SAMPLE BACKGROUND COLOR # ───────────────────────────────────────────── -def erase_bubble_rect(image, x, y, w, h, padding=6): +def sample_bubble_background(cv_image, bubble_data): """ - Fills the interior of a bounding box with white, - leaving a border of `padding` pixels intact. + Samples the dominant background color inside the bbox + by averaging the brightest 10% of pixels. + Returns (B, G, R). + """ + x = max(0, bubble_data["x"]) + y = max(0, bubble_data["y"]) + x2 = min(cv_image.shape[1], x + bubble_data["w"]) + y2 = min(cv_image.shape[0], y + bubble_data["h"]) + + region = cv_image[y:y2, x:x2] + if region.size == 0: + return (255, 255, 255) + + gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY) + threshold = np.percentile(gray, 90) + bg_mask = gray >= threshold + if not np.any(bg_mask): + return (255, 255, 255) + + return tuple(int(c) for c in region[bg_mask].mean(axis=0)) + + +# ───────────────────────────────────────────── +# ERASE ORIGINAL TEXT +# Fills the tight OCR bbox with the sampled +# background color. No extra expansion — +# the bbox from bubbles.json is already the +# exact size of the red squares. +# ───────────────────────────────────────────── +def erase_bubble_text(cv_image, bubble_data, + bg_color=(255, 255, 255)): + """ + Fills the bubble bounding box with bg_color. Args: - image : BGR numpy array (modified in place) - x,y,w,h : Bounding box - padding : Pixels to leave as border (default: 6) + cv_image : BGR numpy array (modified in place) + bubble_data : Dict with 'x','y','w','h' + bg_color : (B,G,R) fill color """ - x1 = max(0, x + padding) - y1 = max(0, y + padding) - x2 = min(image.shape[1], x + w - padding) - y2 = min(image.shape[0], y + h - padding) - - if x2 > x1 and y2 > y1: - image[y1:y2, x1:x2] = 255 + img_h, img_w = cv_image.shape[:2] + x = max(0, bubble_data["x"]) + y = max(0, bubble_data["y"]) + x2 = min(img_w, bubble_data["x"] + bubble_data["w"]) + y2 = min(img_h, bubble_data["y"] + bubble_data["h"]) + cv_image[y:y2, x:x2] = list(bg_color) # ───────────────────────────────────────────── -# STEP 4: FIT FONT SIZE -# Finds the largest font size where the text -# fits inside (max_w × max_h) with word wrap. +# FIT FONT SIZE # ───────────────────────────────────────────── def fit_font_size(draw, text, max_w, max_h, font_path, - min_size=8, max_size=48): + min_size=7, max_size=48): """ - Binary-searches for the largest font size where - word-wrapped text fits within the given box. - - Args: - draw : PIL ImageDraw instance - text : Text string to fit - max_w : Available width in pixels - max_h : Available height in pixels - font_path : Path to .ttf font (or None for default) - min_size : Smallest font size to try (default: 8) - max_size : Largest font size to try (default: 48) - - Returns: - (font, list_of_wrapped_lines) + Finds the largest font size where word-wrapped text + fits inside (max_w × max_h). """ best_font = None best_lines = [text] for size in range(max_size, min_size - 1, -1): try: - font = ImageFont.truetype(font_path, size) if font_path else ImageFont.load_default() + font = (ImageFont.truetype(font_path, size) + if font_path else ImageFont.load_default()) except Exception: font = ImageFont.load_default() - # Word-wrap - words = text.split() - lines = [] - current = "" - + words, lines, current = text.split(), [], "" for word in words: test = (current + " " + word).strip() - bbox = draw.textbbox((0, 0), test, font=font) - if (bbox[2] - bbox[0]) <= max_w: + bb = draw.textbbox((0, 0), test, font=font) + if (bb[2] - bb[0]) <= max_w: current = test else: if current: lines.append(current) current = word - if current: lines.append(current) - # Measure total block height - lh_bbox = draw.textbbox((0, 0), "Ay", font=font) - line_h = (lh_bbox[3] - lh_bbox[1]) + 3 - total_h = line_h * len(lines) - - if total_h <= max_h: + lh = draw.textbbox((0, 0), "Ay", font=font) + line_h = (lh[3] - lh[1]) + 2 + if line_h * len(lines) <= max_h: best_font = font best_lines = lines break - if best_font is None: - best_font = ImageFont.load_default() - - return best_font, best_lines + return best_font or ImageFont.load_default(), best_lines # ───────────────────────────────────────────── -# STEP 5: RENDER TEXT INTO BUBBLE -# Draws translated text centered inside -# the bubble bounding box. +# RENDER TEXT INTO BUBBLE # ───────────────────────────────────────────── -def render_text_in_bubble(pil_image, x, y, w, h, text, - font_path, padding=12, +def render_text_in_bubble(pil_image, bubble_data, text, + font_path, padding=8, font_color=(0, 0, 0)): """ - Renders text centered (horizontally + vertically) - inside a bubble bounding box. - - Args: - pil_image : PIL Image (modified in place) - x,y,w,h : Bubble bounding box - text : Translated text to render - font_path : Path to .ttf font (or None) - padding : Inner padding in pixels (default: 12) - font_color : RGB color tuple (default: black) + Renders translated text centered inside the tight bbox. + Font auto-sizes to fill the same w×h the original occupied. """ + x, y = bubble_data["x"], bubble_data["y"] + w, h = bubble_data["w"], bubble_data["h"] + draw = ImageDraw.Draw(pil_image) inner_w = max(1, w - padding * 2) inner_h = max(1, h - padding * 2) - font, lines = fit_font_size(draw, text, inner_w, inner_h, font_path) - - lh_bbox = draw.textbbox((0, 0), "Ay", font=font) - line_h = (lh_bbox[3] - lh_bbox[1]) + 3 + font, lines = fit_font_size(draw, text, inner_w, inner_h, + font_path) + lh_bb = draw.textbbox((0, 0), "Ay", font=font) + line_h = (lh_bb[3] - lh_bb[1]) + 2 total_h = line_h * len(lines) start_y = y + padding + max(0, (inner_h - total_h) // 2) @@ -232,7 +186,8 @@ def render_text_in_bubble(pil_image, x, y, w, h, text, lb = draw.textbbox((0, 0), line, font=font) line_w = lb[2] - lb[0] start_x = x + padding + max(0, (inner_w - line_w) // 2) - draw.text((start_x, start_y), line, font=font, fill=font_color) + draw.text((start_x, start_y), line, + font=font, fill=font_color) start_y += line_h @@ -244,7 +199,7 @@ def resolve_font(font_path, fallback): print(f" ✅ Using font: {font_path}") return font_path if fallback and os.path.exists(fallback): - print(f" ⚠️ '{font_path}' not found → fallback: {fallback}") + print(f" ⚠️ Fallback: {fallback}") return fallback print(" ⚠️ No font found. Using PIL default.") return None @@ -261,104 +216,122 @@ def render_translated_page( font_path = FONT_PATH, font_fallback = FONT_FALLBACK, font_color = FONT_COLOR, - erase_padding = 6, - text_padding = 12, + text_padding = 8, debug = False, ): """ - Full rendering pipeline: - 1. Parse translations from output.txt + Pipeline: + 1. Parse translations (only present IDs processed) 2. Load bubble boxes from bubbles.json - 3. Load original manga page - 4. Erase original text from each bubble - 5. Render translated text into each bubble - 6. Save output image - - Args: - input_image : Source manga page (default: 'page.png') - output_image : Output path (default: 'page_translated.png') - translations_file : Path to output.txt (default: 'output.txt') - bubbles_file : Path to bubbles.json (default: 'bubbles.json') - font_path : Primary .ttf font path - font_fallback : Fallback font path - font_color : RGB text color (default: black) - erase_padding : Border px when erasing (default: 6) - text_padding : Inner padding for text (default: 12) - debug : Save debug_render.png (default: False) + 3. Cross-check IDs — absent ones left untouched + 4. Sample background color per bubble + 5. Erase original text (fill tight bbox) + 6. Render translated text sized to fit the bbox + 7. Save output """ print("=" * 55) print(" MANGA TRANSLATOR — RENDERER") print("=" * 55) - # ── 1. Parse translations ───────────────────────────────────────────────── print("\n📄 Parsing translations...") translations = parse_translations(translations_file) - if not translations: print("❌ No translations found. Aborting.") return - # ── 2. Load bubble boxes ────────────────────────────────────────────────── - print(f"\n📦 Loading bubble boxes from {bubbles_file}...") + print(f"\n📦 Loading bubble data...") bubble_boxes = load_bubble_boxes(bubbles_file) - if not bubble_boxes: - print("❌ No bubble boxes found. Re-run manga-translator.py first.") + print("❌ No bubble data. Re-run manga-translator.py.") return - # ── 3. Load image ───────────────────────────────────────────────────────── - print(f"\n🖼️ Loading image: {input_image}") + translate_ids = set(translations.keys()) + box_ids = set(bubble_boxes.keys()) + to_process = sorted(translate_ids & box_ids) + untouched = sorted(box_ids - translate_ids) + missing = sorted(translate_ids - box_ids) + + print(f"\n🔗 To process : {to_process}") + print(f" Untouched : {untouched}") + if missing: + print(f" ⚠️ In output.txt but no box: {missing}") + + if not to_process: + print("❌ No matching IDs. Aborting.") + return + + print(f"\n🖼️ Loading: {input_image}") cv_image = cv2.imread(input_image) if cv_image is None: print(f"❌ Could not load: {input_image}") return - print(f" Image size: {cv_image.shape[1]}×{cv_image.shape[0]}px") + print(f" {cv_image.shape[1]}×{cv_image.shape[0]}px") - # ── 4. Erase original text ──────────────────────────────────────────────── - print("\n🧹 Erasing original bubble text...") - for bubble_id in sorted(translations.keys()): - if bubble_id not in bubble_boxes: - print(f" ⚠️ #{bubble_id}: no box in bubbles.json, skipping") - continue - x, y, w, h = bubble_boxes[bubble_id] - erase_bubble_rect(cv_image, x, y, w, h, padding=erase_padding) - print(f" Erased #{bubble_id} at ({x},{y}) {w}×{h}px") + # Sample backgrounds BEFORE erasing + print("\n🎨 Sampling backgrounds...") + bg_colors = {} + for bid in to_process: + bg_bgr = sample_bubble_background( + cv_image, bubble_boxes[bid]) + bg_colors[bid] = bg_bgr + bg_rgb = (bg_bgr[2], bg_bgr[1], bg_bgr[0]) + brightness = sum(bg_rgb) / 3 + ink = "black" if brightness > 128 else "white" + print(f" #{bid}: RGB{bg_rgb} ink→{ink}") - # ── 5. Convert to PIL ───────────────────────────────────────────────────── - pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)) + # Erase + print("\n🧹 Erasing original text...") + for bid in to_process: + bd = bubble_boxes[bid] + erase_bubble_text(cv_image, bd, bg_color=bg_colors[bid]) + print(f" ✅ #{bid} ({bd['w']}×{bd['h']}px)") + + pil_image = Image.fromarray( + cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)) - # ── 6. Resolve font ─────────────────────────────────────────────────────── print("\n🔤 Resolving font...") resolved_font = resolve_font(font_path, font_fallback) - # ── 7. Render translated text ───────────────────────────────────────────── - print("\n✍️ Rendering translated text...") - for bubble_id, text in sorted(translations.items()): - if bubble_id not in bubble_boxes: - continue - x, y, w, h = bubble_boxes[bubble_id] + # Render + print("\n✍️ Rendering...") + for bid in to_process: + text = translations[bid] + bd = bubble_boxes[bid] + bg_rgb = (bg_colors[bid][2], + bg_colors[bid][1], + bg_colors[bid][0]) + brightness = sum(bg_rgb) / 3 + txt_color = (0, 0, 0) if brightness > 128 \ + else (255, 255, 255) + render_text_in_bubble( - pil_image, x, y, w, h, text, + pil_image, bd, text, font_path = resolved_font, padding = text_padding, - font_color = font_color, + font_color = txt_color, ) - print(f" #{bubble_id}: '{text}' → ({x},{y}) {w}×{h}px") + print(f" ✅ #{bid}: '{text}' " + f"({bd['x']},{bd['y']}) {bd['w']}×{bd['h']}px") - # ── 8. Debug overlay ────────────────────────────────────────────────────── if debug: - dbg = pil_image.copy() + dbg = pil_image.copy() dbg_draw = ImageDraw.Draw(dbg) - for bubble_id, (x, y, w, h) in sorted(bubble_boxes.items()): - dbg_draw.rectangle([x, y, x + w, y + h], outline=(255, 0, 0), width=2) - dbg_draw.text((x + 4, y + 4), f"#{bubble_id}", fill=(255, 0, 0)) + for bid, bd in sorted(bubble_boxes.items()): + color = (0, 200, 0) if bid in translate_ids \ + else (160, 160, 160) + dbg_draw.rectangle( + [bd["x"], bd["y"], + bd["x"] + bd["w"], bd["y"] + bd["h"]], + outline=color, width=2) + dbg_draw.text((bd["x"] + 3, bd["y"] + 3), + f"#{bid}", fill=color) dbg.save("debug_render.png") - print("\n 🐛 Debug render saved → debug_render.png") + print("\n 🐛 debug_render.png saved " + "(green=translated, grey=untouched)") - # ── 9. Save output ──────────────────────────────────────────────────────── print(f"\n💾 Saving → {output_image}") pil_image.save(output_image, "PNG") - print(f" ✅ Done! Open: {output_image}") + print(" ✅ Done!") print("=" * 55) @@ -366,7 +339,6 @@ def render_translated_page( # ENTRY POINT # ───────────────────────────────────────────── if __name__ == "__main__": - render_translated_page( input_image = "page.png", output_image = "page_translated.png", @@ -375,7 +347,6 @@ if __name__ == "__main__": font_path = "font.ttf", font_fallback = "/System/Library/Fonts/Helvetica.ttc", font_color = (0, 0, 0), - erase_padding = 6, - text_padding = 12, + text_padding = 8, debug = True, ) \ No newline at end of file diff --git a/manga-translator.py b/manga-translator.py index bb35cb9..f997515 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -29,44 +29,132 @@ SUPPORTED_LANGUAGES = { "Catalan" : "ca", } + # ───────────────────────────────────────────── # SOUND EFFECT FILTER # ───────────────────────────────────────────── SOUND_EFFECT_PATTERNS = [ - r"^b+i+p+$", - r"^sha+$", - r"^ha+$", - r"^ah+$", - r"^oh+$", - r"^ugh+$", - r"^gr+$", - r"^bam+$", - r"^pow+$", - r"^crash+$", - r"^boom+$", - r"^bang+$", - r"^crack+$", - r"^whoosh+$", - r"^thud+$", - r"^snap+$", - r"^zip+$", - r"^swoosh+$", + r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", + r"^oh+$", r"^ugh+$", r"^gr+$", r"^bam+$", + r"^pow+$", r"^crash+$", r"^boom+$", r"^bang+$", + r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$", + r"^zip+$", r"^swoosh+$", ] def is_sound_effect(text): cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) - return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) + return any(re.fullmatch(p, cleaned, re.IGNORECASE) + for p in SOUND_EFFECT_PATTERNS) # ───────────────────────────────────────────── -# BOUNDING BOX HELPERS +# TOKEN FILTER # ───────────────────────────────────────────── +def should_keep_token(text, confidence, confidence_threshold, + min_text_length, filter_sound_effects): + """ + Returns (keep: bool, reason: str). + + Rules: + 1. Drop if confidence below threshold + 2. Drop if shorter than min_text_length + 3. Drop pure digit strings + 4. Drop single non-alpha characters + 5. Drop sound effects if filter enabled + 6. Keep everything else + """ + cleaned = text.strip() + + if confidence < confidence_threshold: + return False, f"low confidence ({confidence:.2f})" + if len(cleaned) < min_text_length: + return False, "too short" + if re.fullmatch(r"\d+", cleaned): + return False, "pure digits" + if len(cleaned) == 1 and not cleaned.isalpha(): + return False, "single symbol" + if filter_sound_effects and is_sound_effect(cleaned): + return False, "sound effect" + + return True, "ok" + + +# ───────────────────────────────────────────── +# BOUNDING BOX +# +# Rules (match the red square exactly): +# Width = widest single quad's width +# Height = sum of ALL quad heights stacked +# X = centered on the widest quad's CX +# Y = topmost Y1 of all quads +# ───────────────────────────────────────────── +def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape, + padding_px=10): + """ + Computes the bubble erase bbox: + + 1. Per-quad: measure w, h, cx for every OCR detection + 2. Width = width of the widest single quad + 3. Height = sum of every quad's height + 4. X = widest quad's center ± max_w/2 + (all lines sit symmetrically inside) + 5. Y = top of topmost quad, bottom = Y + total_h + + Args: + ocr_bboxes : List of EasyOCR quad bboxes + image_shape : (height, width) for clamping + padding_px : Expansion on each side (default: 10) + + Returns: + (x1, y1, x2, y2) clamped to image bounds + """ + img_h, img_w = image_shape[:2] + + if not ocr_bboxes: + return 0, 0, 0, 0 + + # ── Per-quad metrics ────────────────────────────────────────── + quad_metrics = [] + for quad in ocr_bboxes: + xs = [pt[0] for pt in quad] + ys = [pt[1] for pt in quad] + qx1, qx2 = min(xs), max(xs) + qy1, qy2 = min(ys), max(ys) + quad_metrics.append({ + "x1" : qx1, + "x2" : qx2, + "y1" : qy1, + "y2" : qy2, + "w" : qx2 - qx1, + "h" : qy2 - qy1, + "cx" : (qx1 + qx2) / 2.0, + }) + + # ── Width: widest single quad ───────────────────────────────── + widest = max(quad_metrics, key=lambda q: q["w"]) + max_w = widest["w"] + center_x = widest["cx"] + + # ── Height: sum of all quad heights ────────────────────────── + total_h = sum(q["h"] for q in quad_metrics) + + # ── Box edges ───────────────────────────────────────────────── + box_x1 = center_x - max_w / 2.0 + box_x2 = center_x + max_w / 2.0 + box_y1 = min(q["y1"] for q in quad_metrics) + box_y2 = box_y1 + total_h + + # ── Padding + clamp ─────────────────────────────────────────── + x1 = max(0, box_x1 - padding_px) + y1 = max(0, box_y1 - padding_px) + x2 = min(img_w, box_x2 + padding_px) + y2 = min(img_h, box_y2 + padding_px) + + return x1, y1, x2, y2 + + def get_cluster_bbox(items): - """ - Returns (x1, y1, x2, y2) tight bounding box around - all (cy, cx, text) center points in a cluster. - Uses a fixed half-size approximation per text block. - """ + """Fallback center-point bbox — used only during merge step.""" half = 30 x1 = min(cx for _, cx, _ in items) - half y1 = min(cy for cy, _, _ in items) - half @@ -76,10 +164,6 @@ def get_cluster_bbox(items): def boxes_are_close(bbox_a, bbox_b, proximity_px=80): - """ - Returns True if two (x1,y1,x2,y2) boxes are within - proximity_px pixels of each other (or overlapping). - """ ax1, ay1, ax2, ay2 = bbox_a bx1, by1, bx2, by2 = bbox_b ax1 -= proximity_px; ay1 -= proximity_px @@ -87,18 +171,25 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80): return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1) +# ───────────────────────────────────────────── +# TEXT LINE FILTER +# ───────────────────────────────────────────── +def has_translatable_content(text): + """ + True if text contains at least one letter. + ch.isalpha() handles È, é, ñ, ü etc. + """ + return any(ch.isalpha() for ch in text) + + # ───────────────────────────────────────────── # POST-CLUSTER MERGE (Union-Find) # ───────────────────────────────────────────── -def merge_nearby_clusters(raw_clusters, proximity_px=80): - """ - Merges clusters whose bounding boxes are within - proximity_px pixels of each other. - Fixes split bubbles without changing eps globally. - """ +def merge_nearby_clusters(raw_clusters, raw_quads, + proximity_px=80): labels = list(raw_clusters.keys()) - bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl]) for lbl in labels} - + bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl]) + for lbl in labels} parent = {lbl: lbl for lbl in labels} def find(x): @@ -116,35 +207,28 @@ def merge_nearby_clusters(raw_clusters, proximity_px=80): if boxes_are_close(bboxes[a], bboxes[b], proximity_px): union(a, b) - merged = {} + merged_clusters = {} + merged_quads = {} for lbl in labels: root = find(lbl) - merged.setdefault(root, []) - merged[root].extend(raw_clusters[lbl]) + merged_clusters.setdefault(root, []) + merged_quads.setdefault(root, []) + merged_clusters[root].extend(raw_clusters[lbl]) + merged_quads[root].extend(raw_quads[lbl]) - return merged + return merged_clusters, merged_quads # ───────────────────────────────────────────── # CROP-BASED OCR RE-READ # ───────────────────────────────────────────── -def reread_cluster_crop( - image, - bbox, - reader, - source_lang, - padding_px=20, - upscale_factor=2.5, -): - """ - Crops a cluster region from the full image, upscales it, - and re-runs OCR for higher accuracy on small text. - """ +def reread_cluster_crop(image, bbox, reader, source_lang, + padding_px=20, upscale_factor=2.5): img_h, img_w = image.shape[:2] x1, y1, x2, y2 = bbox - x1 = max(0, int(x1) - padding_px) - y1 = max(0, int(y1) - padding_px) + x1 = max(0, int(x1) - padding_px) + y1 = max(0, int(y1) - padding_px) x2 = min(img_w, int(x2) + padding_px) y2 = min(img_h, int(y2) + padding_px) @@ -154,13 +238,13 @@ def reread_cluster_crop( new_w = int(crop.shape[1] * upscale_factor) new_h = int(crop.shape[0] * upscale_factor) - upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + upscaled = cv2.resize(crop, (new_w, new_h), + interpolation=cv2.INTER_CUBIC) kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened = cv2.filter2D(upscaled, -1, kernel) temp_path = "_temp_crop_ocr.png" cv2.imwrite(temp_path, sharpened) - try: crop_results = reader.readtext(temp_path, paragraph=False) finally: @@ -171,26 +255,31 @@ def reread_cluster_crop( return None crop_results.sort(key=lambda r: r[0][0][1]) - lines = [text.strip() for _, text, conf in crop_results if text.strip()] - + lines = [t.strip() for _, t, _ in crop_results if t.strip()] return fix_hyphens(lines) if lines else None # ───────────────────────────────────────────── # DBSCAN BUBBLE CLUSTERING # ───────────────────────────────────────────── -def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): +def cluster_into_bubbles(ocr_results, image_shape, + eps=80, min_samples=1, + proximity_px=80, bbox_padding=10): """ Two-pass clustering: Pass 1 — DBSCAN on center points Pass 2 — Bounding-box proximity merge + Bbox: widest-line width (centered) × stacked height. + All quads contribute to bbox regardless of content. + Returns: - bubble_dict : cluster_id → list of (cy, cx, text) + bubble_dict : cluster_id → list of translatable text lines bbox_dict : cluster_id → (x1, y1, x2, y2) + ocr_quads : cluster_id → list of ALL raw EasyOCR quads """ if not ocr_results: - return {}, {} + return {}, {}, {} centers = [] for bbox, text, confidence in ocr_results: @@ -199,11 +288,12 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): centers.append([sum(xs) / 4, sum(ys) / 4]) centers_array = np.array(centers, dtype=np.float32) - - db = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean") + db = DBSCAN(eps=eps, min_samples=min_samples, + metric="euclidean") labels = db.fit_predict(centers_array) raw_clusters = {} + raw_quads = {} noise_counter = int(max(labels, default=0)) + 1 for idx, label in enumerate(labels): @@ -211,12 +301,17 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): label = noise_counter noise_counter += 1 raw_clusters.setdefault(label, []) + raw_quads.setdefault(label, []) bbox, text, _ = ocr_results[idx] - raw_clusters[label].append((centers[idx][1], centers[idx][0], text)) + raw_clusters[label].append( + (centers[idx][1], centers[idx][0], text)) + raw_quads[label].append(bbox) print(f" DBSCAN pass: {len(raw_clusters)} cluster(s)") - merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px) + merged_clusters, merged_quads = merge_nearby_clusters( + raw_clusters, raw_quads, proximity_px=proximity_px + ) print(f" After merge: {len(merged_clusters)} cluster(s)") row_band_px = 150 @@ -225,17 +320,42 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): return (min(cy for cy, cx, _ in items) // row_band_px, min(cx for cy, cx, _ in items)) - sorted_clusters = sorted(merged_clusters.values(), key=cluster_sort_key) + sorted_labels = sorted( + merged_clusters.keys(), + key=lambda lbl: cluster_sort_key(merged_clusters[lbl]) + ) bubble_dict = {} bbox_dict = {} + ocr_quads = {} - for i, items in enumerate(sorted_clusters, start=1): - items_sorted = sorted(items, key=lambda t: t[0]) - bubble_dict[i] = [text for _, _, text in items_sorted] - bbox_dict[i] = get_cluster_bbox(items) + for i, lbl in enumerate(sorted_labels, start=1): + items = merged_clusters[lbl] + quads = merged_quads[lbl] - return bubble_dict, bbox_dict + items_sorted = sorted(items, key=lambda t: t[0]) + + text_lines = [ + text for _, _, text in items_sorted + if has_translatable_content(text) + ] + if not text_lines: + text_lines = [text for _, _, text in items_sorted] + + bubble_dict[i] = text_lines + ocr_quads[i] = quads + + bbox_dict[i] = get_cluster_bbox_from_ocr( + quads, image_shape, padding_px=bbox_padding + ) + + b = bbox_dict[i] + print(f" Cluster #{i}: {len(quads)} quad(s) " + f"bbox=({int(b[0])},{int(b[1])})→" + f"({int(b[2])},{int(b[3])}) " + f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}") + + return bubble_dict, bbox_dict, ocr_quads # ───────────────────────────────────────────── @@ -246,8 +366,9 @@ def fix_hyphens(lines): return "" merged = lines[0] for line in lines[1:]: - line = line.strip() - merged = merged[:-1] + line if merged.endswith("-") else merged + " " + line + line = line.strip() + merged = (merged[:-1] + line if merged.endswith("-") + else merged + " " + line) return re.sub(r" {2,}", " ", merged).strip() @@ -268,63 +389,45 @@ def compute_auto_eps(image_path, base_eps=80, reference_width=750): # OCR QUALITY SCORE # ───────────────────────────────────────────── def ocr_quality_score(text): - """ - Returns a quality score 0.0–1.0 for an OCR result. - Low score triggers a crop re-read. - """ if not text or len(text) < 2: return 0.0 - - alpha_chars = sum(1 for c in text if c.isalpha()) - total_chars = len(text) - alpha_ratio = alpha_chars / total_chars - - garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"] - penalty = sum(0.2 for p in garbage_patterns if re.search(p, text)) - + alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text) + garbage = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"] + penalty = sum(0.2 for p in garbage if re.search(p, text)) return max(0.0, min(1.0, alpha_ratio - penalty)) # ───────────────────────────────────────────── # BUBBLE JSON EXPORT -# Saves bbox_dict to bubbles.json so the -# renderer can load exact cluster positions. # ───────────────────────────────────────────── -def export_bubble_boxes(bbox_dict, filepath="bubbles.json"): - """ - Serialises bbox_dict to a JSON file. - - Format written: - { - "1": {"x": 120, "y": 45, "w": 180, "h": 210}, - ... - } - - Args: - bbox_dict : Dict {bubble_id (int): (x1, y1, x2, y2)} - filepath : Output path (default: 'bubbles.json') - """ +def export_bubble_boxes(bbox_dict, ocr_quads_dict, + filepath="bubbles.json"): export = {} for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): + quads = ocr_quads_dict.get(bubble_id, []) export[str(bubble_id)] = { - "x": int(x1), - "y": int(y1), - "w": int(x2 - x1), - "h": int(y2 - y1), + "x" : int(x1), + "y" : int(y1), + "w" : int(x2 - x1), + "h" : int(y2 - y1), + "quads": [[[int(pt[0]), int(pt[1])] for pt in quad] + for quad in quads], } with open(filepath, "w", encoding="utf-8") as f: json.dump(export, f, indent=2, ensure_ascii=False) - print(f"📦 Bubble boxes saved → {filepath}") - for bubble_id, vals in export.items(): - print(f" #{bubble_id}: ({vals['x']},{vals['y']}) {vals['w']}×{vals['h']}px") + print(f"\n📦 Bubble boxes saved → {filepath}") + for bid, v in export.items(): + print(f" #{bid}: ({v['x']},{v['y']}) " + f"{v['w']}×{v['h']}px [{len(v['quads'])} quad(s)]") # ───────────────────────────────────────────── # DEBUG CLUSTER IMAGE # ───────────────────────────────────────────── -def save_debug_clusters(image_path, ocr_results, bubble_dict): +def save_debug_clusters(image_path, ocr_results, + bubble_dict, bbox_dict): image = cv2.imread(image_path) if image is None: return @@ -333,7 +436,8 @@ def save_debug_clusters(image_path, ocr_results, bubble_dict): num_bubbles = max(bubble_dict.keys(), default=1) colors = [ tuple(int(c) for c in col) - for col in np.random.randint(50, 230, size=(num_bubbles + 2, 3)) + for col in np.random.randint(50, 230, + size=(num_bubbles + 2, 3)) ] text_to_bubble = {} @@ -345,14 +449,21 @@ def save_debug_clusters(image_path, ocr_results, bubble_dict): bubble_id = text_to_bubble.get(text, 0) color = colors[(bubble_id - 1) % len(colors)] pts = np.array(bbox, dtype=np.int32) - cv2.polylines(image, [pts], isClosed=True, color=color, thickness=2) - x = int(pts[0][0]) - y = max(int(pts[0][1]) - 5, 12) - cv2.putText(image, f"#{bubble_id}", (x, y), + cv2.polylines(image, [pts], isClosed=True, + color=color, thickness=1) + + for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): + color = colors[(bubble_id - 1) % len(colors)] + cv2.rectangle(image, + (int(x1), int(y1)), + (int(x2), int(y2)), + color, 2) + cv2.putText(image, f"BOX#{bubble_id}", + (int(x1) + 2, int(y1) + 16), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) cv2.imwrite("debug_clusters.png", image) - print(" 🐛 Cluster debug saved → debug_clusters.png") + print(" 🐛 debug_clusters.png saved") # ───────────────────────────────────────────── @@ -362,39 +473,18 @@ def translate_manga_text( image_path, source_lang="it", target_lang="ca", - confidence_threshold=0.15, + confidence_threshold=0.10, export_to_file=None, - export_bubbles_to="bubbles.json", # ← NEW: path for bubble boxes JSON + export_bubbles_to="bubbles.json", min_text_length=2, cluster_eps="auto", proximity_px=80, filter_sound_effects=True, quality_threshold=0.5, upscale_factor=2.5, + bbox_padding=10, debug=False, ): - """ - Full pipeline: - OCR → filter → DBSCAN cluster → proximity merge - → quality check → crop re-read if needed - → fix hyphens → translate → export txt + json - - Args: - image_path : Path to your image file - source_lang : Source language code (default: 'it') - target_lang : Target language code (default: 'ca') - confidence_threshold : Min OCR confidence (default: 0.15) - export_to_file : Save translations to .txt (default: None) - export_bubbles_to : Save bubble boxes to .json (default: 'bubbles.json') - min_text_length : Min characters per detection(default: 2) - cluster_eps : DBSCAN eps or 'auto' (default: 'auto') - proximity_px : Post-merge proximity px (default: 80) - filter_sound_effects : Skip onomatopoeia/SFX (default: True) - quality_threshold : Min quality score 0–1 (default: 0.5) - upscale_factor : Crop upscale for re-read (default: 2.5) - debug : Save debug_clusters.png (default: False) - """ - # ── 1. Resolve eps ──────────────────────────────────────────────────────── if cluster_eps == "auto": print("Computing auto eps...") @@ -410,54 +500,61 @@ def translate_manga_text( # ── 3. Initialize OCR ───────────────────────────────────────────────────── print("\nLoading OCR model...") - ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] + ocr_lang_list = ["en", "es"] if source_lang == "ca" \ + else [source_lang] reader = easyocr.Reader(ocr_lang_list) # ── 4. Initialize translator ────────────────────────────────────────────── - translator = GoogleTranslator(source=source_lang, target=target_lang) + translator = GoogleTranslator(source=source_lang, + target=target_lang) # ── 5. Run OCR ──────────────────────────────────────────────────────────── print(f"\nRunning OCR on: {image_path}") results = reader.readtext(image_path, paragraph=False) print(f" Raw detections: {len(results)}") - # ── 6. Filter detections ────────────────────────────────────────────────── + # ── 6. Filter ───────────────────────────────────────────────────────────── filtered = [] skipped = 0 for bbox, text, confidence in results: cleaned = text.strip() - if confidence < confidence_threshold: + keep, reason = should_keep_token( + cleaned, confidence, + confidence_threshold, min_text_length, + filter_sound_effects + ) + if keep: + filtered.append((bbox, cleaned, confidence)) + else: + if reason == "sound effect": + print(f" 🔇 SFX skipped: '{cleaned}'") skipped += 1 - continue - if len(cleaned) < min_text_length: - skipped += 1 - continue - if re.fullmatch(r"[\d\W]+", cleaned): - skipped += 1 - continue - if filter_sound_effects and is_sound_effect(cleaned): - print(f" 🔇 SFX skipped: '{cleaned}'") - skipped += 1 - continue - filtered.append((bbox, cleaned, confidence)) - print(f" ✅ {len(filtered)} detection(s) kept, {skipped} skipped.\n") + print(f" ✅ {len(filtered)} kept, {skipped} skipped.\n") if not filtered: print("⚠️ No text detected after filtering.") return # ── 7. Cluster + merge ──────────────────────────────────────────────────── - print(f"Clustering detections (eps={eps:.1f}px, proximity={proximity_px}px)...") - bubble_dict, bbox_dict = cluster_into_bubbles( - filtered, eps=eps, proximity_px=proximity_px + print(f"Clustering (eps={eps:.1f}px, " + f"proximity={proximity_px}px, " + f"bbox_padding={bbox_padding}px)...") + + bubble_dict, bbox_dict, ocr_quads = cluster_into_bubbles( + filtered, + image_shape = full_image.shape, + eps = eps, + proximity_px = proximity_px, + bbox_padding = bbox_padding, ) print(f" ✅ {len(bubble_dict)} bubble(s) after merge.\n") - # ── 8. Debug image ──────────────────────────────────────────────────────── + # ── 8. Debug ────────────────────────────────────────────────────────────── if debug: - save_debug_clusters(image_path, filtered, bubble_dict) + save_debug_clusters(image_path, filtered, + bubble_dict, bbox_dict) # ── 9. Fix hyphens ──────────────────────────────────────────────────────── clean_bubbles = { @@ -471,41 +568,39 @@ def translate_manga_text( for i, text in clean_bubbles.items(): score = ocr_quality_score(text) status = "✅" if score >= quality_threshold else "🔁" - print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'") + print(f" #{i}: score={score:.2f} {status} '{text[:55]}'") if score < quality_threshold: - print(f" → Re-reading bubble #{i} from crop...") + print(f" → Re-reading #{i} from crop...") reread = reread_cluster_crop( full_image, bbox_dict[i], reader, source_lang, upscale_factor=upscale_factor, ) if reread: - print(f" → Re-read result: '{reread}'") + print(f" → '{reread}'") clean_bubbles[i] = reread else: - print(f" → Re-read returned nothing, keeping original.") + print(f" → Nothing found, keeping original.") # ── 11. Translate & print ───────────────────────────────────────────────── print() - header = f"{'BUBBLE':<8} {'ORIGINAL (Italian)':<50} {'TRANSLATED (Catalan)'}" + header = (f"{'BUBBLE':<8} " + f"{'ORIGINAL (Italian)':<50} " + f"{'TRANSLATED (Catalan)'}") divider = "─" * 105 - output_lines = [header, divider] print(header) print(divider) translated_count = 0 - for i in sorted(clean_bubbles.keys()): bubble_text = clean_bubbles[i].strip() if not bubble_text: continue - try: translated = translator.translate(bubble_text) except Exception as e: translated = f"[Translation error: {e}]" - if translated is None: translated = "[No translation returned]" @@ -515,23 +610,22 @@ def translate_manga_text( output_lines.append(line) output_lines.append(divider) - summary = ( - f"✅ Done! {translated_count} bubble(s) translated, " - f"{skipped} detection(s) skipped." - ) + summary = (f"✅ Done! {translated_count} bubble(s) translated, " + f"{skipped} detection(s) skipped.") output_lines.append(summary) print(divider) print(summary) - # ── 12. Export translations .txt ────────────────────────────────────────── + # ── 12. Export translations ─────────────────────────────────────────────── if export_to_file: with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(output_lines)) print(f"📄 Translations saved → {export_to_file}") - # ── 13. Export bubble boxes .json ───────────────────────────────────────── + # ── 13. Export bubble boxes ─────────────────────────────────────────────── if export_bubbles_to: - export_bubble_boxes(bbox_dict, filepath=export_bubbles_to) + export_bubble_boxes(bbox_dict, ocr_quads, + filepath=export_bubbles_to) # ───────────────────────────────────────────── @@ -549,19 +643,19 @@ def list_languages(): # ENTRY POINT # ───────────────────────────────────────────── if __name__ == "__main__": - translate_manga_text( image_path = "page.png", source_lang = "it", target_lang = "ca", - confidence_threshold = 0.15, + confidence_threshold = 0.10, min_text_length = 2, export_to_file = "output.txt", - export_bubbles_to = "bubbles.json", # ← NEW + export_bubbles_to = "bubbles.json", cluster_eps = "auto", proximity_px = 80, filter_sound_effects = True, quality_threshold = 0.5, upscale_factor = 2.5, + bbox_padding = 0, debug = True, ) \ No newline at end of file