diff --git a/bubbles.json b/bubbles.json new file mode 100644 index 0000000..41eafae --- /dev/null +++ b/bubbles.json @@ -0,0 +1,38 @@ +{ + "1": { + "x": 251, + "y": 149, + "w": 60, + "h": 60 + }, + "2": { + "x": 1202, + "y": 226, + "w": 61, + "h": 159 + }, + "3": { + "x": 966, + "y": 364, + "w": 62, + "h": 156 + }, + "4": { + "x": 265, + "y": 471, + "w": 62, + "h": 230 + }, + "5": { + "x": 359, + "y": 1114, + "w": 72, + "h": 134 + }, + "6": { + "x": 729, + "y": 1306, + "w": 60, + "h": 60 + } +} \ No newline at end of file diff --git a/manga-renderer.py b/manga-renderer.py new file mode 100644 index 0000000..ad1e46c --- /dev/null +++ b/manga-renderer.py @@ -0,0 +1,381 @@ +import re +import json +import cv2 +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import os + + +# ───────────────────────────────────────────── +# CONFIG +# ───────────────────────────────────────────── +INPUT_IMAGE = "page.png" +OUTPUT_IMAGE = "page_translated.png" +TRANSLATIONS_FILE = "output.txt" +BUBBLES_FILE = "bubbles.json" + +FONT_PATH = "font.ttf" +FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc" +FONT_COLOR = (0, 0, 0) +BUBBLE_FILL = (255, 255, 255) + + +# ───────────────────────────────────────────── +# STEP 1: PARSE output.txt +# Robust parser: always takes the LAST +# whitespace-separated column as translation. +# ───────────────────────────────────────────── +def parse_translations(filepath): + """ + Parses output.txt and returns {bubble_id: translated_text}. + + Strategy: split each #N line on 2+ consecutive spaces, + then always take the LAST token as the translation. + This is robust even when original or translated text + contains internal spaces. + + Args: + filepath : Path to output.txt + + Returns: + Dict {1: "LA NOIA ESTÀ IL·LESA!", ...} + """ + translations = {} + + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + line = line.rstrip("\n") + + # Must start with #N + if not re.match(r"^#\d+", line.strip()): + continue + + # Split on 2+ spaces → [bubble_id_col, original_col, translated_col] + parts = re.split(r" {2,}", line.strip()) + + if len(parts) < 3: + continue + + bubble_id = int(re.sub(r"[^0-9]", "", parts[0])) + translated = parts[-1].strip() # always last column + + translations[bubble_id] = translated + + print(f" ✅ Parsed {len(translations)} translation(s) from {filepath}") + for bid, text in sorted(translations.items()): + print(f" #{bid}: {text}") + + return translations + + +# ───────────────────────────────────────────── +# STEP 2: LOAD BUBBLE BOXES from bubbles.json +# These were saved by manga-translator.py +# and are guaranteed to match the clusters. +# ───────────────────────────────────────────── +def load_bubble_boxes(filepath): + """ + Loads bubble bounding boxes from bubbles.json. + + Expected format: + { + "1": {"x": 120, "y": 45, "w": 180, "h": 210}, + "2": { ... }, + ... + } + + Args: + filepath : Path to bubbles.json + + Returns: + Dict {bubble_id (int): (x, y, w, h)} + """ + with open(filepath, "r", encoding="utf-8") as f: + raw = json.load(f) + + boxes = {} + for key, val in raw.items(): + bubble_id = int(key) + boxes[bubble_id] = (val["x"], val["y"], val["w"], val["h"]) + + print(f" ✅ Loaded {len(boxes)} bubble box(es) from {filepath}") + for bid, (x, y, w, h) in sorted(boxes.items()): + print(f" #{bid}: ({x},{y}) {w}×{h}px") + + return boxes + + +# ───────────────────────────────────────────── +# STEP 3: ERASE BUBBLE CONTENT +# Fills a rectangular region with white. +# Uses a slightly inset rect to preserve +# the bubble border. +# ───────────────────────────────────────────── +def erase_bubble_rect(image, x, y, w, h, padding=6): + """ + Fills the interior of a bounding box with white, + leaving a border of `padding` pixels intact. + + Args: + image : BGR numpy array (modified in place) + x,y,w,h : Bounding box + padding : Pixels to leave as border (default: 6) + """ + x1 = max(0, x + padding) + y1 = max(0, y + padding) + x2 = min(image.shape[1], x + w - padding) + y2 = min(image.shape[0], y + h - padding) + + if x2 > x1 and y2 > y1: + image[y1:y2, x1:x2] = 255 + + +# ───────────────────────────────────────────── +# STEP 4: FIT FONT SIZE +# Finds the largest font size where the text +# fits inside (max_w × max_h) with word wrap. +# ───────────────────────────────────────────── +def fit_font_size(draw, text, max_w, max_h, font_path, + min_size=8, max_size=48): + """ + Binary-searches for the largest font size where + word-wrapped text fits within the given box. + + Args: + draw : PIL ImageDraw instance + text : Text string to fit + max_w : Available width in pixels + max_h : Available height in pixels + font_path : Path to .ttf font (or None for default) + min_size : Smallest font size to try (default: 8) + max_size : Largest font size to try (default: 48) + + Returns: + (font, list_of_wrapped_lines) + """ + best_font = None + best_lines = [text] + + for size in range(max_size, min_size - 1, -1): + try: + font = ImageFont.truetype(font_path, size) if font_path else ImageFont.load_default() + except Exception: + font = ImageFont.load_default() + + # Word-wrap + words = text.split() + lines = [] + current = "" + + for word in words: + test = (current + " " + word).strip() + bbox = draw.textbbox((0, 0), test, font=font) + if (bbox[2] - bbox[0]) <= max_w: + current = test + else: + if current: + lines.append(current) + current = word + + if current: + lines.append(current) + + # Measure total block height + lh_bbox = draw.textbbox((0, 0), "Ay", font=font) + line_h = (lh_bbox[3] - lh_bbox[1]) + 3 + total_h = line_h * len(lines) + + if total_h <= max_h: + best_font = font + best_lines = lines + break + + if best_font is None: + best_font = ImageFont.load_default() + + return best_font, best_lines + + +# ───────────────────────────────────────────── +# STEP 5: RENDER TEXT INTO BUBBLE +# Draws translated text centered inside +# the bubble bounding box. +# ───────────────────────────────────────────── +def render_text_in_bubble(pil_image, x, y, w, h, text, + font_path, padding=12, + font_color=(0, 0, 0)): + """ + Renders text centered (horizontally + vertically) + inside a bubble bounding box. + + Args: + pil_image : PIL Image (modified in place) + x,y,w,h : Bubble bounding box + text : Translated text to render + font_path : Path to .ttf font (or None) + padding : Inner padding in pixels (default: 12) + font_color : RGB color tuple (default: black) + """ + draw = ImageDraw.Draw(pil_image) + inner_w = max(1, w - padding * 2) + inner_h = max(1, h - padding * 2) + + font, lines = fit_font_size(draw, text, inner_w, inner_h, font_path) + + lh_bbox = draw.textbbox((0, 0), "Ay", font=font) + line_h = (lh_bbox[3] - lh_bbox[1]) + 3 + + total_h = line_h * len(lines) + start_y = y + padding + max(0, (inner_h - total_h) // 2) + + for line in lines: + lb = draw.textbbox((0, 0), line, font=font) + line_w = lb[2] - lb[0] + start_x = x + padding + max(0, (inner_w - line_w) // 2) + draw.text((start_x, start_y), line, font=font, fill=font_color) + start_y += line_h + + +# ───────────────────────────────────────────── +# RESOLVE FONT +# ───────────────────────────────────────────── +def resolve_font(font_path, fallback): + if font_path and os.path.exists(font_path): + print(f" ✅ Using font: {font_path}") + return font_path + if fallback and os.path.exists(fallback): + print(f" ⚠️ '{font_path}' not found → fallback: {fallback}") + return fallback + print(" ⚠️ No font found. Using PIL default.") + return None + + +# ───────────────────────────────────────────── +# MAIN RENDERER +# ───────────────────────────────────────────── +def render_translated_page( + input_image = INPUT_IMAGE, + output_image = OUTPUT_IMAGE, + translations_file = TRANSLATIONS_FILE, + bubbles_file = BUBBLES_FILE, + font_path = FONT_PATH, + font_fallback = FONT_FALLBACK, + font_color = FONT_COLOR, + erase_padding = 6, + text_padding = 12, + debug = False, +): + """ + Full rendering pipeline: + 1. Parse translations from output.txt + 2. Load bubble boxes from bubbles.json + 3. Load original manga page + 4. Erase original text from each bubble + 5. Render translated text into each bubble + 6. Save output image + + Args: + input_image : Source manga page (default: 'page.png') + output_image : Output path (default: 'page_translated.png') + translations_file : Path to output.txt (default: 'output.txt') + bubbles_file : Path to bubbles.json (default: 'bubbles.json') + font_path : Primary .ttf font path + font_fallback : Fallback font path + font_color : RGB text color (default: black) + erase_padding : Border px when erasing (default: 6) + text_padding : Inner padding for text (default: 12) + debug : Save debug_render.png (default: False) + """ + print("=" * 55) + print(" MANGA TRANSLATOR — RENDERER") + print("=" * 55) + + # ── 1. Parse translations ───────────────────────────────────────────────── + print("\n📄 Parsing translations...") + translations = parse_translations(translations_file) + + if not translations: + print("❌ No translations found. Aborting.") + return + + # ── 2. Load bubble boxes ────────────────────────────────────────────────── + print(f"\n📦 Loading bubble boxes from {bubbles_file}...") + bubble_boxes = load_bubble_boxes(bubbles_file) + + if not bubble_boxes: + print("❌ No bubble boxes found. Re-run manga-translator.py first.") + return + + # ── 3. Load image ───────────────────────────────────────────────────────── + print(f"\n🖼️ Loading image: {input_image}") + cv_image = cv2.imread(input_image) + if cv_image is None: + print(f"❌ Could not load: {input_image}") + return + print(f" Image size: {cv_image.shape[1]}×{cv_image.shape[0]}px") + + # ── 4. Erase original text ──────────────────────────────────────────────── + print("\n🧹 Erasing original bubble text...") + for bubble_id in sorted(translations.keys()): + if bubble_id not in bubble_boxes: + print(f" ⚠️ #{bubble_id}: no box in bubbles.json, skipping") + continue + x, y, w, h = bubble_boxes[bubble_id] + erase_bubble_rect(cv_image, x, y, w, h, padding=erase_padding) + print(f" Erased #{bubble_id} at ({x},{y}) {w}×{h}px") + + # ── 5. Convert to PIL ───────────────────────────────────────────────────── + pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)) + + # ── 6. Resolve font ─────────────────────────────────────────────────────── + print("\n🔤 Resolving font...") + resolved_font = resolve_font(font_path, font_fallback) + + # ── 7. Render translated text ───────────────────────────────────────────── + print("\n✍️ Rendering translated text...") + for bubble_id, text in sorted(translations.items()): + if bubble_id not in bubble_boxes: + continue + x, y, w, h = bubble_boxes[bubble_id] + render_text_in_bubble( + pil_image, x, y, w, h, text, + font_path = resolved_font, + padding = text_padding, + font_color = font_color, + ) + print(f" #{bubble_id}: '{text}' → ({x},{y}) {w}×{h}px") + + # ── 8. Debug overlay ────────────────────────────────────────────────────── + if debug: + dbg = pil_image.copy() + dbg_draw = ImageDraw.Draw(dbg) + for bubble_id, (x, y, w, h) in sorted(bubble_boxes.items()): + dbg_draw.rectangle([x, y, x + w, y + h], outline=(255, 0, 0), width=2) + dbg_draw.text((x + 4, y + 4), f"#{bubble_id}", fill=(255, 0, 0)) + dbg.save("debug_render.png") + print("\n 🐛 Debug render saved → debug_render.png") + + # ── 9. Save output ──────────────────────────────────────────────────────── + print(f"\n💾 Saving → {output_image}") + pil_image.save(output_image, "PNG") + print(f" ✅ Done! Open: {output_image}") + print("=" * 55) + + +# ───────────────────────────────────────────── +# ENTRY POINT +# ───────────────────────────────────────────── +if __name__ == "__main__": + + render_translated_page( + input_image = "page.png", + output_image = "page_translated.png", + translations_file = "output.txt", + bubbles_file = "bubbles.json", + font_path = "font.ttf", + font_fallback = "/System/Library/Fonts/Helvetica.ttc", + font_color = (0, 0, 0), + erase_padding = 6, + text_padding = 12, + debug = True, + ) \ No newline at end of file diff --git a/manga-translator.py b/manga-translator.py index ae787a1..bb35cb9 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -1,5 +1,6 @@ import re import os +import json import cv2 import numpy as np import easyocr @@ -126,10 +127,6 @@ def merge_nearby_clusters(raw_clusters, proximity_px=80): # ───────────────────────────────────────────── # CROP-BASED OCR RE-READ -# For each cluster bounding box, crop the -# original image with padding and re-run OCR -# at higher quality. This fixes garbled text -# in small or low-contrast bubbles. # ───────────────────────────────────────────── def reread_cluster_crop( image, @@ -142,23 +139,10 @@ def reread_cluster_crop( """ Crops a cluster region from the full image, upscales it, and re-runs OCR for higher accuracy on small text. - - Args: - image : Full-page image as numpy array (BGR) - bbox : (x1, y1, x2, y2) cluster bounding box - reader : Initialized EasyOCR Reader - source_lang : Language code string - padding_px : Pixels of padding around the crop (default: 20) - upscale_factor: How much to enlarge the crop before OCR (default: 2.5) - - Returns: - Single cleaned string with all OCR lines merged top-to-bottom, - or None if OCR found nothing. """ img_h, img_w = image.shape[:2] x1, y1, x2, y2 = bbox - # Add padding, clamp to image bounds x1 = max(0, int(x1) - padding_px) y1 = max(0, int(y1) - padding_px) x2 = min(img_w, int(x2) + padding_px) @@ -168,16 +152,12 @@ def reread_cluster_crop( if crop.size == 0: return None - # Upscale for better OCR on small text - new_w = int(crop.shape[1] * upscale_factor) - new_h = int(crop.shape[0] * upscale_factor) - upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) - - # Light sharpening to improve OCR on manga fonts + new_w = int(crop.shape[1] * upscale_factor) + new_h = int(crop.shape[0] * upscale_factor) + upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened = cv2.filter2D(upscaled, -1, kernel) - # Save temp crop and OCR it temp_path = "_temp_crop_ocr.png" cv2.imwrite(temp_path, sharpened) @@ -190,8 +170,7 @@ def reread_cluster_crop( if not crop_results: return None - # Sort detections top-to-bottom and join lines - crop_results.sort(key=lambda r: r[0][0][1]) # sort by top-left Y + crop_results.sort(key=lambda r: r[0][0][1]) lines = [text.strip() for _, text, conf in crop_results if text.strip()] return fix_hyphens(lines) if lines else None @@ -240,7 +219,6 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px) print(f" After merge: {len(merged_clusters)} cluster(s)") - # Sort in reading order row_band_px = 150 def cluster_sort_key(items): @@ -288,19 +266,11 @@ def compute_auto_eps(image_path, base_eps=80, reference_width=750): # ───────────────────────────────────────────── # OCR QUALITY SCORE -# Heuristic to detect garbled OCR output. -# Low score = likely garbage, trigger re-read. # ───────────────────────────────────────────── def ocr_quality_score(text): """ Returns a quality score 0.0–1.0 for an OCR result. - - Penalises: - - High ratio of non-alphabetic characters - - Very short text (< 4 chars) - - Suspicious character combos (,,- etc.) - - A score below 0.5 triggers a crop re-read. + Low score triggers a crop re-read. """ if not text or len(text) < 2: return 0.0 @@ -309,12 +279,46 @@ def ocr_quality_score(text): total_chars = len(text) alpha_ratio = alpha_chars / total_chars - # Penalise suspicious patterns garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"] penalty = sum(0.2 for p in garbage_patterns if re.search(p, text)) - score = alpha_ratio - penalty - return max(0.0, min(1.0, score)) + return max(0.0, min(1.0, alpha_ratio - penalty)) + + +# ───────────────────────────────────────────── +# BUBBLE JSON EXPORT +# Saves bbox_dict to bubbles.json so the +# renderer can load exact cluster positions. +# ───────────────────────────────────────────── +def export_bubble_boxes(bbox_dict, filepath="bubbles.json"): + """ + Serialises bbox_dict to a JSON file. + + Format written: + { + "1": {"x": 120, "y": 45, "w": 180, "h": 210}, + ... + } + + Args: + bbox_dict : Dict {bubble_id (int): (x1, y1, x2, y2)} + filepath : Output path (default: 'bubbles.json') + """ + export = {} + for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): + export[str(bubble_id)] = { + "x": int(x1), + "y": int(y1), + "w": int(x2 - x1), + "h": int(y2 - y1), + } + + with open(filepath, "w", encoding="utf-8") as f: + json.dump(export, f, indent=2, ensure_ascii=False) + + print(f"📦 Bubble boxes saved → {filepath}") + for bubble_id, vals in export.items(): + print(f" #{bubble_id}: ({vals['x']},{vals['y']}) {vals['w']}×{vals['h']}px") # ───────────────────────────────────────────── @@ -360,31 +364,33 @@ def translate_manga_text( target_lang="ca", confidence_threshold=0.15, export_to_file=None, + export_bubbles_to="bubbles.json", # ← NEW: path for bubble boxes JSON min_text_length=2, cluster_eps="auto", proximity_px=80, filter_sound_effects=True, - quality_threshold=0.5, # below this → trigger crop re-read - upscale_factor=2.5, # crop upscale multiplier for re-read + quality_threshold=0.5, + upscale_factor=2.5, debug=False, ): """ Full pipeline: OCR → filter → DBSCAN cluster → proximity merge → quality check → crop re-read if needed - → fix hyphens → translate + → fix hyphens → translate → export txt + json Args: image_path : Path to your image file source_lang : Source language code (default: 'it') target_lang : Target language code (default: 'ca') confidence_threshold : Min OCR confidence (default: 0.15) - export_to_file : Save output to .txt (default: None) + export_to_file : Save translations to .txt (default: None) + export_bubbles_to : Save bubble boxes to .json (default: 'bubbles.json') min_text_length : Min characters per detection(default: 2) cluster_eps : DBSCAN eps or 'auto' (default: 'auto') proximity_px : Post-merge proximity px (default: 80) filter_sound_effects : Skip onomatopoeia/SFX (default: True) - quality_threshold : Min quality score 0–1 before re-read (default: 0.5) + quality_threshold : Min quality score 0–1 (default: 0.5) upscale_factor : Crop upscale for re-read (default: 2.5) debug : Save debug_clusters.png (default: False) """ @@ -396,7 +402,7 @@ def translate_manga_text( else: eps = float(cluster_eps) - # ── 2. Load full image (needed for crop re-reads) ───────────────────────── + # ── 2. Load full image ──────────────────────────────────────────────────── full_image = cv2.imread(image_path) if full_image is None: print(f"❌ Could not load image: {image_path}") @@ -410,7 +416,7 @@ def translate_manga_text( # ── 4. Initialize translator ────────────────────────────────────────────── translator = GoogleTranslator(source=source_lang, target=target_lang) - # ── 5. Run OCR on full image ────────────────────────────────────────────── + # ── 5. Run OCR ──────────────────────────────────────────────────────────── print(f"\nRunning OCR on: {image_path}") results = reader.readtext(image_path, paragraph=False) print(f" Raw detections: {len(results)}") @@ -453,27 +459,24 @@ def translate_manga_text( if debug: save_debug_clusters(image_path, filtered, bubble_dict) - # ── 9. Fix hyphens → first-pass text ───────────────────────────────────── + # ── 9. Fix hyphens ──────────────────────────────────────────────────────── clean_bubbles = { i: fix_hyphens(lines) for i, lines in bubble_dict.items() if lines } - # ── 10. Quality check → crop re-read for low-quality bubbles ───────────── + # ── 10. Quality check + crop re-read ────────────────────────────────────── print("Checking OCR quality per bubble...") for i, text in clean_bubbles.items(): - score = ocr_quality_score(text) + score = ocr_quality_score(text) status = "✅" if score >= quality_threshold else "🔁" print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'") if score < quality_threshold: print(f" → Re-reading bubble #{i} from crop...") reread = reread_cluster_crop( - full_image, - bbox_dict[i], - reader, - source_lang, + full_image, bbox_dict[i], reader, source_lang, upscale_factor=upscale_factor, ) if reread: @@ -520,11 +523,15 @@ def translate_manga_text( print(divider) print(summary) - # ── 12. Export ──────────────────────────────────────────────────────────── + # ── 12. Export translations .txt ────────────────────────────────────────── if export_to_file: with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(output_lines)) - print(f"📄 Output saved to: {export_to_file}") + print(f"📄 Translations saved → {export_to_file}") + + # ── 13. Export bubble boxes .json ───────────────────────────────────────── + if export_bubbles_to: + export_bubble_boxes(bbox_dict, filepath=export_bubbles_to) # ───────────────────────────────────────────── @@ -550,10 +557,11 @@ if __name__ == "__main__": confidence_threshold = 0.15, min_text_length = 2, export_to_file = "output.txt", + export_bubbles_to = "bubbles.json", # ← NEW cluster_eps = "auto", proximity_px = 80, filter_sound_effects = True, - quality_threshold = 0.5, # bubbles scoring below this get re-read - upscale_factor = 2.5, # how much to enlarge the crop for re-read + quality_threshold = 0.5, + upscale_factor = 2.5, debug = True, ) \ No newline at end of file