diff --git a/bubbles.json b/bubbles.json index d96941c..c1008d1 100644 --- a/bubbles.json +++ b/bubbles.json @@ -1,9 +1,9 @@ { "1": { - "x": 201, - "y": 134, - "w": 159, - "h": 88, + "x": 199, + "y": 132, + "w": 163, + "h": 92, "quads": [ [ [ @@ -26,9 +26,9 @@ ] }, "2": { - "x": 1164, - "y": 237, - "w": 138, + "x": 1162, + "y": 235, + "w": 142, "h": 140, "quads": [ [ @@ -106,9 +106,9 @@ ] }, "3": { - "x": 927, - "y": 375, - "w": 142, + "x": 925, + "y": 373, + "w": 146, "h": 138, "quads": [ [ @@ -186,10 +186,10 @@ ] }, "4": { - "x": 217, - "y": 483, - "w": 156, - "h": 216, + "x": 215, + "y": 481, + "w": 160, + "h": 210, "quads": [ [ [ @@ -320,9 +320,9 @@ ] }, "5": { - "x": 351, - "y": 1129, - "w": 98, + "x": 349, + "y": 1127, + "w": 102, "h": 108, "quads": [ [ @@ -382,10 +382,10 @@ ] }, "6": { - "x": 737, - "y": 1321, - "w": 44, - "h": 30, + "x": 735, + "y": 1319, + "w": 48, + "h": 34, "quads": [ [ [ diff --git a/manga-translator.py b/manga-translator.py index 775a154..33b3a9c 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -49,36 +49,20 @@ def is_sound_effect(text): # ───────────────────────────────────────────── # TOKEN CLASSIFIER -# -# Three categories: -# "alpha" — contains at least one letter (È, é, A-Z etc.) -# "punct" — 2+ chars, all punctuation (... ?? !! ?! …) -# "noise" — everything else (single symbols, pure digits, -# low-confidence, sound effects) -# -# Both "alpha" and "punct" tokens are KEPT: -# - "alpha" → contributes to translation text AND bbox -# - "punct" → contributes to bbox only (not translation text) -# unless it immediately follows alpha text -# in the same cluster (handled in clustering) # ───────────────────────────────────────────── def classify_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects): """ Returns one of: "alpha" | "punct" | "noise" - "alpha" : has at least one letter → keep for text + bbox - "punct" : 2+ chars, no letters → keep for bbox only - "noise" : drop entirely - - Rules: - 1. Drop if confidence below threshold → noise - 2. Drop if shorter than min_text_length → noise - 3. Drop pure digit strings → noise - 4. Drop single non-alpha characters → noise - 5. Drop sound effects if filter enabled → noise - 6. 2+ char string with no letters → punct - 7. Has at least one letter → alpha + Rules (in order): + 1. confidence below threshold → noise + 2. shorter than min_text_length → noise + 3. pure digit string → noise + 4. single non-alpha character → noise + 5. sound effect (if filter enabled) → noise + 6. 2+ chars with no letters → punct + 7. has at least one letter → alpha """ cleaned = text.strip() @@ -92,9 +76,6 @@ def classify_token(text, confidence, confidence_threshold, return "noise" if filter_sound_effects and is_sound_effect(cleaned): return "noise" - - # 2+ chars with no letters at all → punctuation token - # Examples: "..." "??" "!!" "?!" "…" ".." if not any(ch.isalpha() for ch in cleaned): return "punct" @@ -115,24 +96,21 @@ def should_keep_token(text, confidence, confidence_threshold, # ───────────────────────────────────────────── # BOUNDING BOX # -# Width = widest single quad's width -# Height = sum of ALL quad heights stacked -# X = centered on the widest quad's CX -# Y = topmost Y1 of all quads +# Flat union of ALL quad corners. +# Handles every layout correctly: +# • "HN" + "..." same line → horizontal union +# • Multi-line bubbles → vertical union +# • Rotated/skewed quads → all 4 corners included # ───────────────────────────────────────────── def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape, padding_px=10): """ - Computes the bubble erase bbox: - - 1. Per-quad: measure w, h, cx - 2. Width = width of the widest single quad - 3. Height = sum of every quad's height - 4. X = widest quad's center ± max_w/2 - 5. Y = top of topmost quad → Y + total_h + Computes the bubble erase bbox by taking the flat union + of ALL quad corners. Args: ocr_bboxes : List of EasyOCR quad bboxes + Each = [[x0,y0],[x1,y1],[x2,y2],[x3,y3]] image_shape : (height, width) for clamping padding_px : Expansion on each side (default: 10) @@ -144,34 +122,13 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape, if not ocr_bboxes: return 0, 0, 0, 0 - quad_metrics = [] - for quad in ocr_bboxes: - xs = [pt[0] for pt in quad] - ys = [pt[1] for pt in quad] - qx1, qx2 = min(xs), max(xs) - qy1, qy2 = min(ys), max(ys) - quad_metrics.append({ - "x1" : qx1, "x2" : qx2, - "y1" : qy1, "y2" : qy2, - "w" : qx2 - qx1, - "h" : qy2 - qy1, - "cx" : (qx1 + qx2) / 2.0, - }) + all_x = [pt[0] for quad in ocr_bboxes for pt in quad] + all_y = [pt[1] for quad in ocr_bboxes for pt in quad] - widest = max(quad_metrics, key=lambda q: q["w"]) - max_w = widest["w"] - center_x = widest["cx"] - total_h = sum(q["h"] for q in quad_metrics) - - box_x1 = center_x - max_w / 2.0 - box_x2 = center_x + max_w / 2.0 - box_y1 = min(q["y1"] for q in quad_metrics) - box_y2 = box_y1 + total_h - - x1 = max(0, box_x1 - padding_px) - y1 = max(0, box_y1 - padding_px) - x2 = min(img_w, box_x2 + padding_px) - y2 = min(img_h, box_y2 + padding_px) + x1 = max(0, min(all_x) - padding_px) + y1 = max(0, min(all_y) - padding_px) + x2 = min(img_w, max(all_x) + padding_px) + y2 = min(img_h, max(all_y) + padding_px) return x1, y1, x2, y2 @@ -282,19 +239,19 @@ def cluster_into_bubbles(ocr_results, image_shape, Pass 1 — DBSCAN on center points Pass 2 — Bounding-box proximity merge - Token categories per cluster: + Token handling per cluster: "alpha" tokens → translation text + bbox - "punct" tokens → bbox only (e.g. "..." after "HN") - "noise" tokens → already filtered before this function + "punct" tokens → bbox included, appended to nearest + alpha line by Y distance + (e.g. "..." joins "HN" → "HN...") - Bbox: widest-line width (centered) × stacked height. + Bbox uses flat union of ALL quad corners: + min/max of all x,y across every quad in the cluster. Returns: bubble_dict : cluster_id → list of text lines - (alpha tokens only, punct appended - to last alpha line if spatially adjacent) bbox_dict : cluster_id → (x1, y1, x2, y2) - ocr_quads : cluster_id → list of ALL raw EasyOCR quads + ocr_quads : cluster_id → list of ALL raw quads """ if not ocr_results: return {}, {}, {} @@ -321,8 +278,6 @@ def cluster_into_bubbles(ocr_results, image_shape, raw_clusters.setdefault(label, []) raw_quads.setdefault(label, []) bbox, text, _ = ocr_results[idx] - # Store (cy, cx, text, category) - cat = ocr_results[idx][2] # confidence stored as category below raw_clusters[label].append( (centers[idx][1], centers[idx][0], text)) raw_quads[label].append(bbox) @@ -355,12 +310,9 @@ def cluster_into_bubbles(ocr_results, image_shape, items_sorted = sorted(items, key=lambda t: t[0]) - # ── Build text lines ────────────────────────────────────── - # Alpha tokens become text lines. - # Punct tokens (... ?? etc.) are appended to the - # nearest preceding alpha token on the same Y level. - alpha_lines = [] # (cy, text) for alpha tokens - punct_tokens = [] # (cy, text) for punct tokens + # ── Separate alpha and punct tokens ─────────────────────── + alpha_lines = [] # (cy, text) + punct_tokens = [] # (cy, text) for cy, cx, text in items_sorted: if any(ch.isalpha() for ch in text): @@ -368,27 +320,24 @@ def cluster_into_bubbles(ocr_results, image_shape, else: punct_tokens.append((cy, text)) - # Append each punct token to the closest alpha line by Y + # ── Append punct to closest alpha line by Y ─────────────── for pcy, ptext in punct_tokens: if alpha_lines: - # Find alpha line with closest cy closest_idx = min( range(len(alpha_lines)), key=lambda k: abs(alpha_lines[k][0] - pcy) ) cy_a, text_a = alpha_lines[closest_idx] alpha_lines[closest_idx] = (cy_a, text_a + ptext) - # If no alpha lines at all, punct still contributes - # to bbox but not to translation text text_lines = [t for _, t in alpha_lines] - # Fallback: if no alpha at all, keep everything + # Fallback: no alpha at all → keep everything as-is if not text_lines: text_lines = [text for _, _, text in items_sorted] bubble_dict[i] = text_lines - ocr_quads[i] = quads # ALL quads → full bbox + ocr_quads[i] = quads # ALL quads → full bbox coverage bbox_dict[i] = get_cluster_bbox_from_ocr( quads, image_shape, padding_px=bbox_padding @@ -408,6 +357,10 @@ def cluster_into_bubbles(ocr_results, image_shape, # HYPHEN REMOVAL # ───────────────────────────────────────────── def fix_hyphens(lines): + """ + Joins lines, merging mid-word hyphens. + e.g. ["GRAVEMEN-", "TE"] → "GRAVEMENTE" + """ if not lines: return "" merged = lines[0] @@ -421,7 +374,8 @@ def fix_hyphens(lines): # ───────────────────────────────────────────── # AUTO EPS # ───────────────────────────────────────────── -def compute_auto_eps(image_path, base_eps=80, reference_width=750): +def compute_auto_eps(image_path, base_eps=80, + reference_width=750): image = cv2.imread(image_path) if image is None: return base_eps @@ -439,7 +393,8 @@ def ocr_quality_score(text): return 0.0 alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text) garbage = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"] - penalty = sum(0.2 for p in garbage if re.search(p, text)) + penalty = sum(0.2 for p in garbage + if re.search(p, text)) return max(0.0, min(1.0, alpha_ratio - penalty)) @@ -466,7 +421,8 @@ def export_bubble_boxes(bbox_dict, ocr_quads_dict, print(f"\n📦 Bubble boxes saved → {filepath}") for bid, v in export.items(): print(f" #{bid}: ({v['x']},{v['y']}) " - f"{v['w']}×{v['h']}px [{len(v['quads'])} quad(s)]") + f"{v['w']}×{v['h']}px " + f"[{len(v['quads'])} quad(s)]") # ───────────────────────────────────────────── @@ -482,8 +438,8 @@ def save_debug_clusters(image_path, ocr_results, num_bubbles = max(bubble_dict.keys(), default=1) colors = [ tuple(int(c) for c in col) - for col in np.random.randint(50, 230, - size=(num_bubbles + 2, 3)) + for col in np.random.randint( + 50, 230, size=(num_bubbles + 2, 3)) ] text_to_bubble = {} @@ -506,7 +462,8 @@ def save_debug_clusters(image_path, ocr_results, color, 2) cv2.putText(image, f"BOX#{bubble_id}", (int(x1) + 2, int(y1) + 16), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, color, 2) cv2.imwrite("debug_clusters.png", image) print(" 🐛 debug_clusters.png saved") @@ -531,35 +488,35 @@ def translate_manga_text( bbox_padding=10, debug=False, ): - # ── 1. Resolve eps ──────────────────────────────────────────────────────── + # ── 1. Resolve eps ──────────────────────────────────────────── if cluster_eps == "auto": print("Computing auto eps...") eps = compute_auto_eps(image_path) else: eps = float(cluster_eps) - # ── 2. Load full image ──────────────────────────────────────────────────── + # ── 2. Load full image ──────────────────────────────────────── full_image = cv2.imread(image_path) if full_image is None: print(f"❌ Could not load image: {image_path}") return - # ── 3. Initialize OCR ───────────────────────────────────────────────────── + # ── 3. Initialize OCR ───────────────────────────────────────── print("\nLoading OCR model...") ocr_lang_list = ["en", "es"] if source_lang == "ca" \ else [source_lang] reader = easyocr.Reader(ocr_lang_list) - # ── 4. Initialize translator ────────────────────────────────────────────── + # ── 4. Initialize translator ────────────────────────────────── translator = GoogleTranslator(source=source_lang, target=target_lang) - # ── 5. Run OCR ──────────────────────────────────────────────────────────── + # ── 5. Run OCR ──────────────────────────────────────────────── print(f"\nRunning OCR on: {image_path}") results = reader.readtext(image_path, paragraph=False) print(f" Raw detections: {len(results)}") - # ── 6. Filter ───────────────────────────────────────────────────────────── + # ── 6. Filter tokens ────────────────────────────────────────── filtered = [] skipped = 0 @@ -575,7 +532,7 @@ def translate_manga_text( if category == "punct": print(f" ✔ Punct kept: '{cleaned}'") else: - if category == "sound effect": + if is_sound_effect(cleaned): print(f" 🔇 SFX skipped: '{cleaned}'") skipped += 1 @@ -585,7 +542,7 @@ def translate_manga_text( print("⚠️ No text detected after filtering.") return - # ── 7. Cluster + merge ──────────────────────────────────────────────────── + # ── 7. Cluster + merge ──────────────────────────────────────── print(f"Clustering (eps={eps:.1f}px, " f"proximity={proximity_px}px, " f"bbox_padding={bbox_padding}px)...") @@ -599,24 +556,25 @@ def translate_manga_text( ) print(f" ✅ {len(bubble_dict)} bubble(s) after merge.\n") - # ── 8. Debug ────────────────────────────────────────────────────────────── + # ── 8. Debug clusters ───────────────────────────────────────── if debug: save_debug_clusters(image_path, filtered, bubble_dict, bbox_dict) - # ── 9. Fix hyphens ──────────────────────────────────────────────────────── + # ── 9. Fix hyphens ──────────────────────────────────────────── clean_bubbles = { i: fix_hyphens(lines) for i, lines in bubble_dict.items() if lines } - # ── 10. Quality check + crop re-read ────────────────────────────────────── + # ── 10. Quality check + crop re-read ────────────────────────── print("Checking OCR quality per bubble...") for i, text in clean_bubbles.items(): score = ocr_quality_score(text) status = "✅" if score >= quality_threshold else "🔁" - print(f" #{i}: score={score:.2f} {status} '{text[:55]}'") + print(f" #{i}: score={score:.2f} {status} " + f"'{text[:55]}'") if score < quality_threshold: print(f" → Re-reading #{i} from crop...") @@ -630,7 +588,7 @@ def translate_manga_text( else: print(f" → Nothing found, keeping original.") - # ── 11. Translate & print ───────────────────────────────────────────────── + # ── 11. Translate & print ───────────────────────────────────── print() header = (f"{'BUBBLE':<8} " f"{'ORIGINAL (Italian)':<50} " @@ -658,19 +616,19 @@ def translate_manga_text( output_lines.append(line) output_lines.append(divider) - summary = (f"✅ Done! {translated_count} bubble(s) translated, " - f"{skipped} detection(s) skipped.") + summary = (f"✅ Done! {translated_count} bubble(s) " + f"translated, {skipped} detection(s) skipped.") output_lines.append(summary) print(divider) print(summary) - # ── 12. Export translations ─────────────────────────────────────────────── + # ── 12. Export translations ─────────────────────────────────── if export_to_file: with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(output_lines)) print(f"📄 Translations saved → {export_to_file}") - # ── 13. Export bubble boxes ─────────────────────────────────────────────── + # ── 13. Export bubble boxes ─────────────────────────────────── if export_bubbles_to: export_bubble_boxes(bbox_dict, ocr_quads, filepath=export_bubbles_to) @@ -704,6 +662,6 @@ if __name__ == "__main__": filter_sound_effects = True, quality_threshold = 0.5, upscale_factor = 2.5, - bbox_padding = 3, + bbox_padding = 5, debug = True, - ) + ) \ No newline at end of file diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..c53b03b --- /dev/null +++ b/pipeline.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +pipeline.py +─────────────────────────────────────────────────────────────── +Full chapter translation pipeline for Dandadan_059_2022_Digital + +Flow per page: + 1. Run manga-translator.py → output.txt + bubbles.json + 2. Run manga-renderer.py → translated image + 3. Collect all translated images → .cbz + +Folder structure produced: + Dandadan_059_2022_Digital_1r0n/ + ├── 00.jpg ← original (untouched) + ├── ... + └── translated/ + ├── 00/ + │ ├── output.txt ← raw translations + │ ├── bubbles.json ← bubble boxes + │ ├── debug_clusters.png ← cluster debug + │ └── 00_translated.jpg ← rendered output + ├── 01/ + │ └── ... + └── ... + Dandadan_059_translated.cbz ← final output +""" + +import os +import sys +import shutil +import zipfile +import importlib.util +from pathlib import Path + + +# ───────────────────────────────────────────── +# CONFIG — edit these as needed +# ───────────────────────────────────────────── +CHAPTER_DIR = "/Users/guillemhernandezsola/Downloads/Dandadan_059_2022_Digital_1r0n" +OUTPUT_CBZ = "/Users/guillemhernandezsola/Downloads/Dandadan_059_translated.cbz" +SOURCE_LANG = "en" +TARGET_LANG = "ca" + +# manga-translator.py settings +CONFIDENCE_THRESHOLD = 0.10 +MIN_TEXT_LENGTH = 2 +CLUSTER_EPS = "auto" +PROXIMITY_PX = 80 +FILTER_SFX = True +QUALITY_THRESHOLD = 0.5 +UPSCALE_FACTOR = 2.5 +BBOX_PADDING = 5 +DEBUG = True + +# manga-renderer.py settings +FONT_PATH = "fonts/ComicRelief-Regular.ttf" +FONT_COLOR = (0, 0, 0) + + +# ───────────────────────────────────────────── +# DYNAMIC MODULE LOADER +# Loads manga-translator.py and manga-renderer.py +# by file path (handles hyphens in filenames) +# ───────────────────────────────────────────── +def load_module(name, filepath): + spec = importlib.util.spec_from_file_location(name, filepath) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# ───────────────────────────────────────────── +# HELPERS +# ───────────────────────────────────────────── +def sorted_pages(chapter_dir): + """ + Returns sorted list of image paths in the chapter folder. + Supports .jpg, .jpeg, .png, .webp + """ + exts = {".jpg", ".jpeg", ".png", ".webp"} + pages = [ + p for p in Path(chapter_dir).iterdir() + if p.suffix.lower() in exts + ] + return sorted(pages, key=lambda p: p.stem) + + +def make_page_workdir(chapter_dir, page_stem): + """ + Creates and returns: + /translated// + """ + workdir = Path(chapter_dir) / "translated" / page_stem + workdir.mkdir(parents=True, exist_ok=True) + return workdir + + +def pack_cbz(translated_dir, output_cbz): + """ + Collects all *_translated.* images from translated/*/ + sorted by page stem, packs into a .cbz file. + """ + images = sorted( + translated_dir.rglob("*_translated.*"), + key=lambda p: p.parent.name + ) + + if not images: + print("⚠️ No translated images found — CBZ not created.") + return + + with zipfile.ZipFile(output_cbz, "w", + compression=zipfile.ZIP_STORED) as zf: + for img in images: + # Archive name keeps the page stem for ordering + arcname = img.name + zf.write(img, arcname) + print(f" 📄 Added: {arcname}") + + print(f"\n✅ CBZ saved → {output_cbz} " + f"({len(images)} page(s))") + + +# ───────────────────────────────────────────── +# PER-PAGE PIPELINE +# ───────────────────────────────────────────── +def process_page(page_path, workdir, + translator_module, renderer_module): + """ + Runs translator + renderer for a single page. + All intermediate files land in workdir. + Returns path to the translated image, or None on failure. + """ + page_stem = page_path.stem + suffix = page_path.suffix + + # Paths inside workdir + output_txt = str(workdir / "output.txt") + bubbles_json= str(workdir / "bubbles.json") + debug_png = str(workdir / "debug_clusters.png") + translated = str(workdir / f"{page_stem}_translated{suffix}") + + print(f"\n{'─'*60}") + print(f" PAGE: {page_path.name}") + print(f"{'─'*60}") + + # ── Step 1: Translate ───────────────────────────────────────── + print(f"\n[1/2] Translating...") + try: + # Temporarily redirect file outputs to workdir + orig_dir = os.getcwd() + os.chdir(workdir) + + translator_module.translate_manga_text( + image_path = str(page_path.resolve()), + source_lang = SOURCE_LANG, + target_lang = TARGET_LANG, + confidence_threshold = CONFIDENCE_THRESHOLD, + export_to_file = "output.txt", + export_bubbles_to = "bubbles.json", + min_text_length = MIN_TEXT_LENGTH, + cluster_eps = CLUSTER_EPS, + proximity_px = PROXIMITY_PX, + filter_sound_effects = FILTER_SFX, + quality_threshold = QUALITY_THRESHOLD, + upscale_factor = UPSCALE_FACTOR, + bbox_padding = BBOX_PADDING, + debug = DEBUG, + ) + except Exception as e: + print(f" ❌ Translation failed: {e}") + os.chdir(orig_dir) + return None + finally: + os.chdir(orig_dir) + + # ── Step 2: Render ──────────────────────────────────────────── + print(f"\n[2/2] Rendering...") + try: + renderer_module.render_translations( + input_image = str(page_path.resolve()), + output_image = translated, + translations_file = output_txt, + bubbles_file = bubbles_json, + font_path = str(Path(orig_dir) / FONT_PATH), + font_color = FONT_COLOR, + ) + except Exception as e: + print(f" ❌ Rendering failed: {e}") + return None + + print(f" ✅ Done → {translated}") + return translated + + +# ───────────────────────────────────────────── +# MAIN +# ───────────────────────────────────────────── +def main(): + # ── Load modules ────────────────────────────────────────────── + print("Loading modules...") + try: + translator = load_module( + "manga_translator", "manga-translator.py") + renderer = load_module( + "manga_renderer", "manga-renderer.py") + except FileNotFoundError as e: + print(f"❌ Could not load module: {e}") + sys.exit(1) + + # ── Discover pages ──────────────────────────────────────────── + pages = sorted_pages(CHAPTER_DIR) + if not pages: + print(f"❌ No images found in: {CHAPTER_DIR}") + sys.exit(1) + + print(f"\n📖 Chapter : {CHAPTER_DIR}") + print(f" Pages : {len(pages)}") + print(f" Source : {SOURCE_LANG} → Target: {TARGET_LANG}") + print(f" Output : {OUTPUT_CBZ}\n") + + # ── Process each page ───────────────────────────────────────── + translated_dir = Path(CHAPTER_DIR) / "translated" + succeeded = [] + failed = [] + + for page_path in pages: + workdir = make_page_workdir(CHAPTER_DIR, page_path.stem) + result = process_page(page_path, workdir, + translator, renderer) + if result: + succeeded.append(result) + else: + failed.append(page_path.name) + + # ── Summary ─────────────────────────────────────────────────── + print(f"\n{'═'*60}") + print(f" PIPELINE COMPLETE") + print(f" ✅ {len(succeeded)} page(s) succeeded") + if failed: + print(f" ❌ {len(failed)} page(s) failed: {failed}") + print(f"{'═'*60}\n") + + # ── Pack CBZ ────────────────────────────────────────────────── + print("Packing CBZ...") + pack_cbz(translated_dir, OUTPUT_CBZ) + + +if __name__ == "__main__": + main()