#!/usr/bin/env python3 """ pipeline-translator.py ─────────────────────────────────────────────────────────────── Translation OCR pipeline (Batch Processing Only) Usage: python pipeline-translator.py /path/to/chapter/folder python pipeline-translator.py /path/to/chapter/folder --start 2 --end 5 python pipeline-translator.py /path/to/chapter/folder --source en --target es """ import os import sys import argparse import importlib.util from pathlib import Path # ───────────────────────────────────────────────────────────── # PIPELINE CONFIGURATION # Maps to the process_manga_page() signature in manga-translator.py # ───────────────────────────────────────────────────────────── PIPELINE_CONFIG = dict( source_lang = "en", target_lang = "ca", ) # ───────────────────────────────────────────────────────────── # DYNAMIC MODULE LOADER # FIX: Always evicts stale sys.modules entry and deletes # __pycache__ for manga-translator.py before loading, # so edits are ALWAYS picked up on every run. # ───────────────────────────────────────────────────────────── def purge_bytecode_cache(filepath: str) -> None: """ Delete the compiled .pyc file for the given .py path so Python cannot silently use a stale cached version of the module. """ import py_compile from importlib.util import cache_from_source try: pyc_path = cache_from_source(filepath) if os.path.exists(pyc_path): os.remove(pyc_path) print(f"🗑️ Purged bytecode cache: {pyc_path}") except Exception as e: # Non-fatal — just warn and continue print(f"⚠️ Could not purge bytecode cache: {e}") def load_module(name: str, filepath: str): """ Dynamically load a .py file as a module. FIX 1: Purge the .pyc cache so edits are always reflected. FIX 2: Evict any previously loaded version from sys.modules to prevent Python reusing a stale module object across multiple calls (e.g. when running in a REPL or test loop). """ # FIX 1: delete stale bytecode purge_bytecode_cache(filepath) # FIX 2: evict from module registry if name in sys.modules: del sys.modules[name] spec = importlib.util.spec_from_file_location(name, filepath) if spec is None or spec.loader is None: raise FileNotFoundError(f"Cannot load module spec for: {filepath}") module = importlib.util.module_from_spec(spec) sys.modules[name] = module # register before exec (handles self-refs) spec.loader.exec_module(module) return module # ───────────────────────────────────────────────────────────── # HELPERS # ───────────────────────────────────────────────────────────── def sorted_pages(chapter_dir: Path): """Return all image files in chapter_dir sorted by filename stem.""" exts = {".jpg", ".jpeg", ".png", ".webp"} pages = [ p for p in chapter_dir.iterdir() if p.is_file() and p.suffix.lower() in exts ] return sorted(pages, key=lambda p: p.stem) def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path: """Create and return translated// inside chapter_dir.""" workdir = chapter_dir / "translated" / page_stem workdir.mkdir(parents=True, exist_ok=True) return workdir def verify_translator_api(module) -> bool: """ Checks that the loaded module exposes process_manga_page() and that it accepts all keys defined in PIPELINE_CONFIG. Prints a clear warning for any missing parameter. """ import inspect fn = getattr(module, "process_manga_page", None) if fn is None: print("❌ manga-translator.py does not expose process_manga_page()") return False sig = inspect.signature(fn) params = set(sig.parameters.keys()) ok = True for key in PIPELINE_CONFIG: if key not in params: print( f"⚠️ PIPELINE_CONFIG key '{key}' not found in " f"process_manga_page() — update pipeline or translator." ) ok = False return ok def sanity_check_fixes(module_path: Path) -> None: """ Grep the translator source for key fix signatures and warn if any are missing. Helps catch cases where an edit was not saved. """ checks = { "Fix A (gap_factor=4.0)": "gap_factor=4.0", "Fix B (_majority_contour_id)": "_majority_contour_id", "Fix C (median_inter adaptive gap)": "median_inter", "Fix D (merge_same_column_dialogue)": "merge_same_column_dialogue_boxes", "Fix E (lang_code from self.langs)": "lang_code = self.langs", } print("\n🔎 Sanity-checking fixes in manga-translator.py:") source = module_path.read_text(encoding="utf-8") all_ok = True for label, token in checks.items(): found = token in source status = "✅" if found else "❌ MISSING" print(f" {status} {label}") if not found: all_ok = False if not all_ok: print( "\n⚠️ One or more fixes are missing from manga-translator.py.\n" " Save the file and re-run. Aborting.\n" ) sys.exit(1) else: print(" All fixes present.\n") # ───────────────────────────────────────────────────────────── # PER-PAGE PIPELINE # ───────────────────────────────────────────────────────────── def process_page(page_path: Path, workdir: Path, translator_module) -> bool: print(f"\n{'─' * 70}") print(f" PAGE : {page_path.name}") print(f" OUT : {workdir}") print(f"{'─' * 70}") orig_dir = os.getcwd() try: os.chdir(workdir) # Use absolute paths so output always lands in workdir # regardless of any internal os.getcwd() calls. output_json = str(workdir / "bubbles.json") output_txt = str(workdir / "output.txt") debug_path = str(workdir / "debug_clusters.png") print(" ⏳ Extracting text and translating...") results = translator_module.process_manga_page( image_path = str(page_path.resolve()), output_json = output_json, output_txt = output_txt, **PIPELINE_CONFIG, ) # ── Debug visualisation ─────────────────────────────── # FIX: process_manga_page() already writes debug_clusters.png # internally with full OCR quad data. # We do NOT call draw_debug_clusters() here with ocr=[] # because that would OVERWRITE the correct debug image with # a degraded version that has no quad outlines. # # If process_manga_page() did not write a debug image # (e.g. older version), we do a minimal fallback draw. if results and not os.path.exists(debug_path): try: import cv2 image_bgr = cv2.imread(str(page_path.resolve())) if image_bgr is not None: vis_boxes: dict = {} vis_lines: dict = {} vis_indices: dict = {} for bid_str, data in results.items(): bid = int(bid_str) xywh = data["box"] vis_boxes[bid] = ( xywh["x"], xywh["y"], xywh["x"] + xywh["w"], xywh["y"] + xywh["h"], ) vis_lines[bid] = data.get("lines", []) vis_indices[bid] = [] # Fallback only — ocr=[] means no quad outlines translator_module.draw_debug_clusters( image_bgr = image_bgr, out_boxes = vis_boxes, out_lines = vis_lines, out_indices = vis_indices, ocr = [], save_path = debug_path, ) print(f" 🖼️ Fallback debug image written → {debug_path}") except Exception as e: print(f" ⚠️ Debug visualisation failed (non-fatal): {e}") # ── Sanity-check output files ───────────────────────── all_good = True for fname in ("output.txt", "bubbles.json"): fpath = workdir / fname if not fpath.exists(): print(f" ⚠️ {fname} was NOT created.") all_good = False elif fpath.stat().st_size == 0: print(f" ⚠️ {fname} exists but is EMPTY.") all_good = False else: print(f" 📄 {fname} → {fpath.stat().st_size} bytes") if not results: print(" ⚠️ process_manga_page() returned no results.") return False print(f" ✅ Done — {len(results)} box(es) processed.") return True except Exception as e: import traceback print(f" ❌ Failed: {e}") traceback.print_exc() return False finally: os.chdir(orig_dir) # ───────────────────────────────────────────────────────────── # MAIN # ───────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description="Manga Translation OCR Batch Pipeline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python pipeline-translator.py pages-for-tests python pipeline-translator.py pages-for-tests --start 2 --end 4 python pipeline-translator.py pages-for-tests --source en --target es """ ) parser.add_argument( "chapter_dir", help="Path to the folder containing manga page images" ) parser.add_argument( "--start", type=int, default=1, help="Start from this page number (1-based, default: 1)" ) parser.add_argument( "--end", type=int, default=None, help="Stop after this page number inclusive (default: all)" ) parser.add_argument( "--source", "-s", default=None, help=f"Override source language (default: {PIPELINE_CONFIG['source_lang']})" ) parser.add_argument( "--target", "-t", default=None, help=f"Override target language (default: {PIPELINE_CONFIG['target_lang']})" ) parser.add_argument( "--skip-sanity", action="store_true", help="Skip the fix sanity check (not recommended)" ) args = parser.parse_args() # ── Apply CLI language overrides ───────────────────────── config = dict(PIPELINE_CONFIG) if args.source: config["source_lang"] = args.source if args.target: config["target_lang"] = args.target PIPELINE_CONFIG.update(config) # ── Resolve chapter directory ───────────────────────────── chapter_dir = Path(args.chapter_dir).resolve() if not chapter_dir.is_dir(): print(f"❌ Not a directory: {chapter_dir}") sys.exit(1) # ── Locate manga-translator.py ──────────────────────────── script_dir = Path(__file__).parent module_path = script_dir / "manga-translator.py" if not module_path.exists(): print(f"❌ manga-translator.py not found in {script_dir}") sys.exit(1) # ── Sanity-check that all fixes are present ─────────────── if not args.skip_sanity: sanity_check_fixes(module_path) # ── Load translator module ──────────────────────────────── print(f"📦 Loading translator from: {module_path}") try: translator = load_module("manga_translator", str(module_path)) except Exception as e: print(f"❌ Could not load manga-translator.py: {e}") sys.exit(1) # ── API compatibility check ─────────────────────────────── if not verify_translator_api(translator): print("❌ Aborting — fix the parameter mismatch above first.") sys.exit(1) # ── Discover and slice pages ────────────────────────────── all_pages = sorted_pages(chapter_dir) if not all_pages: print(f"❌ No image files found in: {chapter_dir}") sys.exit(1) start_idx = max(0, args.start - 1) end_idx = args.end if args.end is not None else len(all_pages) pages = all_pages[start_idx:end_idx] if not pages: print(f"❌ No pages in range [{args.start}, {args.end}]") sys.exit(1) print(f"\n📚 Chapter : {chapter_dir.name}") print(f" Pages : {len(pages)} of {len(all_pages)} total") print(f" Source : {PIPELINE_CONFIG['source_lang']}") print(f" Target : {PIPELINE_CONFIG['target_lang']}") print(f" Output : {chapter_dir / 'translated'}\n") # ── Process each page ───────────────────────────────────── results_summary = [] for page_num, page_path in enumerate(pages, start=start_idx + 1): workdir = make_page_workdir(chapter_dir, page_path.stem) success = process_page(page_path, workdir, translator) results_summary.append((page_num, page_path.name, success)) # ── Final summary ───────────────────────────────────────── print(f"\n{'═' * 70}") print(f" BATCH COMPLETE") print(f"{'═' * 70}") passed = sum(1 for _, _, ok in results_summary if ok) failed = len(results_summary) - passed for page_num, name, ok in results_summary: status = "✅" if ok else "❌" print(f" {status} [{page_num:>3}] {name}") print(f"\n Total: {passed} succeeded, {failed} failed") print(f"{'═' * 70}\n") if failed: sys.exit(1) if __name__ == "__main__": main()