#!/usr/bin/env python3 """ pipeline-translator.py ─────────────────────────────────────────────────────────────── Translation OCR pipeline (Batch Processing Only) Usage: python pipeline-translator.py /path/to/chapter/folder """ import os import sys import argparse import importlib.util from pathlib import Path # ───────────────────────────────────────────────────────────── # PIPELINE CONFIGURATION # Single source of truth — mirrors the __main__ block in # manga-translator.py so both entry points stay in sync. # ───────────────────────────────────────────────────────────── PIPELINE_CONFIG = dict( source_lang = "english", target_lang = "ca", confidence_threshold = 0.03, min_text_length = 1, gap_px = "auto", quality_threshold = 0.62, reading_mode = "rtl", debug = True, use_enhanced_ocr = True, strict_grouping = True, max_box_width_ratio = 0.6, max_box_height_ratio = 0.5, auto_fix_bubbles = True, ) # ───────────────────────────────────────────────────────────── # DYNAMIC MODULE LOADER # ───────────────────────────────────────────────────────────── def load_module(name: str, filepath: str): spec = importlib.util.spec_from_file_location(name, filepath) if spec is None or spec.loader is None: raise FileNotFoundError(f"Cannot load spec for {filepath}") module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module # ───────────────────────────────────────────────────────────── # HELPERS # ───────────────────────────────────────────────────────────── def sorted_pages(chapter_dir: Path): exts = {".jpg", ".jpeg", ".png", ".webp"} pages = [ p for p in chapter_dir.iterdir() if p.is_file() and p.suffix.lower() in exts ] return sorted(pages, key=lambda p: p.stem) def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path: workdir = chapter_dir / "translated" / page_stem workdir.mkdir(parents=True, exist_ok=True) return workdir def verify_translator_api(module) -> bool: """ Checks that the loaded module exposes translate_manga_text() and that it accepts all keys defined in PIPELINE_CONFIG. Prints a warning for any missing parameter so mismatches are caught immediately rather than silently falling back to defaults. """ import inspect fn = getattr(module, "translate_manga_text", None) if fn is None: print("❌ manga-translator.py does not expose translate_manga_text()") return False sig = inspect.signature(fn) params = set(sig.parameters.keys()) ok = True for key in PIPELINE_CONFIG: if key not in params: print(f"⚠️ PIPELINE_CONFIG key '{key}' not found in " f"translate_manga_text() — update pipeline or translator.") ok = False return ok # ───────────────────────────────────────────────────────────── # PER-PAGE PIPELINE # ───────────────────────────────────────────────────────────── def process_page(page_path: Path, workdir: Path, translator_module) -> bool: print(f"\n{'─' * 70}") print(f" PAGE : {page_path.name}") print(f"{'─' * 70}") orig_dir = os.getcwd() try: # Run inside the page's own workdir so debug images and # output files land there automatically. os.chdir(workdir) print(" ⏳ Extracting text and translating...") translator_module.translate_manga_text( image_path = str(page_path.resolve()), export_to_file = "output.txt", export_bubbles_to= "bubbles.json", **PIPELINE_CONFIG, # ← all settings from the single config dict ) # Sanity-check that the expected outputs were actually written for fname in ("output.txt", "bubbles.json"): fpath = workdir / fname if not fpath.exists() or fpath.stat().st_size == 0: print(f" ⚠️ {fname} is missing or empty after processing.") print(" ✅ Translation and OCR data saved successfully") return True except Exception as e: import traceback print(f" ❌ Failed: {e}") traceback.print_exc() return False finally: os.chdir(orig_dir) # ───────────────────────────────────────────────────────────── # MAIN # ───────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description="Manga Translation OCR Batch Pipeline" ) parser.add_argument( "chapter_dir", help="Path to the folder containing manga page images" ) parser.add_argument( "--start", type=int, default=1, help="Start from this page number (1-based, default: 1)" ) parser.add_argument( "--end", type=int, default=None, help="Stop after this page number inclusive (default: all)" ) args = parser.parse_args() chapter_dir = Path(args.chapter_dir).resolve() if not chapter_dir.is_dir(): print(f"❌ Not a directory: {chapter_dir}") sys.exit(1) # ── Load translator module ──────────────────────────────── script_dir = Path(__file__).parent module_path = script_dir / "manga-translator.py" if not module_path.exists(): print(f"❌ manga-translator.py not found in {script_dir}") sys.exit(1) print(f"📦 Loading translator from: {module_path}") try: translator = load_module("manga_translator", str(module_path)) except Exception as e: print(f"❌ Could not load manga-translator.py: {e}") sys.exit(1) # ── API compatibility check ─────────────────────────────── if not verify_translator_api(translator): print("❌ Aborting — fix the parameter mismatch above first.") sys.exit(1) # ── Discover pages ──────────────────────────────────────── all_pages = sorted_pages(chapter_dir) if not all_pages: print(f"❌ No images found in: {chapter_dir}") sys.exit(1) # Apply --start / --end slice (1-based, inclusive) start_idx = max(0, args.start - 1) end_idx = args.end if args.end is not None else len(all_pages) pages = all_pages[start_idx:end_idx] if not pages: print(f"❌ No pages in range [{args.start}, {args.end}]") sys.exit(1) # ── Summary header ──────────────────────────────────────── print(f"\n{'═' * 70}") print(f" 📖 Chapter : {chapter_dir.name}") print(f" 📄 Pages : {len(pages)} " f"(of {len(all_pages)} total, " f"range {args.start}–{end_idx})") print(f" 🌐 Lang : {PIPELINE_CONFIG['source_lang']} → " f"{PIPELINE_CONFIG['target_lang']}") print(f" 📖 Read order : {PIPELINE_CONFIG['reading_mode'].upper()}") print(f" 🔍 Enhanced : {PIPELINE_CONFIG['use_enhanced_ocr']}") print(f"{'═' * 70}\n") succeeded, failed = [], [] for i, page_path in enumerate(pages, start=1): print(f"[{i}/{len(pages)}] {page_path.name}") workdir = make_page_workdir(chapter_dir, page_path.stem) if process_page(page_path, workdir, translator): succeeded.append(page_path.name) else: failed.append(page_path.name) # ── Final report ────────────────────────────────────────── print(f"\n{'═' * 70}") print(" PIPELINE COMPLETE") print(f" ✅ {len(succeeded)} page(s) succeeded") if failed: print(f" ❌ {len(failed)} page(s) failed:") for name in failed: print(f" • {name}") print(f"{'═' * 70}\n") if __name__ == "__main__": main()