manga-translator/pipeline-translator.py

#!/usr/bin/env python3
"""
pipeline-translator.py
───────────────────────────────────────────────────────────────
Translation OCR pipeline (Batch Processing Only)

Usage:
  python pipeline-translator.py /path/to/chapter/folder
  python pipeline-translator.py /path/to/chapter/folder --start 2 --end 5
  python pipeline-translator.py /path/to/chapter/folder --source en --target es
"""

import os
import sys
import argparse
import importlib.util
from pathlib import Path


# ─────────────────────────────────────────────────────────────
#  PIPELINE CONFIGURATION
#  Maps to the process_manga_page() signature in manga-translator.py
# ─────────────────────────────────────────────────────────────
PIPELINE_CONFIG = dict(
    source_lang = "en",
    target_lang = "ca",
)


# ─────────────────────────────────────────────────────────────
#  DYNAMIC MODULE LOADER
#  FIX: Always evicts stale sys.modules entry and deletes
#       __pycache__ for manga-translator.py before loading,
#       so edits are ALWAYS picked up on every run.
# ─────────────────────────────────────────────────────────────
def purge_bytecode_cache(filepath: str) -> None:
    """
    Delete the compiled .pyc file for the given .py path so Python
    cannot silently use a stale cached version of the module.
    """
    import py_compile
    from importlib.util import cache_from_source

    try:
        pyc_path = cache_from_source(filepath)
        if os.path.exists(pyc_path):
            os.remove(pyc_path)
            print(f"🗑️  Purged bytecode cache: {pyc_path}")
    except Exception as e:
        # Non-fatal — just warn and continue
        print(f"⚠️  Could not purge bytecode cache: {e}")


def load_module(name: str, filepath: str):
    """
    Dynamically load a .py file as a module.

    FIX 1: Purge the .pyc cache so edits are always reflected.
    FIX 2: Evict any previously loaded version from sys.modules
            to prevent Python reusing a stale module object across
            multiple calls (e.g. when running in a REPL or test loop).
    """
    # FIX 1: delete stale bytecode
    purge_bytecode_cache(filepath)

    # FIX 2: evict from module registry
    if name in sys.modules:
        del sys.modules[name]

    spec = importlib.util.spec_from_file_location(name, filepath)
    if spec is None or spec.loader is None:
        raise FileNotFoundError(f"Cannot load module spec for: {filepath}")

    module = importlib.util.module_from_spec(spec)
    sys.modules[name] = module          # register before exec (handles self-refs)
    spec.loader.exec_module(module)
    return module


# ─────────────────────────────────────────────────────────────
#  HELPERS
# ─────────────────────────────────────────────────────────────
def sorted_pages(chapter_dir: Path):
    """Return all image files in chapter_dir sorted by filename stem."""
    exts = {".jpg", ".jpeg", ".png", ".webp"}
    pages = [
        p for p in chapter_dir.iterdir()
        if p.is_file() and p.suffix.lower() in exts
    ]
    return sorted(pages, key=lambda p: p.stem)


def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path:
    """Create and return translated/<page_stem>/ inside chapter_dir."""
    workdir = chapter_dir / "translated" / page_stem
    workdir.mkdir(parents=True, exist_ok=True)
    return workdir


def verify_translator_api(module) -> bool:
    """
    Checks that the loaded module exposes process_manga_page() and
    that it accepts all keys defined in PIPELINE_CONFIG.
    Prints a clear warning for any missing parameter.
    """
    import inspect

    fn = getattr(module, "process_manga_page", None)
    if fn is None:
        print("❌ manga-translator.py does not expose process_manga_page()")
        return False

    sig    = inspect.signature(fn)
    params = set(sig.parameters.keys())
    ok     = True

    for key in PIPELINE_CONFIG:
        if key not in params:
            print(
                f"⚠️  PIPELINE_CONFIG key '{key}' not found in "
                f"process_manga_page() — update pipeline or translator."
            )
            ok = False

    return ok


def sanity_check_fixes(module_path: Path) -> None:
    """
    Grep the translator source for key fix signatures and warn if
    any are missing. Helps catch cases where an edit was not saved.
    """
    checks = {
        "Fix A (gap_factor=4.0)":               "gap_factor=4.0",
        "Fix B (_majority_contour_id)":          "_majority_contour_id",
        "Fix C (median_inter adaptive gap)":     "median_inter",
        "Fix D (merge_same_column_dialogue)":    "merge_same_column_dialogue_boxes",
        "Fix E (lang_code from self.langs)":     "lang_code = self.langs",
    }

    print("\n🔎 Sanity-checking fixes in manga-translator.py:")
    source = module_path.read_text(encoding="utf-8")
    all_ok = True

    for label, token in checks.items():
        found = token in source
        status = "✅" if found else "❌ MISSING"
        print(f"   {status}  {label}")
        if not found:
            all_ok = False

    if not all_ok:
        print(
            "\n⚠️  One or more fixes are missing from manga-translator.py.\n"
            "   Save the file and re-run. Aborting.\n"
        )
        sys.exit(1)
    else:
        print("   All fixes present.\n")


# ─────────────────────────────────────────────────────────────
#  PER-PAGE PIPELINE
# ─────────────────────────────────────────────────────────────
def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
    print(f"\n{'─' * 70}")
    print(f"  PAGE : {page_path.name}")
    print(f"  OUT  : {workdir}")
    print(f"{'─' * 70}")

    orig_dir = os.getcwd()
    try:
        os.chdir(workdir)

        # Use absolute paths so output always lands in workdir
        # regardless of any internal os.getcwd() calls.
        output_json = str(workdir / "bubbles.json")
        output_txt  = str(workdir / "output.txt")
        debug_path  = str(workdir / "debug_clusters.png")

        print("  ⏳ Extracting text and translating...")

        results = translator_module.process_manga_page(
            image_path  = str(page_path.resolve()),
            output_json = output_json,
            output_txt  = output_txt,
            **PIPELINE_CONFIG,
        )

        # ── Debug visualisation ───────────────────────────────
        # FIX: process_manga_page() already writes debug_clusters.png
        # internally with full OCR quad data.
        # We do NOT call draw_debug_clusters() here with ocr=[]
        # because that would OVERWRITE the correct debug image with
        # a degraded version that has no quad outlines.
        #
        # If process_manga_page() did not write a debug image
        # (e.g. older version), we do a minimal fallback draw.
        if results and not os.path.exists(debug_path):
            try:
                import cv2
                image_bgr = cv2.imread(str(page_path.resolve()))
                if image_bgr is not None:
                    vis_boxes:   dict = {}
                    vis_lines:   dict = {}
                    vis_indices: dict = {}

                    for bid_str, data in results.items():
                        bid  = int(bid_str)
                        xywh = data["box"]
                        vis_boxes[bid] = (
                            xywh["x"],
                            xywh["y"],
                            xywh["x"] + xywh["w"],
                            xywh["y"] + xywh["h"],
                        )
                        vis_lines[bid]   = data.get("lines", [])
                        vis_indices[bid] = []

                    # Fallback only — ocr=[] means no quad outlines
                    translator_module.draw_debug_clusters(
                        image_bgr   = image_bgr,
                        out_boxes   = vis_boxes,
                        out_lines   = vis_lines,
                        out_indices = vis_indices,
                        ocr         = [],
                        save_path   = debug_path,
                    )
                    print(f"  🖼️  Fallback debug image written → {debug_path}")
            except Exception as e:
                print(f"  ⚠️  Debug visualisation failed (non-fatal): {e}")

        # ── Sanity-check output files ─────────────────────────
        all_good = True
        for fname in ("output.txt", "bubbles.json"):
            fpath = workdir / fname
            if not fpath.exists():
                print(f"  ⚠️  {fname} was NOT created.")
                all_good = False
            elif fpath.stat().st_size == 0:
                print(f"  ⚠️  {fname} exists but is EMPTY.")
                all_good = False
            else:
                print(f"  📄 {fname} → {fpath.stat().st_size} bytes")

        if not results:
            print("  ⚠️  process_manga_page() returned no results.")
            return False

        print(f"  ✅ Done — {len(results)} box(es) processed.")
        return True

    except Exception as e:
        import traceback
        print(f"  ❌ Failed: {e}")
        traceback.print_exc()
        return False

    finally:
        os.chdir(orig_dir)


# ─────────────────────────────────────────────────────────────
#  MAIN
# ─────────────────────────────────────────────────────────────
def main():
    parser = argparse.ArgumentParser(
        description="Manga Translation OCR Batch Pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python pipeline-translator.py pages-for-tests
  python pipeline-translator.py pages-for-tests --start 2 --end 4
  python pipeline-translator.py pages-for-tests --source en --target es
        """
    )
    parser.add_argument(
        "chapter_dir",
        help="Path to the folder containing manga page images"
    )
    parser.add_argument(
        "--start", type=int, default=1,
        help="Start from this page number (1-based, default: 1)"
    )
    parser.add_argument(
        "--end", type=int, default=None,
        help="Stop after this page number inclusive (default: all)"
    )
    parser.add_argument(
        "--source", "-s", default=None,
        help=f"Override source language (default: {PIPELINE_CONFIG['source_lang']})"
    )
    parser.add_argument(
        "--target", "-t", default=None,
        help=f"Override target language (default: {PIPELINE_CONFIG['target_lang']})"
    )
    parser.add_argument(
        "--skip-sanity", action="store_true",
        help="Skip the fix sanity check (not recommended)"
    )
    args = parser.parse_args()

    # ── Apply CLI language overrides ─────────────────────────
    config = dict(PIPELINE_CONFIG)
    if args.source:
        config["source_lang"] = args.source
    if args.target:
        config["target_lang"] = args.target
    PIPELINE_CONFIG.update(config)

    # ── Resolve chapter directory ─────────────────────────────
    chapter_dir = Path(args.chapter_dir).resolve()
    if not chapter_dir.is_dir():
        print(f"❌ Not a directory: {chapter_dir}")
        sys.exit(1)

    # ── Locate manga-translator.py ────────────────────────────
    script_dir  = Path(__file__).parent
    module_path = script_dir / "manga-translator.py"

    if not module_path.exists():
        print(f"❌ manga-translator.py not found in {script_dir}")
        sys.exit(1)

    # ── Sanity-check that all fixes are present ───────────────
    if not args.skip_sanity:
        sanity_check_fixes(module_path)

    # ── Load translator module ────────────────────────────────
    print(f"📦 Loading translator from: {module_path}")
    try:
        translator = load_module("manga_translator", str(module_path))
    except Exception as e:
        print(f"❌ Could not load manga-translator.py: {e}")
        sys.exit(1)

    # ── API compatibility check ───────────────────────────────
    if not verify_translator_api(translator):
        print("❌ Aborting — fix the parameter mismatch above first.")
        sys.exit(1)

    # ── Discover and slice pages ──────────────────────────────
    all_pages = sorted_pages(chapter_dir)
    if not all_pages:
        print(f"❌ No image files found in: {chapter_dir}")
        sys.exit(1)

    start_idx = max(0, args.start - 1)
    end_idx   = args.end if args.end is not None else len(all_pages)
    pages     = all_pages[start_idx:end_idx]

    if not pages:
        print(f"❌ No pages in range [{args.start}, {args.end}]")
        sys.exit(1)

    print(f"\n📚 Chapter  : {chapter_dir.name}")
    print(f"   Pages    : {len(pages)} of {len(all_pages)} total")
    print(f"   Source   : {PIPELINE_CONFIG['source_lang']}")
    print(f"   Target   : {PIPELINE_CONFIG['target_lang']}")
    print(f"   Output   : {chapter_dir / 'translated'}\n")

    # ── Process each page ─────────────────────────────────────
    results_summary = []

    for page_num, page_path in enumerate(pages, start=start_idx + 1):
        workdir = make_page_workdir(chapter_dir, page_path.stem)
        success = process_page(page_path, workdir, translator)
        results_summary.append((page_num, page_path.name, success))

    # ── Final summary ─────────────────────────────────────────
    print(f"\n{'═' * 70}")
    print(f"  BATCH COMPLETE")
    print(f"{'═' * 70}")

    passed = sum(1 for _, _, ok in results_summary if ok)
    failed = len(results_summary) - passed

    for page_num, name, ok in results_summary:
        status = "✅" if ok else "❌"
        print(f"  {status}  [{page_num:>3}]  {name}")

    print(f"\n  Total: {passed} succeeded, {failed} failed")
    print(f"{'═' * 70}\n")

    if failed:
        sys.exit(1)


if __name__ == "__main__":
    main()