Files
manga-translator/pipeline-translator.py
Guillem Hernandez Sola 3ca01dae8c Check new
2026-04-23 15:46:14 +02:00

390 lines
15 KiB
Python

#!/usr/bin/env python3
"""
pipeline-translator.py
───────────────────────────────────────────────────────────────
Translation OCR pipeline (Batch Processing Only)
Usage:
python pipeline-translator.py /path/to/chapter/folder
python pipeline-translator.py /path/to/chapter/folder --start 2 --end 5
python pipeline-translator.py /path/to/chapter/folder --source en --target es
"""
import os
import sys
import argparse
import importlib.util
from pathlib import Path
# ─────────────────────────────────────────────────────────────
# PIPELINE CONFIGURATION
# Maps to the process_manga_page() signature in manga-translator.py
# ─────────────────────────────────────────────────────────────
PIPELINE_CONFIG = dict(
source_lang = "en",
target_lang = "ca",
)
# ─────────────────────────────────────────────────────────────
# DYNAMIC MODULE LOADER
# FIX: Always evicts stale sys.modules entry and deletes
# __pycache__ for manga-translator.py before loading,
# so edits are ALWAYS picked up on every run.
# ─────────────────────────────────────────────────────────────
def purge_bytecode_cache(filepath: str) -> None:
"""
Delete the compiled .pyc file for the given .py path so Python
cannot silently use a stale cached version of the module.
"""
import py_compile
from importlib.util import cache_from_source
try:
pyc_path = cache_from_source(filepath)
if os.path.exists(pyc_path):
os.remove(pyc_path)
print(f"🗑️ Purged bytecode cache: {pyc_path}")
except Exception as e:
# Non-fatal — just warn and continue
print(f"⚠️ Could not purge bytecode cache: {e}")
def load_module(name: str, filepath: str):
"""
Dynamically load a .py file as a module.
FIX 1: Purge the .pyc cache so edits are always reflected.
FIX 2: Evict any previously loaded version from sys.modules
to prevent Python reusing a stale module object across
multiple calls (e.g. when running in a REPL or test loop).
"""
# FIX 1: delete stale bytecode
purge_bytecode_cache(filepath)
# FIX 2: evict from module registry
if name in sys.modules:
del sys.modules[name]
spec = importlib.util.spec_from_file_location(name, filepath)
if spec is None or spec.loader is None:
raise FileNotFoundError(f"Cannot load module spec for: {filepath}")
module = importlib.util.module_from_spec(spec)
sys.modules[name] = module # register before exec (handles self-refs)
spec.loader.exec_module(module)
return module
# ─────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────
def sorted_pages(chapter_dir: Path):
"""Return all image files in chapter_dir sorted by filename stem."""
exts = {".jpg", ".jpeg", ".png", ".webp"}
pages = [
p for p in chapter_dir.iterdir()
if p.is_file() and p.suffix.lower() in exts
]
return sorted(pages, key=lambda p: p.stem)
def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path:
"""Create and return translated/<page_stem>/ inside chapter_dir."""
workdir = chapter_dir / "translated" / page_stem
workdir.mkdir(parents=True, exist_ok=True)
return workdir
def verify_translator_api(module) -> bool:
"""
Checks that the loaded module exposes process_manga_page() and
that it accepts all keys defined in PIPELINE_CONFIG.
Prints a clear warning for any missing parameter.
"""
import inspect
fn = getattr(module, "process_manga_page", None)
if fn is None:
print("❌ manga-translator.py does not expose process_manga_page()")
return False
sig = inspect.signature(fn)
params = set(sig.parameters.keys())
ok = True
for key in PIPELINE_CONFIG:
if key not in params:
print(
f"⚠️ PIPELINE_CONFIG key '{key}' not found in "
f"process_manga_page() — update pipeline or translator."
)
ok = False
return ok
def sanity_check_fixes(module_path: Path) -> None:
"""
Grep the translator source for key fix signatures and warn if
any are missing. Helps catch cases where an edit was not saved.
"""
checks = {
"Fix A (gap_factor=4.0)": "gap_factor=4.0",
"Fix B (_majority_contour_id)": "_majority_contour_id",
"Fix C (median_inter adaptive gap)": "median_inter",
"Fix D (merge_same_column_dialogue)": "merge_same_column_dialogue_boxes",
"Fix E (lang_code from self.langs)": "lang_code = self.langs",
}
print("\n🔎 Sanity-checking fixes in manga-translator.py:")
source = module_path.read_text(encoding="utf-8")
all_ok = True
for label, token in checks.items():
found = token in source
status = "" if found else "❌ MISSING"
print(f" {status} {label}")
if not found:
all_ok = False
if not all_ok:
print(
"\n⚠️ One or more fixes are missing from manga-translator.py.\n"
" Save the file and re-run. Aborting.\n"
)
sys.exit(1)
else:
print(" All fixes present.\n")
# ─────────────────────────────────────────────────────────────
# PER-PAGE PIPELINE
# ─────────────────────────────────────────────────────────────
def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
print(f"\n{'' * 70}")
print(f" PAGE : {page_path.name}")
print(f" OUT : {workdir}")
print(f"{'' * 70}")
orig_dir = os.getcwd()
try:
os.chdir(workdir)
# Use absolute paths so output always lands in workdir
# regardless of any internal os.getcwd() calls.
output_json = str(workdir / "bubbles.json")
output_txt = str(workdir / "output.txt")
debug_path = str(workdir / "debug_clusters.png")
print(" ⏳ Extracting text and translating...")
results = translator_module.process_manga_page(
image_path = str(page_path.resolve()),
output_json = output_json,
output_txt = output_txt,
**PIPELINE_CONFIG,
)
# ── Debug visualisation ───────────────────────────────
# FIX: process_manga_page() already writes debug_clusters.png
# internally with full OCR quad data.
# We do NOT call draw_debug_clusters() here with ocr=[]
# because that would OVERWRITE the correct debug image with
# a degraded version that has no quad outlines.
#
# If process_manga_page() did not write a debug image
# (e.g. older version), we do a minimal fallback draw.
if results and not os.path.exists(debug_path):
try:
import cv2
image_bgr = cv2.imread(str(page_path.resolve()))
if image_bgr is not None:
vis_boxes: dict = {}
vis_lines: dict = {}
vis_indices: dict = {}
for bid_str, data in results.items():
bid = int(bid_str)
xywh = data["box"]
vis_boxes[bid] = (
xywh["x"],
xywh["y"],
xywh["x"] + xywh["w"],
xywh["y"] + xywh["h"],
)
vis_lines[bid] = data.get("lines", [])
vis_indices[bid] = []
# Fallback only — ocr=[] means no quad outlines
translator_module.draw_debug_clusters(
image_bgr = image_bgr,
out_boxes = vis_boxes,
out_lines = vis_lines,
out_indices = vis_indices,
ocr = [],
save_path = debug_path,
)
print(f" 🖼️ Fallback debug image written → {debug_path}")
except Exception as e:
print(f" ⚠️ Debug visualisation failed (non-fatal): {e}")
# ── Sanity-check output files ─────────────────────────
all_good = True
for fname in ("output.txt", "bubbles.json"):
fpath = workdir / fname
if not fpath.exists():
print(f" ⚠️ {fname} was NOT created.")
all_good = False
elif fpath.stat().st_size == 0:
print(f" ⚠️ {fname} exists but is EMPTY.")
all_good = False
else:
print(f" 📄 {fname}{fpath.stat().st_size} bytes")
if not results:
print(" ⚠️ process_manga_page() returned no results.")
return False
print(f" ✅ Done — {len(results)} box(es) processed.")
return True
except Exception as e:
import traceback
print(f" ❌ Failed: {e}")
traceback.print_exc()
return False
finally:
os.chdir(orig_dir)
# ─────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="Manga Translation OCR Batch Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python pipeline-translator.py pages-for-tests
python pipeline-translator.py pages-for-tests --start 2 --end 4
python pipeline-translator.py pages-for-tests --source en --target es
"""
)
parser.add_argument(
"chapter_dir",
help="Path to the folder containing manga page images"
)
parser.add_argument(
"--start", type=int, default=1,
help="Start from this page number (1-based, default: 1)"
)
parser.add_argument(
"--end", type=int, default=None,
help="Stop after this page number inclusive (default: all)"
)
parser.add_argument(
"--source", "-s", default=None,
help=f"Override source language (default: {PIPELINE_CONFIG['source_lang']})"
)
parser.add_argument(
"--target", "-t", default=None,
help=f"Override target language (default: {PIPELINE_CONFIG['target_lang']})"
)
parser.add_argument(
"--skip-sanity", action="store_true",
help="Skip the fix sanity check (not recommended)"
)
args = parser.parse_args()
# ── Apply CLI language overrides ─────────────────────────
config = dict(PIPELINE_CONFIG)
if args.source:
config["source_lang"] = args.source
if args.target:
config["target_lang"] = args.target
PIPELINE_CONFIG.update(config)
# ── Resolve chapter directory ─────────────────────────────
chapter_dir = Path(args.chapter_dir).resolve()
if not chapter_dir.is_dir():
print(f"❌ Not a directory: {chapter_dir}")
sys.exit(1)
# ── Locate manga-translator.py ────────────────────────────
script_dir = Path(__file__).parent
module_path = script_dir / "manga-translator.py"
if not module_path.exists():
print(f"❌ manga-translator.py not found in {script_dir}")
sys.exit(1)
# ── Sanity-check that all fixes are present ───────────────
if not args.skip_sanity:
sanity_check_fixes(module_path)
# ── Load translator module ────────────────────────────────
print(f"📦 Loading translator from: {module_path}")
try:
translator = load_module("manga_translator", str(module_path))
except Exception as e:
print(f"❌ Could not load manga-translator.py: {e}")
sys.exit(1)
# ── API compatibility check ───────────────────────────────
if not verify_translator_api(translator):
print("❌ Aborting — fix the parameter mismatch above first.")
sys.exit(1)
# ── Discover and slice pages ──────────────────────────────
all_pages = sorted_pages(chapter_dir)
if not all_pages:
print(f"❌ No image files found in: {chapter_dir}")
sys.exit(1)
start_idx = max(0, args.start - 1)
end_idx = args.end if args.end is not None else len(all_pages)
pages = all_pages[start_idx:end_idx]
if not pages:
print(f"❌ No pages in range [{args.start}, {args.end}]")
sys.exit(1)
print(f"\n📚 Chapter : {chapter_dir.name}")
print(f" Pages : {len(pages)} of {len(all_pages)} total")
print(f" Source : {PIPELINE_CONFIG['source_lang']}")
print(f" Target : {PIPELINE_CONFIG['target_lang']}")
print(f" Output : {chapter_dir / 'translated'}\n")
# ── Process each page ─────────────────────────────────────
results_summary = []
for page_num, page_path in enumerate(pages, start=start_idx + 1):
workdir = make_page_workdir(chapter_dir, page_path.stem)
success = process_page(page_path, workdir, translator)
results_summary.append((page_num, page_path.name, success))
# ── Final summary ─────────────────────────────────────────
print(f"\n{'' * 70}")
print(f" BATCH COMPLETE")
print(f"{'' * 70}")
passed = sum(1 for _, _, ok in results_summary if ok)
failed = len(results_summary) - passed
for page_num, name, ok in results_summary:
status = "" if ok else ""
print(f" {status} [{page_num:>3}] {name}")
print(f"\n Total: {passed} succeeded, {failed} failed")
print(f"{'' * 70}\n")
if failed:
sys.exit(1)
if __name__ == "__main__":
main()