282 lines
11 KiB
Python
282 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
pipeline-translator.py
|
||
───────────────────────────────────────────────────────────────
|
||
Translation OCR pipeline (Batch Processing Only)
|
||
|
||
Usage:
|
||
python pipeline-translator.py /path/to/chapter/folder
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import argparse
|
||
import importlib.util
|
||
from pathlib import Path
|
||
|
||
# ─────────────────────────────────────────────────────────────
|
||
# PIPELINE CONFIGURATION
|
||
# Maps to the process_manga_page() signature in manga-translator.py
|
||
# ─────────────────────────────────────────────────────────────
|
||
PIPELINE_CONFIG = dict(
|
||
source_lang = "en",
|
||
target_lang = "ca",
|
||
)
|
||
|
||
# ─────────────────────────────────────────────────────────────
|
||
# DYNAMIC MODULE LOADER
|
||
# ─────────────────────────────────────────────────────────────
|
||
def load_module(name: str, filepath: str):
|
||
spec = importlib.util.spec_from_file_location(name, filepath)
|
||
if spec is None or spec.loader is None:
|
||
raise FileNotFoundError(f"Cannot load spec for {filepath}")
|
||
module = importlib.util.module_from_spec(spec)
|
||
spec.loader.exec_module(module)
|
||
return module
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────
|
||
# HELPERS
|
||
# ─────────────────────────────────────────────────────────────
|
||
def sorted_pages(chapter_dir: Path):
|
||
exts = {".jpg", ".jpeg", ".png", ".webp"}
|
||
pages = [
|
||
p for p in chapter_dir.iterdir()
|
||
if p.is_file() and p.suffix.lower() in exts
|
||
]
|
||
return sorted(pages, key=lambda p: p.stem)
|
||
|
||
|
||
def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path:
|
||
workdir = chapter_dir / "translated" / page_stem
|
||
workdir.mkdir(parents=True, exist_ok=True)
|
||
return workdir
|
||
|
||
|
||
def verify_translator_api(module) -> bool:
|
||
"""
|
||
Checks that the loaded module exposes process_manga_page()
|
||
and that it accepts all keys defined in PIPELINE_CONFIG.
|
||
Prints a warning for any missing parameter so mismatches are
|
||
caught immediately rather than silently falling back to defaults.
|
||
"""
|
||
import inspect
|
||
|
||
fn = getattr(module, "process_manga_page", None)
|
||
if fn is None:
|
||
print("❌ manga-translator.py does not expose process_manga_page()")
|
||
return False
|
||
|
||
sig = inspect.signature(fn)
|
||
params = set(sig.parameters.keys())
|
||
ok = True
|
||
|
||
for key in PIPELINE_CONFIG:
|
||
if key not in params:
|
||
print(
|
||
f"⚠️ PIPELINE_CONFIG key '{key}' not found in "
|
||
f"process_manga_page() — update pipeline or translator."
|
||
)
|
||
ok = False
|
||
|
||
return ok
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────
|
||
# PER-PAGE PIPELINE
|
||
# ─────────────────────────────────────────────────────────────
|
||
def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
|
||
print(f"\n{'─' * 70}")
|
||
print(f" PAGE : {page_path.name}")
|
||
print(f"{'─' * 70}")
|
||
|
||
orig_dir = os.getcwd()
|
||
try:
|
||
# Run inside the page's own workdir so debug images and
|
||
# output files land there automatically.
|
||
os.chdir(workdir)
|
||
|
||
output_json = str(workdir / "bubbles.json")
|
||
output_txt = str(workdir / "output.txt")
|
||
debug_path = str(workdir / "debug_clusters.png")
|
||
|
||
print(" ⏳ Extracting text and translating...")
|
||
|
||
results = translator_module.process_manga_page(
|
||
image_path = str(page_path.resolve()),
|
||
output_json = output_json,
|
||
output_txt = output_txt,
|
||
**PIPELINE_CONFIG,
|
||
)
|
||
|
||
# ── Optional debug visualisation ─────────────────────
|
||
if results:
|
||
try:
|
||
import cv2
|
||
|
||
image_bgr = cv2.imread(str(page_path.resolve()))
|
||
if image_bgr is not None:
|
||
# Reconstruct vis_boxes / vis_lines from results dict
|
||
vis_boxes = {}
|
||
vis_lines = {}
|
||
vis_indices = {}
|
||
|
||
for bid_str, data in results.items():
|
||
bid = int(bid_str)
|
||
xywh = data["box"]
|
||
vis_boxes[bid] = (
|
||
xywh["x"],
|
||
xywh["y"],
|
||
xywh["x"] + xywh["w"],
|
||
xywh["y"] + xywh["h"],
|
||
)
|
||
vis_lines[bid] = data.get("lines", [])
|
||
vis_indices[bid] = []
|
||
|
||
translator_module.draw_debug_clusters(
|
||
image_bgr = image_bgr,
|
||
out_boxes = vis_boxes,
|
||
out_lines = vis_lines,
|
||
out_indices = vis_indices,
|
||
ocr = [],
|
||
save_path = debug_path,
|
||
)
|
||
except Exception as e:
|
||
print(f" ⚠️ Debug visualisation failed (non-fatal): {e}")
|
||
|
||
# ── Sanity-check outputs ──────────────────────────────
|
||
for fname in ("output.txt", "bubbles.json"):
|
||
fpath = workdir / fname
|
||
if not fpath.exists() or fpath.stat().st_size == 0:
|
||
print(f" ⚠️ {fname} is missing or empty after processing.")
|
||
|
||
if not results:
|
||
print(" ⚠️ process_manga_page() returned no results.")
|
||
return False
|
||
|
||
print(f" ✅ Done — {len(results)} box(es) processed.")
|
||
return True
|
||
|
||
except Exception as e:
|
||
import traceback
|
||
print(f" ❌ Failed: {e}")
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
finally:
|
||
os.chdir(orig_dir)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────
|
||
# MAIN
|
||
# ─────────────────────────────────────────────────────────────
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Manga Translation OCR Batch Pipeline"
|
||
)
|
||
parser.add_argument(
|
||
"chapter_dir",
|
||
help="Path to the folder containing manga page images"
|
||
)
|
||
parser.add_argument(
|
||
"--start", type=int, default=1,
|
||
help="Start from this page number (1-based, default: 1)"
|
||
)
|
||
parser.add_argument(
|
||
"--end", type=int, default=None,
|
||
help="Stop after this page number inclusive (default: all)"
|
||
)
|
||
parser.add_argument(
|
||
"--source", "-s", default=None,
|
||
help=f"Override source language (default: {PIPELINE_CONFIG['source_lang']})"
|
||
)
|
||
parser.add_argument(
|
||
"--target", "-t", default=None,
|
||
help=f"Override target language (default: {PIPELINE_CONFIG['target_lang']})"
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
# Allow CLI overrides of source/target without touching PIPELINE_CONFIG
|
||
config = dict(PIPELINE_CONFIG)
|
||
if args.source:
|
||
config["source_lang"] = args.source
|
||
if args.target:
|
||
config["target_lang"] = args.target
|
||
|
||
# Patch PIPELINE_CONFIG in-place so process_page() picks up overrides
|
||
PIPELINE_CONFIG.update(config)
|
||
|
||
chapter_dir = Path(args.chapter_dir).resolve()
|
||
if not chapter_dir.is_dir():
|
||
print(f"❌ Not a directory: {chapter_dir}")
|
||
sys.exit(1)
|
||
|
||
# ── Load translator module ────────────────────────────────
|
||
script_dir = Path(__file__).parent
|
||
module_path = script_dir / "manga-translator.py"
|
||
|
||
if not module_path.exists():
|
||
print(f"❌ manga-translator.py not found in {script_dir}")
|
||
sys.exit(1)
|
||
|
||
print(f"📦 Loading translator from: {module_path}")
|
||
try:
|
||
translator = load_module("manga_translator", str(module_path))
|
||
except Exception as e:
|
||
print(f"❌ Could not load manga-translator.py: {e}")
|
||
sys.exit(1)
|
||
|
||
# ── API compatibility check ───────────────────────────────
|
||
if not verify_translator_api(translator):
|
||
print("❌ Aborting — fix the parameter mismatch above first.")
|
||
sys.exit(1)
|
||
|
||
# ── Discover pages ────────────────────────────────────────
|
||
all_pages = sorted_pages(chapter_dir)
|
||
if not all_pages:
|
||
print(f"❌ No images found in: {chapter_dir}")
|
||
sys.exit(1)
|
||
|
||
# Apply --start / --end slice (1-based, inclusive)
|
||
start_idx = max(0, args.start - 1)
|
||
end_idx = args.end if args.end is not None else len(all_pages)
|
||
pages = all_pages[start_idx:end_idx]
|
||
|
||
if not pages:
|
||
print(f"❌ No pages in range [{args.start}, {args.end}]")
|
||
sys.exit(1)
|
||
|
||
# ── Summary header ────────────────────────────────────────
|
||
print(f"\n{'═' * 70}")
|
||
print(f" 📖 Chapter : {chapter_dir.name}")
|
||
print(f" 📄 Pages : {len(pages)} "
|
||
f"(of {len(all_pages)} total, "
|
||
f"range {args.start}–{end_idx})")
|
||
print(f" 🌐 Lang : {PIPELINE_CONFIG['source_lang']} → "
|
||
f"{PIPELINE_CONFIG['target_lang']}")
|
||
print(f"{'═' * 70}\n")
|
||
|
||
succeeded, failed = [], []
|
||
|
||
for i, page_path in enumerate(pages, start=1):
|
||
print(f"[{i}/{len(pages)}] {page_path.name}")
|
||
workdir = make_page_workdir(chapter_dir, page_path.stem)
|
||
|
||
if process_page(page_path, workdir, translator):
|
||
succeeded.append(page_path.name)
|
||
else:
|
||
failed.append(page_path.name)
|
||
|
||
# ── Final report ──────────────────────────────────────────
|
||
print(f"\n{'═' * 70}")
|
||
print(" PIPELINE COMPLETE")
|
||
print(f" ✅ {len(succeeded)} page(s) succeeded")
|
||
if failed:
|
||
print(f" ❌ {len(failed)} page(s) failed:")
|
||
for name in failed:
|
||
print(f" • {name}")
|
||
print(f"{'═' * 70}\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |