Check new
This commit is contained in:
@@ -6,6 +6,8 @@ Translation OCR pipeline (Batch Processing Only)
|
||||
|
||||
Usage:
|
||||
python pipeline-translator.py /path/to/chapter/folder
|
||||
python pipeline-translator.py /path/to/chapter/folder --start 2 --end 5
|
||||
python pipeline-translator.py /path/to/chapter/folder --source en --target es
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -14,6 +16,7 @@ import argparse
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# PIPELINE CONFIGURATION
|
||||
# Maps to the process_manga_page() signature in manga-translator.py
|
||||
@@ -23,14 +26,53 @@ PIPELINE_CONFIG = dict(
|
||||
target_lang = "ca",
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# DYNAMIC MODULE LOADER
|
||||
# FIX: Always evicts stale sys.modules entry and deletes
|
||||
# __pycache__ for manga-translator.py before loading,
|
||||
# so edits are ALWAYS picked up on every run.
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
def purge_bytecode_cache(filepath: str) -> None:
|
||||
"""
|
||||
Delete the compiled .pyc file for the given .py path so Python
|
||||
cannot silently use a stale cached version of the module.
|
||||
"""
|
||||
import py_compile
|
||||
from importlib.util import cache_from_source
|
||||
|
||||
try:
|
||||
pyc_path = cache_from_source(filepath)
|
||||
if os.path.exists(pyc_path):
|
||||
os.remove(pyc_path)
|
||||
print(f"🗑️ Purged bytecode cache: {pyc_path}")
|
||||
except Exception as e:
|
||||
# Non-fatal — just warn and continue
|
||||
print(f"⚠️ Could not purge bytecode cache: {e}")
|
||||
|
||||
|
||||
def load_module(name: str, filepath: str):
|
||||
"""
|
||||
Dynamically load a .py file as a module.
|
||||
|
||||
FIX 1: Purge the .pyc cache so edits are always reflected.
|
||||
FIX 2: Evict any previously loaded version from sys.modules
|
||||
to prevent Python reusing a stale module object across
|
||||
multiple calls (e.g. when running in a REPL or test loop).
|
||||
"""
|
||||
# FIX 1: delete stale bytecode
|
||||
purge_bytecode_cache(filepath)
|
||||
|
||||
# FIX 2: evict from module registry
|
||||
if name in sys.modules:
|
||||
del sys.modules[name]
|
||||
|
||||
spec = importlib.util.spec_from_file_location(name, filepath)
|
||||
if spec is None or spec.loader is None:
|
||||
raise FileNotFoundError(f"Cannot load spec for {filepath}")
|
||||
raise FileNotFoundError(f"Cannot load module spec for: {filepath}")
|
||||
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[name] = module # register before exec (handles self-refs)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
@@ -39,6 +81,7 @@ def load_module(name: str, filepath: str):
|
||||
# HELPERS
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
def sorted_pages(chapter_dir: Path):
|
||||
"""Return all image files in chapter_dir sorted by filename stem."""
|
||||
exts = {".jpg", ".jpeg", ".png", ".webp"}
|
||||
pages = [
|
||||
p for p in chapter_dir.iterdir()
|
||||
@@ -48,6 +91,7 @@ def sorted_pages(chapter_dir: Path):
|
||||
|
||||
|
||||
def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path:
|
||||
"""Create and return translated/<page_stem>/ inside chapter_dir."""
|
||||
workdir = chapter_dir / "translated" / page_stem
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
return workdir
|
||||
@@ -55,10 +99,9 @@ def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path:
|
||||
|
||||
def verify_translator_api(module) -> bool:
|
||||
"""
|
||||
Checks that the loaded module exposes process_manga_page()
|
||||
and that it accepts all keys defined in PIPELINE_CONFIG.
|
||||
Prints a warning for any missing parameter so mismatches are
|
||||
caught immediately rather than silently falling back to defaults.
|
||||
Checks that the loaded module exposes process_manga_page() and
|
||||
that it accepts all keys defined in PIPELINE_CONFIG.
|
||||
Prints a clear warning for any missing parameter.
|
||||
"""
|
||||
import inspect
|
||||
|
||||
@@ -82,20 +125,55 @@ def verify_translator_api(module) -> bool:
|
||||
return ok
|
||||
|
||||
|
||||
def sanity_check_fixes(module_path: Path) -> None:
|
||||
"""
|
||||
Grep the translator source for key fix signatures and warn if
|
||||
any are missing. Helps catch cases where an edit was not saved.
|
||||
"""
|
||||
checks = {
|
||||
"Fix A (gap_factor=4.0)": "gap_factor=4.0",
|
||||
"Fix B (_majority_contour_id)": "_majority_contour_id",
|
||||
"Fix C (median_inter adaptive gap)": "median_inter",
|
||||
"Fix D (merge_same_column_dialogue)": "merge_same_column_dialogue_boxes",
|
||||
"Fix E (lang_code from self.langs)": "lang_code = self.langs",
|
||||
}
|
||||
|
||||
print("\n🔎 Sanity-checking fixes in manga-translator.py:")
|
||||
source = module_path.read_text(encoding="utf-8")
|
||||
all_ok = True
|
||||
|
||||
for label, token in checks.items():
|
||||
found = token in source
|
||||
status = "✅" if found else "❌ MISSING"
|
||||
print(f" {status} {label}")
|
||||
if not found:
|
||||
all_ok = False
|
||||
|
||||
if not all_ok:
|
||||
print(
|
||||
"\n⚠️ One or more fixes are missing from manga-translator.py.\n"
|
||||
" Save the file and re-run. Aborting.\n"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(" All fixes present.\n")
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# PER-PAGE PIPELINE
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
|
||||
print(f"\n{'─' * 70}")
|
||||
print(f" PAGE : {page_path.name}")
|
||||
print(f" OUT : {workdir}")
|
||||
print(f"{'─' * 70}")
|
||||
|
||||
orig_dir = os.getcwd()
|
||||
try:
|
||||
# Run inside the page's own workdir so debug images and
|
||||
# output files land there automatically.
|
||||
os.chdir(workdir)
|
||||
|
||||
# Use absolute paths so output always lands in workdir
|
||||
# regardless of any internal os.getcwd() calls.
|
||||
output_json = str(workdir / "bubbles.json")
|
||||
output_txt = str(workdir / "output.txt")
|
||||
debug_path = str(workdir / "debug_clusters.png")
|
||||
@@ -109,17 +187,23 @@ def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
|
||||
**PIPELINE_CONFIG,
|
||||
)
|
||||
|
||||
# ── Optional debug visualisation ─────────────────────
|
||||
if results:
|
||||
# ── Debug visualisation ───────────────────────────────
|
||||
# FIX: process_manga_page() already writes debug_clusters.png
|
||||
# internally with full OCR quad data.
|
||||
# We do NOT call draw_debug_clusters() here with ocr=[]
|
||||
# because that would OVERWRITE the correct debug image with
|
||||
# a degraded version that has no quad outlines.
|
||||
#
|
||||
# If process_manga_page() did not write a debug image
|
||||
# (e.g. older version), we do a minimal fallback draw.
|
||||
if results and not os.path.exists(debug_path):
|
||||
try:
|
||||
import cv2
|
||||
|
||||
image_bgr = cv2.imread(str(page_path.resolve()))
|
||||
if image_bgr is not None:
|
||||
# Reconstruct vis_boxes / vis_lines from results dict
|
||||
vis_boxes = {}
|
||||
vis_lines = {}
|
||||
vis_indices = {}
|
||||
vis_boxes: dict = {}
|
||||
vis_lines: dict = {}
|
||||
vis_indices: dict = {}
|
||||
|
||||
for bid_str, data in results.items():
|
||||
bid = int(bid_str)
|
||||
@@ -133,6 +217,7 @@ def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
|
||||
vis_lines[bid] = data.get("lines", [])
|
||||
vis_indices[bid] = []
|
||||
|
||||
# Fallback only — ocr=[] means no quad outlines
|
||||
translator_module.draw_debug_clusters(
|
||||
image_bgr = image_bgr,
|
||||
out_boxes = vis_boxes,
|
||||
@@ -141,14 +226,22 @@ def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
|
||||
ocr = [],
|
||||
save_path = debug_path,
|
||||
)
|
||||
print(f" 🖼️ Fallback debug image written → {debug_path}")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Debug visualisation failed (non-fatal): {e}")
|
||||
|
||||
# ── Sanity-check outputs ──────────────────────────────
|
||||
# ── Sanity-check output files ─────────────────────────
|
||||
all_good = True
|
||||
for fname in ("output.txt", "bubbles.json"):
|
||||
fpath = workdir / fname
|
||||
if not fpath.exists() or fpath.stat().st_size == 0:
|
||||
print(f" ⚠️ {fname} is missing or empty after processing.")
|
||||
if not fpath.exists():
|
||||
print(f" ⚠️ {fname} was NOT created.")
|
||||
all_good = False
|
||||
elif fpath.stat().st_size == 0:
|
||||
print(f" ⚠️ {fname} exists but is EMPTY.")
|
||||
all_good = False
|
||||
else:
|
||||
print(f" 📄 {fname} → {fpath.stat().st_size} bytes")
|
||||
|
||||
if not results:
|
||||
print(" ⚠️ process_manga_page() returned no results.")
|
||||
@@ -172,7 +265,14 @@ def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Manga Translation OCR Batch Pipeline"
|
||||
description="Manga Translation OCR Batch Pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python pipeline-translator.py pages-for-tests
|
||||
python pipeline-translator.py pages-for-tests --start 2 --end 4
|
||||
python pipeline-translator.py pages-for-tests --source en --target es
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"chapter_dir",
|
||||
@@ -194,24 +294,27 @@ def main():
|
||||
"--target", "-t", default=None,
|
||||
help=f"Override target language (default: {PIPELINE_CONFIG['target_lang']})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-sanity", action="store_true",
|
||||
help="Skip the fix sanity check (not recommended)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Allow CLI overrides of source/target without touching PIPELINE_CONFIG
|
||||
# ── Apply CLI language overrides ─────────────────────────
|
||||
config = dict(PIPELINE_CONFIG)
|
||||
if args.source:
|
||||
config["source_lang"] = args.source
|
||||
if args.target:
|
||||
config["target_lang"] = args.target
|
||||
|
||||
# Patch PIPELINE_CONFIG in-place so process_page() picks up overrides
|
||||
PIPELINE_CONFIG.update(config)
|
||||
|
||||
# ── Resolve chapter directory ─────────────────────────────
|
||||
chapter_dir = Path(args.chapter_dir).resolve()
|
||||
if not chapter_dir.is_dir():
|
||||
print(f"❌ Not a directory: {chapter_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# ── Load translator module ────────────────────────────────
|
||||
# ── Locate manga-translator.py ────────────────────────────
|
||||
script_dir = Path(__file__).parent
|
||||
module_path = script_dir / "manga-translator.py"
|
||||
|
||||
@@ -219,6 +322,11 @@ def main():
|
||||
print(f"❌ manga-translator.py not found in {script_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# ── Sanity-check that all fixes are present ───────────────
|
||||
if not args.skip_sanity:
|
||||
sanity_check_fixes(module_path)
|
||||
|
||||
# ── Load translator module ────────────────────────────────
|
||||
print(f"📦 Loading translator from: {module_path}")
|
||||
try:
|
||||
translator = load_module("manga_translator", str(module_path))
|
||||
@@ -231,13 +339,12 @@ def main():
|
||||
print("❌ Aborting — fix the parameter mismatch above first.")
|
||||
sys.exit(1)
|
||||
|
||||
# ── Discover pages ────────────────────────────────────────
|
||||
# ── Discover and slice pages ──────────────────────────────
|
||||
all_pages = sorted_pages(chapter_dir)
|
||||
if not all_pages:
|
||||
print(f"❌ No images found in: {chapter_dir}")
|
||||
print(f"❌ No image files found in: {chapter_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# Apply --start / --end slice (1-based, inclusive)
|
||||
start_idx = max(0, args.start - 1)
|
||||
end_idx = args.end if args.end is not None else len(all_pages)
|
||||
pages = all_pages[start_idx:end_idx]
|
||||
@@ -246,37 +353,38 @@ def main():
|
||||
print(f"❌ No pages in range [{args.start}, {args.end}]")
|
||||
sys.exit(1)
|
||||
|
||||
# ── Summary header ────────────────────────────────────────
|
||||
print(f"\n{'═' * 70}")
|
||||
print(f" 📖 Chapter : {chapter_dir.name}")
|
||||
print(f" 📄 Pages : {len(pages)} "
|
||||
f"(of {len(all_pages)} total, "
|
||||
f"range {args.start}–{end_idx})")
|
||||
print(f" 🌐 Lang : {PIPELINE_CONFIG['source_lang']} → "
|
||||
f"{PIPELINE_CONFIG['target_lang']}")
|
||||
print(f"{'═' * 70}\n")
|
||||
print(f"\n📚 Chapter : {chapter_dir.name}")
|
||||
print(f" Pages : {len(pages)} of {len(all_pages)} total")
|
||||
print(f" Source : {PIPELINE_CONFIG['source_lang']}")
|
||||
print(f" Target : {PIPELINE_CONFIG['target_lang']}")
|
||||
print(f" Output : {chapter_dir / 'translated'}\n")
|
||||
|
||||
succeeded, failed = [], []
|
||||
# ── Process each page ─────────────────────────────────────
|
||||
results_summary = []
|
||||
|
||||
for i, page_path in enumerate(pages, start=1):
|
||||
print(f"[{i}/{len(pages)}] {page_path.name}")
|
||||
for page_num, page_path in enumerate(pages, start=start_idx + 1):
|
||||
workdir = make_page_workdir(chapter_dir, page_path.stem)
|
||||
success = process_page(page_path, workdir, translator)
|
||||
results_summary.append((page_num, page_path.name, success))
|
||||
|
||||
if process_page(page_path, workdir, translator):
|
||||
succeeded.append(page_path.name)
|
||||
else:
|
||||
failed.append(page_path.name)
|
||||
|
||||
# ── Final report ──────────────────────────────────────────
|
||||
# ── Final summary ─────────────────────────────────────────
|
||||
print(f"\n{'═' * 70}")
|
||||
print(" PIPELINE COMPLETE")
|
||||
print(f" ✅ {len(succeeded)} page(s) succeeded")
|
||||
if failed:
|
||||
print(f" ❌ {len(failed)} page(s) failed:")
|
||||
for name in failed:
|
||||
print(f" • {name}")
|
||||
print(f" BATCH COMPLETE")
|
||||
print(f"{'═' * 70}")
|
||||
|
||||
passed = sum(1 for _, _, ok in results_summary if ok)
|
||||
failed = len(results_summary) - passed
|
||||
|
||||
for page_num, name, ok in results_summary:
|
||||
status = "✅" if ok else "❌"
|
||||
print(f" {status} [{page_num:>3}] {name}")
|
||||
|
||||
print(f"\n Total: {passed} succeeded, {failed} failed")
|
||||
print(f"{'═' * 70}\n")
|
||||
|
||||
if failed:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
Reference in New Issue
Block a user