ADded pipeline-translator.py

This commit is contained in:
Guillem Hernandez Sola
2026-04-21 20:09:11 +02:00
parent 2fb5e9eb7b
commit ba5f001e75

View File

@@ -1,35 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
pipeline.py pipeline-translator.py
─────────────────────────────────────────────────────────────── ───────────────────────────────────────────────────────────────
Translation OCR pipeline (No Rendering) Translation OCR pipeline (Batch Processing Only)
Usage: Usage:
python pipeline.py /path/to/chapter/folder python pipeline-translator.py /path/to/chapter/folder
""" """
import os import os
import sys import sys
import argparse import argparse
import zipfile
import importlib.util import importlib.util
from pathlib import Path from pathlib import Path
# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
SOURCE_LANG = "en"
TARGET_LANG = "ca"
# Translator Settings
CONFIDENCE_THRESHOLD = 0.10
MIN_TEXT_LENGTH = 1
GAP_PX = "auto"
FILTER_SFX = True
QUALITY_THRESHOLD = 0.50
READING_MODE = "ltr"
DEBUG = True
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# DYNAMIC MODULE LOADER # DYNAMIC MODULE LOADER
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
@@ -57,40 +41,6 @@ def make_page_workdir(chapter_dir, page_stem):
workdir.mkdir(parents=True, exist_ok=True) workdir.mkdir(parents=True, exist_ok=True)
return workdir return workdir
def pack_cbz(chapter_dir, translated_dir, output_cbz):
exts = {".jpg", ".jpeg", ".png", ".webp"}
pages = sorted(
[p for p in Path(chapter_dir).iterdir() if p.is_file() and p.suffix.lower() in exts],
key=lambda p: p.stem
)
txts = sorted(translated_dir.rglob("output.txt"), key=lambda p: p.parent.name)
jsons = sorted(translated_dir.rglob("bubbles.json"), key=lambda p: p.parent.name)
if not pages:
print("⚠️ No original pages found — CBZ not created.")
return
with zipfile.ZipFile(output_cbz, "w", compression=zipfile.ZIP_STORED) as zf:
# Original pages
for img in pages:
arcname = f"pages/{img.name}"
zf.write(img, arcname)
# Text outputs
for txt in txts:
arcname = f"translations/{txt.parent.name}_output.txt"
zf.write(txt, arcname)
# JSON outputs
for j in jsons:
arcname = f"data/{j.parent.name}_bubbles.json"
zf.write(j, arcname)
print(f"\n✅ CBZ saved → {output_cbz}")
print(f"📦 Contains: {len(pages)} original pages, {len(txts)} text files, {len(jsons)} JSON files.")
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# PER-PAGE PIPELINE # PER-PAGE PIPELINE
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
@@ -104,22 +54,17 @@ def process_page(page_path, workdir, translator_module):
# Isolate execution to the specific page's folder # Isolate execution to the specific page's folder
os.chdir(workdir) os.chdir(workdir)
# 1) Translate print(" ⏳ Extracting text and translating...")
# 1) Translate using ONLY the required path arguments.
# This forces the function to use its own internal default variables
# (like source_lang, target_lang, confidence_threshold) directly from manga-translator.py
translator_module.translate_manga_text( translator_module.translate_manga_text(
image_path=str(page_path.resolve()), image_path=str(page_path.resolve()),
source_lang=SOURCE_LANG,
target_lang=TARGET_LANG,
confidence_threshold=CONFIDENCE_THRESHOLD,
min_text_length=MIN_TEXT_LENGTH,
gap_px=GAP_PX,
filter_sound_effects=FILTER_SFX,
quality_threshold=QUALITY_THRESHOLD,
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json"
reading_mode=READING_MODE,
debug=DEBUG
) )
print(" ✅ Translator done") print(" ✅ Translation and OCR data saved successfully")
return True return True
@@ -134,16 +79,13 @@ def process_page(page_path, workdir, translator_module):
# MAIN # MAIN
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def main(): def main():
parser = argparse.ArgumentParser(description="Manga Translation OCR Pipeline") parser = argparse.ArgumentParser(description="Manga Translation OCR Batch Pipeline")
parser.add_argument("chapter_dir", help="Path to the folder containing manga pages") parser.add_argument("chapter_dir", help="Path to the folder containing manga pages")
args = parser.parse_args() args = parser.parse_args()
chapter_dir = Path(args.chapter_dir).resolve() chapter_dir = Path(args.chapter_dir).resolve()
output_cbz = chapter_dir.parent / f"{chapter_dir.name}_translated.cbz"
print("Loading modules...") print("Loading translator module...")
# Ensure we are loading from the directory where pipeline.py is located
script_dir = Path(__file__).parent script_dir = Path(__file__).parent
try: try:
@@ -157,11 +99,10 @@ def main():
print(f"❌ No images found in: {chapter_dir}") print(f"❌ No images found in: {chapter_dir}")
sys.exit(1) sys.exit(1)
print(f"\n📖 Chapter : {chapter_dir}") print(f"\n📖 Chapter : {chapter_dir.name}")
print(f" Pages : {len(pages)}") print(f" Pages : {len(pages)}")
print(f" Source : {SOURCE_LANG} → Target: {TARGET_LANG}\n") print(" Note : Using translation settings directly from manga-translator.py\n")
translated_dir = chapter_dir / "translated"
succeeded, failed = [], [] succeeded, failed = [], []
for i, page_path in enumerate(pages, start=1): for i, page_path in enumerate(pages, start=1):
@@ -182,8 +123,5 @@ def main():
print(f"{f}") print(f"{f}")
print(f"{'' * 70}\n") print(f"{'' * 70}\n")
print("Packing CBZ...")
pack_cbz(chapter_dir, translated_dir, output_cbz)
if __name__ == "__main__": if __name__ == "__main__":
main() main()