Added fixes

Cleaning
Everything
2026-04-22 18:01:29 +02:00 · 2026-04-22 16:28:10 +02:00 · 2026-04-22 16:27:56 +02:00 · 2026-04-22 16:18:59 +02:00 · 2026-04-22 14:05:25 +02:00 · 2026-04-22 11:49:25 +02:00
16 changed files with 2905 additions and 1062 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,11 @@

 .venv311/

+#Folders to test
+Spy_x_Family_076/
+Dandadan_059/
+Lv999/
+
 # Icon must end with two \r
 Icon

--- a/manga-translator.py
+++ b/manga-translator.py
--- a/older-code/analyze_box5.py
+++ b/older-code/analyze_box5.py
--- a/older-code/analyze_box7_split.py
+++ b/older-code/analyze_box7_split.py
--- a/older-code/analyze_grouping.py
+++ b/older-code/analyze_grouping.py
--- a/older-code/check_box7.py
+++ b/older-code/check_box7.py
--- a/older-code/check_grouping_logic.py
+++ b/older-code/check_grouping_logic.py
--- a/older-code/debug_split_phase.py
+++ b/older-code/debug_split_phase.py
--- a/older-code/patch_manga_translator.py
+++ b/older-code/patch_manga_translator.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import re
+from pathlib import Path
+
+TARGET = Path("manga-translator.py")
+
+def cut_after_first_entrypoint(text: str) -> str:
+    """
+    Keep only first full __main__ block and remove duplicated tail if present.
+    """
+    m = re.search(r'(?m)^if __name__ == "__main__":\s*$', text)
+    if not m:
+        return text
+
+    start = m.start()
+    # Keep entrypoint block plus indented lines after it
+    lines = text[start:].splitlines(True)
+    keep = []
+    keep.append(lines[0])  # if __name__...
+    i = 1
+    while i < len(lines):
+        ln = lines[i]
+        if ln.strip() == "":
+            keep.append(ln)
+            i += 1
+            continue
+        # if dedented back to col 0 => end of block
+        if not ln.startswith((" ", "\t")):
+            break
+        keep.append(ln)
+        i += 1
+
+    cleaned = text[:start] + "".join(keep)
+    return cleaned
+
+def replace_bad_vars(text: str) -> str:
+    text = text.replace(
+        "merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr)",
+        "merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image)"
+    )
+    text = text.replace(
+        "reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr)",
+        "reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered)"
+    )
+    return text
+
+def ensure_autofix_chain(text: str) -> str:
+    old = (
+        "    # ── Auto-fix (split + merge) ──────────────────────────────────────────\n"
+        "    if auto_fix_bubbles:\n"
+        "        bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image)\n"
+    )
+    new = (
+        "    # ── Auto-fix (split + merge) ──────────────────────────────────────────\n"
+        "    if auto_fix_bubbles:\n"
+        "        bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(\n"
+        "            bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image)\n"
+        "        bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed(\n"
+        "            bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image)\n"
+    )
+    return text.replace(old, new)
+
+def ensure_split_commit(text: str) -> str:
+    marker = "    # ── Remove nested / duplicate boxes ──────────────────────────────────\n"
+    if marker not in text:
+        return text
+
+    if "bubbles = new_bubbles" in text:
+        return text
+
+    inject = (
+        "    bubbles = new_bubbles\n"
+        "    bubble_boxes = new_bubble_boxes\n"
+        "    bubble_quads = new_bubble_quads\n"
+        "    bubble_indices = new_bubble_indices\n\n"
+    )
+    return text.replace(marker, inject + marker)
+
+def ensure_rescue_pipeline(text: str) -> str:
+    anchor = '    print(f"Kept: {len(filtered)} | Skipped: {skipped}")\n'
+    if anchor not in text:
+        return text
+
+    if "rescue_name_and_short_tokens(raw" in text:
+        return text
+
+    block = (
+        '    print(f"Kept: {len(filtered)} | Skipped: {skipped}")\n'
+        '    # Protect short dialogue tokens confidence\n'
+        '    tmp = []\n'
+        '    for bbox, t, conf in filtered:\n'
+        '        tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40)))\n'
+        '    filtered = tmp\n'
+        '    # Rescue names/short tokens dropped by strict filters\n'
+        '    rescued = rescue_name_and_short_tokens(raw, min_conf=0.20)\n'
+        '    filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55)\n'
+    )
+    return text.replace(anchor, block)
+
+def main():
+    if not TARGET.exists():
+        raise FileNotFoundError(f"Not found: {TARGET}")
+
+    src = TARGET.read_text(encoding="utf-8")
+    out = src
+
+    out = cut_after_first_entrypoint(out)
+    out = replace_bad_vars(out)
+    out = ensure_autofix_chain(out)
+    out = ensure_split_commit(out)
+    out = ensure_rescue_pipeline(out)
+
+    TARGET.write_text(out, encoding="utf-8")
+    print("✅ Patched manga-translator.py")
+
+if __name__ == "__main__":
+    main()
--- a/older-code/regenerate_debug.py
+++ b/older-code/regenerate_debug.py
--- a/older-code/split_bubbles.py
+++ b/older-code/split_bubbles.py
--- a/older-code/split_final.py
+++ b/older-code/split_final.py
--- a/older-code/test_panel_split.py
+++ b/older-code/test_panel_split.py
--- a/pipeline-translator.py
+++ b/pipeline-translator.py
@@ -14,10 +14,19 @@ import argparse
 import importlib.util
 from pathlib import Path

-# ─────────────────────────────────────────────
+# ─────────────────────────────────────────────────────────────
+#  PIPELINE CONFIGURATION
+#  Maps to the process_manga_page() signature in manga-translator.py
+# ─────────────────────────────────────────────────────────────
+PIPELINE_CONFIG = dict(
+    source_lang = "en",
+    target_lang = "ca",
+)
+
+# ─────────────────────────────────────────────────────────────
 #  DYNAMIC MODULE LOADER
-# ─────────────────────────────────────────────
-def load_module(name, filepath):
+# ─────────────────────────────────────────────────────────────
+def load_module(name: str, filepath: str):
    spec = importlib.util.spec_from_file_location(name, filepath)
    if spec is None or spec.loader is None:
        raise FileNotFoundError(f"Cannot load spec for {filepath}")
@@ -25,103 +34,249 @@ def load_module(name, filepath):
    spec.loader.exec_module(module)
    return module

-# ─────────────────────────────────────────────
+
+# ─────────────────────────────────────────────────────────────
 #  HELPERS
-# ─────────────────────────────────────────────
-def sorted_pages(chapter_dir):
+# ─────────────────────────────────────────────────────────────
+def sorted_pages(chapter_dir: Path):
    exts = {".jpg", ".jpeg", ".png", ".webp"}
    pages = [
-        p for p in Path(chapter_dir).iterdir()
+        p for p in chapter_dir.iterdir()
        if p.is_file() and p.suffix.lower() in exts
    ]
    return sorted(pages, key=lambda p: p.stem)

-def make_page_workdir(chapter_dir, page_stem):
-    workdir = Path(chapter_dir) / "translated" / page_stem
+
+def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path:
+    workdir = chapter_dir / "translated" / page_stem
    workdir.mkdir(parents=True, exist_ok=True)
    return workdir

-# ─────────────────────────────────────────────
+
+def verify_translator_api(module) -> bool:
+    """
+    Checks that the loaded module exposes process_manga_page()
+    and that it accepts all keys defined in PIPELINE_CONFIG.
+    Prints a warning for any missing parameter so mismatches are
+    caught immediately rather than silently falling back to defaults.
+    """
+    import inspect
+
+    fn = getattr(module, "process_manga_page", None)
+    if fn is None:
+        print("❌ manga-translator.py does not expose process_manga_page()")
+        return False
+
+    sig    = inspect.signature(fn)
+    params = set(sig.parameters.keys())
+    ok     = True
+
+    for key in PIPELINE_CONFIG:
+        if key not in params:
+            print(
+                f"⚠️  PIPELINE_CONFIG key '{key}' not found in "
+                f"process_manga_page() — update pipeline or translator."
+            )
+            ok = False
+
+    return ok
+
+
+# ─────────────────────────────────────────────────────────────
 #  PER-PAGE PIPELINE
-# ─────────────────────────────────────────────
-def process_page(page_path, workdir, translator_module):
+# ─────────────────────────────────────────────────────────────
+def process_page(page_path: Path, workdir: Path, translator_module) -> bool:
    print(f"\n{'─' * 70}")
-    print(f"PAGE: {page_path.name}")
+    print(f"  PAGE : {page_path.name}")
    print(f"{'─' * 70}")

    orig_dir = os.getcwd()
    try:
-        # Isolate execution to the specific page's folder
+        # Run inside the page's own workdir so debug images and
+        # output files land there automatically.
        os.chdir(workdir)

-        print("  ⏳ Extracting text and translating...")
-        
-        # 1) Translate using ONLY the required path arguments.
-        # This forces the function to use its own internal default variables 
-        # (like source_lang, target_lang, confidence_threshold) directly from manga-translator.py
-        translator_module.translate_manga_text(
-            image_path=str(page_path.resolve()),
-            export_to_file="output.txt",
-            export_bubbles_to="bubbles.json"
-        )
-        print("  ✅ Translation and OCR data saved successfully")
+        output_json = str(workdir / "bubbles.json")
+        output_txt  = str(workdir / "output.txt")
+        debug_path  = str(workdir / "debug_clusters.png")

+        print("  ⏳ Extracting text and translating...")
+
+        results = translator_module.process_manga_page(
+            image_path  = str(page_path.resolve()),
+            output_json = output_json,
+            output_txt  = output_txt,
+            **PIPELINE_CONFIG,
+        )
+
+        # ── Optional debug visualisation ─────────────────────
+        if results:
+            try:
+                import cv2
+
+                image_bgr = cv2.imread(str(page_path.resolve()))
+                if image_bgr is not None:
+                    # Reconstruct vis_boxes / vis_lines from results dict
+                    vis_boxes  = {}
+                    vis_lines  = {}
+                    vis_indices = {}
+
+                    for bid_str, data in results.items():
+                        bid  = int(bid_str)
+                        xywh = data["box"]
+                        vis_boxes[bid] = (
+                            xywh["x"],
+                            xywh["y"],
+                            xywh["x"] + xywh["w"],
+                            xywh["y"] + xywh["h"],
+                        )
+                        vis_lines[bid]   = data.get("lines", [])
+                        vis_indices[bid] = []
+
+                    translator_module.draw_debug_clusters(
+                        image_bgr   = image_bgr,
+                        out_boxes   = vis_boxes,
+                        out_lines   = vis_lines,
+                        out_indices = vis_indices,
+                        ocr         = [],
+                        save_path   = debug_path,
+                    )
+            except Exception as e:
+                print(f"  ⚠️  Debug visualisation failed (non-fatal): {e}")
+
+        # ── Sanity-check outputs ──────────────────────────────
+        for fname in ("output.txt", "bubbles.json"):
+            fpath = workdir / fname
+            if not fpath.exists() or fpath.stat().st_size == 0:
+                print(f"  ⚠️  {fname} is missing or empty after processing.")
+
+        if not results:
+            print("  ⚠️  process_manga_page() returned no results.")
+            return False
+
+        print(f"  ✅ Done — {len(results)} box(es) processed.")
        return True

    except Exception as e:
+        import traceback
        print(f"  ❌ Failed: {e}")
+        traceback.print_exc()
        return False

    finally:
        os.chdir(orig_dir)

-# ─────────────────────────────────────────────
+
+# ─────────────────────────────────────────────────────────────
 #  MAIN
-# ─────────────────────────────────────────────
+# ─────────────────────────────────────────────────────────────
 def main():
-    parser = argparse.ArgumentParser(description="Manga Translation OCR Batch Pipeline")
-    parser.add_argument("chapter_dir", help="Path to the folder containing manga pages")
+    parser = argparse.ArgumentParser(
+        description="Manga Translation OCR Batch Pipeline"
+    )
+    parser.add_argument(
+        "chapter_dir",
+        help="Path to the folder containing manga page images"
+    )
+    parser.add_argument(
+        "--start", type=int, default=1,
+        help="Start from this page number (1-based, default: 1)"
+    )
+    parser.add_argument(
+        "--end", type=int, default=None,
+        help="Stop after this page number inclusive (default: all)"
+    )
+    parser.add_argument(
+        "--source", "-s", default=None,
+        help=f"Override source language (default: {PIPELINE_CONFIG['source_lang']})"
+    )
+    parser.add_argument(
+        "--target", "-t", default=None,
+        help=f"Override target language (default: {PIPELINE_CONFIG['target_lang']})"
+    )
    args = parser.parse_args()

-    chapter_dir = Path(args.chapter_dir).resolve()
+    # Allow CLI overrides of source/target without touching PIPELINE_CONFIG
+    config = dict(PIPELINE_CONFIG)
+    if args.source:
+        config["source_lang"] = args.source
+    if args.target:
+        config["target_lang"] = args.target

-    print("Loading translator module...")
-    script_dir = Path(__file__).parent
-    
+    # Patch PIPELINE_CONFIG in-place so process_page() picks up overrides
+    PIPELINE_CONFIG.update(config)
+
+    chapter_dir = Path(args.chapter_dir).resolve()
+    if not chapter_dir.is_dir():
+        print(f"❌ Not a directory: {chapter_dir}")
+        sys.exit(1)
+
+    # ── Load translator module ────────────────────────────────
+    script_dir  = Path(__file__).parent
+    module_path = script_dir / "manga-translator.py"
+
+    if not module_path.exists():
+        print(f"❌ manga-translator.py not found in {script_dir}")
+        sys.exit(1)
+
+    print(f"📦 Loading translator from: {module_path}")
    try:
-        translator = load_module("manga_translator", str(script_dir / "manga-translator.py"))
+        translator = load_module("manga_translator", str(module_path))
    except Exception as e:
        print(f"❌ Could not load manga-translator.py: {e}")
        sys.exit(1)

-    pages = sorted_pages(chapter_dir)
-    if not pages:
+    # ── API compatibility check ───────────────────────────────
+    if not verify_translator_api(translator):
+        print("❌ Aborting — fix the parameter mismatch above first.")
+        sys.exit(1)
+
+    # ── Discover pages ────────────────────────────────────────
+    all_pages = sorted_pages(chapter_dir)
+    if not all_pages:
        print(f"❌ No images found in: {chapter_dir}")
        sys.exit(1)

-    print(f"\n📖 Chapter : {chapter_dir.name}")
-    print(f"   Pages   : {len(pages)}")
-    print("   Note    : Using translation settings directly from manga-translator.py\n")
+    # Apply --start / --end slice (1-based, inclusive)
+    start_idx = max(0, args.start - 1)
+    end_idx   = args.end if args.end is not None else len(all_pages)
+    pages     = all_pages[start_idx:end_idx]
+
+    if not pages:
+        print(f"❌ No pages in range [{args.start}, {args.end}]")
+        sys.exit(1)
+
+    # ── Summary header ────────────────────────────────────────
+    print(f"\n{'═' * 70}")
+    print(f"  📖 Chapter : {chapter_dir.name}")
+    print(f"  📄 Pages   : {len(pages)} "
+          f"(of {len(all_pages)} total, "
+          f"range {args.start}–{end_idx})")
+    print(f"  🌐 Lang    : {PIPELINE_CONFIG['source_lang']} → "
+          f"{PIPELINE_CONFIG['target_lang']}")
+    print(f"{'═' * 70}\n")

    succeeded, failed = [], []

    for i, page_path in enumerate(pages, start=1):
-        print(f"[{i}/{len(pages)}] Processing...")
+        print(f"[{i}/{len(pages)}] {page_path.name}")
        workdir = make_page_workdir(chapter_dir, page_path.stem)
-        
+
        if process_page(page_path, workdir, translator):
            succeeded.append(page_path.name)
        else:
            failed.append(page_path.name)

+    # ── Final report ──────────────────────────────────────────
    print(f"\n{'═' * 70}")
-    print("PIPELINE COMPLETE")
-    print(f"✅ {len(succeeded)} page(s) succeeded")
+    print("  PIPELINE COMPLETE")
+    print(f"  ✅ {len(succeeded)} page(s) succeeded")
    if failed:
-        print(f"❌ {len(failed)} page(s) failed:")
-        for f in failed:
-            print(f"   • {f}")
+        print(f"  ❌ {len(failed)} page(s) failed:")
+        for name in failed:
+            print(f"     • {name}")
    print(f"{'═' * 70}\n")

+
 if __name__ == "__main__":
    main()
--- a/79
+++ b/79
@@ -1,79 +0,0 @@
-aistudio-sdk==0.3.8
-annotated-doc==0.0.4
-annotated-types==0.7.0
-anyio==4.13.0
-bce-python-sdk==0.9.70
-beautifulsoup4==4.14.3
-certifi==2026.2.25
-chardet==7.4.3
-charset-normalizer==3.4.7
-click==8.3.2
-colorlog==6.10.1
-crc32c==2.8
-deep-translator==1.11.4
-easyocr==1.7.2
-filelock==3.28.0
-fsspec==2026.3.0
-future==1.0.0
-h11==0.16.0
-hf-xet==1.4.3
-httpcore==1.0.9
-httpx==0.28.1
-huggingface_hub==1.10.2
-idna==3.11
-ImageIO==2.37.3
-imagesize==2.0.0
-Jinja2==3.1.6
-lazy-loader==0.5
-markdown-it-py==4.0.0
-MarkupSafe==3.0.3
-mdurl==0.1.2
-modelscope==1.35.4
-mpmath==1.3.0
-networkx==3.6.1
-ninja==1.13.0
-numpy==1.26.4
-opencv-contrib-python==4.10.0.84
-opencv-python==4.11.0.86
-opencv-python-headless==4.11.0.86
-opt-einsum==3.3.0
-packaging==26.1
-paddleocr==3.4.1
-paddlepaddle==3.3.1
-paddlex==3.4.3
-pandas==3.0.2
-pillow==12.2.0
-prettytable==3.17.0
-protobuf==7.34.1
-psutil==7.2.2
-py-cpuinfo==9.0.0
-pyclipper==1.4.0
-pycryptodome==3.23.0
-pydantic==2.13.1
-pydantic_core==2.46.1
-Pygments==2.20.0
-pypdfium2==5.7.0
-python-bidi==0.6.7
-python-dateutil==2.9.0.post0
-PyYAML==6.0.2
-requests==2.33.1
-rich==15.0.0
-ruamel.yaml==0.19.1
-safetensors==0.7.0
-scikit-image==0.26.0
-scipy==1.17.1
-shapely==2.1.2
-shellingham==1.5.4
-six==1.17.0
-soupsieve==2.8.3
-sympy==1.14.0
-tifffile==2026.3.3
-torch==2.11.0
-torchvision==0.26.0
-tqdm==4.67.3
-typer==0.24.1
-typing-inspection==0.4.2
-typing_extensions==4.15.0
-ujson==5.12.0
-urllib3==2.6.3
-wcwidth==0.6.0
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +0,0 @@
-numpy<2.0
-opencv-python>=4.8
-easyocr>=1.7.1
-deep-translator>=1.11.4
-manga-ocr>=0.1.14
-torch
-torchvision
-Pillow
-transformers
-fugashi
-unidic-lite
-
Author	SHA1	Message	Date
Guillem Hernandez Sola	037dadd920	Added fixes	2026-04-22 18:01:29 +02:00
Guillem Hernandez Sola	285e9ca393	Cleaning	2026-04-22 16:28:10 +02:00
Guillem Hernandez Sola	d77db83cfe	Everything	2026-04-22 16:27:56 +02:00
Guillem Hernandez Sola	b730037a06	Added big stuff	2026-04-22 16:18:59 +02:00
Guillem Hernandez Sola	7837aeaa9b	Added fixes	2026-04-22 14:05:25 +02:00
Guillem Hernandez Sola	455b4ad82c	starting point	2026-04-22 11:49:25 +02:00
Guillem Hernandez Sola	b6b0df4774	Added stuff	2026-04-22 10:51:57 +02:00
Guillem Hernandez Sola	512bb32f66	Added all	2026-04-21 23:27:56 +02:00
Guillem Hernandez Sola	494631c967	Some fixes running	2026-04-21 23:03:17 +02:00
Guillem Hernandez Sola	27a3e6f98a	Added some changes2	2026-04-21 22:43:17 +02:00
Guillem Hernandez Sola	f00647e668	Added new styles	2026-04-21 21:45:46 +02:00
Guillem Hernandez Sola	a5c81f4ff0	Added new styles	2026-04-21 21:27:22 +02:00