Added good stuff

2026-04-11 14:34:18 +02:00
parent 555892348f
commit 727b052e93
5 changed files with 310 additions and 157 deletions
--- a/bubbles.json
+++ b/bubbles.json
@@ -1,9 +1,9 @@
 {
  "1": {
-    "x": 204,
+    "x": 201,
-    "y": 137,
+    "y": 134,
-    "w": 153,
+    "w": 159,
-    "h": 82,
+    "h": 88,
    "quads": [
      [
        [
@@ -26,10 +26,10 @@
    ]
  },
  "2": {
-    "x": 1167,
+    "x": 1164,
-    "y": 240,
+    "y": 237,
-    "w": 132,
+    "w": 138,
-    "h": 134,
+    "h": 140,
    "quads": [
      [
        [
@@ -106,10 +106,10 @@
    ]
  },
  "3": {
-    "x": 930,
+    "x": 927,
-    "y": 378,
+    "y": 375,
-    "w": 136,
+    "w": 142,
-    "h": 132,
+    "h": 138,
    "quads": [
      [
        [
@@ -186,10 +186,10 @@
    ]
  },
  "4": {
-    "x": 220,
+    "x": 217,
-    "y": 486,
+    "y": 483,
-    "w": 150,
+    "w": 156,
-    "h": 210,
+    "h": 216,
    "quads": [
      [
        [
@@ -320,10 +320,10 @@
    ]
  },
  "5": {
-    "x": 354,
+    "x": 351,
-    "y": 1132,
+    "y": 1129,
-    "w": 92,
+    "w": 98,
-    "h": 102,
+    "h": 108,
    "quads": [
      [
        [
@@ -382,10 +382,10 @@
    ]
  },
  "6": {
-    "x": 740,
+    "x": 737,
-    "y": 1324,
+    "y": 1321,
-    "w": 38,
+    "w": 44,
-    "h": 24,
+    "h": 30,
    "quads": [
      [
        [
--- a/fonts/ComicRelief-Bold.ttf
+++ b/fonts/ComicRelief-Bold.ttf
--- a/fonts/ComicRelief-Regular.ttf
+++ b/fonts/ComicRelief-Regular.ttf
--- a/manga-renderer.py
+++ b/manga-renderer.py
@@ -18,28 +18,100 @@ FONT_FALLBACK     = "/System/Library/Fonts/Helvetica.ttc"
 FONT_COLOR        = (0, 0, 0)
 # ─────────────────────────────────────────────
 #  WORD-ONLY WRAP
 #
 #  Breaks ONLY at space boundaries.
 #  Returns (lines, overflow) where overflow=True
 #  means a single word is wider than max_w at
 #  this font size → caller must try smaller.
 # ─────────────────────────────────────────────
 def wrap_text_words(draw, text, max_w, font):
    """
    Word-wraps text to fit within max_w pixels.
    Never inserts hyphens or breaks mid-word.
    Returns:
        (lines, overflow)
        lines    : list of strings, each ≤ max_w px wide
        overflow : True if any single word exceeds max_w
    """
    def measure(s):
        bb = draw.textbbox((0, 0), s, font=font)
        return bb[2] - bb[0]
    words    = text.split()
    lines    = []
    current  = ""
    overflow = False
    for word in words:
        if measure(word) > max_w:
            overflow = True
            break
        test = (current + " " + word).strip()
        if measure(test) <= max_w:
            current = test
        else:
            if current:
                lines.append(current)
            current = word
    if not overflow and current:
        lines.append(current)
    return lines, overflow
 # ─────────────────────────────────────────────
 #  PARSE output.txt
 # ─────────────────────────────────────────────
 def parse_translations(filepath):
    """
    Parses output.txt → {bubble_id: translated_text}.
-    Only bubbles present in the file are returned.
+    Uses header line as column ruler to find the exact
-    Absent IDs are left completely untouched on the page.
+    char position of the TRANSLATED column.
    Immune to commas, ellipses, spaces in translated text.
    """
    translations = {}
    header_pos   = None
    with open(filepath, "r", encoding="utf-8") as f:
-        for line in f:
+        lines = f.readlines()
-            line = line.rstrip("\n")
+
    for raw_line in lines:
        line = raw_line.rstrip("\n")
        if re.match(r"^BUBBLE\s+ORIGINAL", line):
            m = re.search(r"TRANSLATED", line)
            if m:
                header_pos = m.start()
                print(f"  ℹ️  TRANSLATED column at char {header_pos}")
            continue
        stripped = line.strip()
        if re.match(r"^[─\-=]{3,}$", stripped):
            continue
        if stripped.startswith("✅") or stripped.startswith("Done"):
            continue
        if not re.match(r"^\s*#\d+", line):
            continue
-            parts = re.split(r" {2,}", line.strip())
+
-            if len(parts) < 3:
+        m_id = re.match(r"^\s*#(\d+)", line)
        if not m_id:
            continue
-            bubble_id  = int(re.sub(r"[^0-9]", "", parts[0]))
+        bubble_id = int(m_id.group(1))
-            translated = parts[-1].strip()
+
-            if translated.startswith("["):
+        if header_pos is not None and len(line) > header_pos:
            translated = line[header_pos:].strip()
        else:
            parts = re.split(r" {2,}", stripped)
            translated = parts[-1].strip() if len(parts) >= 3 else ""
        if not translated or translated.startswith("["):
            print(f"  ⚠️  #{bubble_id}: no translation found")
            continue
        translations[bubble_id] = translated
    print(f"  ✅ {len(translations)} bubble(s) to translate: "
@@ -67,11 +139,6 @@ def load_bubble_boxes(filepath):
 #  SAMPLE BACKGROUND COLOR
 # ─────────────────────────────────────────────
 def sample_bubble_background(cv_image, bubble_data):
    """
    Samples the dominant background color inside the bbox
    by averaging the brightest 10% of pixels.
    Returns (B, G, R).
    """
    x  = max(0, bubble_data["x"])
    y  = max(0, bubble_data["y"])
    x2 = min(cv_image.shape[1], x + bubble_data["w"])
@@ -92,21 +159,9 @@ def sample_bubble_background(cv_image, bubble_data):
 # ─────────────────────────────────────────────
 #  ERASE ORIGINAL TEXT
 #  Fills the tight OCR bbox with the sampled
 #  background color. No extra expansion —
 #  the bbox from bubbles.json is already the
 #  exact size of the red squares.
 # ─────────────────────────────────────────────
 def erase_bubble_text(cv_image, bubble_data,
                      bg_color=(255, 255, 255)):
    """
    Fills the bubble bounding box with bg_color.
    Args:
        cv_image    : BGR numpy array (modified in place)
        bubble_data : Dict with 'x','y','w','h'
        bg_color    : (B,G,R) fill color
    """
    img_h, img_w = cv_image.shape[:2]
    x  = max(0,     bubble_data["x"])
    y  = max(0,     bubble_data["y"])
@@ -116,14 +171,61 @@ def erase_bubble_text(cv_image, bubble_data,
 # ─────────────────────────────────────────────
-#  FIT FONT SIZE
+#  LINE HEIGHT  (tight)
 #
 #  Uses actual ascender+descender of the font
 #  at the given size, with a minimal 1px gap.
 #  Much tighter than the old flat "+2" approach.
 # ─────────────────────────────────────────────
 def get_line_height(draw, font):
    """
    Returns the line height in pixels for the given font.
    Measured from actual glyph bounds of "Ay" (covers
    ascenders and descenders) plus 1px breathing room.
    """
    bb = draw.textbbox((0, 0), "Ay", font=font)
    return (bb[3] - bb[1]) + 1
 # ─────────────────────────────────────────────
 #  FIT FONT SIZE  (dynamic ceiling, word-wrap)
 #
 #  max_size is derived from the box itself:
 #    min(MAX_FONT_CAP, inner_h)
 #  so a tall box can use a large font and a
 #  small box won't waste iterations on huge sizes.
 #
 #  Rejects a size if:
 #    • any single word is wider than inner_w, OR
 #    • total wrapped height exceeds inner_h
 # ─────────────────────────────────────────────
 MAX_FONT_CAP = 120   # absolute ceiling across all boxes
 def fit_font_size(draw, text, max_w, max_h, font_path,
-                  min_size=7, max_size=48):
+                  min_size=7):
    """
    Finds the largest font size where word-wrapped text
-    fits inside (max_w × max_h).
+    fits inside max_w × max_h with NO mid-word breaking.
    max_size is computed dynamically as min(MAX_FONT_CAP, max_h)
    so the search always starts from a sensible upper bound
    relative to the actual box height.
    Args:
        draw      : ImageDraw instance
        text      : Full text string
        max_w     : Available width in pixels
        max_h     : Available height in pixels
        font_path : Path to .ttf (or None for PIL default)
        min_size  : Minimum font pt (default: 7)
    Returns:
        (font, lines)
    """
    # Dynamic ceiling: no point trying a font taller than the box
    max_size = min(MAX_FONT_CAP, max_h)
    max_size = max(max_size, min_size)   # safety: never below min
    best_font  = None
    best_lines = [text]
@@ -134,38 +236,48 @@ def fit_font_size(draw, text, max_w, max_h, font_path,
        except Exception:
            font = ImageFont.load_default()
-        words, lines, current = text.split(), [], ""
+        lines, overflow = wrap_text_words(draw, text, max_w, font)
        for word in words:
            test = (current + " " + word).strip()
            bb   = draw.textbbox((0, 0), test, font=font)
            if (bb[2] - bb[0]) <= max_w:
                current = test
            else:
                if current:
                    lines.append(current)
                current = word
        if current:
            lines.append(current)
-        lh     = draw.textbbox((0, 0), "Ay", font=font)
+        if overflow:
-        line_h = (lh[3] - lh[1]) + 2
+            continue   # a word is wider than the box → too big
-        if line_h * len(lines) <= max_h:
+
        line_h     = get_line_height(draw, font)
        total_h    = line_h * len(lines)
        if total_h <= max_h:
            best_font  = font
            best_lines = lines
-            break
+            break      # largest size that fits — done
-    return best_font or ImageFont.load_default(), best_lines
+    # Guaranteed fallback at min_size
    if best_font is None:
        try:
            best_font = (ImageFont.truetype(font_path, min_size)
                         if font_path else ImageFont.load_default())
        except Exception:
            best_font = ImageFont.load_default()
        best_lines, _ = wrap_text_words(
            draw, text, max_w, best_font)
        if not best_lines:
            best_lines = [text]
    return best_font, best_lines
 # ─────────────────────────────────────────────
 #  RENDER TEXT INTO BUBBLE
 #
 #  Text is centered both horizontally and
 #  vertically inside the padded bbox.
 #  Line height uses get_line_height() (tight).
 # ─────────────────────────────────────────────
 def render_text_in_bubble(pil_image, bubble_data, text,
-                           font_path, padding=8,
+                           font_path, padding=6,
                           font_color=(0, 0, 0)):
    """
-    Renders translated text centered inside the tight bbox.
+    Renders translated text centered inside the bbox.
-    Font auto-sizes to fill the same w×h the original occupied.
+    Font auto-sizes to fill the box as much as possible.
    Word-wrap only — no mid-word hyphens.
    """
    x, y = bubble_data["x"], bubble_data["y"]
    w, h = bubble_data["w"], bubble_data["h"]
@@ -174,17 +286,20 @@ def render_text_in_bubble(pil_image, bubble_data, text,
    inner_w = max(1, w - padding * 2)
    inner_h = max(1, h - padding * 2)
-    font, lines = fit_font_size(draw, text, inner_w, inner_h,
+    font, lines = fit_font_size(
-                                font_path)
+        draw, text, inner_w, inner_h, font_path
    )
-    lh_bb   = draw.textbbox((0, 0), "Ay", font=font)
+    line_h  = get_line_height(draw, font)
    line_h  = (lh_bb[3] - lh_bb[1]) + 2
    total_h = line_h * len(lines)
    # Center block vertically
    start_y = y + padding + max(0, (inner_h - total_h) // 2)
    for line in lines:
-        lb      = draw.textbbox((0, 0), line, font=font)
+        bb      = draw.textbbox((0, 0), line, font=font)
-        line_w  = lb[2] - lb[0]
+        line_w  = bb[2] - bb[0]
        # Center each line horizontally
        start_x = x + padding + max(0, (inner_w - line_w) // 2)
        draw.text((start_x, start_y), line,
                  font=font, fill=font_color)
@@ -216,19 +331,9 @@ def render_translated_page(
    font_path         = FONT_PATH,
    font_fallback     = FONT_FALLBACK,
    font_color        = FONT_COLOR,
-    text_padding      = 8,
+    text_padding      = 6,
    debug             = False,
 ):
    """
    Pipeline:
      1. Parse translations (only present IDs processed)
      2. Load bubble boxes from bubbles.json
      3. Cross-check IDs — absent ones left untouched
      4. Sample background color per bubble
      5. Erase original text (fill tight bbox)
      6. Render translated text sized to fit the bbox
      7. Save output
    """
    print("=" * 55)
    print("  MANGA TRANSLATOR — RENDERER")
    print("=" * 55)
@@ -344,9 +449,9 @@ if __name__ == "__main__":
        output_image      = "page_translated.png",
        translations_file = "output.txt",
        bubbles_file      = "bubbles.json",
-        font_path         = "font.ttf",
+        font_path         = "fonts/ComicRelief-Regular.ttf",
        font_fallback     = "/System/Library/Fonts/Helvetica.ttc",
        font_color        = (0, 0, 0),
-        text_padding      = 8,
+        text_padding      = 6,
        debug             = True,
    )
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -48,41 +48,73 @@ def is_sound_effect(text):
 # ─────────────────────────────────────────────
-#  TOKEN FILTER
+#  TOKEN CLASSIFIER
 #
 #  Three categories:
 #    "alpha"  — contains at least one letter (È, é, A-Z etc.)
 #    "punct"  — 2+ chars, all punctuation  (... ?? !! ?! …)
 #    "noise"  — everything else (single symbols, pure digits,
 #               low-confidence, sound effects)
 #
 #  Both "alpha" and "punct" tokens are KEPT:
 #    - "alpha"  → contributes to translation text AND bbox
 #    - "punct"  → contributes to bbox only (not translation text)
 #                 unless it immediately follows alpha text
 #                 in the same cluster (handled in clustering)
 # ─────────────────────────────────────────────
-def should_keep_token(text, confidence, confidence_threshold,
+def classify_token(text, confidence, confidence_threshold,
                    min_text_length, filter_sound_effects):
    """
-    Returns (keep: bool, reason: str).
+    Returns one of: "alpha" | "punct" | "noise"
    "alpha" : has at least one letter → keep for text + bbox
    "punct" : 2+ chars, no letters   → keep for bbox only
    "noise" : drop entirely
    Rules:
-      1. Drop if confidence below threshold
+      1. Drop if confidence below threshold          → noise
-      2. Drop if shorter than min_text_length
+      2. Drop if shorter than min_text_length        → noise
-      3. Drop pure digit strings
+      3. Drop pure digit strings                     → noise
-      4. Drop single non-alpha characters
+      4. Drop single non-alpha characters            → noise
-      5. Drop sound effects if filter enabled
+      5. Drop sound effects if filter enabled        → noise
-      6. Keep everything else
+      6. 2+ char string with no letters              → punct
      7. Has at least one letter                     → alpha
    """
    cleaned = text.strip()
    if confidence < confidence_threshold:
-        return False, f"low confidence ({confidence:.2f})"
+        return "noise"
    if len(cleaned) < min_text_length:
-        return False, "too short"
+        return "noise"
    if re.fullmatch(r"\d+", cleaned):
-        return False, "pure digits"
+        return "noise"
    if len(cleaned) == 1 and not cleaned.isalpha():
-        return False, "single symbol"
+        return "noise"
    if filter_sound_effects and is_sound_effect(cleaned):
-        return False, "sound effect"
+        return "noise"
-    return True, "ok"
+    # 2+ chars with no letters at all → punctuation token
    # Examples: "..." "??" "!!" "?!" "…" ".."
    if not any(ch.isalpha() for ch in cleaned):
        return "punct"
    return "alpha"
 def should_keep_token(text, confidence, confidence_threshold,
                       min_text_length, filter_sound_effects):
    """
    Backward-compatible wrapper.
    Returns (keep: bool, category: str).
    """
    cat = classify_token(text, confidence, confidence_threshold,
                          min_text_length, filter_sound_effects)
    return cat != "noise", cat
 # ─────────────────────────────────────────────
 #  BOUNDING BOX
 #
 #  Rules (match the red square exactly):
 #  Width  = widest single quad's width
 #  Height = sum of ALL quad heights stacked
 #  X      = centered on the widest quad's CX
@@ -93,12 +125,11 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
    """
    Computes the bubble erase bbox:
-      1. Per-quad: measure w, h, cx for every OCR detection
+      1. Per-quad: measure w, h, cx
      2. Width  = width of the widest single quad
      3. Height = sum of every quad's height
      4. X      = widest quad's center ± max_w/2
-                  (all lines sit symmetrically inside)
+      5. Y      = top of topmost quad → Y + total_h
      5. Y      = top of topmost quad, bottom = Y + total_h
    Args:
        ocr_bboxes  : List of EasyOCR quad bboxes
@@ -113,7 +144,6 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
    if not ocr_bboxes:
        return 0, 0, 0, 0
    # ── Per-quad metrics ──────────────────────────────────────────
    quad_metrics = []
    for quad in ocr_bboxes:
        xs = [pt[0] for pt in quad]
@@ -121,30 +151,23 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
        qx1, qx2 = min(xs), max(xs)
        qy1, qy2 = min(ys), max(ys)
        quad_metrics.append({
-            "x1" : qx1,
+            "x1" : qx1, "x2" : qx2,
-            "x2" : qx2,
+            "y1" : qy1, "y2" : qy2,
            "y1" : qy1,
            "y2" : qy2,
            "w"  : qx2 - qx1,
            "h"  : qy2 - qy1,
            "cx" : (qx1 + qx2) / 2.0,
        })
    # ── Width: widest single quad ─────────────────────────────────
    widest   = max(quad_metrics, key=lambda q: q["w"])
    max_w    = widest["w"]
    center_x = widest["cx"]
    # ── Height: sum of all quad heights ──────────────────────────
    total_h  = sum(q["h"] for q in quad_metrics)
    # ── Box edges ─────────────────────────────────────────────────
    box_x1 = center_x - max_w / 2.0
    box_x2 = center_x + max_w / 2.0
    box_y1 = min(q["y1"] for q in quad_metrics)
    box_y2 = box_y1 + total_h
    # ── Padding + clamp ───────────────────────────────────────────
    x1 = max(0,     box_x1 - padding_px)
    y1 = max(0,     box_y1 - padding_px)
    x2 = min(img_w, box_x2 + padding_px)
@@ -171,17 +194,6 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
    return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
 # ─────────────────────────────────────────────
 #  TEXT LINE FILTER
 # ─────────────────────────────────────────────
 def has_translatable_content(text):
    """
    True if text contains at least one letter.
    ch.isalpha() handles È, é, ñ, ü etc.
    """
    return any(ch.isalpha() for ch in text)
 # ─────────────────────────────────────────────
 #  POST-CLUSTER MERGE  (Union-Find)
 # ─────────────────────────────────────────────
@@ -270,11 +282,17 @@ def cluster_into_bubbles(ocr_results, image_shape,
      Pass 1 — DBSCAN on center points
      Pass 2 — Bounding-box proximity merge
    Token categories per cluster:
      "alpha" tokens → translation text + bbox
      "punct" tokens → bbox only (e.g. "..." after "HN")
      "noise" tokens → already filtered before this function
    Bbox: widest-line width (centered) × stacked height.
    All quads contribute to bbox regardless of content.
    Returns:
-        bubble_dict : cluster_id → list of translatable text lines
+        bubble_dict : cluster_id → list of text lines
                      (alpha tokens only, punct appended
                       to last alpha line if spatially adjacent)
        bbox_dict   : cluster_id → (x1, y1, x2, y2)
        ocr_quads   : cluster_id → list of ALL raw EasyOCR quads
    """
@@ -303,6 +321,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
        raw_clusters.setdefault(label, [])
        raw_quads.setdefault(label, [])
        bbox, text, _ = ocr_results[idx]
        # Store (cy, cx, text, category)
        cat = ocr_results[idx][2]   # confidence stored as category below
        raw_clusters[label].append(
            (centers[idx][1], centers[idx][0], text))
        raw_quads[label].append(bbox)
@@ -335,15 +355,40 @@ def cluster_into_bubbles(ocr_results, image_shape,
        items_sorted = sorted(items, key=lambda t: t[0])
-        text_lines = [
+        # ── Build text lines ──────────────────────────────────────
-            text for _, _, text in items_sorted
+        # Alpha tokens become text lines.
-            if has_translatable_content(text)
+        # Punct tokens (... ?? etc.) are appended to the
-        ]
+        # nearest preceding alpha token on the same Y level.
        alpha_lines  = []   # (cy, text) for alpha tokens
        punct_tokens = []   # (cy, text) for punct tokens
        for cy, cx, text in items_sorted:
            if any(ch.isalpha() for ch in text):
                alpha_lines.append((cy, text))
            else:
                punct_tokens.append((cy, text))
        # Append each punct token to the closest alpha line by Y
        for pcy, ptext in punct_tokens:
            if alpha_lines:
                # Find alpha line with closest cy
                closest_idx = min(
                    range(len(alpha_lines)),
                    key=lambda k: abs(alpha_lines[k][0] - pcy)
                )
                cy_a, text_a = alpha_lines[closest_idx]
                alpha_lines[closest_idx] = (cy_a, text_a + ptext)
            # If no alpha lines at all, punct still contributes
            # to bbox but not to translation text
        text_lines = [t for _, t in alpha_lines]
        # Fallback: if no alpha at all, keep everything
        if not text_lines:
            text_lines = [text for _, _, text in items_sorted]
        bubble_dict[i] = text_lines
-        ocr_quads[i]   = quads
+        ocr_quads[i]   = quads   # ALL quads → full bbox
        bbox_dict[i] = get_cluster_bbox_from_ocr(
            quads, image_shape, padding_px=bbox_padding
@@ -353,7 +398,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
        print(f"  Cluster #{i}: {len(quads)} quad(s)  "
              f"bbox=({int(b[0])},{int(b[1])})→"
              f"({int(b[2])},{int(b[3])})  "
-              f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}")
+              f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}  "
              f"text={text_lines}")
    return bubble_dict, bbox_dict, ocr_quads
@@ -519,15 +565,17 @@ def translate_manga_text(
    for bbox, text, confidence in results:
        cleaned = text.strip()
-        keep, reason = should_keep_token(
+        keep, category = should_keep_token(
            cleaned, confidence,
            confidence_threshold, min_text_length,
            filter_sound_effects
        )
        if keep:
            filtered.append((bbox, cleaned, confidence))
            if category == "punct":
                print(f"  ✔ Punct kept:  '{cleaned}'")
        else:
-            if reason == "sound effect":
+            if category == "sound effect":
                print(f"  🔇 SFX skipped: '{cleaned}'")
            skipped += 1
@@ -656,6 +704,6 @@ if __name__ == "__main__":
        filter_sound_effects = True,
        quality_threshold    = 0.5,
        upscale_factor       = 2.5,
-        bbox_padding         = 0,
+        bbox_padding         = 3,
        debug                = True,
    )