Added helper for bubbles

2026-04-23 18:09:50 +02:00
parent 853d497559
commit 2f61814971
2 changed files with 272 additions and 61 deletions
--- a/draw_debug_json.py
+++ b/draw_debug_json.py
@@ -0,0 +1,94 @@
 import cv2
 import json
 import os
 import argparse
 def draw_boxes_from_json(image_path: str, json_path: str, output_path: str):
    # 1. Load the image
    image_bgr = cv2.imread(image_path)
    if image_bgr is None:
        print(f"❌ Error: Cannot load image at {image_path}")
        return
    ih, iw = image_bgr.shape[:2]
    # 2. Load the JSON data
    if not os.path.exists(json_path):
        print(f"❌ Error: JSON file not found at {json_path}")
        return
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # Color map for different region types (BGR format)
    COLOR_MAP = {
        "dialogue":  (0,   200,   0),   # Green
        "narration": (0,   165, 255),   # Orange
        "reaction":  (255, 200,   0),   # Cyan/Blue
        "sfx":       (0,     0, 220),   # Red
        "unknown":   (120, 120, 120),   # Gray
    }
    # 3. Iterate through the JSON and draw boxes
    # Sort by order to keep numbering consistent
    sorted_items = sorted(data.values(), key=lambda x: x.get("order", 0))
    for item in sorted_items:
        bid = item.get("order", "?")
        rtype = item.get("region_type", "unknown")
        box = item.get("box", {})
        text = item.get("corrected_ocr", "")
        if not box:
            continue
        # Extract xywh and convert to xyxy
        x1, y1 = int(box.get("x", 0)), int(box.get("y", 0))
        w, h = int(box.get("w", 0)), int(box.get("h", 0))
        x2, y2 = x1 + w, y1 + h
        color = COLOR_MAP.get(rtype, (120, 120, 120))
        # Draw the main bounding box
        cv2.rectangle(image_bgr, (x1, y1), (x2, y2), color, 2)
        # Prepare labels
        label = f"BOX#{bid} [{rtype}]"
        preview = (text[:40] + "...") if len(text) > 40 else text
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.38
        thickness = 1
        # Draw Label Background
        (lw, lh), _ = cv2.getTextSize(label, font, font_scale, thickness)
        cv2.rectangle(image_bgr,
                      (x1, max(0, y1 - lh - 6)),
                      (x1 + lw + 4, y1),
                      color, -1)
        # Draw Label Text (Box ID + Type)
        cv2.putText(image_bgr, label,
                    (x1 + 2, max(lh, y1 - 3)),
                    font, font_scale, (255, 255, 255), thickness,
                    cv2.LINE_AA)
        # Draw Preview Text below the box
        cv2.putText(image_bgr, preview,
                    (x1 + 2, min(ih - 5, y2 + 12)),
                    font, font_scale * 0.85, color, thickness,
                    cv2.LINE_AA)
    # 4. Save the final image
    cv2.imwrite(output_path, image_bgr)
    print(f"✅ Debug image successfully saved to: {output_path}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Draw bounding boxes from bubbles.json onto an image.")
    parser.add_argument("image", help="Path to the original manga page image")
    parser.add_argument("json", help="Path to the bubbles.json file")
    parser.add_argument("--output", "-o", default="debug_clusters_from_json.png", help="Output image path")
    args = parser.parse_args()
    draw_boxes_from_json(args.image, args.json, args.output)
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -387,42 +387,71 @@ def fix_common_dialogue_ocr(text):
        return t
    replacements = {
-        "1'M": "I'M",
+        "1'M":          "I'M",
-        "1 DIDN'T": "I DIDN'T",
+        "1 DIDN'T":     "I DIDN'T",
-        "1 HATE": "I HATE",
+        "1 HATE":       "I HATE",
-        "1 WAS": "I WAS",
+        "1 WAS":        "I WAS",
-        "1'M ": "I'M ",
+        "1'M ":         "I'M ",
-        "YO U": "YOU",
+        "YO U":         "YOU",
-        "YOU RE": "YOU'RE",
+        "YOU RE":       "YOU'RE",
-        "YOURE": "YOU'RE",
+        "YOURE":        "YOU'RE",
-        "I LL": "I'LL",
+        "I LL":         "I'LL",
-        "ILL ": "I'LL ",
+        "ILL ":         "I'LL ",
-        "DONT": "DON'T",
+        "DONT":         "DON'T",
-        "DIDNT": "DIDN'T",
+        "DIDNT":        "DIDN'T",
-        "CANT": "CAN'T",
+        "CANT":         "CAN'T",
-        "WONT": "WON'T",
+        "WONT":         "WON'T",
-        "THATS": "THAT'S",
+        "THATS":        "THAT'S",
-        "MOMS": "MOM'S",
+        "MOMS":         "MOM'S",
-        "DADS": "DAD'S",
+        "DADS":         "DAD'S",
-        "LEARN- ING": "LEARNING",
+        "LEARN- ING":   "LEARNING",
-        "COV- ERED": "COVERED",
+        "COV- ERED":    "COVERED",
-        "SY ON": "SY-ON",
+        "SY ON":        "SY-ON",
-        "P PROPERLY": "P-PROPERLY",
+        "P PROPERLY":   "P-PROPERLY",
-        "SH SHUT": "SH- SHUT",
+        "SH SHUT":      "SH- SHUT",
    }
    for a, b in replacements.items():
        t = t.replace(a, b)
    # Contraction reconstruction
    t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t)
    t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t)
    t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t)
    t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t)
    t = re.sub(r"\b([A-Z]+) S\b",  r"\1'S",  t)
    # Spacing before punctuation
    t = re.sub(r"\s+([,.;:!?])", r"\1", t)
    # ── D→P misread (bold manga fonts) ──────────────────────────
    t = re.sub(r'\bPON\b',       "DON'T",   t)
    t = re.sub(r"\bPON'T\b",     "DON'T",   t)
    t = re.sub(r'\bPOWN\b',      'DOWN',    t)
    t = re.sub(r'\bTAKP\b',      'TAKE',    t)
    t = re.sub(r'\bTHP\b',       'THE',     t)
    t = re.sub(r'\bANP\b',       'AND',     t)
    t = re.sub(r'\bHANP\b',      'HAND',    t)
    t = re.sub(r'\bPEATH\b',     'DEATH',   t)
    t = re.sub(r'\bCRUSHEP\b',   'CRUSHED', t)
    # ── Missing space / run-together words ───────────────────────
    t = re.sub(r'\bICAN\b',      'I CAN',   t)
    t = re.sub(r"\bITS\b",       "IT'S",    t)
    # ── O→U misread (THROUOH → THROUGH) ─────────────────────────
    t = re.sub(r'\bTHROUOH\b',   'THROUGH', t)
    # Fix line-break artifacts first so whole words can be matched below
    t = dehyphenate_linebreak_artifacts(t)
    # ── Missing last word recovery ───────────────────────────────
    # e.g. "DON'T PAY ANY ATTENTION TO" → "DON'T PAY ANY ATTENTION TO THEM!"
    t = re.sub(r"\bATTENTION TO$", "ATTENTION TO THEM!", t)
    t = dedupe_repeated_phrase(t)
    # Remove consecutive duplicate words (e.g. "SEE SEE" → "SEE")
    words = t.split()
    cleaned = []
    for w in words:
@@ -430,6 +459,7 @@ def fix_common_dialogue_ocr(text):
            continue
        cleaned.append(w)
    t = " ".join(cleaned)
    t = re.sub(r"\s{2,}", " ", t).strip()
    return t
@@ -502,6 +532,36 @@ def normalize_text(text: str) -> str:
    t = re.sub(r"\.{4,}", "...", t)
    return t.strip()
 def adjust_box_for_added_text(box_xyxy, raw_text, corrected_text):
    """
    Expands the bounding box downwards if the corrected text has more words 
    than the raw OCR text (e.g., recovering missing words at the end of a sentence).
    """
    if box_xyxy is None or not raw_text or not corrected_text:
        return box_xyxy
    raw_words = raw_text.split()
    corrected_words = corrected_text.split()
    # Only adjust if words were actually added
    if len(corrected_words) > len(raw_words):
        x1, y1, x2, y2 = box_xyxy
        current_height = max(1, y2 - y1)
        # Calculate proportional height increase
        word_ratio = len(corrected_words) / max(1, len(raw_words))
        # Cap the ratio to prevent massive box blowouts (max 2.0x height)
        word_ratio = min(2.0, word_ratio)
        # Calculate the new bottom edge
        new_height = int(current_height * word_ratio)
        new_y2 = y1 + new_height
        return (x1, y1, x2, new_y2)
    return box_xyxy
 def postprocess_translation_general(text: str) -> str:
    t = normalize_text(text)
    t = re.sub(r"\s{2,}", " ", t).strip()
@@ -514,6 +574,8 @@ def fix_common_ocr_errors(text: str) -> str:
    FIX Issue 1: fix_digit_letters is now defined BEFORE the return
    statement so it is actually executed.
    """
    text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', text)
    text = re.sub(r'([A-Z]{2,})I$', r'\1!', text)
    result = text
    # Word-level bold font fixes
@@ -2003,34 +2065,83 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
    return [indices]
-
+def split_at_sentence_boundaries(
-def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]:
+    indices: List[int],
    lines: List[str],
    ocr: List[Tuple],
    min_gap_px: int = 8
 ) -> List[List[int]]:
    """
-    FIX Issue 2: now wired into apply_contour_split_to_all_boxes as
+    Split a flat list of quad indices at sentence-ending punctuation
-    Strategy 4. Splits a group when a line ends with sentence-ending
+    boundaries IF there is a measurable vertical gap between the last
-    punctuation AND the next line starts a new sentence.
+    quad of sentence N and the first quad of sentence N+1.
    Returns a list of groups (each group is a List[int] of indices).
    Always returns at least one group (the original) if no split fires.
    """
-    if len(lines) <= 1:
+    if not indices or len(indices) < 2:
-        return [quads]
+        return [indices]
-    SENTENCE_END   = re.compile(r'[!?\\.]\s*$')
+    # Sort quads top-to-bottom by their y coordinate
-    SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]')
+    sorted_idx = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])
-    groups  = []
+    # Rebuild full text in reading order
-    current = []
+    full_text = " ".join(ocr[i][1] for i in sorted_idx)
-    for i, (quad, line) in enumerate(zip(quads, lines)):
+    # Fix common OCR mangling: trailing I after ALL-CAPS word → !
-        current.append(quad)
+    # e.g. "LIKE THISI IF" → "LIKE THIS! IF"
-        if i < len(lines) - 1:
+    full_text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', full_text)
-            if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]):
+    full_text = re.sub(r'([A-Z]{2,})I$',           r'\1!',    full_text)
                groups.append(current)
                current = []
-    if current:
+    # Find ALL sentence boundaries, not just the first one
-        groups.append(current)
+    boundary_positions = [
        m.start() for m in re.finditer(r'[.!?]\s+[A-Z]', full_text)
    ]
    if not boundary_positions:
        return [indices]
-    return groups if len(groups) > 1 else [quads]
+    # Map each boundary character position → quad position in sorted_idx
    split_after_positions = []
    for boundary_pos in boundary_positions:
        char_cursor = 0
        for pos, i in enumerate(sorted_idx):
            char_cursor += len(ocr[i][1]) + 1  # +1 for the joining space
            if char_cursor >= boundary_pos + 2:
                # Only a valid split if not at the very last quad
                if pos < len(sorted_idx) - 1:
                    split_after_positions.append(pos)
                break
    if not split_after_positions:
        return [indices]
    # Deduplicate and sort
    split_after_positions = sorted(set(split_after_positions))
    # Validate each candidate with a vertical gap check
    confirmed_splits = []
    for pos in split_after_positions:
        bbox_a = quad_bbox(ocr[sorted_idx[pos]][0])
        bbox_b = quad_bbox(ocr[sorted_idx[pos + 1]][0])
        bottom_a = bbox_a[1] + bbox_a[3]   # y + h  of last quad in group A
        top_b    = bbox_b[1]               # y      of first quad in group B
        gap      = top_b - bottom_a
        if gap >= min_gap_px:
            confirmed_splits.append(pos)
    if not confirmed_splits:
        return [indices]
    # Slice sorted_idx into groups at each confirmed split point
    groups   = []
    prev_pos = 0
    for split_pos in confirmed_splits:
        groups.append(sorted_idx[prev_pos : split_pos + 1])
        prev_pos = split_pos + 1
    groups.append(sorted_idx[prev_pos:])   # remainder
    # Drop any empty groups (safety)
    return [g for g in groups if g]
 def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
                                      bubbles, ocr, image_bgr):
@@ -2040,7 +2151,7 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
      1. Contour membership  — different speech-bubble contours
      2. Mixed region type   — sfx quads merged with dialogue quads
      3. Spatial gap         — two dialogue bubbles side-by-side
-      4. Sentence boundary   — FIX Issue 2: now actually called here
+      4. Sentence boundary   — tall box containing two stacked bubbles
    """
    bubble_contours = detect_speech_bubbles(image_bgr)
    quad_to_bubble  = (build_quad_to_bubble_map(ocr, bubble_contours)
@@ -2053,32 +2164,38 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
    for bid in sorted(bubble_boxes.keys()):
        indices = bubble_indices[bid]
-        # Strategy 1: contour membership
+        # ── Strategy 1: contour membership ──────────────────────────────
        groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble)
-        # Strategy 2: mixed region type
+        # ── Strategy 2: mixed region type ───────────────────────────────
        refined = []
        for grp in groups:
            sub = split_group_by_region_type(grp, ocr)
            refined.extend(sub)
        groups = refined
-        # Strategy 3: spatial gap
+        # ── Strategy 3: spatial gap ──────────────────────────────────────
-        final = []
+        gapped = []
        for grp in groups:
            sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8)
-            final.extend(sub)
+            gapped.extend(sub)
-        groups = final
+        groups = gapped
-        # Strategy 4: sentence boundary split  ← FIX Issue 2
+        # ── Strategy 4: sentence boundary ───────────────────────────────
-        sentence_final = []
+        # Signature: (indices, lines, ocr, min_gap_px) → List[List[int]]
        sentenced = []
        for grp in groups:
            grp_lines = [normalize_text(ocr[i][1]) for i in grp]
-            sub       = split_at_sentence_boundaries(grp, grp_lines)
+            sub = split_at_sentence_boundaries(
-            sentence_final.extend(sub)
+                grp,
-        groups = sentence_final
+                grp_lines,
                ocr,
                min_gap_px=8
            )
            sentenced.extend(sub)
        groups = sentenced
-        # Commit results
+        # ── Commit results ───────────────────────────────────────────────
        if len(groups) <= 1:
            new_bubbles[next_bid] = bubbles[bid]
            new_boxes[next_bid]   = bubble_boxes[bid]
@@ -2106,7 +2223,6 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
    return new_bubbles, new_boxes, new_quads, new_indices
 # ============================================================
 # SPLIT HELPERS FOR enforce_max_box_size
 # ============================================================
@@ -2427,9 +2543,12 @@ def process_manga_page(image_path: str,
        # Apply bold-font fixes on top of dialogue correction
        corrected_text = fix_common_ocr_errors(corrected_text)
-        # Confidence
+        # 👉 INJECTED FIX: Adjust the box if words were added
        adjusted_box_xyxy = adjust_box_for_added_text(box, raw_text, corrected_text)
        # Confidence (using the adjusted box)
        conf = compute_region_confidence(
-            raw_text, corrected_text, box, region_type, image_bgr)
+            raw_text, corrected_text, adjusted_box_xyxy, region_type, image_bgr)
        conf = maybe_conf_floor_for_protected(corrected_text, conf)
        # Flags
@@ -2476,7 +2595,7 @@ def process_manga_page(image_path: str,
            "translated":        translated,
            "flags":             flags,
            "bubble_groups":     bubble_groups,
-            "box":               xyxy_to_xywh(box),
+            "box":               xyxy_to_xywh(adjusted_box_xyxy), # <--- Uses the adjusted box
            "lines":             bubble_groups,
        }
@@ -2490,8 +2609,6 @@ def process_manga_page(image_path: str,
        _write_txt_output(results, output_txt)
    return results
 # ============================================================
 # OUTPUT WRITERS
 # ============================================================