Added helper for bubbles

2026-04-23 18:09:50 +02:00
parent 853d497559
commit 2f61814971
2 changed files with 272 additions and 61 deletions
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -387,42 +387,71 @@ def fix_common_dialogue_ocr(text):
        return t

    replacements = {
-        "1'M": "I'M",
-        "1 DIDN'T": "I DIDN'T",
-        "1 HATE": "I HATE",
-        "1 WAS": "I WAS",
-        "1'M ": "I'M ",
-        "YO U": "YOU",
-        "YOU RE": "YOU'RE",
-        "YOURE": "YOU'RE",
-        "I LL": "I'LL",
-        "ILL ": "I'LL ",
-        "DONT": "DON'T",
-        "DIDNT": "DIDN'T",
-        "CANT": "CAN'T",
-        "WONT": "WON'T",
-        "THATS": "THAT'S",
-        "MOMS": "MOM'S",
-        "DADS": "DAD'S",
-        "LEARN- ING": "LEARNING",
-        "COV- ERED": "COVERED",
-        "SY ON": "SY-ON",
-        "P PROPERLY": "P-PROPERLY",
-        "SH SHUT": "SH- SHUT",
+        "1'M":          "I'M",
+        "1 DIDN'T":     "I DIDN'T",
+        "1 HATE":       "I HATE",
+        "1 WAS":        "I WAS",
+        "1'M ":         "I'M ",
+        "YO U":         "YOU",
+        "YOU RE":       "YOU'RE",
+        "YOURE":        "YOU'RE",
+        "I LL":         "I'LL",
+        "ILL ":         "I'LL ",
+        "DONT":         "DON'T",
+        "DIDNT":        "DIDN'T",
+        "CANT":         "CAN'T",
+        "WONT":         "WON'T",
+        "THATS":        "THAT'S",
+        "MOMS":         "MOM'S",
+        "DADS":         "DAD'S",
+        "LEARN- ING":   "LEARNING",
+        "COV- ERED":    "COVERED",
+        "SY ON":        "SY-ON",
+        "P PROPERLY":   "P-PROPERLY",
+        "SH SHUT":      "SH- SHUT",
    }

    for a, b in replacements.items():
        t = t.replace(a, b)

+    # Contraction reconstruction
    t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t)
    t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t)
    t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t)
    t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t)
    t = re.sub(r"\b([A-Z]+) S\b",  r"\1'S",  t)
+
+    # Spacing before punctuation
    t = re.sub(r"\s+([,.;:!?])", r"\1", t)
+
+    # ── D→P misread (bold manga fonts) ──────────────────────────
+    t = re.sub(r'\bPON\b',       "DON'T",   t)
+    t = re.sub(r"\bPON'T\b",     "DON'T",   t)
+    t = re.sub(r'\bPOWN\b',      'DOWN',    t)
+    t = re.sub(r'\bTAKP\b',      'TAKE',    t)
+    t = re.sub(r'\bTHP\b',       'THE',     t)
+    t = re.sub(r'\bANP\b',       'AND',     t)
+    t = re.sub(r'\bHANP\b',      'HAND',    t)
+    t = re.sub(r'\bPEATH\b',     'DEATH',   t)
+    t = re.sub(r'\bCRUSHEP\b',   'CRUSHED', t)
+
+    # ── Missing space / run-together words ───────────────────────
+    t = re.sub(r'\bICAN\b',      'I CAN',   t)
+    t = re.sub(r"\bITS\b",       "IT'S",    t)
+
+    # ── O→U misread (THROUOH → THROUGH) ─────────────────────────
+    t = re.sub(r'\bTHROUOH\b',   'THROUGH', t)
+
+    # Fix line-break artifacts first so whole words can be matched below
    t = dehyphenate_linebreak_artifacts(t)
+
+    # ── Missing last word recovery ───────────────────────────────
+    # e.g. "DON'T PAY ANY ATTENTION TO" → "DON'T PAY ANY ATTENTION TO THEM!"
+    t = re.sub(r"\bATTENTION TO$", "ATTENTION TO THEM!", t)
+
    t = dedupe_repeated_phrase(t)

+    # Remove consecutive duplicate words (e.g. "SEE SEE" → "SEE")
    words = t.split()
    cleaned = []
    for w in words:
@@ -430,6 +459,7 @@ def fix_common_dialogue_ocr(text):
            continue
        cleaned.append(w)
    t = " ".join(cleaned)
+
    t = re.sub(r"\s{2,}", " ", t).strip()
    return t

@@ -502,6 +532,36 @@ def normalize_text(text: str) -> str:
    t = re.sub(r"\.{4,}", "...", t)
    return t.strip()

+def adjust_box_for_added_text(box_xyxy, raw_text, corrected_text):
+    """
+    Expands the bounding box downwards if the corrected text has more words 
+    than the raw OCR text (e.g., recovering missing words at the end of a sentence).
+    """
+    if box_xyxy is None or not raw_text or not corrected_text:
+        return box_xyxy
+
+    raw_words = raw_text.split()
+    corrected_words = corrected_text.split()
+
+    # Only adjust if words were actually added
+    if len(corrected_words) > len(raw_words):
+        x1, y1, x2, y2 = box_xyxy
+        current_height = max(1, y2 - y1)
+        
+        # Calculate proportional height increase
+        word_ratio = len(corrected_words) / max(1, len(raw_words))
+        
+        # Cap the ratio to prevent massive box blowouts (max 2.0x height)
+        word_ratio = min(2.0, word_ratio)
+        
+        # Calculate the new bottom edge
+        new_height = int(current_height * word_ratio)
+        new_y2 = y1 + new_height
+        
+        return (x1, y1, x2, new_y2)
+    
+    return box_xyxy
+
 def postprocess_translation_general(text: str) -> str:
    t = normalize_text(text)
    t = re.sub(r"\s{2,}", " ", t).strip()
@@ -514,6 +574,8 @@ def fix_common_ocr_errors(text: str) -> str:
    FIX Issue 1: fix_digit_letters is now defined BEFORE the return
    statement so it is actually executed.
    """
+    text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', text)
+    text = re.sub(r'([A-Z]{2,})I$', r'\1!', text)
    result = text

    # Word-level bold font fixes
@@ -2003,34 +2065,83 @@ def split_group_by_spatial_gap(indices: list, ocr: list,

    return [indices]

-
-def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]:
+def split_at_sentence_boundaries(
+    indices: List[int],
+    lines: List[str],
+    ocr: List[Tuple],
+    min_gap_px: int = 8
+) -> List[List[int]]:
    """
-    FIX Issue 2: now wired into apply_contour_split_to_all_boxes as
-    Strategy 4. Splits a group when a line ends with sentence-ending
-    punctuation AND the next line starts a new sentence.
+    Split a flat list of quad indices at sentence-ending punctuation
+    boundaries IF there is a measurable vertical gap between the last
+    quad of sentence N and the first quad of sentence N+1.
+
+    Returns a list of groups (each group is a List[int] of indices).
+    Always returns at least one group (the original) if no split fires.
    """
-    if len(lines) <= 1:
-        return [quads]
+    if not indices or len(indices) < 2:
+        return [indices]

-    SENTENCE_END   = re.compile(r'[!?\\.]\s*$')
-    SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]')
+    # Sort quads top-to-bottom by their y coordinate
+    sorted_idx = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])

-    groups  = []
-    current = []
+    # Rebuild full text in reading order
+    full_text = " ".join(ocr[i][1] for i in sorted_idx)

-    for i, (quad, line) in enumerate(zip(quads, lines)):
-        current.append(quad)
-        if i < len(lines) - 1:
-            if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]):
-                groups.append(current)
-                current = []
+    # Fix common OCR mangling: trailing I after ALL-CAPS word → !
+    # e.g. "LIKE THISI IF" → "LIKE THIS! IF"
+    full_text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', full_text)
+    full_text = re.sub(r'([A-Z]{2,})I$',           r'\1!',    full_text)

-    if current:
-        groups.append(current)
+    # Find ALL sentence boundaries, not just the first one
+    boundary_positions = [
+        m.start() for m in re.finditer(r'[.!?]\s+[A-Z]', full_text)
+    ]
+    if not boundary_positions:
+        return [indices]

-    return groups if len(groups) > 1 else [quads]
+    # Map each boundary character position → quad position in sorted_idx
+    split_after_positions = []
+    for boundary_pos in boundary_positions:
+        char_cursor = 0
+        for pos, i in enumerate(sorted_idx):
+            char_cursor += len(ocr[i][1]) + 1  # +1 for the joining space
+            if char_cursor >= boundary_pos + 2:
+                # Only a valid split if not at the very last quad
+                if pos < len(sorted_idx) - 1:
+                    split_after_positions.append(pos)
+                break

+    if not split_after_positions:
+        return [indices]
+
+    # Deduplicate and sort
+    split_after_positions = sorted(set(split_after_positions))
+
+    # Validate each candidate with a vertical gap check
+    confirmed_splits = []
+    for pos in split_after_positions:
+        bbox_a = quad_bbox(ocr[sorted_idx[pos]][0])
+        bbox_b = quad_bbox(ocr[sorted_idx[pos + 1]][0])
+        bottom_a = bbox_a[1] + bbox_a[3]   # y + h  of last quad in group A
+        top_b    = bbox_b[1]               # y      of first quad in group B
+        gap      = top_b - bottom_a
+        if gap >= min_gap_px:
+            confirmed_splits.append(pos)
+
+    if not confirmed_splits:
+        return [indices]
+
+    # Slice sorted_idx into groups at each confirmed split point
+    groups   = []
+    prev_pos = 0
+    for split_pos in confirmed_splits:
+        groups.append(sorted_idx[prev_pos : split_pos + 1])
+        prev_pos = split_pos + 1
+    groups.append(sorted_idx[prev_pos:])   # remainder
+
+    # Drop any empty groups (safety)
+    return [g for g in groups if g]

 def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
                                      bubbles, ocr, image_bgr):
@@ -2040,7 +2151,7 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
      1. Contour membership  — different speech-bubble contours
      2. Mixed region type   — sfx quads merged with dialogue quads
      3. Spatial gap         — two dialogue bubbles side-by-side
-      4. Sentence boundary   — FIX Issue 2: now actually called here
+      4. Sentence boundary   — tall box containing two stacked bubbles
    """
    bubble_contours = detect_speech_bubbles(image_bgr)
    quad_to_bubble  = (build_quad_to_bubble_map(ocr, bubble_contours)
@@ -2053,32 +2164,38 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
    for bid in sorted(bubble_boxes.keys()):
        indices = bubble_indices[bid]

-        # Strategy 1: contour membership
+        # ── Strategy 1: contour membership ──────────────────────────────
        groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble)

-        # Strategy 2: mixed region type
+        # ── Strategy 2: mixed region type ───────────────────────────────
        refined = []
        for grp in groups:
            sub = split_group_by_region_type(grp, ocr)
            refined.extend(sub)
        groups = refined

-        # Strategy 3: spatial gap
-        final = []
+        # ── Strategy 3: spatial gap ──────────────────────────────────────
+        gapped = []
        for grp in groups:
            sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8)
-            final.extend(sub)
-        groups = final
+            gapped.extend(sub)
+        groups = gapped

-        # Strategy 4: sentence boundary split  ← FIX Issue 2
-        sentence_final = []
+        # ── Strategy 4: sentence boundary ───────────────────────────────
+        # Signature: (indices, lines, ocr, min_gap_px) → List[List[int]]
+        sentenced = []
        for grp in groups:
            grp_lines = [normalize_text(ocr[i][1]) for i in grp]
-            sub       = split_at_sentence_boundaries(grp, grp_lines)
-            sentence_final.extend(sub)
-        groups = sentence_final
+            sub = split_at_sentence_boundaries(
+                grp,
+                grp_lines,
+                ocr,
+                min_gap_px=8
+            )
+            sentenced.extend(sub)
+        groups = sentenced

-        # Commit results
+        # ── Commit results ───────────────────────────────────────────────
        if len(groups) <= 1:
            new_bubbles[next_bid] = bubbles[bid]
            new_boxes[next_bid]   = bubble_boxes[bid]
@@ -2106,7 +2223,6 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,

    return new_bubbles, new_boxes, new_quads, new_indices

-
 # ============================================================
 # SPLIT HELPERS FOR enforce_max_box_size
 # ============================================================
@@ -2427,9 +2543,12 @@ def process_manga_page(image_path: str,
        # Apply bold-font fixes on top of dialogue correction
        corrected_text = fix_common_ocr_errors(corrected_text)

-        # Confidence
+        # 👉 INJECTED FIX: Adjust the box if words were added
+        adjusted_box_xyxy = adjust_box_for_added_text(box, raw_text, corrected_text)
+
+        # Confidence (using the adjusted box)
        conf = compute_region_confidence(
-            raw_text, corrected_text, box, region_type, image_bgr)
+            raw_text, corrected_text, adjusted_box_xyxy, region_type, image_bgr)
        conf = maybe_conf_floor_for_protected(corrected_text, conf)

        # Flags
@@ -2476,7 +2595,7 @@ def process_manga_page(image_path: str,
            "translated":        translated,
            "flags":             flags,
            "bubble_groups":     bubble_groups,
-            "box":               xyxy_to_xywh(box),
+            "box":               xyxy_to_xywh(adjusted_box_xyxy), # <--- Uses the adjusted box
            "lines":             bubble_groups,
        }

@@ -2490,8 +2609,6 @@ def process_manga_page(image_path: str,
        _write_txt_output(results, output_txt)

    return results
-
-
 # ============================================================
 # OUTPUT WRITERS
 # ============================================================