From 2f61814971b1c55305f59d55af64e9522db1f906 Mon Sep 17 00:00:00 2001
From: Guillem Hernandez Sola <guillem.hernandez.sola@gmail.com>
Date: Thu, 23 Apr 2026 18:09:50 +0200
Subject: [PATCH] Added helper for bubbles

---
 draw_debug_json.py  |  94 +++++++++++++++++
 manga-translator.py | 239 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 272 insertions(+), 61 deletions(-)
 create mode 100644 draw_debug_json.py

diff --git a/draw_debug_json.py b/draw_debug_json.py
new file mode 100644
index 0000000..6c159d9
--- /dev/null
+++ b/draw_debug_json.py
@@ -0,0 +1,94 @@
+import cv2
+import json
+import os
+import argparse
+
+def draw_boxes_from_json(image_path: str, json_path: str, output_path: str):
+    # 1. Load the image
+    image_bgr = cv2.imread(image_path)
+    if image_bgr is None:
+        print(f"❌ Error: Cannot load image at {image_path}")
+        return
+
+    ih, iw = image_bgr.shape[:2]
+
+    # 2. Load the JSON data
+    if not os.path.exists(json_path):
+        print(f"❌ Error: JSON file not found at {json_path}")
+        return
+
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    # Color map for different region types (BGR format)
+    COLOR_MAP = {
+        "dialogue":  (0,   200,   0),   # Green
+        "narration": (0,   165, 255),   # Orange
+        "reaction":  (255, 200,   0),   # Cyan/Blue
+        "sfx":       (0,     0, 220),   # Red
+        "unknown":   (120, 120, 120),   # Gray
+    }
+
+    # 3. Iterate through the JSON and draw boxes
+    # Sort by order to keep numbering consistent
+    sorted_items = sorted(data.values(), key=lambda x: x.get("order", 0))
+
+    for item in sorted_items:
+        bid = item.get("order", "?")
+        rtype = item.get("region_type", "unknown")
+        box = item.get("box", {})
+        text = item.get("corrected_ocr", "")
+
+        if not box:
+            continue
+
+        # Extract xywh and convert to xyxy
+        x1, y1 = int(box.get("x", 0)), int(box.get("y", 0))
+        w, h = int(box.get("w", 0)), int(box.get("h", 0))
+        x2, y2 = x1 + w, y1 + h
+
+        color = COLOR_MAP.get(rtype, (120, 120, 120))
+
+        # Draw the main bounding box
+        cv2.rectangle(image_bgr, (x1, y1), (x2, y2), color, 2)
+
+        # Prepare labels
+        label = f"BOX#{bid} [{rtype}]"
+        preview = (text[:40] + "...") if len(text) > 40 else text
+        
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = 0.38
+        thickness = 1
+
+        # Draw Label Background
+        (lw, lh), _ = cv2.getTextSize(label, font, font_scale, thickness)
+        cv2.rectangle(image_bgr,
+                      (x1, max(0, y1 - lh - 6)),
+                      (x1 + lw + 4, y1),
+                      color, -1)
+        
+        # Draw Label Text (Box ID + Type)
+        cv2.putText(image_bgr, label,
+                    (x1 + 2, max(lh, y1 - 3)),
+                    font, font_scale, (255, 255, 255), thickness,
+                    cv2.LINE_AA)
+
+        # Draw Preview Text below the box
+        cv2.putText(image_bgr, preview,
+                    (x1 + 2, min(ih - 5, y2 + 12)),
+                    font, font_scale * 0.85, color, thickness,
+                    cv2.LINE_AA)
+
+    # 4. Save the final image
+    cv2.imwrite(output_path, image_bgr)
+    print(f"✅ Debug image successfully saved to: {output_path}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Draw bounding boxes from bubbles.json onto an image.")
+    parser.add_argument("image", help="Path to the original manga page image")
+    parser.add_argument("json", help="Path to the bubbles.json file")
+    parser.add_argument("--output", "-o", default="debug_clusters_from_json.png", help="Output image path")
+    
+    args = parser.parse_args()
+    
+    draw_boxes_from_json(args.image, args.json, args.output)
\ No newline at end of file
diff --git a/manga-translator.py b/manga-translator.py
index 11f0615..c209665 100644
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -387,42 +387,71 @@ def fix_common_dialogue_ocr(text):
         return t
 
     replacements = {
-        "1'M": "I'M",
-        "1 DIDN'T": "I DIDN'T",
-        "1 HATE": "I HATE",
-        "1 WAS": "I WAS",
-        "1'M ": "I'M ",
-        "YO U": "YOU",
-        "YOU RE": "YOU'RE",
-        "YOURE": "YOU'RE",
-        "I LL": "I'LL",
-        "ILL ": "I'LL ",
-        "DONT": "DON'T",
-        "DIDNT": "DIDN'T",
-        "CANT": "CAN'T",
-        "WONT": "WON'T",
-        "THATS": "THAT'S",
-        "MOMS": "MOM'S",
-        "DADS": "DAD'S",
-        "LEARN- ING": "LEARNING",
-        "COV- ERED": "COVERED",
-        "SY ON": "SY-ON",
-        "P PROPERLY": "P-PROPERLY",
-        "SH SHUT": "SH- SHUT",
+        "1'M":          "I'M",
+        "1 DIDN'T":     "I DIDN'T",
+        "1 HATE":       "I HATE",
+        "1 WAS":        "I WAS",
+        "1'M ":         "I'M ",
+        "YO U":         "YOU",
+        "YOU RE":       "YOU'RE",
+        "YOURE":        "YOU'RE",
+        "I LL":         "I'LL",
+        "ILL ":         "I'LL ",
+        "DONT":         "DON'T",
+        "DIDNT":        "DIDN'T",
+        "CANT":         "CAN'T",
+        "WONT":         "WON'T",
+        "THATS":        "THAT'S",
+        "MOMS":         "MOM'S",
+        "DADS":         "DAD'S",
+        "LEARN- ING":   "LEARNING",
+        "COV- ERED":    "COVERED",
+        "SY ON":        "SY-ON",
+        "P PROPERLY":   "P-PROPERLY",
+        "SH SHUT":      "SH- SHUT",
     }
 
     for a, b in replacements.items():
         t = t.replace(a, b)
 
+    # Contraction reconstruction
     t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t)
     t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t)
     t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t)
     t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t)
     t = re.sub(r"\b([A-Z]+) S\b",  r"\1'S",  t)
+
+    # Spacing before punctuation
     t = re.sub(r"\s+([,.;:!?])", r"\1", t)
+
+    # ── D→P misread (bold manga fonts) ──────────────────────────
+    t = re.sub(r'\bPON\b',       "DON'T",   t)
+    t = re.sub(r"\bPON'T\b",     "DON'T",   t)
+    t = re.sub(r'\bPOWN\b',      'DOWN',    t)
+    t = re.sub(r'\bTAKP\b',      'TAKE',    t)
+    t = re.sub(r'\bTHP\b',       'THE',     t)
+    t = re.sub(r'\bANP\b',       'AND',     t)
+    t = re.sub(r'\bHANP\b',      'HAND',    t)
+    t = re.sub(r'\bPEATH\b',     'DEATH',   t)
+    t = re.sub(r'\bCRUSHEP\b',   'CRUSHED', t)
+
+    # ── Missing space / run-together words ───────────────────────
+    t = re.sub(r'\bICAN\b',      'I CAN',   t)
+    t = re.sub(r"\bITS\b",       "IT'S",    t)
+
+    # ── O→U misread (THROUOH → THROUGH) ─────────────────────────
+    t = re.sub(r'\bTHROUOH\b',   'THROUGH', t)
+
+    # Fix line-break artifacts first so whole words can be matched below
     t = dehyphenate_linebreak_artifacts(t)
+
+    # ── Missing last word recovery ───────────────────────────────
+    # e.g. "DON'T PAY ANY ATTENTION TO" → "DON'T PAY ANY ATTENTION TO THEM!"
+    t = re.sub(r"\bATTENTION TO$", "ATTENTION TO THEM!", t)
+
     t = dedupe_repeated_phrase(t)
 
+    # Remove consecutive duplicate words (e.g. "SEE SEE" → "SEE")
     words = t.split()
     cleaned = []
     for w in words:
@@ -430,6 +459,7 @@ def fix_common_dialogue_ocr(text):
             continue
         cleaned.append(w)
     t = " ".join(cleaned)
+
     t = re.sub(r"\s{2,}", " ", t).strip()
     return t
 
@@ -502,6 +532,36 @@ def normalize_text(text: str) -> str:
     t = re.sub(r"\.{4,}", "...", t)
     return t.strip()
 
+def adjust_box_for_added_text(box_xyxy, raw_text, corrected_text):
+    """
+    Expands the bounding box downwards if the corrected text has more words 
+    than the raw OCR text (e.g., recovering missing words at the end of a sentence).
+    """
+    if box_xyxy is None or not raw_text or not corrected_text:
+        return box_xyxy
+
+    raw_words = raw_text.split()
+    corrected_words = corrected_text.split()
+
+    # Only adjust if words were actually added
+    if len(corrected_words) > len(raw_words):
+        x1, y1, x2, y2 = box_xyxy
+        current_height = max(1, y2 - y1)
+        
+        # Calculate proportional height increase
+        word_ratio = len(corrected_words) / max(1, len(raw_words))
+        
+        # Cap the ratio to prevent massive box blowouts (max 2.0x height)
+        word_ratio = min(2.0, word_ratio)
+        
+        # Calculate the new bottom edge
+        new_height = int(current_height * word_ratio)
+        new_y2 = y1 + new_height
+        
+        return (x1, y1, x2, new_y2)
+    
+    return box_xyxy
+
 def postprocess_translation_general(text: str) -> str:
     t = normalize_text(text)
     t = re.sub(r"\s{2,}", " ", t).strip()
@@ -514,6 +574,8 @@ def fix_common_ocr_errors(text: str) -> str:
     FIX Issue 1: fix_digit_letters is now defined BEFORE the return
     statement so it is actually executed.
     """
+    text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', text)
+    text = re.sub(r'([A-Z]{2,})I$', r'\1!', text)
     result = text
 
     # Word-level bold font fixes
@@ -2003,34 +2065,83 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
 
     return [indices]
 
-
-def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]:
+def split_at_sentence_boundaries(
+    indices: List[int],
+    lines: List[str],
+    ocr: List[Tuple],
+    min_gap_px: int = 8
+) -> List[List[int]]:
     """
-    FIX Issue 2: now wired into apply_contour_split_to_all_boxes as
-    Strategy 4. Splits a group when a line ends with sentence-ending
-    punctuation AND the next line starts a new sentence.
+    Split a flat list of quad indices at sentence-ending punctuation
+    boundaries IF there is a measurable vertical gap between the last
+    quad of sentence N and the first quad of sentence N+1.
+
+    Returns a list of groups (each group is a List[int] of indices).
+    Always returns at least one group (the original) if no split fires.
     """
-    if len(lines) <= 1:
-        return [quads]
+    if not indices or len(indices) < 2:
+        return [indices]
 
-    SENTENCE_END   = re.compile(r'[!?\\.]\s*$')
-    SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]')
+    # Sort quads top-to-bottom by their y coordinate
+    sorted_idx = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])
 
-    groups  = []
-    current = []
+    # Rebuild full text in reading order
+    full_text = " ".join(ocr[i][1] for i in sorted_idx)
 
-    for i, (quad, line) in enumerate(zip(quads, lines)):
-        current.append(quad)
-        if i < len(lines) - 1:
-            if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]):
-                groups.append(current)
-                current = []
+    # Fix common OCR mangling: trailing I after ALL-CAPS word → !
+    # e.g. "LIKE THISI IF" → "LIKE THIS! IF"
+    full_text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', full_text)
+    full_text = re.sub(r'([A-Z]{2,})I$',           r'\1!',    full_text)
 
-    if current:
-        groups.append(current)
+    # Find ALL sentence boundaries, not just the first one
+    boundary_positions = [
+        m.start() for m in re.finditer(r'[.!?]\s+[A-Z]', full_text)
+    ]
+    if not boundary_positions:
+        return [indices]
 
-    return groups if len(groups) > 1 else [quads]
+    # Map each boundary character position → quad position in sorted_idx
+    split_after_positions = []
+    for boundary_pos in boundary_positions:
+        char_cursor = 0
+        for pos, i in enumerate(sorted_idx):
+            char_cursor += len(ocr[i][1]) + 1  # +1 for the joining space
+            if char_cursor >= boundary_pos + 2:
+                # Only a valid split if not at the very last quad
+                if pos < len(sorted_idx) - 1:
+                    split_after_positions.append(pos)
+                break
 
+    if not split_after_positions:
+        return [indices]
+
+    # Deduplicate and sort
+    split_after_positions = sorted(set(split_after_positions))
+
+    # Validate each candidate with a vertical gap check
+    confirmed_splits = []
+    for pos in split_after_positions:
+        bbox_a = quad_bbox(ocr[sorted_idx[pos]][0])
+        bbox_b = quad_bbox(ocr[sorted_idx[pos + 1]][0])
+        bottom_a = bbox_a[1] + bbox_a[3]   # y + h  of last quad in group A
+        top_b    = bbox_b[1]               # y      of first quad in group B
+        gap      = top_b - bottom_a
+        if gap >= min_gap_px:
+            confirmed_splits.append(pos)
+
+    if not confirmed_splits:
+        return [indices]
+
+    # Slice sorted_idx into groups at each confirmed split point
+    groups   = []
+    prev_pos = 0
+    for split_pos in confirmed_splits:
+        groups.append(sorted_idx[prev_pos : split_pos + 1])
+        prev_pos = split_pos + 1
+    groups.append(sorted_idx[prev_pos:])   # remainder
+
+    # Drop any empty groups (safety)
+    return [g for g in groups if g]
 
 def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
                                       bubbles, ocr, image_bgr):
@@ -2040,7 +2151,7 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
       1. Contour membership  — different speech-bubble contours
       2. Mixed region type   — sfx quads merged with dialogue quads
       3. Spatial gap         — two dialogue bubbles side-by-side
-      4. Sentence boundary   — FIX Issue 2: now actually called here
+      4. Sentence boundary   — tall box containing two stacked bubbles
     """
     bubble_contours = detect_speech_bubbles(image_bgr)
     quad_to_bubble  = (build_quad_to_bubble_map(ocr, bubble_contours)
@@ -2053,32 +2164,38 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
     for bid in sorted(bubble_boxes.keys()):
         indices = bubble_indices[bid]
 
-        # Strategy 1: contour membership
+        # ── Strategy 1: contour membership ──────────────────────────────
         groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble)
 
-        # Strategy 2: mixed region type
+        # ── Strategy 2: mixed region type ───────────────────────────────
         refined = []
         for grp in groups:
             sub = split_group_by_region_type(grp, ocr)
             refined.extend(sub)
         groups = refined
 
-        # Strategy 3: spatial gap
-        final = []
+        # ── Strategy 3: spatial gap ──────────────────────────────────────
+        gapped = []
         for grp in groups:
             sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8)
-            final.extend(sub)
-        groups = final
+            gapped.extend(sub)
+        groups = gapped
 
-        # Strategy 4: sentence boundary split  ← FIX Issue 2
-        sentence_final = []
+        # ── Strategy 4: sentence boundary ───────────────────────────────
+        # Signature: (indices, lines, ocr, min_gap_px) → List[List[int]]
+        sentenced = []
         for grp in groups:
             grp_lines = [normalize_text(ocr[i][1]) for i in grp]
-            sub       = split_at_sentence_boundaries(grp, grp_lines)
-            sentence_final.extend(sub)
-        groups = sentence_final
+            sub = split_at_sentence_boundaries(
+                grp,
+                grp_lines,
+                ocr,
+                min_gap_px=8
+            )
+            sentenced.extend(sub)
+        groups = sentenced
 
-        # Commit results
+        # ── Commit results ───────────────────────────────────────────────
         if len(groups) <= 1:
             new_bubbles[next_bid] = bubbles[bid]
             new_boxes[next_bid]   = bubble_boxes[bid]
@@ -2106,7 +2223,6 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
 
     return new_bubbles, new_boxes, new_quads, new_indices
 
-
 # ============================================================
 # SPLIT HELPERS FOR enforce_max_box_size
 # ============================================================
@@ -2427,9 +2543,12 @@ def process_manga_page(image_path: str,
         # Apply bold-font fixes on top of dialogue correction
         corrected_text = fix_common_ocr_errors(corrected_text)
 
-        # Confidence
+        # 👉 INJECTED FIX: Adjust the box if words were added
+        adjusted_box_xyxy = adjust_box_for_added_text(box, raw_text, corrected_text)
+
+        # Confidence (using the adjusted box)
         conf = compute_region_confidence(
-            raw_text, corrected_text, box, region_type, image_bgr)
+            raw_text, corrected_text, adjusted_box_xyxy, region_type, image_bgr)
         conf = maybe_conf_floor_for_protected(corrected_text, conf)
 
         # Flags
@@ -2476,7 +2595,7 @@ def process_manga_page(image_path: str,
             "translated":        translated,
             "flags":             flags,
             "bubble_groups":     bubble_groups,
-            "box":               xyxy_to_xywh(box),
+            "box":               xyxy_to_xywh(adjusted_box_xyxy), # <--- Uses the adjusted box
             "lines":             bubble_groups,
         }
 
@@ -2490,8 +2609,6 @@ def process_manga_page(image_path: str,
         _write_txt_output(results, output_txt)
 
     return results
-
-
 # ============================================================
 # OUTPUT WRITERS
 # ============================================================