From 2f61814971b1c55305f59d55af64e9522db1f906 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Thu, 23 Apr 2026 18:09:50 +0200 Subject: [PATCH] Added helper for bubbles --- draw_debug_json.py | 94 +++++++++++++++++ manga-translator.py | 239 +++++++++++++++++++++++++++++++++----------- 2 files changed, 272 insertions(+), 61 deletions(-) create mode 100644 draw_debug_json.py diff --git a/draw_debug_json.py b/draw_debug_json.py new file mode 100644 index 0000000..6c159d9 --- /dev/null +++ b/draw_debug_json.py @@ -0,0 +1,94 @@ +import cv2 +import json +import os +import argparse + +def draw_boxes_from_json(image_path: str, json_path: str, output_path: str): + # 1. Load the image + image_bgr = cv2.imread(image_path) + if image_bgr is None: + print(f"❌ Error: Cannot load image at {image_path}") + return + + ih, iw = image_bgr.shape[:2] + + # 2. Load the JSON data + if not os.path.exists(json_path): + print(f"❌ Error: JSON file not found at {json_path}") + return + + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Color map for different region types (BGR format) + COLOR_MAP = { + "dialogue": (0, 200, 0), # Green + "narration": (0, 165, 255), # Orange + "reaction": (255, 200, 0), # Cyan/Blue + "sfx": (0, 0, 220), # Red + "unknown": (120, 120, 120), # Gray + } + + # 3. Iterate through the JSON and draw boxes + # Sort by order to keep numbering consistent + sorted_items = sorted(data.values(), key=lambda x: x.get("order", 0)) + + for item in sorted_items: + bid = item.get("order", "?") + rtype = item.get("region_type", "unknown") + box = item.get("box", {}) + text = item.get("corrected_ocr", "") + + if not box: + continue + + # Extract xywh and convert to xyxy + x1, y1 = int(box.get("x", 0)), int(box.get("y", 0)) + w, h = int(box.get("w", 0)), int(box.get("h", 0)) + x2, y2 = x1 + w, y1 + h + + color = COLOR_MAP.get(rtype, (120, 120, 120)) + + # Draw the main bounding box + cv2.rectangle(image_bgr, (x1, y1), (x2, y2), color, 2) + + # Prepare labels + label = f"BOX#{bid} [{rtype}]" + preview = (text[:40] + "...") if len(text) > 40 else text + + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.38 + thickness = 1 + + # Draw Label Background + (lw, lh), _ = cv2.getTextSize(label, font, font_scale, thickness) + cv2.rectangle(image_bgr, + (x1, max(0, y1 - lh - 6)), + (x1 + lw + 4, y1), + color, -1) + + # Draw Label Text (Box ID + Type) + cv2.putText(image_bgr, label, + (x1 + 2, max(lh, y1 - 3)), + font, font_scale, (255, 255, 255), thickness, + cv2.LINE_AA) + + # Draw Preview Text below the box + cv2.putText(image_bgr, preview, + (x1 + 2, min(ih - 5, y2 + 12)), + font, font_scale * 0.85, color, thickness, + cv2.LINE_AA) + + # 4. Save the final image + cv2.imwrite(output_path, image_bgr) + print(f"✅ Debug image successfully saved to: {output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Draw bounding boxes from bubbles.json onto an image.") + parser.add_argument("image", help="Path to the original manga page image") + parser.add_argument("json", help="Path to the bubbles.json file") + parser.add_argument("--output", "-o", default="debug_clusters_from_json.png", help="Output image path") + + args = parser.parse_args() + + draw_boxes_from_json(args.image, args.json, args.output) \ No newline at end of file diff --git a/manga-translator.py b/manga-translator.py index 11f0615..c209665 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -387,42 +387,71 @@ def fix_common_dialogue_ocr(text): return t replacements = { - "1'M": "I'M", - "1 DIDN'T": "I DIDN'T", - "1 HATE": "I HATE", - "1 WAS": "I WAS", - "1'M ": "I'M ", - "YO U": "YOU", - "YOU RE": "YOU'RE", - "YOURE": "YOU'RE", - "I LL": "I'LL", - "ILL ": "I'LL ", - "DONT": "DON'T", - "DIDNT": "DIDN'T", - "CANT": "CAN'T", - "WONT": "WON'T", - "THATS": "THAT'S", - "MOMS": "MOM'S", - "DADS": "DAD'S", - "LEARN- ING": "LEARNING", - "COV- ERED": "COVERED", - "SY ON": "SY-ON", - "P PROPERLY": "P-PROPERLY", - "SH SHUT": "SH- SHUT", + "1'M": "I'M", + "1 DIDN'T": "I DIDN'T", + "1 HATE": "I HATE", + "1 WAS": "I WAS", + "1'M ": "I'M ", + "YO U": "YOU", + "YOU RE": "YOU'RE", + "YOURE": "YOU'RE", + "I LL": "I'LL", + "ILL ": "I'LL ", + "DONT": "DON'T", + "DIDNT": "DIDN'T", + "CANT": "CAN'T", + "WONT": "WON'T", + "THATS": "THAT'S", + "MOMS": "MOM'S", + "DADS": "DAD'S", + "LEARN- ING": "LEARNING", + "COV- ERED": "COVERED", + "SY ON": "SY-ON", + "P PROPERLY": "P-PROPERLY", + "SH SHUT": "SH- SHUT", } for a, b in replacements.items(): t = t.replace(a, b) + # Contraction reconstruction t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t) t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t) t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t) t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t) t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t) + + # Spacing before punctuation t = re.sub(r"\s+([,.;:!?])", r"\1", t) + + # ── D→P misread (bold manga fonts) ────────────────────────── + t = re.sub(r'\bPON\b', "DON'T", t) + t = re.sub(r"\bPON'T\b", "DON'T", t) + t = re.sub(r'\bPOWN\b', 'DOWN', t) + t = re.sub(r'\bTAKP\b', 'TAKE', t) + t = re.sub(r'\bTHP\b', 'THE', t) + t = re.sub(r'\bANP\b', 'AND', t) + t = re.sub(r'\bHANP\b', 'HAND', t) + t = re.sub(r'\bPEATH\b', 'DEATH', t) + t = re.sub(r'\bCRUSHEP\b', 'CRUSHED', t) + + # ── Missing space / run-together words ─────────────────────── + t = re.sub(r'\bICAN\b', 'I CAN', t) + t = re.sub(r"\bITS\b", "IT'S", t) + + # ── O→U misread (THROUOH → THROUGH) ───────────────────────── + t = re.sub(r'\bTHROUOH\b', 'THROUGH', t) + + # Fix line-break artifacts first so whole words can be matched below t = dehyphenate_linebreak_artifacts(t) + + # ── Missing last word recovery ─────────────────────────────── + # e.g. "DON'T PAY ANY ATTENTION TO" → "DON'T PAY ANY ATTENTION TO THEM!" + t = re.sub(r"\bATTENTION TO$", "ATTENTION TO THEM!", t) + t = dedupe_repeated_phrase(t) + # Remove consecutive duplicate words (e.g. "SEE SEE" → "SEE") words = t.split() cleaned = [] for w in words: @@ -430,6 +459,7 @@ def fix_common_dialogue_ocr(text): continue cleaned.append(w) t = " ".join(cleaned) + t = re.sub(r"\s{2,}", " ", t).strip() return t @@ -502,6 +532,36 @@ def normalize_text(text: str) -> str: t = re.sub(r"\.{4,}", "...", t) return t.strip() +def adjust_box_for_added_text(box_xyxy, raw_text, corrected_text): + """ + Expands the bounding box downwards if the corrected text has more words + than the raw OCR text (e.g., recovering missing words at the end of a sentence). + """ + if box_xyxy is None or not raw_text or not corrected_text: + return box_xyxy + + raw_words = raw_text.split() + corrected_words = corrected_text.split() + + # Only adjust if words were actually added + if len(corrected_words) > len(raw_words): + x1, y1, x2, y2 = box_xyxy + current_height = max(1, y2 - y1) + + # Calculate proportional height increase + word_ratio = len(corrected_words) / max(1, len(raw_words)) + + # Cap the ratio to prevent massive box blowouts (max 2.0x height) + word_ratio = min(2.0, word_ratio) + + # Calculate the new bottom edge + new_height = int(current_height * word_ratio) + new_y2 = y1 + new_height + + return (x1, y1, x2, new_y2) + + return box_xyxy + def postprocess_translation_general(text: str) -> str: t = normalize_text(text) t = re.sub(r"\s{2,}", " ", t).strip() @@ -514,6 +574,8 @@ def fix_common_ocr_errors(text: str) -> str: FIX Issue 1: fix_digit_letters is now defined BEFORE the return statement so it is actually executed. """ + text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', text) + text = re.sub(r'([A-Z]{2,})I$', r'\1!', text) result = text # Word-level bold font fixes @@ -2003,34 +2065,83 @@ def split_group_by_spatial_gap(indices: list, ocr: list, return [indices] - -def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]: +def split_at_sentence_boundaries( + indices: List[int], + lines: List[str], + ocr: List[Tuple], + min_gap_px: int = 8 +) -> List[List[int]]: """ - FIX Issue 2: now wired into apply_contour_split_to_all_boxes as - Strategy 4. Splits a group when a line ends with sentence-ending - punctuation AND the next line starts a new sentence. + Split a flat list of quad indices at sentence-ending punctuation + boundaries IF there is a measurable vertical gap between the last + quad of sentence N and the first quad of sentence N+1. + + Returns a list of groups (each group is a List[int] of indices). + Always returns at least one group (the original) if no split fires. """ - if len(lines) <= 1: - return [quads] + if not indices or len(indices) < 2: + return [indices] - SENTENCE_END = re.compile(r'[!?\\.]\s*$') - SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]') + # Sort quads top-to-bottom by their y coordinate + sorted_idx = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1]) - groups = [] - current = [] + # Rebuild full text in reading order + full_text = " ".join(ocr[i][1] for i in sorted_idx) - for i, (quad, line) in enumerate(zip(quads, lines)): - current.append(quad) - if i < len(lines) - 1: - if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]): - groups.append(current) - current = [] + # Fix common OCR mangling: trailing I after ALL-CAPS word → ! + # e.g. "LIKE THISI IF" → "LIKE THIS! IF" + full_text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', full_text) + full_text = re.sub(r'([A-Z]{2,})I$', r'\1!', full_text) - if current: - groups.append(current) + # Find ALL sentence boundaries, not just the first one + boundary_positions = [ + m.start() for m in re.finditer(r'[.!?]\s+[A-Z]', full_text) + ] + if not boundary_positions: + return [indices] - return groups if len(groups) > 1 else [quads] + # Map each boundary character position → quad position in sorted_idx + split_after_positions = [] + for boundary_pos in boundary_positions: + char_cursor = 0 + for pos, i in enumerate(sorted_idx): + char_cursor += len(ocr[i][1]) + 1 # +1 for the joining space + if char_cursor >= boundary_pos + 2: + # Only a valid split if not at the very last quad + if pos < len(sorted_idx) - 1: + split_after_positions.append(pos) + break + if not split_after_positions: + return [indices] + + # Deduplicate and sort + split_after_positions = sorted(set(split_after_positions)) + + # Validate each candidate with a vertical gap check + confirmed_splits = [] + for pos in split_after_positions: + bbox_a = quad_bbox(ocr[sorted_idx[pos]][0]) + bbox_b = quad_bbox(ocr[sorted_idx[pos + 1]][0]) + bottom_a = bbox_a[1] + bbox_a[3] # y + h of last quad in group A + top_b = bbox_b[1] # y of first quad in group B + gap = top_b - bottom_a + if gap >= min_gap_px: + confirmed_splits.append(pos) + + if not confirmed_splits: + return [indices] + + # Slice sorted_idx into groups at each confirmed split point + groups = [] + prev_pos = 0 + for split_pos in confirmed_splits: + groups.append(sorted_idx[prev_pos : split_pos + 1]) + prev_pos = split_pos + 1 + groups.append(sorted_idx[prev_pos:]) # remainder + + # Drop any empty groups (safety) + return [g for g in groups if g] def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr): @@ -2040,7 +2151,7 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, 1. Contour membership — different speech-bubble contours 2. Mixed region type — sfx quads merged with dialogue quads 3. Spatial gap — two dialogue bubbles side-by-side - 4. Sentence boundary — FIX Issue 2: now actually called here + 4. Sentence boundary — tall box containing two stacked bubbles """ bubble_contours = detect_speech_bubbles(image_bgr) quad_to_bubble = (build_quad_to_bubble_map(ocr, bubble_contours) @@ -2053,32 +2164,38 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, for bid in sorted(bubble_boxes.keys()): indices = bubble_indices[bid] - # Strategy 1: contour membership + # ── Strategy 1: contour membership ────────────────────────────── groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble) - # Strategy 2: mixed region type + # ── Strategy 2: mixed region type ─────────────────────────────── refined = [] for grp in groups: sub = split_group_by_region_type(grp, ocr) refined.extend(sub) groups = refined - # Strategy 3: spatial gap - final = [] + # ── Strategy 3: spatial gap ────────────────────────────────────── + gapped = [] for grp in groups: sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8) - final.extend(sub) - groups = final + gapped.extend(sub) + groups = gapped - # Strategy 4: sentence boundary split ← FIX Issue 2 - sentence_final = [] + # ── Strategy 4: sentence boundary ─────────────────────────────── + # Signature: (indices, lines, ocr, min_gap_px) → List[List[int]] + sentenced = [] for grp in groups: grp_lines = [normalize_text(ocr[i][1]) for i in grp] - sub = split_at_sentence_boundaries(grp, grp_lines) - sentence_final.extend(sub) - groups = sentence_final + sub = split_at_sentence_boundaries( + grp, + grp_lines, + ocr, + min_gap_px=8 + ) + sentenced.extend(sub) + groups = sentenced - # Commit results + # ── Commit results ─────────────────────────────────────────────── if len(groups) <= 1: new_bubbles[next_bid] = bubbles[bid] new_boxes[next_bid] = bubble_boxes[bid] @@ -2106,7 +2223,6 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads, return new_bubbles, new_boxes, new_quads, new_indices - # ============================================================ # SPLIT HELPERS FOR enforce_max_box_size # ============================================================ @@ -2427,9 +2543,12 @@ def process_manga_page(image_path: str, # Apply bold-font fixes on top of dialogue correction corrected_text = fix_common_ocr_errors(corrected_text) - # Confidence + # 👉 INJECTED FIX: Adjust the box if words were added + adjusted_box_xyxy = adjust_box_for_added_text(box, raw_text, corrected_text) + + # Confidence (using the adjusted box) conf = compute_region_confidence( - raw_text, corrected_text, box, region_type, image_bgr) + raw_text, corrected_text, adjusted_box_xyxy, region_type, image_bgr) conf = maybe_conf_floor_for_protected(corrected_text, conf) # Flags @@ -2476,7 +2595,7 @@ def process_manga_page(image_path: str, "translated": translated, "flags": flags, "bubble_groups": bubble_groups, - "box": xyxy_to_xywh(box), + "box": xyxy_to_xywh(adjusted_box_xyxy), # <--- Uses the adjusted box "lines": bubble_groups, } @@ -2490,8 +2609,6 @@ def process_manga_page(image_path: str, _write_txt_output(results, output_txt) return results - - # ============================================================ # OUTPUT WRITERS # ============================================================