Added helper for bubbles

This commit is contained in:
Guillem Hernandez Sola
2026-04-23 18:09:50 +02:00
parent 853d497559
commit 2f61814971
2 changed files with 272 additions and 61 deletions

94
draw_debug_json.py Normal file
View File

@@ -0,0 +1,94 @@
import cv2
import json
import os
import argparse
def draw_boxes_from_json(image_path: str, json_path: str, output_path: str):
# 1. Load the image
image_bgr = cv2.imread(image_path)
if image_bgr is None:
print(f"❌ Error: Cannot load image at {image_path}")
return
ih, iw = image_bgr.shape[:2]
# 2. Load the JSON data
if not os.path.exists(json_path):
print(f"❌ Error: JSON file not found at {json_path}")
return
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Color map for different region types (BGR format)
COLOR_MAP = {
"dialogue": (0, 200, 0), # Green
"narration": (0, 165, 255), # Orange
"reaction": (255, 200, 0), # Cyan/Blue
"sfx": (0, 0, 220), # Red
"unknown": (120, 120, 120), # Gray
}
# 3. Iterate through the JSON and draw boxes
# Sort by order to keep numbering consistent
sorted_items = sorted(data.values(), key=lambda x: x.get("order", 0))
for item in sorted_items:
bid = item.get("order", "?")
rtype = item.get("region_type", "unknown")
box = item.get("box", {})
text = item.get("corrected_ocr", "")
if not box:
continue
# Extract xywh and convert to xyxy
x1, y1 = int(box.get("x", 0)), int(box.get("y", 0))
w, h = int(box.get("w", 0)), int(box.get("h", 0))
x2, y2 = x1 + w, y1 + h
color = COLOR_MAP.get(rtype, (120, 120, 120))
# Draw the main bounding box
cv2.rectangle(image_bgr, (x1, y1), (x2, y2), color, 2)
# Prepare labels
label = f"BOX#{bid} [{rtype}]"
preview = (text[:40] + "...") if len(text) > 40 else text
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.38
thickness = 1
# Draw Label Background
(lw, lh), _ = cv2.getTextSize(label, font, font_scale, thickness)
cv2.rectangle(image_bgr,
(x1, max(0, y1 - lh - 6)),
(x1 + lw + 4, y1),
color, -1)
# Draw Label Text (Box ID + Type)
cv2.putText(image_bgr, label,
(x1 + 2, max(lh, y1 - 3)),
font, font_scale, (255, 255, 255), thickness,
cv2.LINE_AA)
# Draw Preview Text below the box
cv2.putText(image_bgr, preview,
(x1 + 2, min(ih - 5, y2 + 12)),
font, font_scale * 0.85, color, thickness,
cv2.LINE_AA)
# 4. Save the final image
cv2.imwrite(output_path, image_bgr)
print(f"✅ Debug image successfully saved to: {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Draw bounding boxes from bubbles.json onto an image.")
parser.add_argument("image", help="Path to the original manga page image")
parser.add_argument("json", help="Path to the bubbles.json file")
parser.add_argument("--output", "-o", default="debug_clusters_from_json.png", help="Output image path")
args = parser.parse_args()
draw_boxes_from_json(args.image, args.json, args.output)

View File

@@ -414,15 +414,44 @@ def fix_common_dialogue_ocr(text):
for a, b in replacements.items():
t = t.replace(a, b)
# Contraction reconstruction
t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t)
t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t)
t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t)
t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t)
t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t)
# Spacing before punctuation
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
# ── D→P misread (bold manga fonts) ──────────────────────────
t = re.sub(r'\bPON\b', "DON'T", t)
t = re.sub(r"\bPON'T\b", "DON'T", t)
t = re.sub(r'\bPOWN\b', 'DOWN', t)
t = re.sub(r'\bTAKP\b', 'TAKE', t)
t = re.sub(r'\bTHP\b', 'THE', t)
t = re.sub(r'\bANP\b', 'AND', t)
t = re.sub(r'\bHANP\b', 'HAND', t)
t = re.sub(r'\bPEATH\b', 'DEATH', t)
t = re.sub(r'\bCRUSHEP\b', 'CRUSHED', t)
# ── Missing space / run-together words ───────────────────────
t = re.sub(r'\bICAN\b', 'I CAN', t)
t = re.sub(r"\bITS\b", "IT'S", t)
# ── O→U misread (THROUOH → THROUGH) ─────────────────────────
t = re.sub(r'\bTHROUOH\b', 'THROUGH', t)
# Fix line-break artifacts first so whole words can be matched below
t = dehyphenate_linebreak_artifacts(t)
# ── Missing last word recovery ───────────────────────────────
# e.g. "DON'T PAY ANY ATTENTION TO" → "DON'T PAY ANY ATTENTION TO THEM!"
t = re.sub(r"\bATTENTION TO$", "ATTENTION TO THEM!", t)
t = dedupe_repeated_phrase(t)
# Remove consecutive duplicate words (e.g. "SEE SEE" → "SEE")
words = t.split()
cleaned = []
for w in words:
@@ -430,6 +459,7 @@ def fix_common_dialogue_ocr(text):
continue
cleaned.append(w)
t = " ".join(cleaned)
t = re.sub(r"\s{2,}", " ", t).strip()
return t
@@ -502,6 +532,36 @@ def normalize_text(text: str) -> str:
t = re.sub(r"\.{4,}", "...", t)
return t.strip()
def adjust_box_for_added_text(box_xyxy, raw_text, corrected_text):
"""
Expands the bounding box downwards if the corrected text has more words
than the raw OCR text (e.g., recovering missing words at the end of a sentence).
"""
if box_xyxy is None or not raw_text or not corrected_text:
return box_xyxy
raw_words = raw_text.split()
corrected_words = corrected_text.split()
# Only adjust if words were actually added
if len(corrected_words) > len(raw_words):
x1, y1, x2, y2 = box_xyxy
current_height = max(1, y2 - y1)
# Calculate proportional height increase
word_ratio = len(corrected_words) / max(1, len(raw_words))
# Cap the ratio to prevent massive box blowouts (max 2.0x height)
word_ratio = min(2.0, word_ratio)
# Calculate the new bottom edge
new_height = int(current_height * word_ratio)
new_y2 = y1 + new_height
return (x1, y1, x2, new_y2)
return box_xyxy
def postprocess_translation_general(text: str) -> str:
t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip()
@@ -514,6 +574,8 @@ def fix_common_ocr_errors(text: str) -> str:
FIX Issue 1: fix_digit_letters is now defined BEFORE the return
statement so it is actually executed.
"""
text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', text)
text = re.sub(r'([A-Z]{2,})I$', r'\1!', text)
result = text
# Word-level bold font fixes
@@ -2003,34 +2065,83 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
return [indices]
def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]:
def split_at_sentence_boundaries(
indices: List[int],
lines: List[str],
ocr: List[Tuple],
min_gap_px: int = 8
) -> List[List[int]]:
"""
FIX Issue 2: now wired into apply_contour_split_to_all_boxes as
Strategy 4. Splits a group when a line ends with sentence-ending
punctuation AND the next line starts a new sentence.
Split a flat list of quad indices at sentence-ending punctuation
boundaries IF there is a measurable vertical gap between the last
quad of sentence N and the first quad of sentence N+1.
Returns a list of groups (each group is a List[int] of indices).
Always returns at least one group (the original) if no split fires.
"""
if len(lines) <= 1:
return [quads]
if not indices or len(indices) < 2:
return [indices]
SENTENCE_END = re.compile(r'[!?\\.]\s*$')
SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]')
# Sort quads top-to-bottom by their y coordinate
sorted_idx = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])
# Rebuild full text in reading order
full_text = " ".join(ocr[i][1] for i in sorted_idx)
# Fix common OCR mangling: trailing I after ALL-CAPS word → !
# e.g. "LIKE THISI IF" → "LIKE THIS! IF"
full_text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', full_text)
full_text = re.sub(r'([A-Z]{2,})I$', r'\1!', full_text)
# Find ALL sentence boundaries, not just the first one
boundary_positions = [
m.start() for m in re.finditer(r'[.!?]\s+[A-Z]', full_text)
]
if not boundary_positions:
return [indices]
# Map each boundary character position → quad position in sorted_idx
split_after_positions = []
for boundary_pos in boundary_positions:
char_cursor = 0
for pos, i in enumerate(sorted_idx):
char_cursor += len(ocr[i][1]) + 1 # +1 for the joining space
if char_cursor >= boundary_pos + 2:
# Only a valid split if not at the very last quad
if pos < len(sorted_idx) - 1:
split_after_positions.append(pos)
break
if not split_after_positions:
return [indices]
# Deduplicate and sort
split_after_positions = sorted(set(split_after_positions))
# Validate each candidate with a vertical gap check
confirmed_splits = []
for pos in split_after_positions:
bbox_a = quad_bbox(ocr[sorted_idx[pos]][0])
bbox_b = quad_bbox(ocr[sorted_idx[pos + 1]][0])
bottom_a = bbox_a[1] + bbox_a[3] # y + h of last quad in group A
top_b = bbox_b[1] # y of first quad in group B
gap = top_b - bottom_a
if gap >= min_gap_px:
confirmed_splits.append(pos)
if not confirmed_splits:
return [indices]
# Slice sorted_idx into groups at each confirmed split point
groups = []
current = []
for i, (quad, line) in enumerate(zip(quads, lines)):
current.append(quad)
if i < len(lines) - 1:
if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]):
groups.append(current)
current = []
if current:
groups.append(current)
return groups if len(groups) > 1 else [quads]
prev_pos = 0
for split_pos in confirmed_splits:
groups.append(sorted_idx[prev_pos : split_pos + 1])
prev_pos = split_pos + 1
groups.append(sorted_idx[prev_pos:]) # remainder
# Drop any empty groups (safety)
return [g for g in groups if g]
def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
@@ -2040,7 +2151,7 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
1. Contour membership — different speech-bubble contours
2. Mixed region type — sfx quads merged with dialogue quads
3. Spatial gap — two dialogue bubbles side-by-side
4. Sentence boundary — FIX Issue 2: now actually called here
4. Sentence boundary — tall box containing two stacked bubbles
"""
bubble_contours = detect_speech_bubbles(image_bgr)
quad_to_bubble = (build_quad_to_bubble_map(ocr, bubble_contours)
@@ -2053,32 +2164,38 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
for bid in sorted(bubble_boxes.keys()):
indices = bubble_indices[bid]
# Strategy 1: contour membership
# ── Strategy 1: contour membership ──────────────────────────────
groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble)
# Strategy 2: mixed region type
# ── Strategy 2: mixed region type ───────────────────────────────
refined = []
for grp in groups:
sub = split_group_by_region_type(grp, ocr)
refined.extend(sub)
groups = refined
# Strategy 3: spatial gap
final = []
# ── Strategy 3: spatial gap ──────────────────────────────────────
gapped = []
for grp in groups:
sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8)
final.extend(sub)
groups = final
gapped.extend(sub)
groups = gapped
# Strategy 4: sentence boundary split ← FIX Issue 2
sentence_final = []
# ── Strategy 4: sentence boundary ───────────────────────────────
# Signature: (indices, lines, ocr, min_gap_px) → List[List[int]]
sentenced = []
for grp in groups:
grp_lines = [normalize_text(ocr[i][1]) for i in grp]
sub = split_at_sentence_boundaries(grp, grp_lines)
sentence_final.extend(sub)
groups = sentence_final
sub = split_at_sentence_boundaries(
grp,
grp_lines,
ocr,
min_gap_px=8
)
sentenced.extend(sub)
groups = sentenced
# Commit results
# ── Commit results ───────────────────────────────────────────────
if len(groups) <= 1:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
@@ -2106,7 +2223,6 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# SPLIT HELPERS FOR enforce_max_box_size
# ============================================================
@@ -2427,9 +2543,12 @@ def process_manga_page(image_path: str,
# Apply bold-font fixes on top of dialogue correction
corrected_text = fix_common_ocr_errors(corrected_text)
# Confidence
# 👉 INJECTED FIX: Adjust the box if words were added
adjusted_box_xyxy = adjust_box_for_added_text(box, raw_text, corrected_text)
# Confidence (using the adjusted box)
conf = compute_region_confidence(
raw_text, corrected_text, box, region_type, image_bgr)
raw_text, corrected_text, adjusted_box_xyxy, region_type, image_bgr)
conf = maybe_conf_floor_for_protected(corrected_text, conf)
# Flags
@@ -2476,7 +2595,7 @@ def process_manga_page(image_path: str,
"translated": translated,
"flags": flags,
"bubble_groups": bubble_groups,
"box": xyxy_to_xywh(box),
"box": xyxy_to_xywh(adjusted_box_xyxy), # <--- Uses the adjusted box
"lines": bubble_groups,
}
@@ -2490,8 +2609,6 @@ def process_manga_page(image_path: str,
_write_txt_output(results, output_txt)
return results
# ============================================================
# OUTPUT WRITERS
# ============================================================