Check new

This commit is contained in:
Guillem Hernandez Sola
2026-04-23 15:46:14 +02:00
parent 037dadd920
commit 3ca01dae8c
2 changed files with 362 additions and 102 deletions

View File

@@ -1661,18 +1661,25 @@ def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_qu
return new_bubbles, new_boxes, new_quads, new_indices
def _majority_contour_id(indices: list, quad_to_bubble: Dict[int, int]) -> int:
"""
FIX B helper: Returns the most common contour ID among all quads
in a box. Falls back to -1 only if truly no quad is inside any contour.
"""
from collections import Counter
ids = [quad_to_bubble.get(i, -1) for i in indices]
valid = [cid for cid in ids if cid != -1]
if not valid:
return -1
return Counter(valid).most_common(1)[0][0]
def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
"""
FIX: Merges boxes that are:
1. Inside the same speech-bubble contour
2. Vertically adjacent (gap ≤ 2 × med_h)
3. Both classified as dialogue/reaction/narration
(never merges sfx into dialogue)
This fixes split detections like Box7+Box9 in 001 and
Box9+Box10 in 002 where one bubble was detected as two
separate regions due to an intervening SFX quad.
FIX B: Uses majority contour vote instead of idx[0] only.
Also relaxed vert_gap threshold from med_h*2.5 → med_h*3.5
to catch boxes like 002/box9+10 that have a slightly larger gap.
"""
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
@@ -1695,7 +1702,6 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads,
text_i = normalize_text(" ".join(bubbles.get(bid_i, [])))
role_i = region_text_role_hint(text_i)
# Never merge sfx boxes into anything
if role_i == "sfx":
continue
@@ -1711,32 +1717,31 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads,
if role_j == "sfx":
continue
# Must share the same speech-bubble contour
idx_i = bubble_indices[bid_i]
idx_j = bubble_indices[bid_j]
if not idx_i or not idx_j:
continue
cid_i = quad_to_bubble.get(idx_i[0], -1)
cid_j = quad_to_bubble.get(idx_j[0], -1)
# FIX B: majority vote instead of idx[0]
cid_i = _majority_contour_id(idx_i, quad_to_bubble)
cid_j = _majority_contour_id(idx_j, quad_to_bubble)
if cid_i == -1 or cid_j == -1 or cid_i != cid_j:
continue
# Must be vertically adjacent
# FIX B: relaxed from med_h*2.5 → med_h*3.5
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
if vert_gap > med_h * 2.5:
if vert_gap > med_h * 3.5:
continue
# Must have horizontal overlap
h_overlap = max(0, min(box_i[2], box_j[2]) - max(box_i[0], box_j[0]))
min_w = min(xyxy_width(box_i), xyxy_width(box_j))
if h_overlap / max(1, min_w) < 0.25:
if h_overlap / max(1, min_w) < 0.20: # FIX B: relaxed from 0.25 → 0.20
continue
merge_pairs.append((bid_i, bid_j))
visited.add(bid_i)
visited.add(bid_j)
break # each box merges with at most one partner
break
if not merge_pairs:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
@@ -1770,15 +1775,116 @@ def merge_continuation_boxes(bubble_boxes, bubble_indices, bubble_quads,
return new_bubbles, new_boxes, new_quads, new_indices
def merge_same_column_dialogue_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
"""
FIX D: Merges dialogue boxes that share the same horizontal column
(strong x-overlap) and are vertically close, even when they have
different contour IDs.
This catches 004/box2+6 where the speech bubble body and its
continuation are detected as separate contours.
Criteria:
- Both boxes are dialogue (not sfx)
- Horizontal overlap ratio ≥ 0.50 (same column)
- Vertical gap ≤ med_h * 4.0
- Combined height ≤ image_height * 0.35 (not a full-page merge)
"""
ih, iw = image_bgr.shape[:2]
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bids = sorted(bubble_boxes.keys(),
key=lambda b: (bubble_boxes[b][1] + bubble_boxes[b][3]) / 2.0)
merge_pairs = []
visited = set()
for i in range(len(bids)):
bid_i = bids[i]
if bid_i in visited:
continue
box_i = bubble_boxes[bid_i]
text_i = normalize_text(" ".join(bubbles.get(bid_i, [])))
if region_text_role_hint(text_i) == "sfx":
continue
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in visited:
continue
box_j = bubble_boxes[bid_j]
text_j = normalize_text(" ".join(bubbles.get(bid_j, [])))
if region_text_role_hint(text_j) == "sfx":
continue
# Vertical gap check
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
if vert_gap > med_h * 4.0:
continue
# Horizontal overlap check
h_ov = max(0, min(box_i[2], box_j[2]) - max(box_i[0], box_j[0]))
min_w = min(xyxy_width(box_i), xyxy_width(box_j))
if h_ov / max(1, min_w) < 0.50:
continue
# Combined height sanity check
merged_h = (max(box_i[3], box_j[3]) - min(box_i[1], box_j[1]))
if merged_h > ih * 0.35:
continue
merge_pairs.append((bid_i, bid_j))
visited.add(bid_i)
visited.add(bid_j)
break
if not merge_pairs:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n📐 Same-column dialogue merge: {len(merge_pairs)} pair(s):")
processed = set()
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for bid_a, bid_b in merge_pairs:
print(f" ✓ Merging BOX#{bid_a} + BOX#{bid_b}")
all_idx = sorted(
set(bubble_indices[bid_a]) | set(bubble_indices[bid_b]),
key=lambda k: (quad_bbox(ocr[k][0])[1], quad_bbox(ocr[k][0])[0])
)
new_bubbles[next_bid] = build_lines_from_indices(all_idx, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_idx])
new_quads[next_bid] = [ocr[i][0] for i in all_idx]
new_indices[next_bid] = all_idx
processed.update({bid_a, bid_b})
next_bid += 1
for bid in bids:
if bid not in processed:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
"""
Full fix pipeline:
1. Split boxes that span multiple speech bubbles.
2. Merge fragments detected inside the same contour.
3. Merge continuation boxes split across same bubble (NEW).
4. Proximity+overlap merge — pass 1.
5. Proximity+overlap merge — pass 2 (chain resolution).
1. Split boxes spanning multiple bubbles.
2. Merge fragments inside the same contour.
3. Merge continuation boxes (same bubble, split detection).
4. FIX D: Merge same-column dialogue boxes.
5. Proximity+overlap merge — pass 1.
6. Proximity+overlap merge — pass 2.
"""
print("\n🔍 Running automatic bubble detection fixes...")
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
@@ -1793,11 +1899,15 @@ def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
detect_and_merge_fragmented_bubbles(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
# FIX: merge continuation boxes (same bubble, split detection)
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_continuation_boxes(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
# FIX D: same-column dialogue merge
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_same_column_dialogue_boxes(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
# Pass 1
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_boxes_by_proximity_and_overlap(
@@ -1991,14 +2101,16 @@ def split_group_by_region_type(indices: list, ocr: list) -> List[List[int]]:
def split_group_by_spatial_gap(indices: list, ocr: list,
gap_factor: float = 1.8) -> List[List[int]]:
gap_factor: float = 1.2) -> List[List[int]]:
"""
Splits a group of OCR indices where a large spatial gap exists
between clusters — catches Box-22/007 where two dialogue bubbles
sit side-by-side with a visible horizontal gap.
FIX C: Reduced gap_factor from 1.8 → 1.2 and added adaptive
minimum gap based on the actual inter-quad spacing distribution.
Works in both axes: tries horizontal split first, then vertical.
Returns original list if no significant gap is found.
This catches tight splits like:
007/box12: "YOU'RE A BIG MEAN JERK." vs "I HATE YOU, SY-ON BOY."
007/box15: three separate italic caption lines
007/box21: two side-by-side dialogue bubbles
008/box13: "AND I'M TOO CUTE..." vs "I WAS NOT!"
"""
if len(indices) <= 1:
return [indices]
@@ -2006,16 +2118,47 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in indices]
med_h = float(np.median(all_h)) if all_h else 14.0
gap_threshold = med_h * gap_factor
# ── Try horizontal split (left / right columns) ───────────
# ── Adaptive gap: use median inter-quad gap as baseline ───
sorted_by_y = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])
inter_gaps_y = []
for k in range(len(sorted_by_y) - 1):
b_curr = quad_bbox(ocr[sorted_by_y[k]][0])
b_next = quad_bbox(ocr[sorted_by_y[k+1]][0])
gap = b_next[1] - b_curr[3]
if gap > 0:
inter_gaps_y.append(gap)
# Adaptive threshold: max of (med_h * gap_factor) and
# (median_inter_gap * 2.5) — whichever is smaller wins
if inter_gaps_y:
median_inter = float(np.median(inter_gaps_y))
gap_threshold_y = min(med_h * gap_factor,
max(med_h * 0.8, median_inter * 2.5))
else:
gap_threshold_y = med_h * gap_factor
# ── Try horizontal split first ────────────────────────────
sorted_by_x = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[0])
boxes_x = [quad_bbox(ocr[i][0]) for i in sorted_by_x]
inter_gaps_x = []
for k in range(len(sorted_by_x) - 1):
gap = boxes_x[k+1][0] - boxes_x[k][2]
if gap > 0:
inter_gaps_x.append(gap)
if inter_gaps_x:
median_inter_x = float(np.median(inter_gaps_x))
gap_threshold_x = min(med_h * gap_factor,
max(med_h * 0.8, median_inter_x * 2.5))
else:
gap_threshold_x = med_h * gap_factor
best_h_gap, best_h_split = 0.0, None
for k in range(len(sorted_by_x) - 1):
gap = boxes_x[k + 1][0] - boxes_x[k][2]
if gap > gap_threshold and gap > best_h_gap:
if gap > gap_threshold_x and gap > best_h_gap:
best_h_gap = gap
best_h_split = k
@@ -2023,16 +2166,17 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
left = [sorted_by_x[i] for i in range(best_h_split + 1)]
right = [sorted_by_x[i] for i in range(best_h_split + 1, len(sorted_by_x))]
if left and right:
return [left, right]
# Recurse to catch further splits in each half
return (split_group_by_spatial_gap(left, ocr, gap_factor) +
split_group_by_spatial_gap(right, ocr, gap_factor))
# ── Try vertical split (top / bottom rows) ────────────────
sorted_by_y = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])
boxes_y = [quad_bbox(ocr[i][0]) for i in sorted_by_y]
# ── Try vertical split ────────────────────────────────────
boxes_y = [quad_bbox(ocr[i][0]) for i in sorted_by_y]
best_v_gap, best_v_split = 0.0, None
for k in range(len(sorted_by_y) - 1):
gap = boxes_y[k + 1][1] - boxes_y[k][3]
if gap > gap_threshold and gap > best_v_gap:
if gap > gap_threshold_y and gap > best_v_gap:
best_v_gap = gap
best_v_split = k
@@ -2040,11 +2184,12 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
top = [sorted_by_y[i] for i in range(best_v_split + 1)]
bottom = [sorted_by_y[i] for i in range(best_v_split + 1, len(sorted_by_y))]
if top and bottom:
return [top, bottom]
# Recurse to catch further splits in each half
return (split_group_by_spatial_gap(top, ocr, gap_factor) +
split_group_by_spatial_gap(bottom, ocr, gap_factor))
return [indices]
def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
"""
@@ -2233,32 +2378,38 @@ class ImprovedMacVisionDetector:
Strategy: use the variant with the most detections as base,
then fill gaps from other variants using IoU matching.
"""
"""
FIX E: Use self.langs[0] locale for is_meaningful_text()
instead of hardcoded "en", so short words like "BUT" and "I"
are protected when source_lang != "en".
"""
if not all_results:
return []
# pick base = most detections
# Derive source_lang string from self.langs[0] (e.g. "en-US" → "en")
lang_code = self.langs[0].split("-")[0].lower()
base_idx = max(range(len(all_results)), key=lambda i: len(all_results[i]))
base = list(all_results[base_idx])
others = [r for i, r in enumerate(all_results) if i != base_idx]
for other in others:
for quad_o, text_o, conf_o in other:
box_o = quad_bbox(quad_o)
box_o = quad_bbox(quad_o)
matched = False
for k, (quad_b, text_b, conf_b) in enumerate(base):
box_b = quad_bbox(quad_b)
if boxes_iou(box_o, box_b) > 0.40:
# keep higher-confidence reading
if conf_o > conf_b:
base[k] = (quad_b, text_o, conf_o)
matched = True
break
if not matched and is_meaningful_text(text_o, "en"):
# FIX E: use lang_code not hardcoded "en"
if not matched and is_meaningful_text(text_o, lang_code):
base.append((quad_o, text_o, conf_o))
return base
# ============================================================
# BUILD LINES FROM INDICES
# ============================================================
@@ -2271,12 +2422,14 @@ def build_lines_from_indices(indices, ocr, reading_mode="ltr"):
return []
return build_text_from_layout(indices, ocr, reading_mode=reading_mode)
def split_indices_into_vertical_blocks(indices, ocr, gap_factor=2.5):
def split_indices_into_vertical_blocks(indices, ocr, gap_factor=4.0):
"""
Split indices into vertically separated blocks.
A new block starts when the vertical gap between consecutive
quads (sorted top-to-bottom) exceeds gap_factor * median_height.
FIX A: Raised gap_factor from 2.5 → 4.0
The old value cut off trailing punctuation tokens ("...!!", "DY",
"ENEMIES.") that sit a few pixels below the main text block.
A larger gap is needed before we consider two groups to be in
separate bubbles — contour splitting handles the real separations.
"""
if not indices:
return []
@@ -2287,7 +2440,7 @@ def split_indices_into_vertical_blocks(indices, ocr, gap_factor=2.5):
sorted_idx = sorted(indices, key=lambda i: (quad_bbox(ocr[i][0])[1],
quad_bbox(ocr[i][0])[0]))
blocks = [[sorted_idx[0]]]
blocks = [[sorted_idx[0]]]
for k in range(1, len(sorted_idx)):
prev_box = quad_bbox(ocr[sorted_idx[k-1]][0])
curr_box = quad_bbox(ocr[sorted_idx[k]][0])
@@ -2298,7 +2451,6 @@ def split_indices_into_vertical_blocks(indices, ocr, gap_factor=2.5):
return blocks
# ============================================================
# SPLIT HELPERS FOR enforce_max_box_size
# ============================================================