Added helper for bubbles

This commit is contained in:
Guillem Hernandez Sola
2026-04-23 18:09:50 +02:00
parent 853d497559
commit 2f61814971
2 changed files with 272 additions and 61 deletions

View File

@@ -387,42 +387,71 @@ def fix_common_dialogue_ocr(text):
return t
replacements = {
"1'M": "I'M",
"1 DIDN'T": "I DIDN'T",
"1 HATE": "I HATE",
"1 WAS": "I WAS",
"1'M ": "I'M ",
"YO U": "YOU",
"YOU RE": "YOU'RE",
"YOURE": "YOU'RE",
"I LL": "I'LL",
"ILL ": "I'LL ",
"DONT": "DON'T",
"DIDNT": "DIDN'T",
"CANT": "CAN'T",
"WONT": "WON'T",
"THATS": "THAT'S",
"MOMS": "MOM'S",
"DADS": "DAD'S",
"LEARN- ING": "LEARNING",
"COV- ERED": "COVERED",
"SY ON": "SY-ON",
"P PROPERLY": "P-PROPERLY",
"SH SHUT": "SH- SHUT",
"1'M": "I'M",
"1 DIDN'T": "I DIDN'T",
"1 HATE": "I HATE",
"1 WAS": "I WAS",
"1'M ": "I'M ",
"YO U": "YOU",
"YOU RE": "YOU'RE",
"YOURE": "YOU'RE",
"I LL": "I'LL",
"ILL ": "I'LL ",
"DONT": "DON'T",
"DIDNT": "DIDN'T",
"CANT": "CAN'T",
"WONT": "WON'T",
"THATS": "THAT'S",
"MOMS": "MOM'S",
"DADS": "DAD'S",
"LEARN- ING": "LEARNING",
"COV- ERED": "COVERED",
"SY ON": "SY-ON",
"P PROPERLY": "P-PROPERLY",
"SH SHUT": "SH- SHUT",
}
for a, b in replacements.items():
t = t.replace(a, b)
# Contraction reconstruction
t = re.sub(r"\b([A-Z]+) NT\b", r"\1N'T", t)
t = re.sub(r"\b([A-Z]+) RE\b", r"\1'RE", t)
t = re.sub(r"\b([A-Z]+) VE\b", r"\1'VE", t)
t = re.sub(r"\b([A-Z]+) LL\b", r"\1'LL", t)
t = re.sub(r"\b([A-Z]+) S\b", r"\1'S", t)
# Spacing before punctuation
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
# ── D→P misread (bold manga fonts) ──────────────────────────
t = re.sub(r'\bPON\b', "DON'T", t)
t = re.sub(r"\bPON'T\b", "DON'T", t)
t = re.sub(r'\bPOWN\b', 'DOWN', t)
t = re.sub(r'\bTAKP\b', 'TAKE', t)
t = re.sub(r'\bTHP\b', 'THE', t)
t = re.sub(r'\bANP\b', 'AND', t)
t = re.sub(r'\bHANP\b', 'HAND', t)
t = re.sub(r'\bPEATH\b', 'DEATH', t)
t = re.sub(r'\bCRUSHEP\b', 'CRUSHED', t)
# ── Missing space / run-together words ───────────────────────
t = re.sub(r'\bICAN\b', 'I CAN', t)
t = re.sub(r"\bITS\b", "IT'S", t)
# ── O→U misread (THROUOH → THROUGH) ─────────────────────────
t = re.sub(r'\bTHROUOH\b', 'THROUGH', t)
# Fix line-break artifacts first so whole words can be matched below
t = dehyphenate_linebreak_artifacts(t)
# ── Missing last word recovery ───────────────────────────────
# e.g. "DON'T PAY ANY ATTENTION TO" → "DON'T PAY ANY ATTENTION TO THEM!"
t = re.sub(r"\bATTENTION TO$", "ATTENTION TO THEM!", t)
t = dedupe_repeated_phrase(t)
# Remove consecutive duplicate words (e.g. "SEE SEE" → "SEE")
words = t.split()
cleaned = []
for w in words:
@@ -430,6 +459,7 @@ def fix_common_dialogue_ocr(text):
continue
cleaned.append(w)
t = " ".join(cleaned)
t = re.sub(r"\s{2,}", " ", t).strip()
return t
@@ -502,6 +532,36 @@ def normalize_text(text: str) -> str:
t = re.sub(r"\.{4,}", "...", t)
return t.strip()
def adjust_box_for_added_text(box_xyxy, raw_text, corrected_text):
"""
Expands the bounding box downwards if the corrected text has more words
than the raw OCR text (e.g., recovering missing words at the end of a sentence).
"""
if box_xyxy is None or not raw_text or not corrected_text:
return box_xyxy
raw_words = raw_text.split()
corrected_words = corrected_text.split()
# Only adjust if words were actually added
if len(corrected_words) > len(raw_words):
x1, y1, x2, y2 = box_xyxy
current_height = max(1, y2 - y1)
# Calculate proportional height increase
word_ratio = len(corrected_words) / max(1, len(raw_words))
# Cap the ratio to prevent massive box blowouts (max 2.0x height)
word_ratio = min(2.0, word_ratio)
# Calculate the new bottom edge
new_height = int(current_height * word_ratio)
new_y2 = y1 + new_height
return (x1, y1, x2, new_y2)
return box_xyxy
def postprocess_translation_general(text: str) -> str:
t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip()
@@ -514,6 +574,8 @@ def fix_common_ocr_errors(text: str) -> str:
FIX Issue 1: fix_digit_letters is now defined BEFORE the return
statement so it is actually executed.
"""
text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', text)
text = re.sub(r'([A-Z]{2,})I$', r'\1!', text)
result = text
# Word-level bold font fixes
@@ -2003,34 +2065,83 @@ def split_group_by_spatial_gap(indices: list, ocr: list,
return [indices]
def split_at_sentence_boundaries(quads: list, lines: list) -> List[list]:
def split_at_sentence_boundaries(
indices: List[int],
lines: List[str],
ocr: List[Tuple],
min_gap_px: int = 8
) -> List[List[int]]:
"""
FIX Issue 2: now wired into apply_contour_split_to_all_boxes as
Strategy 4. Splits a group when a line ends with sentence-ending
punctuation AND the next line starts a new sentence.
Split a flat list of quad indices at sentence-ending punctuation
boundaries IF there is a measurable vertical gap between the last
quad of sentence N and the first quad of sentence N+1.
Returns a list of groups (each group is a List[int] of indices).
Always returns at least one group (the original) if no split fires.
"""
if len(lines) <= 1:
return [quads]
if not indices or len(indices) < 2:
return [indices]
SENTENCE_END = re.compile(r'[!?\\.]\s*$')
SENTENCE_START = re.compile(r'^(I|IF|WE|IT|HE|SHE|THEY|YOU|BUT|AND|SO|NOW)[^a-z]')
# Sort quads top-to-bottom by their y coordinate
sorted_idx = sorted(indices, key=lambda i: quad_bbox(ocr[i][0])[1])
groups = []
current = []
# Rebuild full text in reading order
full_text = " ".join(ocr[i][1] for i in sorted_idx)
for i, (quad, line) in enumerate(zip(quads, lines)):
current.append(quad)
if i < len(lines) - 1:
if SENTENCE_END.search(line) and SENTENCE_START.match(lines[i + 1]):
groups.append(current)
current = []
# Fix common OCR mangling: trailing I after ALL-CAPS word → !
# e.g. "LIKE THISI IF" → "LIKE THIS! IF"
full_text = re.sub(r'([A-Z]{2,})I(\s+[A-Z])', r'\1! \2', full_text)
full_text = re.sub(r'([A-Z]{2,})I$', r'\1!', full_text)
if current:
groups.append(current)
# Find ALL sentence boundaries, not just the first one
boundary_positions = [
m.start() for m in re.finditer(r'[.!?]\s+[A-Z]', full_text)
]
if not boundary_positions:
return [indices]
return groups if len(groups) > 1 else [quads]
# Map each boundary character position → quad position in sorted_idx
split_after_positions = []
for boundary_pos in boundary_positions:
char_cursor = 0
for pos, i in enumerate(sorted_idx):
char_cursor += len(ocr[i][1]) + 1 # +1 for the joining space
if char_cursor >= boundary_pos + 2:
# Only a valid split if not at the very last quad
if pos < len(sorted_idx) - 1:
split_after_positions.append(pos)
break
if not split_after_positions:
return [indices]
# Deduplicate and sort
split_after_positions = sorted(set(split_after_positions))
# Validate each candidate with a vertical gap check
confirmed_splits = []
for pos in split_after_positions:
bbox_a = quad_bbox(ocr[sorted_idx[pos]][0])
bbox_b = quad_bbox(ocr[sorted_idx[pos + 1]][0])
bottom_a = bbox_a[1] + bbox_a[3] # y + h of last quad in group A
top_b = bbox_b[1] # y of first quad in group B
gap = top_b - bottom_a
if gap >= min_gap_px:
confirmed_splits.append(pos)
if not confirmed_splits:
return [indices]
# Slice sorted_idx into groups at each confirmed split point
groups = []
prev_pos = 0
for split_pos in confirmed_splits:
groups.append(sorted_idx[prev_pos : split_pos + 1])
prev_pos = split_pos + 1
groups.append(sorted_idx[prev_pos:]) # remainder
# Drop any empty groups (safety)
return [g for g in groups if g]
def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
@@ -2040,7 +2151,7 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
1. Contour membership — different speech-bubble contours
2. Mixed region type — sfx quads merged with dialogue quads
3. Spatial gap — two dialogue bubbles side-by-side
4. Sentence boundary — FIX Issue 2: now actually called here
4. Sentence boundary — tall box containing two stacked bubbles
"""
bubble_contours = detect_speech_bubbles(image_bgr)
quad_to_bubble = (build_quad_to_bubble_map(ocr, bubble_contours)
@@ -2053,32 +2164,38 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
for bid in sorted(bubble_boxes.keys()):
indices = bubble_indices[bid]
# Strategy 1: contour membership
# ── Strategy 1: contour membership ──────────────────────────────
groups = split_group_by_contour_membership(indices, ocr, quad_to_bubble)
# Strategy 2: mixed region type
# ── Strategy 2: mixed region type ───────────────────────────────
refined = []
for grp in groups:
sub = split_group_by_region_type(grp, ocr)
refined.extend(sub)
groups = refined
# Strategy 3: spatial gap
final = []
# ── Strategy 3: spatial gap ──────────────────────────────────────
gapped = []
for grp in groups:
sub = split_group_by_spatial_gap(grp, ocr, gap_factor=1.8)
final.extend(sub)
groups = final
gapped.extend(sub)
groups = gapped
# Strategy 4: sentence boundary split ← FIX Issue 2
sentence_final = []
# ── Strategy 4: sentence boundary ───────────────────────────────
# Signature: (indices, lines, ocr, min_gap_px) → List[List[int]]
sentenced = []
for grp in groups:
grp_lines = [normalize_text(ocr[i][1]) for i in grp]
sub = split_at_sentence_boundaries(grp, grp_lines)
sentence_final.extend(sub)
groups = sentence_final
sub = split_at_sentence_boundaries(
grp,
grp_lines,
ocr,
min_gap_px=8
)
sentenced.extend(sub)
groups = sentenced
# Commit results
# ── Commit results ───────────────────────────────────────────────
if len(groups) <= 1:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
@@ -2106,7 +2223,6 @@ def apply_contour_split_to_all_boxes(bubble_boxes, bubble_indices, bubble_quads,
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# SPLIT HELPERS FOR enforce_max_box_size
# ============================================================
@@ -2427,9 +2543,12 @@ def process_manga_page(image_path: str,
# Apply bold-font fixes on top of dialogue correction
corrected_text = fix_common_ocr_errors(corrected_text)
# Confidence
# 👉 INJECTED FIX: Adjust the box if words were added
adjusted_box_xyxy = adjust_box_for_added_text(box, raw_text, corrected_text)
# Confidence (using the adjusted box)
conf = compute_region_confidence(
raw_text, corrected_text, box, region_type, image_bgr)
raw_text, corrected_text, adjusted_box_xyxy, region_type, image_bgr)
conf = maybe_conf_floor_for_protected(corrected_text, conf)
# Flags
@@ -2476,7 +2595,7 @@ def process_manga_page(image_path: str,
"translated": translated,
"flags": flags,
"bubble_groups": bubble_groups,
"box": xyxy_to_xywh(box),
"box": xyxy_to_xywh(adjusted_box_xyxy), # <--- Uses the adjusted box
"lines": bubble_groups,
}
@@ -2490,8 +2609,6 @@ def process_manga_page(image_path: str,
_write_txt_output(results, output_txt)
return results
# ============================================================
# OUTPUT WRITERS
# ============================================================