Files
manga-translator/manga-translator.py
Guillem Hernandez Sola 455b4ad82c starting point
2026-04-22 11:49:25 +02:00

2886 lines
121 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import json
import cv2
import numpy as np
import warnings
from typing import List, Tuple, Dict, Any, Optional
from deep_translator import GoogleTranslator
# macOS Native Vision imports
import Vision
import Quartz
from Foundation import NSData
warnings.filterwarnings("ignore", category=UserWarning)
# ============================================================
# CONFIG
# ============================================================
TOP_BAND_RATIO = 0.08
# ============================================================
# HELPERS
# ============================================================
def normalize_text(text: str) -> str:
t = (text or "").strip().upper()
t = t.replace("\u201c", "\"").replace("\u201d", "\"")
t = t.replace("\u2018", "'").replace("\u2019", "'")
t = t.replace("\u2026", "...")
t = re.sub(r"\s+", " ", t)
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
t = re.sub(r"([¡¿])\s+", r"\1", t)
t = re.sub(r"\(\s+", "(", t)
t = re.sub(r"\s+\)", ")", t)
t = re.sub(r"\.{4,}", "...", t)
return t.strip()
def postprocess_translation_general(text: str) -> str:
t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip()
t = re.sub(r"([!?]){3,}", r"\1\1", t)
t = re.sub(r"\.{4,}", "...", t)
return t
def fix_common_ocr_errors(text: str) -> str:
result = text
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
result = result.replace('|', 'I')
result = result.replace('`', "'")
return result
def is_valid_language(text: str, source_lang: str) -> bool:
if not text:
return False
clean_text = re.sub(r'[^\w]', '', text)
if not clean_text:
return False
lang = source_lang.lower()
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
foreign_chars = len(re.findall(
r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF'
r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]',
clean_text))
if foreign_chars > 0:
return False
latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text))
total = len(clean_text)
if total <= 3:
return latin_chars >= 1
if total <= 6:
return (latin_chars / total) >= 0.55
return (latin_chars / total) >= 0.45
elif lang in ['ja', 'japanese']:
ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text))
if len(clean_text) <= 3:
return ja_chars >= 1
return (ja_chars / len(clean_text)) >= 0.4
elif lang in ['ko', 'korean']:
ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text))
if len(clean_text) <= 3:
return ko_chars >= 1
return (ko_chars / len(clean_text)) >= 0.4
elif lang in ['zh', 'chinese']:
zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text))
if len(clean_text) <= 3:
return zh_chars >= 1
return (zh_chars / len(clean_text)) >= 0.4
return True
_NOISE_TOKENS = {
'P', 'F', 'N', 'M', 'X', 'Z', 'Q',
'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP',
}
_MANGA_INTERJECTIONS = {
'HUH', 'HUH?', 'HUH??', 'HUH?!',
'OH', 'OH!', 'OOH', 'OOH!',
'AH', 'AH!', 'UH', 'UH...',
'HEY', 'HEY!',
'EH', 'EH?',
'WOW', 'WOW!',
'YES', 'NO', 'NO!',
'RUN', 'GO', 'GO!',
'STOP', 'WAIT',
'WHAT', 'WHAT?', 'WHAT?!',
'WHY', 'WHY?',
'HOW', 'HOW?',
'OK', 'OK!', 'OKAY',
'EEEEP', 'EEEP',
'OMIGOSH',
'BECKY', 'BECKY!',
'HMM', 'HMM...',
'TSK', 'TCH',
'GRRR','I','A',
'FWUP', 'FWAP',
'SHIVER',
'RRRING',
'MORNING', 'MORNING.',
}
def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool:
if not text:
return False
t = text.strip()
t_upper = t.upper()
t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper)
if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS:
return True
alpha_count = sum(c.isalpha() for c in t)
if alpha_count < min_alpha_chars:
return False
if t_upper in _NOISE_TOKENS:
return False
lang = source_lang.lower()
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
non_alpha = sum(not c.isalpha() for c in t)
if len(t) > 0 and (non_alpha / len(t)) > 0.60:
return False
if len(t) >= 3 and len(set(t_upper)) == 1:
return False
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
if len(t) > 4:
vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t))
if vowels == 0:
return False
return True
def quad_bbox(quad):
xs = [p[0] for p in quad]
ys = [p[1] for p in quad]
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
def quad_center(quad):
x1, y1, x2, y2 = quad_bbox(quad)
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
def boxes_union_xyxy(boxes):
boxes = [b for b in boxes if b is not None]
if not boxes:
return None
return (
int(min(b[0] for b in boxes)),
int(min(b[1] for b in boxes)),
int(max(b[2] for b in boxes)),
int(max(b[3] for b in boxes)),
)
def bbox_area_xyxy(b):
if b is None:
return 0
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
def xyxy_to_xywh(b):
if b is None:
return None
x1, y1, x2, y2 = b
return {"x": int(x1), "y": int(y1),
"w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
def overlap_or_near(a, b, gap=0):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
gap_y = max(0, max(ay1, by1) - min(ay2, by2))
return gap_x <= gap and gap_y <= gap
def boxes_iou(a, b):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, area_a + area_b - inter)
def boxes_overlap_ratio(a, b):
"""Ratio of intersection to the SMALLER box area."""
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, min(area_a, area_b))
def ocr_candidate_score(text: str) -> float:
if not text:
return 0.0
t = text.strip()
n = len(t)
if n == 0:
return 0.0
alpha = sum(c.isalpha() for c in t) / n
spaces = sum(c.isspace() for c in t) / n
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
penalty = 0.0
if re.search(r"\b[A-Z]\b", t):
penalty += 0.05
if re.search(r"[0-9]{2,}", t):
penalty += 0.08
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
return max(0.0, min(1.0, score))
def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool:
x1, y1, x2, y2 = quad_bbox(quad)
return (max(1, x2 - x1) / max(1, y2 - y1)) >= ratio_threshold
def quad_is_vertical(quad, ratio_threshold=1.5) -> bool:
x1, y1, x2, y2 = quad_bbox(quad)
return (max(1, y2 - y1) / max(1, x2 - x1)) >= ratio_threshold
# ============================================================
# ENHANCED IMAGE PREPROCESSING
# ============================================================
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
h, w = image_bgr.shape[:2]
upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)),
interpolation=cv2.INTER_CUBIC)
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
denoised = cv2.fastNlMeansDenoising(gray, None, h=10,
templateWindowSize=7, searchWindowSize=21)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
sharpened = cv2.filter2D(enhanced, -1,
np.array([[-1,-1,-1],[-1,9,-1],[-1,-1,-1]]))
binary = cv2.adaptiveThreshold(sharpened, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8))
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
def detect_small_text_regions(image_bgr, existing_quads):
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
mask = np.zeros(gray.shape, dtype=np.uint8)
for quad in existing_quads:
cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255)
mask_inv = cv2.bitwise_not(mask)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
text_regions = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
if 50 < area < 5000 and 0.1 < h / max(w, 1) < 10:
text_regions.append((x, y, x + w, y + h))
return text_regions
# ============================================================
# SPEECH BUBBLE DETECTION
# ============================================================
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
return [c for c in contours if cv2.contourArea(c) > 500]
def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5):
x1, y1, x2, y2 = quad_bbox_xyxy
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance
def split_indices_by_bubble(indices, ocr, bubble_contours):
if not indices:
return []
bubble_groups, outside_group = {}, []
for idx in indices:
bbox = quad_bbox(ocr[idx][0])
found = False
for bidx, bubble in enumerate(bubble_contours):
if is_quad_in_bubble(bbox, bubble):
bubble_groups.setdefault(bidx, []).append(idx)
found = True
break
if not found:
outside_group.append(idx)
result = list(bubble_groups.values())
if outside_group:
result.append(outside_group)
return result
def check_vertical_alignment_split(indices, ocr, threshold=20):
if len(indices) <= 1:
return [indices]
items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices],
key=lambda x: x[1][1])
groups, current_group = [], [items[0][0]]
for i in range(1, len(items)):
if items[i][1][1] - items[i-1][1][3] > threshold:
groups.append(current_group)
current_group = [items[i][0]]
else:
current_group.append(items[i][0])
if current_group:
groups.append(current_group)
return groups
# ============================================================
# QUAD SIZE VALIDATION AND SPLITTING
# ============================================================
def is_quad_oversized(quad, median_height, width_threshold=8.0):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
return w > median_height * width_threshold or w / h > 12.0
def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size == 0:
return [(quad, text, conf)]
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(median_height * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if not gaps:
return [(quad, text, conf)]
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
return [(quad, text, conf)]
def validate_and_split_oversized_quads(image_bgr, filtered_ocr):
if not filtered_ocr:
return filtered_ocr, 0
heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
median_height = float(np.median(heights)) if heights else 14.0
result, splits_made = [], 0
for quad, text, conf in filtered_ocr:
if is_quad_oversized(quad, median_height, 8.0):
sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height)
if len(sr) > 1:
result.extend(sr); splits_made += 1
else:
result.append((quad, text, conf))
else:
result.append((quad, text, conf))
return result, splits_made
# ============================================================
# HORIZONTAL GAP DETECTION AT QUAD LEVEL
# ============================================================
def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5):
if len(indices) < 2:
return None
items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0])
boxes = [quad_bbox(ocr[i][0]) for i in items]
gap_threshold = med_h * gap_factor
best_gap, best_split = 0.0, None
for k in range(len(items) - 1):
gap = boxes[k + 1][0] - boxes[k][2]
if gap > gap_threshold and gap > best_gap:
best_gap, best_split = gap, k
if best_split is None:
return None
left_group = [items[i] for i in range(best_split + 1)]
right_group = [items[i] for i in range(best_split + 1, len(items))]
if not left_group or not right_group:
return None
return (left_group, right_group)
def orientation_compatible(idx_a, idx_b, ocr):
ba = quad_bbox(ocr[idx_a][0])
bb = quad_bbox(ocr[idx_b][0])
wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1])
wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1])
ra, rb = wa / ha, wb / hb
if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0):
return False
return True
# ============================================================
# WIDE QUAD COLUMN SPLIT — pre-grouping
# ============================================================
def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h,
min_gap_factor=1.8):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
if w < med_h * 3.0:
return [(quad, text, conf)]
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size == 0:
return [(quad, text, conf)]
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.12
min_gap_px = max(int(med_h * min_gap_factor), 10)
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= min_gap_px:
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if not gaps:
return [(quad, text, conf)]
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_rel = gaps[0][0]
split_x_abs = x1 + split_x_rel
if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h:
return [(quad, text, conf)]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int(split_x_rel / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * split_x_rel / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
return [(quad, text, conf)]
def apply_column_gap_splits(image_bgr, ocr_list, med_h):
result, splits_made = [], 0
for quad, text, conf in ocr_list:
parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h)
if len(parts) > 1:
splits_made += 1
result.extend(parts)
if splits_made:
print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping")
return result, splits_made
# ============================================================
# GENERALIZED BOX FIXING FUNCTIONS
# ============================================================
def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid, splits_made = 1, []
for bid, indices in bubble_indices.items():
if len(indices) < 2:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = indices
next_bid += 1
continue
split_groups = split_indices_by_bubble(indices, ocr, bubble_contours)
if len(split_groups) > 1:
for group in split_groups:
if group:
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
new_quads[next_bid] = [ocr[i][0] for i in group]
new_indices[next_bid] = group
next_bid += 1
splits_made.append(f"BOX#{bid}{len(split_groups)} bubbles")
continue
vertical_splits = check_vertical_alignment_split(indices, ocr,
threshold=int(med_h * 2.0))
if len(vertical_splits) > 1:
for group in vertical_splits:
if group:
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
new_quads[next_bid] = [ocr[i][0] for i in group]
new_indices[next_bid] = group
next_bid += 1
splits_made.append(f"BOX#{bid}{len(vertical_splits)} vertical groups")
continue
box = bubble_boxes[bid]
x1, y1, x2, y2 = box
if (x2 - x1) > med_h * 10:
x_centers = [quad_center(ocr[i][0])[0] for i in indices]
x_median = np.median(x_centers)
left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median]
right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median]
if left_group and right_group:
left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group])
right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group])
if right_box[0] - left_box[2] > med_h * 1.5:
for grp in [left_group, right_group]:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} → 2 horizontal panels")
continue
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = indices
next_bid += 1
if splits_made:
print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):")
for s in splits_made: print(f"{s}")
return new_bubbles, new_boxes, new_quads, new_indices
def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
bids = list(bubble_boxes.keys())
to_merge = []
for i in range(len(bids)):
for j in range(i + 1, len(bids)):
bid_i, bid_j = bids[i], bids[j]
box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j]
cx_i = (box_i[0] + box_i[2]) / 2.0
cy_i = (box_i[1] + box_i[3]) / 2.0
cx_j = (box_j[0] + box_j[2]) / 2.0
cy_j = (box_j[1] + box_j[3]) / 2.0
in_same_bubble = any(
cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0
for c in bubble_contours
)
if in_same_bubble:
if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0:
to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i))
if not to_merge:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):")
merge_groups = {}
for top, bottom in to_merge:
found = False
for key in merge_groups:
if top in merge_groups[key] or bottom in merge_groups[key]:
merge_groups[key].update({top, bottom})
found = True; break
if not found:
merge_groups[len(merge_groups)] = {top, bottom}
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
merged_bids, next_bid = set(), 1
for merge_set in merge_groups.values():
merge_list = sorted(merge_set)
print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}")
all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b]))
for b in merge_list: merged_bids.add(b)
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
next_bid += 1
for bid in bids:
if bid not in merged_bids:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, med_h):
"""
Merges boxes that are vertically close AND share significant horizontal overlap.
Single-quad boxes participate fully — no isolation treatment.
This fixes BOX#2+#16, BOX#8+#21, BOX#9+#22 type problems where a
single-line detection sits directly above/below a multi-line box in the
same speech bubble.
Merge criteria (both must be true):
1. Vertical gap ≤ 1.5 × med_h
2. Horizontal overlap ratio ≥ 0.35
"""
bids = sorted(bubble_boxes.keys())
merge_map: Dict[int, List[int]] = {}
merged_into: Dict[int, int] = {}
for i, bid_i in enumerate(bids):
if bid_i in merged_into:
continue
box_i = bubble_boxes[bid_i]
wi = max(1, box_i[2] - box_i[0])
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_into:
continue
box_j = bubble_boxes[bid_j]
wj = max(1, box_j[2] - box_j[0])
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
h_ix1 = max(box_i[0], box_j[0])
h_ix2 = min(box_i[2], box_j[2])
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35:
root = merged_into.get(bid_i, bid_i)
merge_map.setdefault(root, [root])
if bid_j not in merge_map[root]:
merge_map[root].append(bid_j)
merged_into[bid_j] = root
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):")
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
processed, next_bid = set(), 1
for root, group in merge_map.items():
group_unique = sorted(set(group))
print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}")
all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
next_bid += 1
processed.update(group_unique)
for bid in bids:
if bid not in processed:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
"""
Full fix pipeline:
1. Split boxes that span multiple speech bubbles.
2. Merge fragments detected inside the same contour.
3. Merge fragments missed by contour detection (proximity+overlap) — pass 1.
4. Second proximity pass — catches chains resolved after pass 1.
"""
print("\n🔍 Running automatic bubble detection fixes...")
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
detect_and_split_multi_bubble_boxes(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
detect_and_merge_fragmented_bubbles(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
# Pass 1
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_boxes_by_proximity_and_overlap(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
# Pass 2 — catches chains only visible after pass 1
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
merge_boxes_by_proximity_and_overlap(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles,
overlap_threshold=0.50):
bids = list(bubble_boxes.keys())
to_remove = set()
for i in range(len(bids)):
bid_i = bids[i]
if bid_i in to_remove: continue
box_i = bubble_boxes[bid_i]
area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1])
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in to_remove: continue
box_j = bubble_boxes[bid_j]
area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1])
shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j])
overlap = boxes_overlap_ratio(box_i, box_j)
if overlap > overlap_threshold or len(shared) > 0:
if area_i >= area_j:
to_remove.add(bid_j)
print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})")
else:
to_remove.add(bid_i)
print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})")
break
if to_remove:
print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)")
for bid in to_remove:
bubble_boxes.pop(bid, None)
bubble_indices.pop(bid, None)
bubble_quads.pop(bid, None)
bubbles.pop(bid, None)
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr,
max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None):
if image_shape is None:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
ih, iw = image_shape[:2]
max_width, max_height = iw * max_width_ratio, ih * max_height_ratio
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid, splits_made = 1, []
for bid, box in bubble_boxes.items():
x1, y1, x2, y2 = box
w, h = x2 - x1, y2 - y1
if w > max_width or h > max_height:
indices = bubble_indices[bid]
col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid,
use_aggressive_thresholds=True)
if col_split:
for grp in col_split:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
continue
row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid)
if row_split:
for grp in row_split:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
continue
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = box
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
if splits_made:
print(f"\n📏 Split {len(splits_made)} oversized box(es):")
for s in splits_made: print(f"{s}")
return new_bubbles, new_boxes, new_quads, new_indices
def should_merge_groups(group1_indices, group2_indices, ocr, median_height,
max_vertical_gap=None):
if max_vertical_gap is None:
max_vertical_gap = median_height * 2.5
box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices])
box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices])
if box1 is None or box2 is None:
return False
cx1 = (box1[0] + box1[2]) / 2.0
cx2 = (box2[0] + box2[2]) / 2.0
if abs(cx1 - cx2) > median_height * 1.8:
return False
vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3]))
return vertical_gap <= max_vertical_gap
# ============================================================
# ENHANCED OCR ENGINE
# ============================================================
class ImprovedMacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
self.langs = [lang_map.get(lang_key, "en-US")]
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
def preprocess_variants(self, image_bgr):
variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))]
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
_, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
variants.append(("high_contrast",
cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5,
interpolation=cv2.INTER_CUBIC),
cv2.COLOR_GRAY2BGR)))
variants.append(("bilateral",
cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75),
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
variants.append(("inverted",
cv2.resize(cv2.bitwise_not(image_bgr),
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
variants.append(("original",
cv2.resize(image_bgr, None, fx=2.5, fy=2.5,
interpolation=cv2.INTER_CUBIC)))
return variants
def run_vision_ocr(self, image_bgr):
if image_bgr is None or image_bgr.size == 0:
return []
ih, iw = image_bgr.shape[:2]
success, buffer = cv2.imencode('.png', image_bgr)
if not success:
return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error: return
for obs in request.results():
candidate = obs.topCandidates_(1)[0]
text, conf = candidate.string(), candidate.confidence()
bbox = obs.boundingBox()
x = bbox.origin.x * iw
y_bl = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bl - h
quad = [[int(x),int(y)],[int(x+w),int(y)],
[int(x+w),int(y+h)],[int(x),int(y+h)]]
results.append((quad, text, conf))
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(False)
req.setRecognitionLanguages_(self.langs)
req.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([req], None)
return results
def merge_multi_pass_results(self, all_results, original_shape):
if not all_results:
return []
scale_factor = 2.5
normalized = []
for variant_name, results in all_results:
for quad, text, conf in results:
sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad]
normalized.append((sq, text, conf, variant_name))
def quads_overlap(q1, q2, threshold=0.5):
b1, b2 = quad_bbox(q1), quad_bbox(q2)
x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1])
x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3])
if x2 < x1 or y2 < y1: return False
inter = (x2-x1)*(y2-y1)
union = ((b1[2]-b1[0])*(b1[3]-b1[1]) +
(b2[2]-b2[0])*(b2[3]-b2[1]) - inter)
return inter / max(union, 1) > threshold
clusters, used = [], set()
for i, (q1, t1, c1, v1) in enumerate(normalized):
if i in used: continue
cluster = [(q1, t1, c1, v1)]
used.add(i)
for j, (q2, t2, c2, v2) in enumerate(normalized):
if j in used or i == j: continue
if quads_overlap(q1, q2):
cluster.append((q2, t2, c2, v2))
used.add(j)
clusters.append(cluster)
final_results = []
for cluster in clusters:
cluster.sort(key=lambda x: x[2], reverse=True)
best_quad, best_text, best_conf, _ = cluster[0]
text_votes = {}
for _, text, conf, _ in cluster:
n = normalize_text(text)
if n: text_votes[n] = text_votes.get(n, 0) + conf
if text_votes:
voted = max(text_votes.items(), key=lambda x: x[1])[0]
if voted != normalize_text(best_text):
best_text = voted
final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf))
return final_results
def read(self, image_path_or_array):
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
else image_path_or_array
if img is None or img.size == 0:
return []
variants = self.preprocess_variants(img)
all_results = []
for vname, vimg in variants:
r = self.run_vision_ocr(vimg)
if r: all_results.append((vname, r))
return self.merge_multi_pass_results(all_results, img.shape)
class MacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
self.langs = [lang_map.get(lang_key, "en-US")]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
def read(self, image_path_or_array):
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
else image_path_or_array
if img is None or img.size == 0:
return []
ih, iw = img.shape[:2]
success, buffer = cv2.imencode('.png', img)
if not success: return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error: return
for obs in request.results():
candidate = obs.topCandidates_(1)[0]
text, conf = candidate.string(), candidate.confidence()
bbox = obs.boundingBox()
x = bbox.origin.x * iw
y_bl = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bl - h
quad = [[int(x),int(y)],[int(x+w),int(y)],
[int(x+w),int(y+h)],[int(x),int(y+h)]]
results.append((quad, text, conf))
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(True)
req.setRecognitionLanguages_(self.langs)
req.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([req], None)
return results
# ============================================================
# COLUMN / ROW SPLITTING
# ============================================================
def split_bubble_if_multiple_columns(indices, ocr, bid=None,
use_aggressive_thresholds=False):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
xs = [(b[0]+b[2])/2.0 for b in boxes]
xs_sorted = sorted(xs)
gap_thresh = max(med_h*1.2, 18) if use_aggressive_thresholds else max(med_h*1.5, 22)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(xs_sorted) - 1):
gap = xs_sorted[i+1] - xs_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx+1]) / 2.0
left_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
right_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_bubble_if_multiple_rows(indices, ocr, bid=None):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
ys = [(b[1]+b[3])/2.0 for b in boxes]
ys_sorted = sorted(ys)
gap_thresh = max(med_h * 2.0, 30)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(ys_sorted) - 1):
gap = ys_sorted[i+1] - ys_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0
top_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y]
bot_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y]
if not top_idxs or not bot_idxs: return None
return (top_idxs, bot_idxs)
def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices],
key=lambda x: (x[1][1]+x[1][3])/2.0)
gap_thresh = max(med_h * factor, min_gap)
best_gap, best_split_idx = 0.0, None
for k in range(len(items) - 1):
gap = items[k+1][1][1] - items[k][1][3]
if gap > gap_thresh and gap > best_gap:
best_gap, best_split_idx = gap, k
if best_split_idx is None: return None
top_idxs = [it[0] for it in items[:best_split_idx+1]]
bot_idxs = [it[0] for it in items[best_split_idx+1:]]
if not top_idxs or not bot_idxs: return None
return (top_idxs, bot_idxs)
def is_vertical_text_like(indices, ocr):
if len(indices) < 2: return False
boxes = [quad_bbox(ocr[i][0]) for i in indices]
med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes]))
med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes]))
if med_h < med_w * 1.2: return False
xs = [(b[0]+b[2])/2.0 for b in boxes]
ys = [(b[1]+b[3])/2.0 for b in boxes]
if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False
return True
def split_nested_or_side_by_side(indices, ocr):
if len(indices) < 2: return None
xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0
for i in indices])
mid_idx = len(xs) // 2
split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0
left_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
right_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_panel_box(image_bgr, box_xyxy, bubble_quads=None):
x1, y1, x2, y2 = box_xyxy
ih, iw = image_bgr.shape[:2]
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(iw-1, x2), min(ih-1, y2)
if x2 <= x1 or y2 <= y1: return None
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
h_proj = np.sum(edges, axis=0)
w = x2 - x1
if w < 100: return None
search_start = int(w * 0.35)
search_end = int(w * 0.65)
if search_end <= search_start: return None
region = h_proj[search_start:search_end]
if len(region) == 0: return None
threshold = np.percentile(region, 85)
candidates = [x1 + search_start + rx
for rx in range(len(region)) if region[rx] >= threshold]
if not candidates: return None
split_x = int(np.median(candidates))
if bubble_quads:
lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x)
rc = len(bubble_quads) - lc
if lc == 0 or rc == 0: return None
return (x1, x2, split_x)
# ============================================================
# MERGE CLOSE BUBBLES
# ============================================================
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads,
bubble_indices, ocr):
"""
Merges boxes that are spatially very close on BOTH axes AND share
meaningful horizontal overlap (same column).
Single-quad boxes participate fully — no special isolation treatment.
The h_overlap_ratio >= 0.25 guard prevents merging horizontally
adjacent distinct bubbles.
"""
if not bubbles:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
merge_tol = max(8, med_h * 1.4)
bids = sorted(bubble_boxes.keys())
merged_set, merge_map = set(), {}
for i, bid_i in enumerate(bids):
if bid_i in merged_set: continue
x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i]
wi = max(1, x2_i - x1_i)
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_set: continue
x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j]
wj = max(1, x2_j - x1_j)
gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j))
gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j))
h_ix1 = max(x1_i, x1_j)
h_ix2 = min(x2_i, x2_j)
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25:
if bid_i not in merge_map:
merge_map[bid_i] = [bid_i]
merge_map[bid_i].append(bid_j)
merged_set.add(bid_j)
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for bid in bids:
if bid in merged_set: continue
if bid in merge_map:
group = merge_map[bid]
all_indices = sorted(set(idx for b in group for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
else:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# WIDE / BRIDGE QUAD SPLITTING
# ============================================================
def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w = x2 - x1
if w > med_h * width_factor:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = roi.shape[0] * 255 * 0.15
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.6), 12):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
if w / h > aspect_ratio_threshold:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def normalize_ocr_quads(ocr_list):
result = []
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
pad = 3
new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]]
result.append((new_quad, text, conf))
return result
# ============================================================
# VISION RE-READ
# ============================================================
def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw": return gray
if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0,
tileGridSize=(8,8)).apply(gray)
if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3,3), 0)
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11)
if mode == "otsu":
den = cv2.GaussianBlur(gray, (3,3), 0)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "invert": return 255 - gray
if mode == "bilateral":
den = cv2.bilateralFilter(gray, 7, 60, 60)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "morph_open":
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8))
return gray
def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2]
c = (w/2, h/2)
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
cos, sin = abs(M[0,0]), abs(M[0,1])
new_w = int((h*sin) + (w*cos))
new_h = int((h*cos) + (w*sin))
M[0,2] += (new_w/2) - c[0]
M[1,2] += (new_h/2) - c[1]
return cv2.warpAffine(img, M, (new_w, new_h),
flags=cv2.INTER_CUBIC, borderValue=255)
def rebuild_text_from_vision_result(res):
if not res: return ""
norm = []
for bbox, txt, conf in res:
if not txt or not txt.strip(): continue
b = quad_bbox(bbox)
norm.append((b, txt, conf,
(b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
if not norm: return ""
med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4])
rows = []
for it in norm:
placed = False
for r in rows:
if abs(it[4] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[4] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[4], "m": [it]})
rows.sort(key=lambda r: r["yc"])
lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3])))
for r in rows]
return normalize_text(" ".join(filter(None, lines)))
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector,
upscale=3.0, pad=24):
ih, iw = image_bgr.shape[:2]
x1, y1, x2, y2 = bbox_xyxy
x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad))
x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad))
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None, 0.0, "none"
modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
angles = [0.0, 1.5, -1.5]
best_v_txt, best_v_sc = "", 0.0
up0 = cv2.resize(crop,
(int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)),
interpolation=cv2.INTER_CUBIC)
for mode in modes:
proc = preprocess_variant(up0, mode)
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
for a in angles:
rot = rotate_image_keep_bounds(proc3, a)
res = (vision_detector.run_vision_ocr(rot)
if hasattr(vision_detector, 'run_vision_ocr')
else vision_detector.read(rot))
txt = rebuild_text_from_vision_result(res)
sc = ocr_candidate_score(txt)
if sc > best_v_sc:
best_v_txt, best_v_sc = txt, sc
if best_v_txt: return best_v_txt, best_v_sc, "vision-reread"
return None, 0.0, "none"
# ============================================================
# LINES + BUBBLES
# ============================================================
def build_lines_from_indices(indices, ocr):
if not indices: return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
placed = False
for r in rows:
if abs(it[3] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[3] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[3], "m": [it]})
rows.sort(key=lambda r: r["yc"])
return [normalize_text(
" ".join(ocr[i][1]
for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2])))
for r in rows if r["m"]]
def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path)
return base * (img.shape[1] / ref_w) if img is not None else base
def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1,
strict_mode=False):
n = len(ocr)
if n == 0: return {}, {}, {}, {}
boxes = [quad_bbox(r[0]) for r in ocr]
centers = [quad_center(r[0]) for r in ocr]
hs = [max(1.0, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0
max_horizontal_offset = med_h * 1.8
sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0]))
groups, used = [], set()
for i in sorted_indices:
if i in used: continue
current_group = [i]
used.add(i)
cx_i = centers[i][0]
for j in sorted_indices:
if j in used or j == i: continue
cx_j, cy_j = centers[j]
if cy_j <= centers[i][1]: continue
if abs(cx_i - cx_j) > max_horizontal_offset: continue
# Horizontal gap guard
gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2]))
if gap_x > med_h * 1.5: continue
# Orientation compatibility guard
if not orientation_compatible(i, j, ocr): continue
vertical_gap = boxes[j][1] - boxes[current_group[-1]][3]
if vertical_gap <= max_vertical_gap:
current_group.append(j)
used.add(j)
cx_i = (cx_i + cx_j) / 2.0
if current_group:
groups.append(current_group)
# Secondary merge pass
merged_groups, used_groups = [], set()
for i, group1 in enumerate(groups):
if i in used_groups: continue
merged = list(group1)
used_groups.add(i)
for j, group2 in enumerate(groups):
if i == j or j in used_groups: continue
if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap):
compat = all(orientation_compatible(a, b, ocr)
for a in merged for b in group2)
if compat:
merged.extend(group2)
used_groups.add(j)
merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1]))
# Horizontal gap split pass
final_groups = []
for group in merged_groups:
h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5)
if h_split:
lg, rg = h_split
final_groups.append(sorted(lg, key=lambda idx: centers[idx][1]))
final_groups.append(sorted(rg, key=lambda idx: centers[idx][1]))
else:
final_groups.append(group)
final_groups.sort(key=lambda g: (min(centers[i][1] for i in g),
min(centers[i][0] for i in g)))
bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
ih, iw = image_shape[:2]
for bid, idxs in enumerate(final_groups, start=1):
lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
if ub is None: continue
x1, y1, x2, y2 = ub
ap = max(1, int(round(med_h * 0.16)))
bubbles[bid] = lines
bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap),
min(iw-1,x2+ap), min(ih-1,y2+ap))
bubble_quads[bid] = quads
bubble_indices[bid] = idxs
return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ============================================================
# SPLIT HELPER — centralises all split strategies
# ============================================================
def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes,
filtered, image, iw, ih):
"""
Attempts all split strategies in priority order.
Returns ((part1_indices, part2_indices), reason_str) or (None, None).
BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4
so the gap between the top speech bubble and the bottom cluster triggers.
"""
indices = bubble_indices[bid]
box = bubble_boxes[bid]
# 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster)
if is_vertical_text_like(indices, filtered):
vgap = split_cluster_by_big_vertical_gap(indices, filtered,
factor=1.4, min_gap=18)
if vgap:
return vgap, "vertical-stack y-gap"
# 2. Panel border
sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
if sr:
_, _, split_x = sr
li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x]
ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x]
if li and ri:
return (li, ri), "panel border"
elif len(bubble_quads[bid]) >= 4:
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid,
use_aggressive_thresholds=True)
if cs:
return cs, "aggressive column"
# 3. Column gap
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid)
if cs:
return cs, "vertical column"
# 4. Nested / side-by-side
ns = split_nested_or_side_by_side(indices, filtered)
if ns:
return ns, "nested/side-by-side"
# 5. Row split
rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid)
if rs:
return rs, "horizontal row"
# 6. Large vertical gap (general, less sensitive)
gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22)
if gy:
return gy, "large vertical-gap"
return None, None
# ============================================================
# DEBUG / EXPORT
# ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices,
clean_lines=None, out_path="debug_clusters.png"):
"""
Draws all detected boxes.
Single-quad boxes are drawn in orange for visibility but are NOT
labelled as (ISOLATED) — they participate fully in merge passes.
"""
img = cv2.imread(image_path)
if img is None: return
for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32)
cv2.fillPoly(img, [pts], (255, 255, 255))
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb
n_quads = len(bubble_indices.get(bid, []))
color = (255, 165, 0) if n_quads == 1 else (0, 220, 0)
thickness = 3 if n_quads == 1 else 2
cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
if clean_lines and bid in clean_lines:
text = clean_lines[bid]
words = text.split()
lines, cur = [], ""
for w in words:
if len(cur) + len(w) < 25: cur += w + " "
else: lines.append(cur.strip()); cur = w + " "
if cur: lines.append(cur.strip())
y_text = y2 + 18
for line in lines:
cv2.putText(img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3)
cv2.putText(img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
y_text += 18
cv2.imwrite(out_path, img)
def estimate_reading_order(bbox_dict, mode="ltr"):
items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0)
for bid, bb in bbox_dict.items()]
items.sort(key=lambda t: t[2])
rows, tol = [], 90
for it in items:
placed = False
for r in rows:
if abs(it[2] - r["cy"]) <= tol:
r["items"].append(it)
r["cy"] = float(np.mean([x[2] for x in r["items"]]))
placed = True; break
if not placed: rows.append({"cy": it[2], "items": [it]})
rows.sort(key=lambda r: r["cy"])
order = []
for r in rows:
r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
order.extend([z[0] for z in r["items"]])
return {bid: i+1 for i, bid in enumerate(order)}
# ============================================================
# MAIN PIPELINE
# ============================================================
def translate_manga_text(
image_path="001-page.png",
source_lang="en",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
):
image = cv2.imread(image_path)
if image is None:
print(f"❌ Cannot load image: {image_path}"); return
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
ih, iw = image.shape[:2]
print("Loading OCR engines...")
if use_enhanced_ocr:
detector = ImprovedMacVisionDetector(source_lang=source_lang)
print("🚀 Using Enhanced Multi-Pass OCR")
else:
detector = MacVisionDetector(source_lang=source_lang)
print("Running detection OCR (Apple Vision)...")
raw = detector.read(image_path)
print(f"Raw detections: {len(raw)}")
if use_enhanced_ocr:
existing_quads = [r[0] for r in raw]
missed_regions = detect_small_text_regions(image, existing_quads)
if missed_regions:
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
for region in missed_regions:
rx1, ry1, rx2, ry2 = region
pad = 10
rx1, ry1 = max(0, rx1-pad), max(0, ry1-pad)
rx2, ry2 = min(iw, rx2+pad), min(ih, ry2+pad)
crop = image[ry1:ry2, rx1:rx2]
if crop.size > 0:
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0,
interpolation=cv2.INTER_CUBIC)
for quad, text, conf in detector.run_vision_ocr(upscaled):
raw.append(([[int(p[0]/4.0+rx1), int(p[1]/4.0+ry1)]
for p in quad], text, conf))
print(f"📝 Total detections after missed region scan: {len(raw)}")
# ── Filtering ─────────────────────────────────────────────────────────
filtered, skipped = [], 0
for bbox, text, conf in raw:
t = normalize_text(text)
qb = quad_bbox(bbox)
if conf < confidence_threshold: skipped += 1; continue
if len(t) < min_text_length: skipped += 1; continue
if not is_valid_language(t, source_lang): skipped += 1; continue
if not is_meaningful_text(t, source_lang): skipped += 1; continue
if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5:
skipped += 1; continue
filtered.append((bbox, t, conf))
print(f"Kept: {len(filtered)} | Skipped: {skipped}")
if not filtered:
print("⚠️ No text after filtering."); return
# ── Pre-grouping quad splits ──────────────────────────────────────────
filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered)
if oversized_splits > 0:
print(f"📐 Split {oversized_splits} oversized quad(s) before grouping")
filtered, wide_splits = split_wide_ocr_items(image, filtered)
if wide_splits > 0:
print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.")
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
if bridge_splits > 0:
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
# Column-gap split: catches wide quads spanning two columns (BOX#6 type)
hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered]
med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0
filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre)
filtered = normalize_ocr_quads(filtered)
# ── Grouping ──────────────────────────────────────────────────────────
print("📊 Grouping quads vertically...")
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical(
filtered, image.shape, gap_px=resolved_gap,
bbox_padding=1, strict_mode=strict_grouping)
print(f" Created {len(bubbles)} initial box(es)")
# ── Auto-fix (split + merge) ──────────────────────────────────────────
if auto_fix_bubbles:
bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image)
# ── Enforce max box size ──────────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered,
max_width_ratio=max_box_width_ratio,
max_height_ratio=max_box_height_ratio,
image_shape=image.shape)
# ── Close-proximity merge ─────────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered)
# ── Per-bubble split pass ─────────────────────────────────────────────
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
next_bid = max(bubbles.keys()) + 1 if bubbles else 1
splits_performed = []
for bid in list(bubbles.keys()):
split_result, split_reason = _split_bubble_if_needed(
bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih)
if split_result:
p1, p2 = split_result
splits_performed.append(f"BOX#{bid} ({split_reason})")
for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]:
ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs])
new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered)
new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2),
min(iw-1,ub[2]+2), min(ih-1,ub[3]+2))
new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs]
new_bubble_indices[part_bid] = part_idxs
next_bid += 1
else:
new_bubbles[bid] = bubbles[bid]
new_bubble_boxes[bid] = bubble_boxes[bid]
new_bubble_quads[bid] = bubble_quads[bid]
new_bubble_indices[bid] = bubble_indices[bid]
if splits_performed:
print(f"\n🔀 Splits detected: {len(splits_performed)}")
for s in splits_performed: print(f"{s}")
# ── Remove nested / duplicate boxes ──────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes(
new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles,
overlap_threshold=0.50)
print(f"✅ Final box count: {len(bubbles)}")
# ── OCR quality pass ──────────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
clean_lines: Dict[int, str] = {}
sources_used: Dict[int, str] = {}
translations: Dict[int, str] = {}
for bid, lines in bubbles.items():
base_txt = normalize_text(" ".join(lines))
base_sc = ocr_candidate_score(base_txt)
txt, src_used = base_txt, "vision-base"
if base_sc < quality_threshold:
rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
image, bubble_boxes[bid], detector, upscale=3.0, pad=24)
if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang):
txt, src_used = rr_txt, rr_src
clean_lines[bid] = normalize_text(txt)
sources_used[bid] = src_used
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
# ── Translation ───────────────────────────────────────────────────────
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
try:
tgt = translator.translate(src_txt) or ""
tgt = postprocess_translation_general(tgt).upper()
except Exception as e:
tgt = f"[Error: {e}]"
translations[bid] = tgt
if debug:
save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices,
clean_lines, "debug_clusters.png")
# ── Text output ───────────────────────────────────────────────────────
divider = "" * 120
out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} "
f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider)
translated_count = 0
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
flags = []
tgt = translations.get(bid, "")
if not tgt: flags.append("NO_TRANSLATION")
src_u = src_txt.upper()
src_engine = sources_used.get(bid, "unknown")
out_lines.append(
f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|"
f"{','.join(flags) if flags else '-'}")
print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}")
translated_count += 1
out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.")
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(out_lines))
# ── bubbles.json ──────────────────────────────────────────────────────
bubbles_payload = {}
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
box = bubble_boxes.get(bid)
tgt = translations.get(bid, "")
bubbles_payload[str(bid)] = {
"order": reading_map.get(bid, bid),
"ocr_source": sources_used.get(bid, "unknown"),
"original": src_txt.upper(),
"translated": tgt,
"box": {
"x": box[0] if box else 0,
"y": box[1] if box else 0,
"w": (box[2]-box[0]) if box else 0,
"h": (box[3]-box[1]) if box else 0,
},
"lines": [line.upper() for line in bubbles.get(bid, [])],
}
with open(export_bubbles_to, "w", encoding="utf-8") as f:
json.dump(bubbles_payload, f, ensure_ascii=False, indent=2)
print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}")
# ============================================================
# ENTRY POINT
# ============================================================
if __name__ == "__main__":
translate_manga_text(
image_path="19.png",
source_lang="english",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="rtl",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
)
def split_bubble_if_multiple_rows(indices, ocr, bid=None):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
ys = [(b[1]+b[3])/2.0 for b in boxes]
ys_sorted = sorted(ys)
gap_thresh = max(med_h * 2.0, 30)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(ys_sorted) - 1):
gap = ys_sorted[i+1] - ys_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0
top_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y]
bot_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y]
if not top_idxs or not bot_idxs: return None
return (top_idxs, bot_idxs)
def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices],
key=lambda x: (x[1][1]+x[1][3])/2.0)
gap_thresh = max(med_h * factor, min_gap)
best_gap, best_split_idx = 0.0, None
for k in range(len(items) - 1):
gap = items[k+1][1][1] - items[k][1][3]
if gap > gap_thresh and gap > best_gap:
best_gap, best_split_idx = gap, k
if best_split_idx is None: return None
top_idxs = [it[0] for it in items[:best_split_idx+1]]
bot_idxs = [it[0] for it in items[best_split_idx+1:]]
if not top_idxs or not bot_idxs: return None
return (top_idxs, bot_idxs)
def is_vertical_text_like(indices, ocr):
if len(indices) < 2: return False
boxes = [quad_bbox(ocr[i][0]) for i in indices]
med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes]))
med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes]))
if med_h < med_w * 1.2: return False
xs = [(b[0]+b[2])/2.0 for b in boxes]
ys = [(b[1]+b[3])/2.0 for b in boxes]
if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False
return True
def split_nested_or_side_by_side(indices, ocr):
if len(indices) < 2: return None
xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0
for i in indices])
mid_idx = len(xs) // 2
split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0
left_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
right_idxs = [i for i in indices
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_panel_box(image_bgr, box_xyxy, bubble_quads=None):
x1, y1, x2, y2 = box_xyxy
ih, iw = image_bgr.shape[:2]
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(iw-1, x2), min(ih-1, y2)
if x2 <= x1 or y2 <= y1: return None
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
h_proj = np.sum(edges, axis=0)
w = x2 - x1
if w < 100: return None
search_start = int(w * 0.35)
search_end = int(w * 0.65)
if search_end <= search_start: return None
region = h_proj[search_start:search_end]
if len(region) == 0: return None
threshold = np.percentile(region, 85)
candidates = [x1 + search_start + rx
for rx in range(len(region)) if region[rx] >= threshold]
if not candidates: return None
split_x = int(np.median(candidates))
if bubble_quads:
lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x)
rc = len(bubble_quads) - lc
if lc == 0 or rc == 0: return None
return (x1, x2, split_x)
# ============================================================
# MERGE CLOSE BUBBLES
# ============================================================
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads,
bubble_indices, ocr):
"""
Merges boxes that are spatially very close on BOTH axes AND share
meaningful horizontal overlap (same column).
Single-quad boxes participate fully — no special isolation treatment.
The h_overlap_ratio >= 0.25 guard prevents merging horizontally
adjacent distinct bubbles.
"""
if not bubbles:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1])
for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
merge_tol = max(8, med_h * 1.4)
bids = sorted(bubble_boxes.keys())
merged_set, merge_map = set(), {}
for i, bid_i in enumerate(bids):
if bid_i in merged_set: continue
x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i]
wi = max(1, x2_i - x1_i)
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_set: continue
x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j]
wj = max(1, x2_j - x1_j)
gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j))
gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j))
h_ix1 = max(x1_i, x1_j)
h_ix2 = min(x2_i, x2_j)
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25:
if bid_i not in merge_map:
merge_map[bid_i] = [bid_i]
merge_map[bid_i].append(bid_j)
merged_set.add(bid_j)
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for bid in bids:
if bid in merged_set: continue
if bid in merge_map:
group = merge_map[bid]
all_indices = sorted(set(idx for b in group for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
else:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# WIDE / BRIDGE QUAD SPLITTING
# ============================================================
def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w = x2 - x1
if w > med_h * width_factor:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = roi.shape[0] * 255 * 0.15
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.6), 12):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
if w / h > aspect_ratio_threshold:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def normalize_ocr_quads(ocr_list):
result = []
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
pad = 3
new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]]
result.append((new_quad, text, conf))
return result
# ============================================================
# VISION RE-READ
# ============================================================
def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw": return gray
if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0,
tileGridSize=(8,8)).apply(gray)
if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3,3), 0)
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11)
if mode == "otsu":
den = cv2.GaussianBlur(gray, (3,3), 0)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "invert": return 255 - gray
if mode == "bilateral":
den = cv2.bilateralFilter(gray, 7, 60, 60)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "morph_open":
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8))
return gray
def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2]
c = (w/2, h/2)
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
cos, sin = abs(M[0,0]), abs(M[0,1])
new_w = int((h*sin) + (w*cos))
new_h = int((h*cos) + (w*sin))
M[0,2] += (new_w/2) - c[0]
M[1,2] += (new_h/2) - c[1]
return cv2.warpAffine(img, M, (new_w, new_h),
flags=cv2.INTER_CUBIC, borderValue=255)
def rebuild_text_from_vision_result(res):
if not res: return ""
norm = []
for bbox, txt, conf in res:
if not txt or not txt.strip(): continue
b = quad_bbox(bbox)
norm.append((b, txt, conf,
(b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
if not norm: return ""
med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4])
rows = []
for it in norm:
placed = False
for r in rows:
if abs(it[4] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[4] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[4], "m": [it]})
rows.sort(key=lambda r: r["yc"])
lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3])))
for r in rows]
return normalize_text(" ".join(filter(None, lines)))
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector,
upscale=3.0, pad=24):
ih, iw = image_bgr.shape[:2]
x1, y1, x2, y2 = bbox_xyxy
x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad))
x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad))
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None, 0.0, "none"
modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
angles = [0.0, 1.5, -1.5]
best_v_txt, best_v_sc = "", 0.0
up0 = cv2.resize(crop,
(int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)),
interpolation=cv2.INTER_CUBIC)
for mode in modes:
proc = preprocess_variant(up0, mode)
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
for a in angles:
rot = rotate_image_keep_bounds(proc3, a)
res = (vision_detector.run_vision_ocr(rot)
if hasattr(vision_detector, 'run_vision_ocr')
else vision_detector.read(rot))
txt = rebuild_text_from_vision_result(res)
sc = ocr_candidate_score(txt)
if sc > best_v_sc:
best_v_txt, best_v_sc = txt, sc
if best_v_txt: return best_v_txt, best_v_sc, "vision-reread"
return None, 0.0, "none"
# ============================================================
# LINES + BUBBLES
# ============================================================
def build_lines_from_indices(indices, ocr):
if not indices: return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
placed = False
for r in rows:
if abs(it[3] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[3] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[3], "m": [it]})
rows.sort(key=lambda r: r["yc"])
return [normalize_text(
" ".join(ocr[i][1]
for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2])))
for r in rows if r["m"]]
def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path)
return base * (img.shape[1] / ref_w) if img is not None else base
def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1,
strict_mode=False):
n = len(ocr)
if n == 0: return {}, {}, {}, {}
boxes = [quad_bbox(r[0]) for r in ocr]
centers = [quad_center(r[0]) for r in ocr]
hs = [max(1.0, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0
max_horizontal_offset = med_h * 1.8
sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0]))
groups, used = [], set()
for i in sorted_indices:
if i in used: continue
current_group = [i]
used.add(i)
cx_i = centers[i][0]
for j in sorted_indices:
if j in used or j == i: continue
cx_j, cy_j = centers[j]
if cy_j <= centers[i][1]: continue
if abs(cx_i - cx_j) > max_horizontal_offset: continue
# Horizontal gap guard
gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2]))
if gap_x > med_h * 1.5: continue
# Orientation compatibility guard
if not orientation_compatible(i, j, ocr): continue
vertical_gap = boxes[j][1] - boxes[current_group[-1]][3]
if vertical_gap <= max_vertical_gap:
current_group.append(j)
used.add(j)
cx_i = (cx_i + cx_j) / 2.0
if current_group:
groups.append(current_group)
# Secondary merge pass
merged_groups, used_groups = [], set()
for i, group1 in enumerate(groups):
if i in used_groups: continue
merged = list(group1)
used_groups.add(i)
for j, group2 in enumerate(groups):
if i == j or j in used_groups: continue
if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap):
compat = all(orientation_compatible(a, b, ocr)
for a in merged for b in group2)
if compat:
merged.extend(group2)
used_groups.add(j)
merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1]))
# Horizontal gap split pass
final_groups = []
for group in merged_groups:
h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5)
if h_split:
lg, rg = h_split
final_groups.append(sorted(lg, key=lambda idx: centers[idx][1]))
final_groups.append(sorted(rg, key=lambda idx: centers[idx][1]))
else:
final_groups.append(group)
final_groups.sort(key=lambda g: (min(centers[i][1] for i in g),
min(centers[i][0] for i in g)))
bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
ih, iw = image_shape[:2]
for bid, idxs in enumerate(final_groups, start=1):
lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
if ub is None: continue
x1, y1, x2, y2 = ub
ap = max(1, int(round(med_h * 0.16)))
bubbles[bid] = lines
bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap),
min(iw-1,x2+ap), min(ih-1,y2+ap))
bubble_quads[bid] = quads
bubble_indices[bid] = idxs
return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ============================================================
# SPLIT HELPER — centralises all split strategies
# ============================================================
def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes,
filtered, image, iw, ih):
"""
Attempts all split strategies in priority order.
Returns ((part1_indices, part2_indices), reason_str) or (None, None).
BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4
so the gap between the top speech bubble and the bottom cluster triggers.
"""
indices = bubble_indices[bid]
box = bubble_boxes[bid]
# 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster)
if is_vertical_text_like(indices, filtered):
vgap = split_cluster_by_big_vertical_gap(indices, filtered,
factor=1.4, min_gap=18)
if vgap:
return vgap, "vertical-stack y-gap"
# 2. Panel border
sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
if sr:
_, _, split_x = sr
li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x]
ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x]
if li and ri:
return (li, ri), "panel border"
elif len(bubble_quads[bid]) >= 4:
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid,
use_aggressive_thresholds=True)
if cs:
return cs, "aggressive column"
# 3. Column gap
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid)
if cs:
return cs, "vertical column"
# 4. Nested / side-by-side
ns = split_nested_or_side_by_side(indices, filtered)
if ns:
return ns, "nested/side-by-side"
# 5. Row split
rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid)
if rs:
return rs, "horizontal row"
# 6. Large vertical gap (general, less sensitive)
gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22)
if gy:
return gy, "large vertical-gap"
return None, None
# ============================================================
# DEBUG / EXPORT
# ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices,
clean_lines=None, out_path="debug_clusters.png"):
"""
Draws all detected boxes.
Single-quad boxes are drawn in orange for visibility but are NOT
labelled as (ISOLATED) — they participate fully in merge passes.
"""
img = cv2.imread(image_path)
if img is None: return
for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32)
cv2.fillPoly(img, [pts], (255, 255, 255))
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb
n_quads = len(bubble_indices.get(bid, []))
color = (255, 165, 0) if n_quads == 1 else (0, 220, 0)
thickness = 3 if n_quads == 1 else 2
cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
if clean_lines and bid in clean_lines:
text = clean_lines[bid]
words = text.split()
lines, cur = [], ""
for w in words:
if len(cur) + len(w) < 25: cur += w + " "
else: lines.append(cur.strip()); cur = w + " "
if cur: lines.append(cur.strip())
y_text = y2 + 18
for line in lines:
cv2.putText(img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3)
cv2.putText(img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
y_text += 18
cv2.imwrite(out_path, img)
def estimate_reading_order(bbox_dict, mode="ltr"):
items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0)
for bid, bb in bbox_dict.items()]
items.sort(key=lambda t: t[2])
rows, tol = [], 90
for it in items:
placed = False
for r in rows:
if abs(it[2] - r["cy"]) <= tol:
r["items"].append(it)
r["cy"] = float(np.mean([x[2] for x in r["items"]]))
placed = True; break
if not placed: rows.append({"cy": it[2], "items": [it]})
rows.sort(key=lambda r: r["cy"])
order = []
for r in rows:
r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
order.extend([z[0] for z in r["items"]])
return {bid: i+1 for i, bid in enumerate(order)}
# ============================================================
# MAIN PIPELINE
# ============================================================
def translate_manga_text(
image_path="001-page.png",
source_lang="en",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
):
image = cv2.imread(image_path)
if image is None:
print(f"❌ Cannot load image: {image_path}"); return
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
ih, iw = image.shape[:2]
print("Loading OCR engines...")
if use_enhanced_ocr:
detector = ImprovedMacVisionDetector(source_lang=source_lang)
print("🚀 Using Enhanced Multi-Pass OCR")
else:
detector = MacVisionDetector(source_lang=source_lang)
print("Running detection OCR (Apple Vision)...")
raw = detector.read(image_path)
print(f"Raw detections: {len(raw)}")
if use_enhanced_ocr:
existing_quads = [r[0] for r in raw]
missed_regions = detect_small_text_regions(image, existing_quads)
if missed_regions:
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
for region in missed_regions:
rx1, ry1, rx2, ry2 = region
pad = 10
rx1, ry1 = max(0, rx1-pad), max(0, ry1-pad)
rx2, ry2 = min(iw, rx2+pad), min(ih, ry2+pad)
crop = image[ry1:ry2, rx1:rx2]
if crop.size > 0:
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0,
interpolation=cv2.INTER_CUBIC)
for quad, text, conf in detector.run_vision_ocr(upscaled):
raw.append(([[int(p[0]/4.0+rx1), int(p[1]/4.0+ry1)]
for p in quad], text, conf))
print(f"📝 Total detections after missed region scan: {len(raw)}")
# ── Filtering ─────────────────────────────────────────────────────────
filtered, skipped = [], 0
for bbox, text, conf in raw:
t = normalize_text(text)
qb = quad_bbox(bbox)
if conf < confidence_threshold: skipped += 1; continue
if len(t) < min_text_length: skipped += 1; continue
if not is_valid_language(t, source_lang): skipped += 1; continue
if not is_meaningful_text(t, source_lang): skipped += 1; continue
if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5:
skipped += 1; continue
filtered.append((bbox, t, conf))
print(f"Kept: {len(filtered)} | Skipped: {skipped}")
if not filtered:
print("⚠️ No text after filtering."); return
# ── Pre-grouping quad splits ──────────────────────────────────────────
filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered)
if oversized_splits > 0:
print(f"📐 Split {oversized_splits} oversized quad(s) before grouping")
filtered, wide_splits = split_wide_ocr_items(image, filtered)
if wide_splits > 0:
print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.")
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
if bridge_splits > 0:
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
# Column-gap split: catches wide quads spanning two columns (BOX#6 type)
hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered]
med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0
filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre)
filtered = normalize_ocr_quads(filtered)
# ── Grouping ──────────────────────────────────────────────────────────
print("📊 Grouping quads vertically...")
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical(
filtered, image.shape, gap_px=resolved_gap,
bbox_padding=1, strict_mode=strict_grouping)
print(f" Created {len(bubbles)} initial box(es)")
# ── Auto-fix (split + merge) ──────────────────────────────────────────
if auto_fix_bubbles:
bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image)
# ── Enforce max box size ──────────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered,
max_width_ratio=max_box_width_ratio,
max_height_ratio=max_box_height_ratio,
image_shape=image.shape)
# ── Close-proximity merge ─────────────────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered)
# ── Per-bubble split pass ─────────────────────────────────────────────
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
next_bid = max(bubbles.keys()) + 1 if bubbles else 1
splits_performed = []
for bid in list(bubbles.keys()):
split_result, split_reason = _split_bubble_if_needed(
bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih)
if split_result:
p1, p2 = split_result
splits_performed.append(f"BOX#{bid} ({split_reason})")
for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]:
ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs])
new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered)
new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2),
min(iw-1,ub[2]+2), min(ih-1,ub[3]+2))
new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs]
new_bubble_indices[part_bid] = part_idxs
next_bid += 1
else:
new_bubbles[bid] = bubbles[bid]
new_bubble_boxes[bid] = bubble_boxes[bid]
new_bubble_quads[bid] = bubble_quads[bid]
new_bubble_indices[bid] = bubble_indices[bid]
if splits_performed:
print(f"\n🔀 Splits detected: {len(splits_performed)}")
for s in splits_performed: print(f"{s}")
# ── Remove nested / duplicate boxes ──────────────────────────────────
bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes(
new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles,
overlap_threshold=0.50)
print(f"✅ Final box count: {len(bubbles)}")
# ── OCR quality pass ──────────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
clean_lines: Dict[int, str] = {}
sources_used: Dict[int, str] = {}
translations: Dict[int, str] = {}
for bid, lines in bubbles.items():
base_txt = normalize_text(" ".join(lines))
base_sc = ocr_candidate_score(base_txt)
txt, src_used = base_txt, "vision-base"
if base_sc < quality_threshold:
rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
image, bubble_boxes[bid], detector, upscale=3.0, pad=24)
if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang):
txt, src_used = rr_txt, rr_src
clean_lines[bid] = normalize_text(txt)
sources_used[bid] = src_used
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
# ── Translation ───────────────────────────────────────────────────────
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
try:
tgt = translator.translate(src_txt) or ""
tgt = postprocess_translation_general(tgt).upper()
except Exception as e:
tgt = f"[Error: {e}]"
translations[bid] = tgt
if debug:
save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices,
clean_lines, "debug_clusters.png")
# ── Text output ───────────────────────────────────────────────────────
divider = "" * 120
out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} "
f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider)
translated_count = 0
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
flags = []
tgt = translations.get(bid, "")
if not tgt: flags.append("NO_TRANSLATION")
src_u = src_txt.upper()
src_engine = sources_used.get(bid, "unknown")
out_lines.append(
f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|"
f"{','.join(flags) if flags else '-'}")
print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}")
translated_count += 1
out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.")
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(out_lines))
# ── bubbles.json ──────────────────────────────────────────────────────
bubbles_payload = {}
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
box = bubble_boxes.get(bid)
tgt = translations.get(bid, "")
bubbles_payload[str(bid)] = {
"order": reading_map.get(bid, bid),
"ocr_source": sources_used.get(bid, "unknown"),
"original": src_txt.upper(),
"translated": tgt,
"box": {
"x": box[0] if box else 0,
"y": box[1] if box else 0,
"w": (box[2]-box[0]) if box else 0,
"h": (box[3]-box[1]) if box else 0,
},
"lines": [line.upper() for line in bubbles.get(bid, [])],
}
with open(export_bubbles_to, "w", encoding="utf-8") as f:
json.dump(bubbles_payload, f, ensure_ascii=False, indent=2)
print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}")
# ============================================================
# ENTRY POINT
# ============================================================
if __name__ == "__main__":
translate_manga_text(
image_path="19.png",
source_lang="english",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="rtl",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
)