Files
manga-translator/manga-translator.py
Guillem Hernandez Sola b6b0df4774 Added stuff
2026-04-22 10:51:57 +02:00

2000 lines
81 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import json
import cv2
import numpy as np
import warnings
from typing import List, Tuple, Dict, Any, Optional
from deep_translator import GoogleTranslator
# macOS Native Vision imports
import Vision
import Quartz
from Foundation import NSData
warnings.filterwarnings("ignore", category=UserWarning)
# ============================================================
# CONFIG
# ============================================================
TOP_BAND_RATIO = 0.08
# ============================================================
# HELPERS
# ============================================================
def normalize_text(text: str) -> str:
t = (text or "").strip().upper()
t = t.replace("\u201c", "\"").replace("\u201d", "\"")
t = t.replace("\u2018", "'").replace("\u2019", "'")
t = t.replace("\u2026", "...")
t = re.sub(r"\s+", " ", t)
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
t = re.sub(r"([¡¿])\s+", r"\1", t)
t = re.sub(r"\(\s+", "(", t)
t = re.sub(r"\s+\)", ")", t)
t = re.sub(r"\.{4,}", "...", t)
return t.strip()
def postprocess_translation_general(text: str) -> str:
t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip()
t = re.sub(r"([!?]){3,}", r"\1\1", t)
t = re.sub(r"\.{4,}", "...", t)
return t
def fix_common_ocr_errors(text: str) -> str:
result = text
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
result = result.replace('|', 'I')
result = result.replace('`', "'")
return result
def is_valid_language(text: str, source_lang: str) -> bool:
if not text:
return False
clean_text = re.sub(r'[^\w]', '', text)
if not clean_text:
return False
lang = source_lang.lower()
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
foreign_chars = len(re.findall(
r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF'
r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]',
clean_text
))
if foreign_chars > 0:
return False
latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text))
total = len(clean_text)
if total <= 3:
return latin_chars >= 1
if total <= 6:
return (latin_chars / total) >= 0.55
return (latin_chars / total) >= 0.45
elif lang in ['ja', 'japanese']:
ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text))
if len(clean_text) <= 3:
return ja_chars >= 1
return (ja_chars / len(clean_text)) >= 0.4
elif lang in ['ko', 'korean']:
ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text))
if len(clean_text) <= 3:
return ko_chars >= 1
return (ko_chars / len(clean_text)) >= 0.4
elif lang in ['zh', 'chinese']:
zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text))
if len(clean_text) <= 3:
return zh_chars >= 1
return (zh_chars / len(clean_text)) >= 0.4
return True
_NOISE_TOKENS = {
'P', 'F', 'N', 'M', 'X', 'Z', 'Q',
'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP',
}
_MANGA_INTERJECTIONS = {
'HUH', 'HUH?', 'HUH??', 'HUH?!',
'OH', 'OH!', 'OOH', 'OOH!',
'AH', 'AH!', 'UH', 'UH...',
'HEY', 'HEY!',
'EH', 'EH?',
'WOW', 'WOW!',
'YES', 'NO', 'NO!',
'RUN', 'GO', 'GO!',
'STOP', 'WAIT',
'WHAT', 'WHAT?', 'WHAT?!',
'WHY', 'WHY?',
'HOW', 'HOW?',
'OK', 'OK!', 'OKAY',
'EEEEP', 'EEEP',
'OMIGOSH',
'HMM', 'HMM...',
'TSK', 'TCH',
'GRRR','I','A',
'FWUP', 'FWAP',
'SHIVER',
'RRRING',
'MORNING', 'MORNING.',
}
def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool:
if not text:
return False
t = text.strip()
t_upper = t.upper()
t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper)
if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS:
return True
alpha_count = sum(c.isalpha() for c in t)
if alpha_count < min_alpha_chars:
return False
if t_upper in _NOISE_TOKENS:
return False
lang = source_lang.lower()
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
non_alpha = sum(not c.isalpha() for c in t)
if len(t) > 0 and (non_alpha / len(t)) > 0.60:
return False
if len(t) >= 3 and len(set(t_upper)) == 1:
return False
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
if len(t) > 4:
vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t))
if vowels == 0:
return False
return True
def quad_bbox(quad):
xs = [p[0] for p in quad]
ys = [p[1] for p in quad]
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
def quad_center(quad):
x1, y1, x2, y2 = quad_bbox(quad)
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
def boxes_union_xyxy(boxes):
boxes = [b for b in boxes if b is not None]
if not boxes:
return None
return (
int(min(b[0] for b in boxes)),
int(min(b[1] for b in boxes)),
int(max(b[2] for b in boxes)),
int(max(b[3] for b in boxes)),
)
def bbox_area_xyxy(b):
if b is None:
return 0
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
def xyxy_to_xywh(b):
if b is None:
return None
x1, y1, x2, y2 = b
return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
def overlap_or_near(a, b, gap=0):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
gap_y = max(0, max(ay1, by1) - min(ay2, by2))
return gap_x <= gap and gap_y <= gap
def boxes_iou(a, b):
"""Intersection over Union for two xyxy boxes."""
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1 = max(ax1, bx1)
iy1 = max(ay1, by1)
ix2 = min(ax2, bx2)
iy2 = min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, area_a + area_b - inter)
def boxes_overlap_ratio(a, b):
"""Ratio of intersection to the SMALLER box area."""
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1 = max(ax1, bx1)
iy1 = max(ay1, by1)
ix2 = min(ax2, bx2)
iy2 = min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
return inter / max(1, min(area_a, area_b))
def ocr_candidate_score(text: str) -> float:
if not text:
return 0.0
t = text.strip()
n = len(t)
if n == 0:
return 0.0
alpha = sum(c.isalpha() for c in t) / n
spaces = sum(c.isspace() for c in t) / n
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
penalty = 0.0
if re.search(r"\b[A-Z]\b", t):
penalty += 0.05
if re.search(r"[0-9]{2,}", t):
penalty += 0.08
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
return max(0.0, min(1.0, score))
def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool:
x1, y1, x2, y2 = quad_bbox(quad)
w = max(1, x2 - x1)
h = max(1, y2 - y1)
return (w / h) >= ratio_threshold
def quad_is_vertical(quad, ratio_threshold=1.5) -> bool:
x1, y1, x2, y2 = quad_bbox(quad)
w = max(1, x2 - x1)
h = max(1, y2 - y1)
return (h / w) >= ratio_threshold
# ============================================================
# ENHANCED IMAGE PREPROCESSING
# ============================================================
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
h, w = image_bgr.shape[:2]
upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)),
interpolation=cv2.INTER_CUBIC)
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
kernel_sharpen = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]])
sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
kernel = np.ones((2, 2), np.uint8)
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
def detect_small_text_regions(image_bgr, existing_quads):
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
mask = np.zeros(gray.shape, dtype=np.uint8)
for quad in existing_quads:
pts = np.array(quad, dtype=np.int32)
cv2.fillPoly(mask, [pts], 255)
mask_inv = cv2.bitwise_not(mask)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
text_regions = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
text_regions.append((x, y, x+w, y+h))
return text_regions
# ============================================================
# SPEECH BUBBLE DETECTION
# ============================================================
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
return [c for c in contours if cv2.contourArea(c) > 500]
def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5):
x1, y1, x2, y2 = quad_bbox_xyxy
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance
def split_indices_by_bubble(indices, ocr, bubble_contours):
if not indices:
return []
bubble_groups = {}
outside_group = []
for idx in indices:
bbox = quad_bbox(ocr[idx][0])
found = False
for bidx, bubble in enumerate(bubble_contours):
if is_quad_in_bubble(bbox, bubble):
bubble_groups.setdefault(bidx, []).append(idx)
found = True
break
if not found:
outside_group.append(idx)
result = list(bubble_groups.values())
if outside_group:
result.append(outside_group)
return result
def check_vertical_alignment_split(indices, ocr, threshold=20):
if len(indices) <= 1:
return [indices]
items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], key=lambda x: x[1][1])
groups, current_group = [], [items[0][0]]
for i in range(1, len(items)):
if items[i][1][1] - items[i-1][1][3] > threshold:
groups.append(current_group)
current_group = [items[i][0]]
else:
current_group.append(items[i][0])
if current_group:
groups.append(current_group)
return groups
# ============================================================
# QUAD SIZE VALIDATION AND SPLITTING
# ============================================================
def is_quad_oversized(quad, median_height, width_threshold=8.0):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
return w > median_height * width_threshold or w / h > 12.0
def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height):
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size == 0:
return [(quad, text, conf)]
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(median_height * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if not gaps:
return [(quad, text, conf)]
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
return [(quad, text, conf)]
def validate_and_split_oversized_quads(image_bgr, filtered_ocr):
if not filtered_ocr:
return filtered_ocr, 0
heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
median_height = float(np.median(heights)) if heights else 14.0
result, splits_made = [], 0
for quad, text, conf in filtered_ocr:
if is_quad_oversized(quad, median_height, 8.0):
sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height)
if len(sr) > 1:
result.extend(sr); splits_made += 1
else:
result.append((quad, text, conf))
else:
result.append((quad, text, conf))
return result, splits_made
# ============================================================
# HORIZONTAL GAP DETECTION AT QUAD LEVEL
# ============================================================
def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5):
"""
Detects a large horizontal gap between quads within a group and splits them.
Fixes cases like BOX#8 in debug_clusters_016 where two column groups
are incorrectly merged into one box.
"""
if len(indices) < 2:
return None
items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0])
boxes = [quad_bbox(ocr[i][0]) for i in items]
gap_threshold = med_h * gap_factor
best_gap, best_split = 0.0, None
for k in range(len(items) - 1):
gap = boxes[k + 1][0] - boxes[k][2]
if gap > gap_threshold and gap > best_gap:
best_gap, best_split = gap, k
if best_split is None:
return None
left_group = [items[i] for i in range(best_split + 1)]
right_group = [items[i] for i in range(best_split + 1, len(items))]
if not left_group or not right_group:
return None
return (left_group, right_group)
def orientation_compatible(idx_a, idx_b, ocr):
"""
Prevents merging a tall/narrow isolated glyph with wide horizontal text lines.
Fixes BOX#1 type problems in debug_clusters_015.
"""
ba = quad_bbox(ocr[idx_a][0])
bb = quad_bbox(ocr[idx_b][0])
wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1])
wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1])
ra, rb = wa/ha, wb/hb
if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0):
return False
return True
# ============================================================
# WIDE QUAD COLUMN SPLIT — pre-grouping
# ============================================================
def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h,
min_gap_factor=1.8):
"""
FIX for BOX#6 type problem:
Splits a single OCR quad that spans two distinct text columns by finding
the largest vertical gap in its pixel projection. More aggressive than
split_oversized_quad_by_content — targets column-level gaps specifically.
"""
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
# Only attempt if the quad is wide enough to plausibly span two columns
if w < med_h * 3.0:
return [(quad, text, conf)]
pad = 2
roi = image_bgr[max(0, y1-pad):min(image_bgr.shape[0], y2+pad),
max(0, x1):min(image_bgr.shape[1], x2)]
if roi.size == 0:
return [(quad, text, conf)]
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
# Threshold: column gap must be nearly empty
gap_threshold = h * 255 * 0.12
min_gap_px = max(int(med_h * min_gap_factor), 10)
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= min_gap_px:
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if not gaps:
return [(quad, text, conf)]
# Use the widest gap as the split point
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_rel = gaps[0][0]
split_x_abs = x1 + split_x_rel
# Ensure the split produces two non-trivial halves
if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h:
return [(quad, text, conf)]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int(split_x_rel / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * split_x_rel / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
return [(quad, text, conf)]
def apply_column_gap_splits(image_bgr, ocr_list, med_h):
"""
Applies split_wide_quad_by_column_gap to every quad in the list.
Run this BEFORE grouping so column-spanning quads never seed bad groups.
"""
result, splits_made = [], 0
for quad, text, conf in ocr_list:
parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h)
if len(parts) > 1:
splits_made += 1
result.extend(parts)
if splits_made:
print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping")
return result, splits_made
# ============================================================
# GENERALIZED BOX FIXING FUNCTIONS
# ============================================================
def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
splits_made = []
for bid, indices in bubble_indices.items():
if len(indices) < 2:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = indices
next_bid += 1
continue
split_groups = split_indices_by_bubble(indices, ocr, bubble_contours)
if len(split_groups) > 1:
for group in split_groups:
if group:
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
new_quads[next_bid] = [ocr[i][0] for i in group]
new_indices[next_bid] = group
next_bid += 1
splits_made.append(f"BOX#{bid}{len(split_groups)} bubbles")
continue
vertical_splits = check_vertical_alignment_split(indices, ocr, threshold=int(med_h * 2.0))
if len(vertical_splits) > 1:
for group in vertical_splits:
if group:
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
new_quads[next_bid] = [ocr[i][0] for i in group]
new_indices[next_bid] = group
next_bid += 1
splits_made.append(f"BOX#{bid}{len(vertical_splits)} vertical groups")
continue
box = bubble_boxes[bid]
x1, y1, x2, y2 = box
if (x2 - x1) > med_h * 10:
x_centers = [quad_center(ocr[i][0])[0] for i in indices]
x_median = np.median(x_centers)
left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median]
right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median]
if left_group and right_group:
left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group])
right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group])
if right_box[0] - left_box[2] > med_h * 1.5:
for grp in [left_group, right_group]:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} → 2 horizontal panels")
continue
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = indices
next_bid += 1
if splits_made:
print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):")
for s in splits_made: print(f"{s}")
return new_bubbles, new_boxes, new_quads, new_indices
def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubble_contours = detect_speech_bubbles(image_bgr)
bids = list(bubble_boxes.keys())
to_merge = []
for i in range(len(bids)):
for j in range(i + 1, len(bids)):
bid_i, bid_j = bids[i], bids[j]
box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j]
cx_i = (box_i[0] + box_i[2]) / 2.0
cy_i = (box_i[1] + box_i[3]) / 2.0
cx_j = (box_j[0] + box_j[2]) / 2.0
cy_j = (box_j[1] + box_j[3]) / 2.0
in_same_bubble = any(
cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0
for c in bubble_contours
)
if in_same_bubble:
if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0:
to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i))
if not to_merge:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):")
merge_groups = {}
for top, bottom in to_merge:
found = False
for key in merge_groups:
if top in merge_groups[key] or bottom in merge_groups[key]:
merge_groups[key].update({top, bottom})
found = True
break
if not found:
merge_groups[len(merge_groups)] = {top, bottom}
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
merged_bids, next_bid = set(), 1
for merge_set in merge_groups.values():
merge_list = sorted(merge_set)
print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}")
all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b]))
for b in merge_list: merged_bids.add(b)
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
next_bid += 1
for bid in bids:
if bid not in merged_bids:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, med_h):
"""
FIX for BOX#2+BOX#14 and BOX#7+BOX#18 type problems:
Merges boxes whose bounding rectangles are very close vertically AND
share significant horizontal overlap — indicating they belong to the
same speech bubble that the contour detector missed (e.g. dashed outlines).
Unlike merge_close_bubbles_by_line_height, this checks BOTH axes strictly
to avoid merging boxes from adjacent but distinct bubbles.
"""
bids = sorted(bubble_boxes.keys())
merge_map: Dict[int, List[int]] = {}
merged_into: Dict[int, int] = {}
for i, bid_i in enumerate(bids):
if bid_i in merged_into:
continue
box_i = bubble_boxes[bid_i]
wi = box_i[2] - box_i[0]
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_into:
continue
box_j = bubble_boxes[bid_j]
wj = box_j[2] - box_j[0]
# Vertical gap between the two boxes
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
# Horizontal overlap ratio (intersection / min width)
h_ix1 = max(box_i[0], box_j[0])
h_ix2 = min(box_i[2], box_j[2])
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
# Merge only when:
# 1. Vertical gap is small (boxes are stacked closely)
# 2. Horizontal overlap is significant (same column)
if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35:
root = merged_into.get(bid_i, bid_i)
merge_map.setdefault(root, [root])
if bid_j not in merge_map[root]:
merge_map[root].append(bid_j)
merged_into[bid_j] = root
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):")
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
processed, next_bid = set(), 1
for root, group in merge_map.items():
group_unique = sorted(set(group))
print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}")
all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
next_bid += 1
processed.update(group_unique)
for bid in bids:
if bid not in processed:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
bubbles, ocr, image_bgr):
print("\n🔍 Running automatic bubble detection fixes...")
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_split_multi_bubble_boxes(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_merge_fragmented_bubbles(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
# Second pass: catch fragments missed by contour detection (dashed bubbles, etc.)
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_boxes_by_proximity_and_overlap(
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles,
overlap_threshold=0.50):
bids = list(bubble_boxes.keys())
to_remove = set()
for i in range(len(bids)):
bid_i = bids[i]
if bid_i in to_remove: continue
box_i = bubble_boxes[bid_i]
area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1])
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in to_remove: continue
box_j = bubble_boxes[bid_j]
area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1])
shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j])
overlap = boxes_overlap_ratio(box_i, box_j)
if overlap > overlap_threshold or len(shared) > 0:
if area_i >= area_j:
to_remove.add(bid_j)
print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})")
else:
to_remove.add(bid_i)
print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})")
break
if to_remove:
print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)")
for bid in to_remove:
bubble_boxes.pop(bid, None)
bubble_indices.pop(bid, None)
bubble_quads.pop(bid, None)
bubbles.pop(bid, None)
return bubbles, bubble_boxes, bubble_quads, bubble_indices
def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr,
max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None):
if image_shape is None:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
ih, iw = image_shape[:2]
max_width, max_height = iw * max_width_ratio, ih * max_height_ratio
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid, splits_made = 1, []
for bid, box in bubble_boxes.items():
x1, y1, x2, y2 = box
w, h = x2 - x1, y2 - y1
if w > max_width or h > max_height:
indices = bubble_indices[bid]
col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid,
use_aggressive_thresholds=True)
if col_split:
for grp in col_split:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
continue
row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid)
if row_split:
for grp in row_split:
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
new_quads[next_bid] = [ocr[i][0] for i in grp]
new_indices[next_bid] = grp
next_bid += 1
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
continue
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = box
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
if splits_made:
print(f"\n📏 Split {len(splits_made)} oversized box(es):")
for s in splits_made: print(f"{s}")
return new_bubbles, new_boxes, new_quads, new_indices
def should_merge_groups(group1_indices, group2_indices, ocr, median_height,
max_vertical_gap=None):
if max_vertical_gap is None:
max_vertical_gap = median_height * 2.5
box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices])
box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices])
if box1 is None or box2 is None:
return False
cx1 = (box1[0] + box1[2]) / 2.0
cx2 = (box2[0] + box2[2]) / 2.0
if abs(cx1 - cx2) > median_height * 1.8:
return False
vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3]))
return vertical_gap <= max_vertical_gap
# ============================================================
# ENHANCED OCR ENGINE
# ============================================================
class ImprovedMacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
self.langs = [lang_map.get(lang_key, "en-US")]
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
def preprocess_variants(self, image_bgr):
variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))]
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
_, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
variants.append(("high_contrast", cv2.cvtColor(
cv2.resize(hc, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC),
cv2.COLOR_GRAY2BGR)))
variants.append(("bilateral", cv2.resize(
cv2.bilateralFilter(image_bgr, 9, 75, 75),
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
variants.append(("inverted", cv2.resize(
cv2.bitwise_not(image_bgr),
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
variants.append(("original", cv2.resize(
image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
return variants
def run_vision_ocr(self, image_bgr):
if image_bgr is None or image_bgr.size == 0:
return []
ih, iw = image_bgr.shape[:2]
success, buffer = cv2.imencode('.png', image_bgr)
if not success:
return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error: return
for obs in request.results():
candidate = obs.topCandidates_(1)[0]
text, confidence = candidate.string(), candidate.confidence()
bbox = obs.boundingBox()
x = bbox.origin.x * iw
y_bl = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bl - h
quad = [[int(x),int(y)],[int(x+w),int(y)],
[int(x+w),int(y+h)],[int(x),int(y+h)]]
results.append((quad, text, confidence))
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(False)
req.setRecognitionLanguages_(self.langs)
req.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([req], None)
return results
def merge_multi_pass_results(self, all_results, original_shape):
if not all_results:
return []
scale_factor = 2.5
normalized = []
for variant_name, results in all_results:
for quad, text, conf in results:
sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad]
normalized.append((sq, text, conf, variant_name))
def quads_overlap(q1, q2, threshold=0.5):
b1, b2 = quad_bbox(q1), quad_bbox(q2)
x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1])
x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3])
if x2 < x1 or y2 < y1: return False
inter = (x2-x1)*(y2-y1)
union = (b1[2]-b1[0])*(b1[3]-b1[1]) + (b2[2]-b2[0])*(b2[3]-b2[1]) - inter
return inter / max(union, 1) > threshold
clusters, used = [], set()
for i, (q1, t1, c1, v1) in enumerate(normalized):
if i in used: continue
cluster = [(q1, t1, c1, v1)]
used.add(i)
for j, (q2, t2, c2, v2) in enumerate(normalized):
if j in used or i == j: continue
if quads_overlap(q1, q2):
cluster.append((q2, t2, c2, v2))
used.add(j)
clusters.append(cluster)
final_results = []
for cluster in clusters:
cluster.sort(key=lambda x: x[2], reverse=True)
best_quad, best_text, best_conf, _ = cluster[0]
text_votes = {}
for _, text, conf, _ in cluster:
n = normalize_text(text)
if n: text_votes[n] = text_votes.get(n, 0) + conf
if text_votes:
voted = max(text_votes.items(), key=lambda x: x[1])[0]
if voted != normalize_text(best_text):
best_text = voted
final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf))
return final_results
def read(self, image_path_or_array):
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
else image_path_or_array
if img is None or img.size == 0:
return []
variants = self.preprocess_variants(img)
all_results = []
for vname, vimg in variants:
r = self.run_vision_ocr(vimg)
if r: all_results.append((vname, r))
return self.merge_multi_pass_results(all_results, img.shape)
class MacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
self.langs = [lang_map.get(lang_key, "en-US")]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
def read(self, image_path_or_array):
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
else image_path_or_array
if img is None or img.size == 0:
return []
ih, iw = img.shape[:2]
success, buffer = cv2.imencode('.png', img)
if not success: return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error: return
for obs in request.results():
candidate = obs.topCandidates_(1)[0]
text, confidence = candidate.string(), candidate.confidence()
bbox = obs.boundingBox()
x = bbox.origin.x * iw
y_bl = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bl - h
quad = [[int(x),int(y)],[int(x+w),int(y)],
[int(x+w),int(y+h)],[int(x),int(y+h)]]
results.append((quad, text, confidence))
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(True)
req.setRecognitionLanguages_(self.langs)
req.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([req], None)
return results
def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3] - b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
xs = [(b[0] + b[2]) / 2.0 for b in boxes]
xs_sorted = sorted(xs)
gap_thresh = max(med_h * 1.2, 18) if use_aggressive_thresholds else max(med_h * 1.5, 22)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(xs_sorted) - 1):
gap = xs_sorted[i + 1] - xs_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx + 1]) / 2.0
left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 < split_x]
right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_bubble_if_multiple_rows(indices, ocr, bid=None):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3] - b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
ys = [(b[1] + b[3]) / 2.0 for b in boxes]
ys_sorted = sorted(ys)
gap_thresh = max(med_h * 2.0, 30)
best_gap_idx, best_gap_size = None, 0.0
for i in range(len(ys_sorted) - 1):
gap = ys_sorted[i + 1] - ys_sorted[i]
if gap > gap_thresh and gap > best_gap_size:
best_gap_size, best_gap_idx = gap, i
if best_gap_idx is None: return None
split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx + 1]) / 2.0
top_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 < split_y]
bottom_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 >= split_y]
if not top_idxs or not bottom_idxs: return None
return (top_idxs, bottom_idxs)
def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
if len(indices) < 2: return None
boxes = [quad_bbox(ocr[i][0]) for i in indices]
hs = [max(1, b[3] - b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices],
key=lambda x: (x[1][1] + x[1][3]) / 2.0)
gap_thresh = max(med_h * factor, min_gap)
best_gap, best_split_idx = 0.0, None
for k in range(len(items) - 1):
gap = items[k + 1][1][1] - items[k][1][3]
if gap > gap_thresh and gap > best_gap:
best_gap, best_split_idx = gap, k
if best_split_idx is None: return None
top_idxs = [it[0] for it in items[:best_split_idx + 1]]
bottom_idxs = [it[0] for it in items[best_split_idx + 1:]]
if not top_idxs or not bottom_idxs: return None
return (top_idxs, bottom_idxs)
def is_vertical_text_like(indices, ocr):
if len(indices) < 2: return False
boxes = [quad_bbox(ocr[i][0]) for i in indices]
med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes]))
med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes]))
if med_h < med_w * 1.2: return False
xs = [(b[0]+b[2])/2.0 for b in boxes]
ys = [(b[1]+b[3])/2.0 for b in boxes]
if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False
return True
def split_nested_or_side_by_side(indices, ocr):
if len(indices) < 2: return None
xs = sorted([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in indices])
mid_idx = len(xs) // 2
split_x = (xs[mid_idx - 1] + xs[mid_idx]) / 2.0
left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
if not left_idxs or not right_idxs: return None
return (left_idxs, right_idxs)
def split_panel_box(image_bgr, box_xyxy, bubble_quads=None):
x1, y1, x2, y2 = box_xyxy
ih, iw = image_bgr.shape[:2]
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(iw-1, x2), min(ih-1, y2)
if x2 <= x1 or y2 <= y1: return None
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
h_proj = np.sum(edges, axis=0)
w = x2 - x1
if w < 100: return None
search_start = int(w * 0.35)
search_end = int(w * 0.65)
if search_end <= search_start: return None
region = h_proj[search_start:search_end]
if len(region) == 0: return None
threshold = np.percentile(region, 85)
candidates = [x1 + search_start + rx for rx in range(len(region)) if region[rx] >= threshold]
if not candidates: return None
split_x = int(np.median(candidates))
if bubble_quads:
left_count = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x)
right_count = len(bubble_quads) - left_count
if left_count == 0 or right_count == 0: return None
return (x1, x2, split_x)
# ============================================================
# MERGE CLOSE BUBBLES
# ============================================================
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads,
bubble_indices, ocr):
"""
Merges boxes that are spatially very close (within ~1.4× line height on
BOTH axes simultaneously). Strict dual-axis check prevents merging boxes
from adjacent but distinct bubbles — fixing the BOX#5+BOX#16 overlap problem.
"""
if not bubbles:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
med_h = float(np.median(all_h)) if all_h else 14.0
merge_tol = max(8, med_h * 1.4)
bids = sorted(bubble_boxes.keys())
merged_set, merge_map = set(), {}
for i, bid_i in enumerate(bids):
if bid_i in merged_set: continue
x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i]
wi = x2_i - x1_i
for j in range(i + 1, len(bids)):
bid_j = bids[j]
if bid_j in merged_set: continue
x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j]
wj = x2_j - x1_j
gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j))
gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j))
# Horizontal overlap ratio — must be significant to merge
h_ix1 = max(x1_i, x1_j)
h_ix2 = min(x2_i, x2_j)
h_overlap = max(0, h_ix2 - h_ix1)
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
# STRICT: both gap_x AND gap_y must be small, AND boxes must
# share meaningful horizontal overlap (same column).
# This prevents merging horizontally adjacent distinct bubbles.
if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25:
if bid_i not in merge_map:
merge_map[bid_i] = [bid_i]
merge_map[bid_i].append(bid_j)
merged_set.add(bid_j)
if not merge_map:
return bubbles, bubble_boxes, bubble_quads, bubble_indices
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
next_bid = 1
for bid in bids:
if bid in merged_set: continue
if bid in merge_map:
group = merge_map[bid]
all_indices = sorted(set(idx for b in group for idx in bubble_indices[b]))
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
new_indices[next_bid] = all_indices
else:
new_bubbles[next_bid] = bubbles[bid]
new_boxes[next_bid] = bubble_boxes[bid]
new_quads[next_bid] = bubble_quads[bid]
new_indices[next_bid] = bubble_indices[bid]
next_bid += 1
return new_bubbles, new_boxes, new_quads, new_indices
# ============================================================
# WIDE / BRIDGE QUAD SPLITTING
# ============================================================
def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w = x2 - x1
if w > med_h * width_factor:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = roi.shape[0] * 255 * 0.15
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.6), 12):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0):
if not ocr_list: return ocr_list, 0
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
med_h = float(np.median(hs)) if hs else 14.0
result, splits_made = [], 0
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
w, h = x2 - x1, max(1, y2 - y1)
if w / h > aspect_ratio_threshold:
pad = 2
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
max(0,x1):min(image_bgr.shape[1],x2)]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
v_proj = np.sum(binary, axis=0)
gap_threshold = h * 255 * 0.20
gaps, in_gap, gap_start = [], False, 0
for x in range(len(v_proj)):
if v_proj[x] < gap_threshold:
if not in_gap: gap_start, in_gap = x, True
else:
if in_gap:
gw = x - gap_start
if gw >= max(int(med_h * 0.8), 15):
gaps.append((gap_start + gw // 2, gw))
in_gap = False
if gaps:
gaps.sort(key=lambda g: g[1], reverse=True)
split_x_abs = max(0, x1) + gaps[0][0]
if ' ' in text:
char_w = w / max(1, len(text))
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx))
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
else:
split_idx = int(len(text) * (split_x_abs - x1) / w)
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
if tl and tr:
result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
splits_made += 1
continue
result.append((quad, text, conf))
return result, splits_made
def normalize_ocr_quads(ocr_list):
result = []
for quad, text, conf in ocr_list:
x1, y1, x2, y2 = quad_bbox(quad)
pad = 3
new_quad = [[x1-pad, y1-pad], [x2+pad, y1-pad], [x2+pad, y2+pad], [x1-pad, y2+pad]]
result.append((new_quad, text, conf))
return result
# ============================================================
# VISION RE-READ
# ============================================================
def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw": return gray
if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(gray)
if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3,3), 0)
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11)
if mode == "otsu":
den = cv2.GaussianBlur(gray, (3,3), 0)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "invert": return 255 - gray
if mode == "bilateral":
den = cv2.bilateralFilter(gray, 7, 60, 60)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "morph_open":
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8))
return gray
def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2]
c = (w/2, h/2)
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
cos, sin = abs(M[0,0]), abs(M[0,1])
new_w = int((h*sin) + (w*cos))
new_h = int((h*cos) + (w*sin))
M[0,2] += (new_w/2) - c[0]
M[1,2] += (new_h/2) - c[1]
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
def rebuild_text_from_vision_result(res):
if not res: return ""
norm = []
for bbox, txt, conf in res:
if not txt or not txt.strip(): continue
b = quad_bbox(bbox)
norm.append((b, txt, conf, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
if not norm: return ""
med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4])
rows = []
for it in norm:
placed = False
for r in rows:
if abs(it[4] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[4] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[4], "m": [it]})
rows.sort(key=lambda r: r["yc"])
lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) for r in rows]
return normalize_text(" ".join(filter(None, lines)))
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
ih, iw = image_bgr.shape[:2]
x1, y1, x2, y2 = bbox_xyxy
x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad))
x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad))
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: return None, 0.0, "none"
modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
angles = [0.0, 1.5, -1.5]
best_v_txt, best_v_sc = "", 0.0
up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)),
interpolation=cv2.INTER_CUBIC)
for mode in modes:
proc = preprocess_variant(up0, mode)
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
for a in angles:
rot = rotate_image_keep_bounds(proc3, a)
res = (vision_detector.run_vision_ocr(rot)
if hasattr(vision_detector, 'run_vision_ocr')
else vision_detector.read(rot))
txt = rebuild_text_from_vision_result(res)
sc = ocr_candidate_score(txt)
if sc > best_v_sc:
best_v_txt, best_v_sc = txt, sc
if best_v_txt: return best_v_txt, best_v_sc, "vision-reread"
return None, 0.0, "none"
# ============================================================
# LINES + BUBBLES
# ============================================================
def build_lines_from_indices(indices, ocr):
if not indices: return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
placed = False
for r in rows:
if abs(it[3] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[3] for k in r["m"]]))
placed = True; break
if not placed: rows.append({"yc": it[3], "m": [it]})
rows.sort(key=lambda r: r["yc"])
return [normalize_text(" ".join(ocr[i][1] for i,_,_,_,_ in sorted(r["m"], key=lambda z: z[2])))
for r in rows if r["m"]]
def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path)
return base * (img.shape[1] / ref_w) if img is not None else base
def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mode=False):
"""
Groups OCR quads into bubble candidates.
Generic protections applied:
- orientation_compatible(): prevents tall/narrow glyphs merging with wide text lines.
- Horizontal gap guard: prevents side-by-side column quads from merging.
- detect_horizontal_gap_in_group(): post-merge split for groups with large internal gaps.
- Orientation check in secondary merge pass.
"""
n = len(ocr)
if n == 0: return {}, {}, {}, {}
boxes = [quad_bbox(r[0]) for r in ocr]
centers = [quad_center(r[0]) for r in ocr]
hs = [max(1.0, b[3]-b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0
max_horizontal_offset = med_h * 1.8
sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0]))
groups, used = [], set()
for i in sorted_indices:
if i in used: continue
current_group = [i]
used.add(i)
cx_i, cy_i = centers[i]
for j in sorted_indices:
if j in used or j == i: continue
cx_j, cy_j = centers[j]
if cy_j <= cy_i: continue
if abs(cx_i - cx_j) > max_horizontal_offset: continue
# Horizontal gap guard
gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2]))
if gap_x > med_h * 1.5: continue
# Orientation compatibility guard
if not orientation_compatible(i, j, ocr): continue
vertical_gap = boxes[j][1] - boxes[current_group[-1]][3]
if vertical_gap <= max_vertical_gap:
current_group.append(j)
used.add(j)
cx_i = (cx_i + cx_j) / 2.0
if current_group:
groups.append(current_group)
# Secondary merge pass
merged_groups, used_groups = [], set()
for i, group1 in enumerate(groups):
if i in used_groups: continue
merged = list(group1)
used_groups.add(i)
for j, group2 in enumerate(groups):
if i == j or j in used_groups: continue
if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap):
compat = all(orientation_compatible(a, b, ocr)
for a in merged for b in group2)
if compat:
merged.extend(group2)
used_groups.add(j)
merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1]))
# Horizontal gap split pass
final_groups = []
for group in merged_groups:
h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5)
if h_split:
lg, rg = h_split
final_groups.append(sorted(lg, key=lambda idx: centers[idx][1]))
final_groups.append(sorted(rg, key=lambda idx: centers[idx][1]))
else:
final_groups.append(group)
final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), min(centers[i][0] for i in g)))
bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
ih, iw = image_shape[:2]
for bid, idxs in enumerate(final_groups, start=1):
lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
if ub is None: continue
x1, y1, x2, y2 = ub
ap = max(1, int(round(med_h * 0.16)))
bubbles[bid] = lines
bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), min(iw-1,x2+ap), min(ih-1,y2+ap))
bubble_quads[bid] = quads
bubble_indices[bid]= idxs
return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ============================================================
# DEBUG / EXPORT
# ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices,
clean_lines=None, out_path="debug_clusters.png"):
img = cv2.imread(image_path)
if img is None: return
for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32)
cv2.fillPoly(img, [pts], (255,255,255))
cv2.polylines(img, [pts], True, (180,180,180), 1)
for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb
is_isolated = len(bubble_indices.get(bid, [])) == 1
color = (255,165,0) if is_isolated else (0,220,0)
thickness = 3 if is_isolated else 2
cv2.rectangle(img, (x1,y1), (x2,y2), color, thickness)
label = f"BOX#{bid}" + (" (ISOLATED)" if is_isolated else "")
cv2.putText(img, label, (x1+2, max(15, y1+16)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
if clean_lines and bid in clean_lines:
text = clean_lines[bid]
words = text.split()
lines, cur = [], ""
for w in words:
if len(cur) + len(w) < 25: cur += w + " "
else: lines.append(cur.strip()); cur = w + " "
if cur: lines.append(cur.strip())
y_text = y2 + 18
for line in lines:
cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3)
cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
y_text += 18
cv2.imwrite(out_path, img)
def estimate_reading_order(bbox_dict, mode="ltr"):
items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) for bid, bb in bbox_dict.items()]
items.sort(key=lambda t: t[2])
rows, tol = [], 90
for it in items:
placed = False
for r in rows:
if abs(it[2] - r["cy"]) <= tol:
r["items"].append(it)
r["cy"] = float(np.mean([x[2] for x in r["items"]]))
placed = True; break
if not placed: rows.append({"cy": it[2], "items": [it]})
rows.sort(key=lambda r: r["cy"])
order = []
for r in rows:
r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
order.extend([z[0] for z in r["items"]])
return {bid: i+1 for i, bid in enumerate(order)}
# ============================================================
# MAIN PIPELINE
# ============================================================
def translate_manga_text(
image_path="001-page.png",
source_lang="en",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
):
image = cv2.imread(image_path)
if image is None:
print(f"❌ Cannot load image: {image_path}"); return
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
print("Loading OCR engines...")
if use_enhanced_ocr:
detector = ImprovedMacVisionDetector(source_lang=source_lang)
print("🚀 Using Enhanced Multi-Pass OCR")
else:
detector = MacVisionDetector(source_lang=source_lang)
print("Running detection OCR (Apple Vision)...")
raw = detector.read(image_path)
print(f"Raw detections: {len(raw)}")
if use_enhanced_ocr:
existing_quads = [r[0] for r in raw]
missed_regions = detect_small_text_regions(image, existing_quads)
if missed_regions:
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
for region in missed_regions:
x1, y1, x2, y2 = region
pad = 10
x1, y1 = max(0, x1-pad), max(0, y1-pad)
x2, y2 = min(image.shape[1], x2+pad), min(image.shape[0], y2+pad)
crop = image[y1:y2, x1:x2]
if crop.size > 0:
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0,
interpolation=cv2.INTER_CUBIC)
for quad, text, conf in detector.run_vision_ocr(upscaled):
raw.append(([[int(p[0]/4.0+x1), int(p[1]/4.0+y1)] for p in quad],
text, conf))
print(f"📝 Total detections after missed region scan: {len(raw)}")
filtered, skipped = [], 0
ih, iw = image.shape[:2]
for bbox, text, conf in raw:
t = normalize_text(text)
qb = quad_bbox(bbox)
if conf < confidence_threshold: skipped += 1; continue
if len(t) < min_text_length: skipped += 1; continue
if not is_valid_language(t, source_lang): skipped += 1; continue
if not is_meaningful_text(t, source_lang):skipped += 1; continue
if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5:
skipped += 1; continue
filtered.append((bbox, t, conf))
print(f"Kept: {len(filtered)} | Skipped: {skipped}")
if not filtered:
print("⚠️ No text after filtering."); return
# ── Pre-grouping quad splits ──────────────────────────────────────────
filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered)
if oversized_splits > 0:
print(f"📐 Split {oversized_splits} oversized quad(s) before grouping")
filtered, splits_made = split_wide_ocr_items(image, filtered)
if splits_made > 0:
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
if bridge_splits > 0:
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
# ── Column-gap split: catches BOX#6 type wide quads spanning two columns ──
hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered]
med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0
filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre)
if col_splits > 0:
print(f"📐 Column-gap split: {col_splits} quad(s) split before grouping")
filtered = normalize_ocr_quads(filtered)
print("📊 Grouping quads vertically...")
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical(
filtered, image.shape, gap_px=resolved_gap, bbox_padding=1, strict_mode=strict_grouping
)
print(f" Created {len(bubbles)} initial box(es)")
if auto_fix_bubbles:
bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image
)
bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size(
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered,
max_width_ratio=max_box_width_ratio,
max_height_ratio=max_box_height_ratio,
image_shape=image.shape
)
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
)
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
next_bid = max(bubbles.keys()) + 1 if bubbles else 1
splits_performed = []
for bid in list(bubbles.keys()):
box = bubble_boxes[bid]
bubble_split = None
if is_vertical_text_like(bubble_indices[bid], filtered):
vgap = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered,
factor=1.7, min_gap=18)
if vgap:
bubble_split = vgap
splits_performed.append(f"BOX#{bid} (vertical-stack y-gap)")
if bubble_split is None:
sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
if sr:
_, _, split_x = sr
li = [idx for idx in bubble_indices[bid]
if quad_center(filtered[idx][0])[0] < split_x]
ri = [idx for idx in bubble_indices[bid]
if quad_center(filtered[idx][0])[0] >= split_x]
if li and ri:
bubble_split = (li, ri)
splits_performed.append(f"BOX#{bid} (panel border)")
elif len(bubble_quads[bid]) >= 4:
cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered,
bid=bid, use_aggressive_thresholds=True)
if cs:
bubble_split = cs
splits_performed.append(f"BOX#{bid} (aggressive column)")
if bubble_split is None:
cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid)
if cs:
bubble_split = cs
splits_performed.append(f"BOX#{bid} (vertical column)")
if bubble_split is None:
ns = split_nested_or_side_by_side(bubble_indices[bid], filtered)
if ns:
bubble_split = ns
splits_performed.append(f"BOX#{bid} (nested/side-by-side)")
if bubble_split is None:
rs = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid)
if rs:
bubble_split = rs
splits_performed.append(f"BOX#{bid} (horizontal row)")
if bubble_split is None:
gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered,
factor=1.9, min_gap=22)
if gy:
bubble_split = gy
splits_performed.append(f"BOX#{bid} (large vertical-gap)")
if bubble_split:
p1, p2 = bubble_split
for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]:
ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs])
new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered)
new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2),
min(iw-1,ub[2]+2), min(ih-1,ub[3]+2))
new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs]
new_bubble_indices[part_bid] = part_idxs
next_bid += 1
else:
new_bubbles[bid] = bubbles[bid]
new_bubble_boxes[bid] = bubble_boxes[bid]
new_bubble_quads[bid] = bubble_quads[bid]
new_bubble_indices[bid] = bubble_indices[bid]
if splits_performed:
print(f"\n🔀 Splits detected: {len(splits_performed)}")
bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes(
new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles,
overlap_threshold=0.50
)
print(f"✅ Final box count: {len(bubbles)}")
# ── OCR quality pass ──────────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
clean_lines: Dict[int, str] = {}
sources_used: Dict[int, str] = {}
translations: Dict[int, str] = {}
for bid, lines in bubbles.items():
base_txt = normalize_text(" ".join(lines))
base_sc = ocr_candidate_score(base_txt)
txt, src_used = base_txt, "vision-base"
if base_sc < quality_threshold:
rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
image, bubble_boxes[bid], detector, upscale=3.0, pad=24)
if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang):
txt, src_used = rr_txt, rr_src
clean_lines[bid] = normalize_text(txt)
sources_used[bid] = src_used
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
# ── Single-pass translation cache ────────────────────────────────────
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
try:
tgt = translator.translate(src_txt) or ""
tgt = postprocess_translation_general(tgt).upper()
except Exception as e:
tgt = f"[Error: {e}]"
translations[bid] = tgt
if debug:
save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices,
clean_lines, "debug_clusters.png")
# ── Text output ───────────────────────────────────────────────────────
divider = "" * 120
out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} "
f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider)
translated_count = 0
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
flags = []
tgt = translations.get(bid, "")
if not tgt: flags.append("NO_TRANSLATION")
src_u = src_txt.upper()
src_engine = sources_used.get(bid, "unknown")
out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|"
f"{','.join(flags) if flags else '-'}")
print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}")
translated_count += 1
out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.")
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(out_lines))
# ── bubbles.json ──────────────────────────────────────────────────────
bubbles_payload = {}
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src_txt = clean_lines[bid].strip()
if not src_txt: continue
if not is_valid_language(src_txt, source_lang): continue
if not is_meaningful_text(src_txt, source_lang): continue
box = bubble_boxes.get(bid)
tgt = translations.get(bid, "")
bubbles_payload[str(bid)] = {
"order": reading_map.get(bid, bid),
"ocr_source": sources_used.get(bid, "unknown"),
"original": src_txt.upper(),
"translated": tgt,
"box": {
"x": box[0] if box else 0,
"y": box[1] if box else 0,
"w": (box[2]-box[0]) if box else 0,
"h": (box[3]-box[1]) if box else 0,
},
"lines": [line.upper() for line in bubbles.get(bid, [])],
}
with open(export_bubbles_to, "w", encoding="utf-8") as f:
json.dump(bubbles_payload, f, ensure_ascii=False, indent=2)
print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}")
# ============================================================
# ENTRY POINT
# ============================================================
if __name__ == "__main__":
translate_manga_text(
image_path="17.jpg",
source_lang="english",
target_lang="ca",
confidence_threshold=0.03,
min_text_length=1,
gap_px="auto",
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="rtl",
debug=True,
use_enhanced_ocr=True,
strict_grouping=True,
max_box_width_ratio=0.6,
max_box_height_ratio=0.5,
auto_fix_bubbles=True
)