2696 lines
106 KiB
Python
2696 lines
106 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import os
|
||
import re
|
||
import json
|
||
import cv2
|
||
import numpy as np
|
||
import warnings
|
||
from typing import List, Tuple, Dict, Any, Optional
|
||
|
||
from deep_translator import GoogleTranslator
|
||
|
||
# macOS Native Vision imports
|
||
import Vision
|
||
import Quartz
|
||
from Foundation import NSData
|
||
|
||
warnings.filterwarnings("ignore", category=UserWarning)
|
||
|
||
# ============================================================
|
||
# CONFIG
|
||
# ============================================================
|
||
TOP_BAND_RATIO = 0.08
|
||
|
||
# ============================================================
|
||
# HELPERS
|
||
# ============================================================
|
||
def normalize_text(text: str) -> str:
|
||
t = (text or "").strip().upper()
|
||
t = t.replace("\u201c", "\"").replace("\u201d", "\"")
|
||
t = t.replace("\u2018", "'").replace("\u2019", "'")
|
||
t = t.replace("\u2026", "...")
|
||
t = re.sub(r"\s+", " ", t)
|
||
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
|
||
t = re.sub(r"([¡¿])\s+", r"\1", t)
|
||
t = re.sub(r"\(\s+", "(", t)
|
||
t = re.sub(r"\s+\)", ")", t)
|
||
t = re.sub(r"\.{4,}", "...", t)
|
||
return t.strip()
|
||
|
||
def postprocess_translation_general(text: str) -> str:
|
||
t = normalize_text(text)
|
||
t = re.sub(r"\s{2,}", " ", t).strip()
|
||
t = re.sub(r"([!?]){3,}", r"\1\1", t)
|
||
t = re.sub(r"\.{4,}", "...", t)
|
||
return t
|
||
|
||
def fix_common_ocr_errors(text: str) -> str:
|
||
result = text
|
||
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
|
||
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
|
||
result = result.replace('|', 'I')
|
||
result = result.replace('`', "'")
|
||
return result
|
||
|
||
def is_valid_language(text: str, source_lang: str) -> bool:
|
||
if not text:
|
||
return False
|
||
clean_text = re.sub(r'[^\w]', '', text)
|
||
if not clean_text:
|
||
return False
|
||
lang = source_lang.lower()
|
||
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
|
||
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
|
||
foreign_chars = len(re.findall(
|
||
r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF'
|
||
r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]',
|
||
clean_text))
|
||
if foreign_chars > 0:
|
||
return False
|
||
latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text))
|
||
total = len(clean_text)
|
||
if total <= 3:
|
||
return latin_chars >= 1
|
||
if total <= 6:
|
||
return (latin_chars / total) >= 0.55
|
||
return (latin_chars / total) >= 0.45
|
||
elif lang in ['ja', 'japanese']:
|
||
ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text))
|
||
if len(clean_text) <= 3:
|
||
return ja_chars >= 1
|
||
return (ja_chars / len(clean_text)) >= 0.4
|
||
elif lang in ['ko', 'korean']:
|
||
ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text))
|
||
if len(clean_text) <= 3:
|
||
return ko_chars >= 1
|
||
return (ko_chars / len(clean_text)) >= 0.4
|
||
elif lang in ['zh', 'chinese']:
|
||
zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text))
|
||
if len(clean_text) <= 3:
|
||
return zh_chars >= 1
|
||
return (zh_chars / len(clean_text)) >= 0.4
|
||
return True
|
||
|
||
|
||
_NOISE_TOKENS = {
|
||
'P', 'F', 'N', 'M', 'X', 'Z', 'Q',
|
||
'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP',
|
||
}
|
||
|
||
_MANGA_INTERJECTIONS = {
|
||
'HUH', 'HUH?', 'HUH??', 'HUH?!',
|
||
'OH', 'OH!', 'OOH', 'OOH!',
|
||
'AH', 'AH!', 'UH', 'UH...',
|
||
'HEY', 'HEY!',
|
||
'EH', 'EH?',
|
||
'WOW', 'WOW!',
|
||
'YES', 'NO', 'NO!',
|
||
'RUN', 'GO', 'GO!',
|
||
'STOP', 'WAIT',
|
||
'WHAT', 'WHAT?', 'WHAT?!',
|
||
'WHY', 'WHY?',
|
||
'HOW', 'HOW?',
|
||
'OK', 'OK!', 'OKAY',
|
||
'EEEEP', 'EEEP',
|
||
'OMIGOSH',
|
||
'BECKY', 'BECKY!',
|
||
'HMM', 'HMM...',
|
||
'TSK', 'TCH',
|
||
'GRRR','I','A',
|
||
'FWUP', 'FWAP',
|
||
'SHIVER',
|
||
'RRRING',
|
||
'MORNING', 'MORNING.',
|
||
}
|
||
|
||
# ============================================================
|
||
# PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET
|
||
# ============================================================
|
||
PROTECTED_SHORT_TOKENS = {
|
||
"HUH", "HUH?", "HUH??", "HUH?!",
|
||
"OH", "OH!", "OOH", "OOH!",
|
||
"AH", "AH!", "UH", "UH...",
|
||
"HEY", "HEY!", "EH", "EH?",
|
||
"WOW", "WOW!",
|
||
"MORNING", "MORNING.",
|
||
"BECKY", "BECKY!",
|
||
"DAMIAN", "CECILE", "WALD",
|
||
"OMIGOSH", "EEEP", "EEEEP"
|
||
}
|
||
|
||
KNOWN_NAMES = {
|
||
"BECKY", "DAMIAN", "CECILE", "WALD"
|
||
}
|
||
|
||
def is_protected_token(text: str) -> bool:
|
||
t = normalize_text(text or "")
|
||
if not t:
|
||
return False
|
||
if t in PROTECTED_SHORT_TOKENS:
|
||
return True
|
||
# punctuation-insensitive fallback
|
||
t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t)
|
||
return t_alpha in PROTECTED_SHORT_TOKENS
|
||
|
||
def maybe_conf_floor_for_protected(text: str, conf: float, floor: float = 0.40) -> float:
|
||
if is_protected_token(text):
|
||
return max(conf, floor)
|
||
return conf
|
||
|
||
def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool:
|
||
if not text:
|
||
return False
|
||
|
||
t = text.strip()
|
||
t_upper = normalize_text(t)
|
||
|
||
# 1) Hard keep for protected tokens
|
||
if is_protected_token(t_upper):
|
||
return True
|
||
|
||
t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper)
|
||
if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS:
|
||
return True
|
||
|
||
alpha_count = sum(c.isalpha() for c in t)
|
||
if alpha_count < min_alpha_chars:
|
||
# allow short punctuated utterances like "Huh?"
|
||
if re.fullmatch(r"[A-Za-zÀ-ÿ]{2,6}[!?\.]{0,3}", t.strip()):
|
||
return True
|
||
return False
|
||
|
||
if t_upper in _NOISE_TOKENS:
|
||
return False
|
||
|
||
lang = source_lang.lower()
|
||
|
||
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
|
||
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
|
||
non_alpha = sum(not c.isalpha() for c in t)
|
||
# slightly less aggressive than before
|
||
if len(t) > 0 and (non_alpha / len(t)) > 0.72:
|
||
return False
|
||
|
||
if len(t) >= 3 and len(set(t_upper)) == 1:
|
||
return False
|
||
|
||
if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
|
||
'it', 'italian', 'ca', 'catalan', 'de', 'german']:
|
||
if len(t) > 5:
|
||
vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t))
|
||
if vowels == 0:
|
||
return False
|
||
|
||
return True
|
||
|
||
def quad_bbox(quad):
|
||
xs = [p[0] for p in quad]
|
||
ys = [p[1] for p in quad]
|
||
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
|
||
|
||
def quad_center(quad):
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
|
||
|
||
def boxes_union_xyxy(boxes):
|
||
boxes = [b for b in boxes if b is not None]
|
||
if not boxes:
|
||
return None
|
||
return (
|
||
int(min(b[0] for b in boxes)),
|
||
int(min(b[1] for b in boxes)),
|
||
int(max(b[2] for b in boxes)),
|
||
int(max(b[3] for b in boxes)),
|
||
)
|
||
|
||
def bbox_area_xyxy(b):
|
||
if b is None:
|
||
return 0
|
||
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
|
||
|
||
def xyxy_to_xywh(b):
|
||
if b is None:
|
||
return None
|
||
x1, y1, x2, y2 = b
|
||
return {"x": int(x1), "y": int(y1),
|
||
"w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
|
||
|
||
def overlap_or_near(a, b, gap=0):
|
||
ax1, ay1, ax2, ay2 = a
|
||
bx1, by1, bx2, by2 = b
|
||
gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
|
||
gap_y = max(0, max(ay1, by1) - min(ay2, by2))
|
||
return gap_x <= gap and gap_y <= gap
|
||
|
||
def boxes_iou(a, b):
|
||
ax1, ay1, ax2, ay2 = a
|
||
bx1, by1, bx2, by2 = b
|
||
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
||
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
||
if inter == 0:
|
||
return 0.0
|
||
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
|
||
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
|
||
return inter / max(1, area_a + area_b - inter)
|
||
|
||
def boxes_overlap_ratio(a, b):
|
||
"""Ratio of intersection to the SMALLER box area."""
|
||
ax1, ay1, ax2, ay2 = a
|
||
bx1, by1, bx2, by2 = b
|
||
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
||
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
||
if inter == 0:
|
||
return 0.0
|
||
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
|
||
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
|
||
return inter / max(1, min(area_a, area_b))
|
||
|
||
def ocr_candidate_score(text: str) -> float:
|
||
if not text:
|
||
return 0.0
|
||
t = text.strip()
|
||
n = len(t)
|
||
if n == 0:
|
||
return 0.0
|
||
alpha = sum(c.isalpha() for c in t) / n
|
||
spaces = sum(c.isspace() for c in t) / n
|
||
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
|
||
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
|
||
penalty = 0.0
|
||
if re.search(r"\b[A-Z]\b", t):
|
||
penalty += 0.05
|
||
if re.search(r"[0-9]{2,}", t):
|
||
penalty += 0.08
|
||
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
|
||
return max(0.0, min(1.0, score))
|
||
|
||
def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool:
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
return (max(1, x2 - x1) / max(1, y2 - y1)) >= ratio_threshold
|
||
|
||
def quad_is_vertical(quad, ratio_threshold=1.5) -> bool:
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
return (max(1, y2 - y1) / max(1, x2 - x1)) >= ratio_threshold
|
||
|
||
|
||
# ============================================================
|
||
# ENHANCED IMAGE PREPROCESSING
|
||
# ============================================================
|
||
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
|
||
h, w = image_bgr.shape[:2]
|
||
upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)),
|
||
interpolation=cv2.INTER_CUBIC)
|
||
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
|
||
denoised = cv2.fastNlMeansDenoising(gray, None, h=10,
|
||
templateWindowSize=7, searchWindowSize=21)
|
||
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
||
enhanced = clahe.apply(denoised)
|
||
sharpened = cv2.filter2D(enhanced, -1,
|
||
np.array([[-1,-1,-1],[-1,9,-1],[-1,-1,-1]]))
|
||
binary = cv2.adaptiveThreshold(sharpened, 255,
|
||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY, 11, 2)
|
||
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8))
|
||
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
|
||
|
||
def detect_small_text_regions(image_bgr, existing_quads):
|
||
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||
mask = np.zeros(gray.shape, dtype=np.uint8)
|
||
for quad in existing_quads:
|
||
cv2.fillPoly(mask, [np.array(quad, dtype=np.int32)], 255)
|
||
mask_inv = cv2.bitwise_not(mask)
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
|
||
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL,
|
||
cv2.CHAIN_APPROX_SIMPLE)
|
||
text_regions = []
|
||
for contour in contours:
|
||
x, y, w, h = cv2.boundingRect(contour)
|
||
area = w * h
|
||
if 50 < area < 5000 and 0.1 < h / max(w, 1) < 10:
|
||
text_regions.append((x, y, x + w, y + h))
|
||
return text_regions
|
||
|
||
|
||
# ============================================================
|
||
# SPEECH BUBBLE DETECTION
|
||
# ============================================================
|
||
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
|
||
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY_INV, 11, 2)
|
||
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||
return [c for c in contours if cv2.contourArea(c) > 500]
|
||
|
||
def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5):
|
||
x1, y1, x2, y2 = quad_bbox_xyxy
|
||
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
|
||
return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance
|
||
|
||
def split_indices_by_bubble(indices, ocr, bubble_contours):
|
||
if not indices:
|
||
return []
|
||
bubble_groups, outside_group = {}, []
|
||
for idx in indices:
|
||
bbox = quad_bbox(ocr[idx][0])
|
||
found = False
|
||
for bidx, bubble in enumerate(bubble_contours):
|
||
if is_quad_in_bubble(bbox, bubble):
|
||
bubble_groups.setdefault(bidx, []).append(idx)
|
||
found = True
|
||
break
|
||
if not found:
|
||
outside_group.append(idx)
|
||
result = list(bubble_groups.values())
|
||
if outside_group:
|
||
result.append(outside_group)
|
||
return result
|
||
|
||
def check_vertical_alignment_split(indices, ocr, threshold=20):
|
||
if len(indices) <= 1:
|
||
return [indices]
|
||
items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices],
|
||
key=lambda x: x[1][1])
|
||
groups, current_group = [], [items[0][0]]
|
||
for i in range(1, len(items)):
|
||
if items[i][1][1] - items[i-1][1][3] > threshold:
|
||
groups.append(current_group)
|
||
current_group = [items[i][0]]
|
||
else:
|
||
current_group.append(items[i][0])
|
||
if current_group:
|
||
groups.append(current_group)
|
||
return groups
|
||
|
||
|
||
# ============================================================
|
||
# QUAD SIZE VALIDATION AND SPLITTING
|
||
# ============================================================
|
||
def is_quad_oversized(quad, median_height, width_threshold=8.0):
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
w, h = x2 - x1, max(1, y2 - y1)
|
||
return w > median_height * width_threshold or w / h > 12.0
|
||
|
||
def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height):
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
w, h = x2 - x1, max(1, y2 - y1)
|
||
pad = 2
|
||
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
|
||
max(0,x1):min(image_bgr.shape[1],x2)]
|
||
if roi.size == 0:
|
||
return [(quad, text, conf)]
|
||
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
v_proj = np.sum(binary, axis=0)
|
||
gap_threshold = h * 255 * 0.20
|
||
gaps, in_gap, gap_start = [], False, 0
|
||
for x in range(len(v_proj)):
|
||
if v_proj[x] < gap_threshold:
|
||
if not in_gap: gap_start, in_gap = x, True
|
||
else:
|
||
if in_gap:
|
||
gw = x - gap_start
|
||
if gw >= max(int(median_height * 0.8), 15):
|
||
gaps.append((gap_start + gw // 2, gw))
|
||
in_gap = False
|
||
if not gaps:
|
||
return [(quad, text, conf)]
|
||
gaps.sort(key=lambda g: g[1], reverse=True)
|
||
split_x_abs = max(0, x1) + gaps[0][0]
|
||
if ' ' in text:
|
||
char_w = w / max(1, len(text))
|
||
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
|
||
spaces = [i for i, c in enumerate(text) if c == ' ']
|
||
if spaces:
|
||
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
else:
|
||
split_idx = int(len(text) * (split_x_abs - x1) / w)
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
if tl and tr:
|
||
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
|
||
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
|
||
return [(quad, text, conf)]
|
||
|
||
def validate_and_split_oversized_quads(image_bgr, filtered_ocr):
|
||
if not filtered_ocr:
|
||
return filtered_ocr, 0
|
||
heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
|
||
median_height = float(np.median(heights)) if heights else 14.0
|
||
result, splits_made = [], 0
|
||
for quad, text, conf in filtered_ocr:
|
||
if is_quad_oversized(quad, median_height, 8.0):
|
||
sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height)
|
||
if len(sr) > 1:
|
||
result.extend(sr); splits_made += 1
|
||
else:
|
||
result.append((quad, text, conf))
|
||
else:
|
||
result.append((quad, text, conf))
|
||
return result, splits_made
|
||
|
||
|
||
# ============================================================
|
||
# HORIZONTAL GAP DETECTION AT QUAD LEVEL
|
||
# ============================================================
|
||
def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5):
|
||
if len(indices) < 2:
|
||
return None
|
||
items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0])
|
||
boxes = [quad_bbox(ocr[i][0]) for i in items]
|
||
gap_threshold = med_h * gap_factor
|
||
best_gap, best_split = 0.0, None
|
||
for k in range(len(items) - 1):
|
||
gap = boxes[k + 1][0] - boxes[k][2]
|
||
if gap > gap_threshold and gap > best_gap:
|
||
best_gap, best_split = gap, k
|
||
if best_split is None:
|
||
return None
|
||
left_group = [items[i] for i in range(best_split + 1)]
|
||
right_group = [items[i] for i in range(best_split + 1, len(items))]
|
||
if not left_group or not right_group:
|
||
return None
|
||
return (left_group, right_group)
|
||
|
||
def orientation_compatible(idx_a, idx_b, ocr):
|
||
ba = quad_bbox(ocr[idx_a][0])
|
||
bb = quad_bbox(ocr[idx_b][0])
|
||
wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1])
|
||
wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1])
|
||
ra, rb = wa / ha, wb / hb
|
||
if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0):
|
||
return False
|
||
return True
|
||
|
||
|
||
# ============================================================
|
||
# WIDE QUAD COLUMN SPLIT — pre-grouping
|
||
# ============================================================
|
||
def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h,
|
||
min_gap_factor=1.8):
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
w, h = x2 - x1, max(1, y2 - y1)
|
||
if w < med_h * 3.0:
|
||
return [(quad, text, conf)]
|
||
pad = 2
|
||
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
|
||
max(0,x1):min(image_bgr.shape[1],x2)]
|
||
if roi.size == 0:
|
||
return [(quad, text, conf)]
|
||
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
v_proj = np.sum(binary, axis=0)
|
||
gap_threshold = h * 255 * 0.12
|
||
min_gap_px = max(int(med_h * min_gap_factor), 10)
|
||
gaps, in_gap, gap_start = [], False, 0
|
||
for x in range(len(v_proj)):
|
||
if v_proj[x] < gap_threshold:
|
||
if not in_gap: gap_start, in_gap = x, True
|
||
else:
|
||
if in_gap:
|
||
gw = x - gap_start
|
||
if gw >= min_gap_px:
|
||
gaps.append((gap_start + gw // 2, gw))
|
||
in_gap = False
|
||
if not gaps:
|
||
return [(quad, text, conf)]
|
||
gaps.sort(key=lambda g: g[1], reverse=True)
|
||
split_x_rel = gaps[0][0]
|
||
split_x_abs = x1 + split_x_rel
|
||
if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h:
|
||
return [(quad, text, conf)]
|
||
if ' ' in text:
|
||
char_w = w / max(1, len(text))
|
||
split_idx = int(split_x_rel / max(1e-6, char_w))
|
||
spaces = [i for i, c in enumerate(text) if c == ' ']
|
||
if spaces:
|
||
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
else:
|
||
split_idx = int(len(text) * split_x_rel / w)
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
if tl and tr:
|
||
return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
|
||
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
|
||
return [(quad, text, conf)]
|
||
|
||
def apply_column_gap_splits(image_bgr, ocr_list, med_h):
|
||
result, splits_made = [], 0
|
||
for quad, text, conf in ocr_list:
|
||
parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h)
|
||
if len(parts) > 1:
|
||
splits_made += 1
|
||
result.extend(parts)
|
||
if splits_made:
|
||
print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping")
|
||
return result, splits_made
|
||
|
||
|
||
# ============================================================
|
||
# GENERALIZED BOX FIXING FUNCTIONS
|
||
# ============================================================
|
||
def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads,
|
||
bubbles, ocr, image_bgr):
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
|
||
for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
bubble_contours = detect_speech_bubbles(image_bgr)
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
next_bid, splits_made = 1, []
|
||
|
||
for bid, indices in bubble_indices.items():
|
||
if len(indices) < 2:
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = indices
|
||
next_bid += 1
|
||
continue
|
||
|
||
split_groups = split_indices_by_bubble(indices, ocr, bubble_contours)
|
||
if len(split_groups) > 1:
|
||
for group in split_groups:
|
||
if group:
|
||
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
|
||
new_quads[next_bid] = [ocr[i][0] for i in group]
|
||
new_indices[next_bid] = group
|
||
next_bid += 1
|
||
splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles")
|
||
continue
|
||
|
||
vertical_splits = check_vertical_alignment_split(indices, ocr,
|
||
threshold=int(med_h * 2.0))
|
||
if len(vertical_splits) > 1:
|
||
for group in vertical_splits:
|
||
if group:
|
||
new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
|
||
new_quads[next_bid] = [ocr[i][0] for i in group]
|
||
new_indices[next_bid] = group
|
||
next_bid += 1
|
||
splits_made.append(f"BOX#{bid} → {len(vertical_splits)} vertical groups")
|
||
continue
|
||
|
||
box = bubble_boxes[bid]
|
||
x1, y1, x2, y2 = box
|
||
if (x2 - x1) > med_h * 10:
|
||
x_centers = [quad_center(ocr[i][0])[0] for i in indices]
|
||
x_median = np.median(x_centers)
|
||
left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median]
|
||
right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median]
|
||
if left_group and right_group:
|
||
left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group])
|
||
right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group])
|
||
if right_box[0] - left_box[2] > med_h * 1.5:
|
||
for grp in [left_group, right_group]:
|
||
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
|
||
new_quads[next_bid] = [ocr[i][0] for i in grp]
|
||
new_indices[next_bid] = grp
|
||
next_bid += 1
|
||
splits_made.append(f"BOX#{bid} → 2 horizontal panels")
|
||
continue
|
||
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = indices
|
||
next_bid += 1
|
||
|
||
if splits_made:
|
||
print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):")
|
||
for s in splits_made: print(f" ✓ {s}")
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
|
||
def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads,
|
||
bubbles, ocr, image_bgr):
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
|
||
for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
bubble_contours = detect_speech_bubbles(image_bgr)
|
||
bids = list(bubble_boxes.keys())
|
||
to_merge = []
|
||
|
||
for i in range(len(bids)):
|
||
for j in range(i + 1, len(bids)):
|
||
bid_i, bid_j = bids[i], bids[j]
|
||
box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j]
|
||
cx_i = (box_i[0] + box_i[2]) / 2.0
|
||
cy_i = (box_i[1] + box_i[3]) / 2.0
|
||
cx_j = (box_j[0] + box_j[2]) / 2.0
|
||
cy_j = (box_j[1] + box_j[3]) / 2.0
|
||
in_same_bubble = any(
|
||
cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
|
||
cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0
|
||
for c in bubble_contours
|
||
)
|
||
if in_same_bubble:
|
||
if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0:
|
||
to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i))
|
||
|
||
if not to_merge:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):")
|
||
merge_groups = {}
|
||
for top, bottom in to_merge:
|
||
found = False
|
||
for key in merge_groups:
|
||
if top in merge_groups[key] or bottom in merge_groups[key]:
|
||
merge_groups[key].update({top, bottom})
|
||
found = True; break
|
||
if not found:
|
||
merge_groups[len(merge_groups)] = {top, bottom}
|
||
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
merged_bids, next_bid = set(), 1
|
||
for merge_set in merge_groups.values():
|
||
merge_list = sorted(merge_set)
|
||
print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}")
|
||
all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b]))
|
||
for b in merge_list: merged_bids.add(b)
|
||
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
|
||
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
|
||
new_indices[next_bid] = all_indices
|
||
next_bid += 1
|
||
for bid in bids:
|
||
if bid not in merged_bids:
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = bubble_indices[bid]
|
||
next_bid += 1
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
|
||
def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads,
|
||
bubbles, ocr, med_h):
|
||
"""
|
||
Merges boxes that are vertically close AND share significant horizontal overlap.
|
||
|
||
Single-quad boxes participate fully — no isolation treatment.
|
||
This fixes BOX#2+#16, BOX#8+#21, BOX#9+#22 type problems where a
|
||
single-line detection sits directly above/below a multi-line box in the
|
||
same speech bubble.
|
||
|
||
Merge criteria (both must be true):
|
||
1. Vertical gap ≤ 1.5 × med_h
|
||
2. Horizontal overlap ratio ≥ 0.35
|
||
"""
|
||
bids = sorted(bubble_boxes.keys())
|
||
merge_map: Dict[int, List[int]] = {}
|
||
merged_into: Dict[int, int] = {}
|
||
|
||
for i, bid_i in enumerate(bids):
|
||
if bid_i in merged_into:
|
||
continue
|
||
box_i = bubble_boxes[bid_i]
|
||
wi = max(1, box_i[2] - box_i[0])
|
||
|
||
for j in range(i + 1, len(bids)):
|
||
bid_j = bids[j]
|
||
if bid_j in merged_into:
|
||
continue
|
||
box_j = bubble_boxes[bid_j]
|
||
wj = max(1, box_j[2] - box_j[0])
|
||
|
||
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
|
||
h_ix1 = max(box_i[0], box_j[0])
|
||
h_ix2 = min(box_i[2], box_j[2])
|
||
h_overlap = max(0, h_ix2 - h_ix1)
|
||
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
|
||
|
||
if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35:
|
||
root = merged_into.get(bid_i, bid_i)
|
||
merge_map.setdefault(root, [root])
|
||
if bid_j not in merge_map[root]:
|
||
merge_map[root].append(bid_j)
|
||
merged_into[bid_j] = root
|
||
|
||
if not merge_map:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):")
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
processed, next_bid = set(), 1
|
||
|
||
for root, group in merge_map.items():
|
||
group_unique = sorted(set(group))
|
||
print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}")
|
||
all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b]))
|
||
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
|
||
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
|
||
new_indices[next_bid] = all_indices
|
||
next_bid += 1
|
||
processed.update(group_unique)
|
||
|
||
for bid in bids:
|
||
if bid not in processed:
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = bubble_indices[bid]
|
||
next_bid += 1
|
||
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
|
||
def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
|
||
bubbles, ocr, image_bgr):
|
||
"""
|
||
Full fix pipeline:
|
||
1. Split boxes that span multiple speech bubbles.
|
||
2. Merge fragments detected inside the same contour.
|
||
3. Merge fragments missed by contour detection (proximity+overlap) — pass 1.
|
||
4. Second proximity pass — catches chains resolved after pass 1.
|
||
"""
|
||
print("\n🔍 Running automatic bubble detection fixes...")
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1])
|
||
for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
|
||
detect_and_split_multi_bubble_boxes(
|
||
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
|
||
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
|
||
detect_and_merge_fragmented_bubbles(
|
||
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
|
||
|
||
# Pass 1
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
|
||
merge_boxes_by_proximity_and_overlap(
|
||
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
|
||
|
||
# Pass 2 — catches chains only visible after pass 1
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = \
|
||
merge_boxes_by_proximity_and_overlap(
|
||
bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
|
||
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
|
||
def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles,
|
||
overlap_threshold=0.50):
|
||
bids = list(bubble_boxes.keys())
|
||
to_remove = set()
|
||
for i in range(len(bids)):
|
||
bid_i = bids[i]
|
||
if bid_i in to_remove: continue
|
||
box_i = bubble_boxes[bid_i]
|
||
area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1])
|
||
for j in range(i + 1, len(bids)):
|
||
bid_j = bids[j]
|
||
if bid_j in to_remove: continue
|
||
box_j = bubble_boxes[bid_j]
|
||
area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1])
|
||
shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j])
|
||
overlap = boxes_overlap_ratio(box_i, box_j)
|
||
if overlap > overlap_threshold or len(shared) > 0:
|
||
if area_i >= area_j:
|
||
to_remove.add(bid_j)
|
||
print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})")
|
||
else:
|
||
to_remove.add(bid_i)
|
||
print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})")
|
||
break
|
||
if to_remove:
|
||
print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)")
|
||
for bid in to_remove:
|
||
bubble_boxes.pop(bid, None)
|
||
bubble_indices.pop(bid, None)
|
||
bubble_quads.pop(bid, None)
|
||
bubbles.pop(bid, None)
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
|
||
def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr,
|
||
max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None):
|
||
if image_shape is None:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
ih, iw = image_shape[:2]
|
||
max_width, max_height = iw * max_width_ratio, ih * max_height_ratio
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
next_bid, splits_made = 1, []
|
||
|
||
for bid, box in bubble_boxes.items():
|
||
x1, y1, x2, y2 = box
|
||
w, h = x2 - x1, y2 - y1
|
||
if w > max_width or h > max_height:
|
||
indices = bubble_indices[bid]
|
||
col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid,
|
||
use_aggressive_thresholds=True)
|
||
if col_split:
|
||
for grp in col_split:
|
||
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
|
||
new_quads[next_bid] = [ocr[i][0] for i in grp]
|
||
new_indices[next_bid] = grp
|
||
next_bid += 1
|
||
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
|
||
continue
|
||
row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid)
|
||
if row_split:
|
||
for grp in row_split:
|
||
new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
|
||
new_quads[next_bid] = [ocr[i][0] for i in grp]
|
||
new_indices[next_bid] = grp
|
||
next_bid += 1
|
||
splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
|
||
continue
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = box
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = bubble_indices[bid]
|
||
next_bid += 1
|
||
|
||
if splits_made:
|
||
print(f"\n📏 Split {len(splits_made)} oversized box(es):")
|
||
for s in splits_made: print(f" ✓ {s}")
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
|
||
def should_merge_groups(group1_indices, group2_indices, ocr, median_height,
|
||
max_vertical_gap=None):
|
||
if max_vertical_gap is None:
|
||
max_vertical_gap = median_height * 2.5
|
||
box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices])
|
||
box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices])
|
||
if box1 is None or box2 is None:
|
||
return False
|
||
cx1 = (box1[0] + box1[2]) / 2.0
|
||
cx2 = (box2[0] + box2[2]) / 2.0
|
||
if abs(cx1 - cx2) > median_height * 1.8:
|
||
return False
|
||
vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3]))
|
||
return vertical_gap <= max_vertical_gap
|
||
|
||
|
||
# ============================================================
|
||
# ENHANCED OCR ENGINE
|
||
# ============================================================
|
||
class ImprovedMacVisionDetector:
|
||
def __init__(self, source_lang="en"):
|
||
lang_key = source_lang.lower().strip()
|
||
lang_map = {
|
||
"en": "en-US", "english": "en-US",
|
||
"es": "es-ES", "spanish": "es-ES",
|
||
"ca": "ca-ES", "catalan": "ca-ES",
|
||
"fr": "fr-FR", "french": "fr-FR",
|
||
"ja": "ja-JP", "japanese": "ja-JP",
|
||
"it": "it-IT", "italian": "it-IT",
|
||
"de": "de-DE", "german": "de-DE",
|
||
"ko": "ko-KR", "korean": "ko-KR",
|
||
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||
}
|
||
self.langs = [lang_map.get(lang_key, "en-US")]
|
||
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
|
||
|
||
def preprocess_variants(self, image_bgr):
|
||
variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))]
|
||
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||
_, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
variants.append(("high_contrast",
|
||
cv2.cvtColor(cv2.resize(hc, None, fx=2.5, fy=2.5,
|
||
interpolation=cv2.INTER_CUBIC),
|
||
cv2.COLOR_GRAY2BGR)))
|
||
variants.append(("bilateral",
|
||
cv2.resize(cv2.bilateralFilter(image_bgr, 9, 75, 75),
|
||
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
|
||
variants.append(("inverted",
|
||
cv2.resize(cv2.bitwise_not(image_bgr),
|
||
None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
|
||
variants.append(("original",
|
||
cv2.resize(image_bgr, None, fx=2.5, fy=2.5,
|
||
interpolation=cv2.INTER_CUBIC)))
|
||
return variants
|
||
|
||
def run_vision_ocr(self, image_bgr):
|
||
if image_bgr is None or image_bgr.size == 0:
|
||
return []
|
||
ih, iw = image_bgr.shape[:2]
|
||
success, buffer = cv2.imencode('.png', image_bgr)
|
||
if not success:
|
||
return []
|
||
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||
results = []
|
||
|
||
def completion_handler(request, error):
|
||
if error: return
|
||
for obs in request.results():
|
||
candidate = obs.topCandidates_(1)[0]
|
||
text, conf = candidate.string(), candidate.confidence()
|
||
bbox = obs.boundingBox()
|
||
x = bbox.origin.x * iw
|
||
y_bl = bbox.origin.y * ih
|
||
w = bbox.size.width * iw
|
||
h = bbox.size.height * ih
|
||
y = ih - y_bl - h
|
||
quad = [[int(x),int(y)],[int(x+w),int(y)],
|
||
[int(x+w),int(y+h)],[int(x),int(y+h)]]
|
||
results.append((quad, text, conf))
|
||
|
||
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||
req.setUsesLanguageCorrection_(False)
|
||
req.setRecognitionLanguages_(self.langs)
|
||
req.setAutomaticallyDetectsLanguage_(True)
|
||
handler.performRequests_error_([req], None)
|
||
return results
|
||
|
||
def merge_multi_pass_results(self, all_results, original_shape):
|
||
if not all_results:
|
||
return []
|
||
scale_factor = 2.5
|
||
normalized = []
|
||
for variant_name, results in all_results:
|
||
for quad, text, conf in results:
|
||
sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad]
|
||
normalized.append((sq, text, conf, variant_name))
|
||
|
||
def quads_overlap(q1, q2, threshold=0.5):
|
||
b1, b2 = quad_bbox(q1), quad_bbox(q2)
|
||
x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1])
|
||
x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3])
|
||
if x2 < x1 or y2 < y1: return False
|
||
inter = (x2-x1)*(y2-y1)
|
||
union = ((b1[2]-b1[0])*(b1[3]-b1[1]) +
|
||
(b2[2]-b2[0])*(b2[3]-b2[1]) - inter)
|
||
return inter / max(union, 1) > threshold
|
||
|
||
clusters, used = [], set()
|
||
for i, (q1, t1, c1, v1) in enumerate(normalized):
|
||
if i in used: continue
|
||
cluster = [(q1, t1, c1, v1)]
|
||
used.add(i)
|
||
for j, (q2, t2, c2, v2) in enumerate(normalized):
|
||
if j in used or i == j: continue
|
||
if quads_overlap(q1, q2):
|
||
cluster.append((q2, t2, c2, v2))
|
||
used.add(j)
|
||
clusters.append(cluster)
|
||
|
||
final_results = []
|
||
for cluster in clusters:
|
||
cluster.sort(key=lambda x: x[2], reverse=True)
|
||
best_quad, best_text, best_conf, _ = cluster[0]
|
||
text_votes = {}
|
||
for _, text, conf, _ in cluster:
|
||
n = normalize_text(text)
|
||
if n: text_votes[n] = text_votes.get(n, 0) + conf
|
||
if text_votes:
|
||
voted = max(text_votes.items(), key=lambda x: x[1])[0]
|
||
if voted != normalize_text(best_text):
|
||
best_text = voted
|
||
final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf))
|
||
return final_results
|
||
|
||
def read(self, image_path_or_array):
|
||
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
|
||
else image_path_or_array
|
||
if img is None or img.size == 0:
|
||
return []
|
||
variants = self.preprocess_variants(img)
|
||
all_results = []
|
||
for vname, vimg in variants:
|
||
r = self.run_vision_ocr(vimg)
|
||
if r: all_results.append((vname, r))
|
||
return self.merge_multi_pass_results(all_results, img.shape)
|
||
|
||
|
||
class MacVisionDetector:
|
||
def __init__(self, source_lang="en"):
|
||
lang_key = source_lang.lower().strip()
|
||
lang_map = {
|
||
"en": "en-US", "english": "en-US",
|
||
"es": "es-ES", "spanish": "es-ES",
|
||
"ca": "ca-ES", "catalan": "ca-ES",
|
||
"fr": "fr-FR", "french": "fr-FR",
|
||
"ja": "ja-JP", "japanese": "ja-JP",
|
||
"it": "it-IT", "italian": "it-IT",
|
||
"de": "de-DE", "german": "de-DE",
|
||
"ko": "ko-KR", "korean": "ko-KR",
|
||
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||
}
|
||
self.langs = [lang_map.get(lang_key, "en-US")]
|
||
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
|
||
|
||
def read(self, image_path_or_array):
|
||
img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
|
||
else image_path_or_array
|
||
if img is None or img.size == 0:
|
||
return []
|
||
ih, iw = img.shape[:2]
|
||
success, buffer = cv2.imencode('.png', img)
|
||
if not success: return []
|
||
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||
results = []
|
||
|
||
def completion_handler(request, error):
|
||
if error: return
|
||
for obs in request.results():
|
||
candidate = obs.topCandidates_(1)[0]
|
||
text, conf = candidate.string(), candidate.confidence()
|
||
bbox = obs.boundingBox()
|
||
x = bbox.origin.x * iw
|
||
y_bl = bbox.origin.y * ih
|
||
w = bbox.size.width * iw
|
||
h = bbox.size.height * ih
|
||
y = ih - y_bl - h
|
||
quad = [[int(x),int(y)],[int(x+w),int(y)],
|
||
[int(x+w),int(y+h)],[int(x),int(y+h)]]
|
||
results.append((quad, text, conf))
|
||
|
||
req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||
req.setUsesLanguageCorrection_(True)
|
||
req.setRecognitionLanguages_(self.langs)
|
||
req.setAutomaticallyDetectsLanguage_(True)
|
||
handler.performRequests_error_([req], None)
|
||
return results
|
||
|
||
|
||
# ============================================================
|
||
# COLUMN / ROW SPLITTING
|
||
# ============================================================
|
||
def split_bubble_if_multiple_columns(indices, ocr, bid=None,
|
||
use_aggressive_thresholds=False):
|
||
if len(indices) < 2: return None
|
||
boxes = [quad_bbox(ocr[i][0]) for i in indices]
|
||
hs = [max(1, b[3]-b[1]) for b in boxes]
|
||
med_h = float(np.median(hs)) if hs else 12.0
|
||
xs = [(b[0]+b[2])/2.0 for b in boxes]
|
||
xs_sorted = sorted(xs)
|
||
gap_thresh = max(med_h*1.2, 18) if use_aggressive_thresholds else max(med_h*1.5, 22)
|
||
best_gap_idx, best_gap_size = None, 0.0
|
||
for i in range(len(xs_sorted) - 1):
|
||
gap = xs_sorted[i+1] - xs_sorted[i]
|
||
if gap > gap_thresh and gap > best_gap_size:
|
||
best_gap_size, best_gap_idx = gap, i
|
||
if best_gap_idx is None: return None
|
||
split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx+1]) / 2.0
|
||
left_idxs = [i for i in indices
|
||
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
|
||
right_idxs = [i for i in indices
|
||
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
|
||
if not left_idxs or not right_idxs: return None
|
||
return (left_idxs, right_idxs)
|
||
|
||
def split_bubble_if_multiple_rows(indices, ocr, bid=None):
|
||
if len(indices) < 2: return None
|
||
boxes = [quad_bbox(ocr[i][0]) for i in indices]
|
||
hs = [max(1, b[3]-b[1]) for b in boxes]
|
||
med_h = float(np.median(hs)) if hs else 12.0
|
||
ys = [(b[1]+b[3])/2.0 for b in boxes]
|
||
ys_sorted = sorted(ys)
|
||
gap_thresh = max(med_h * 2.0, 30)
|
||
best_gap_idx, best_gap_size = None, 0.0
|
||
for i in range(len(ys_sorted) - 1):
|
||
gap = ys_sorted[i+1] - ys_sorted[i]
|
||
if gap > gap_thresh and gap > best_gap_size:
|
||
best_gap_size, best_gap_idx = gap, i
|
||
if best_gap_idx is None: return None
|
||
split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0
|
||
top_idxs = [i for i in indices
|
||
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y]
|
||
bot_idxs = [i for i in indices
|
||
if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y]
|
||
if not top_idxs or not bot_idxs: return None
|
||
return (top_idxs, bot_idxs)
|
||
|
||
|
||
def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
|
||
if len(indices) < 2: return None
|
||
boxes = [quad_bbox(ocr[i][0]) for i in indices]
|
||
hs = [max(1, b[3]-b[1]) for b in boxes]
|
||
med_h = float(np.median(hs)) if hs else 12.0
|
||
items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices],
|
||
key=lambda x: (x[1][1]+x[1][3])/2.0)
|
||
gap_thresh = max(med_h * factor, min_gap)
|
||
best_gap, best_split_idx = 0.0, None
|
||
for k in range(len(items) - 1):
|
||
gap = items[k+1][1][1] - items[k][1][3]
|
||
if gap > gap_thresh and gap > best_gap:
|
||
best_gap, best_split_idx = gap, k
|
||
if best_split_idx is None: return None
|
||
top_idxs = [it[0] for it in items[:best_split_idx+1]]
|
||
bot_idxs = [it[0] for it in items[best_split_idx+1:]]
|
||
if not top_idxs or not bot_idxs: return None
|
||
return (top_idxs, bot_idxs)
|
||
|
||
|
||
def is_vertical_text_like(indices, ocr):
|
||
if len(indices) < 2: return False
|
||
boxes = [quad_bbox(ocr[i][0]) for i in indices]
|
||
med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes]))
|
||
med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes]))
|
||
if med_h < med_w * 1.2: return False
|
||
xs = [(b[0]+b[2])/2.0 for b in boxes]
|
||
ys = [(b[1]+b[3])/2.0 for b in boxes]
|
||
if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False
|
||
return True
|
||
|
||
|
||
def split_nested_or_side_by_side(indices, ocr):
|
||
if len(indices) < 2: return None
|
||
xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0
|
||
for i in indices])
|
||
mid_idx = len(xs) // 2
|
||
split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0
|
||
left_idxs = [i for i in indices
|
||
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
|
||
right_idxs = [i for i in indices
|
||
if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]
|
||
if not left_idxs or not right_idxs: return None
|
||
return (left_idxs, right_idxs)
|
||
|
||
|
||
def split_panel_box(image_bgr, box_xyxy, bubble_quads=None):
|
||
x1, y1, x2, y2 = box_xyxy
|
||
ih, iw = image_bgr.shape[:2]
|
||
x1, y1 = max(0, x1), max(0, y1)
|
||
x2, y2 = min(iw-1, x2), min(ih-1, y2)
|
||
if x2 <= x1 or y2 <= y1: return None
|
||
crop = image_bgr[y1:y2, x1:x2]
|
||
if crop.size == 0: return None
|
||
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
|
||
edges = cv2.Canny(gray, 50, 150)
|
||
h_proj = np.sum(edges, axis=0)
|
||
w = x2 - x1
|
||
if w < 100: return None
|
||
search_start = int(w * 0.35)
|
||
search_end = int(w * 0.65)
|
||
if search_end <= search_start: return None
|
||
region = h_proj[search_start:search_end]
|
||
if len(region) == 0: return None
|
||
threshold = np.percentile(region, 85)
|
||
candidates = [x1 + search_start + rx
|
||
for rx in range(len(region)) if region[rx] >= threshold]
|
||
if not candidates: return None
|
||
split_x = int(np.median(candidates))
|
||
if bubble_quads:
|
||
lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x)
|
||
rc = len(bubble_quads) - lc
|
||
if lc == 0 or rc == 0: return None
|
||
return (x1, x2, split_x)
|
||
|
||
|
||
# ============================================================
|
||
# MERGE CLOSE BUBBLES
|
||
# ============================================================
|
||
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads,
|
||
bubble_indices, ocr):
|
||
"""
|
||
Merges boxes that are spatially very close on BOTH axes AND share
|
||
meaningful horizontal overlap (same column).
|
||
|
||
Single-quad boxes participate fully — no special isolation treatment.
|
||
The h_overlap_ratio >= 0.25 guard prevents merging horizontally
|
||
adjacent distinct bubbles.
|
||
"""
|
||
if not bubbles:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1])
|
||
for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
merge_tol = max(8, med_h * 1.4)
|
||
|
||
bids = sorted(bubble_boxes.keys())
|
||
merged_set, merge_map = set(), {}
|
||
|
||
for i, bid_i in enumerate(bids):
|
||
if bid_i in merged_set: continue
|
||
x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i]
|
||
wi = max(1, x2_i - x1_i)
|
||
|
||
for j in range(i + 1, len(bids)):
|
||
bid_j = bids[j]
|
||
if bid_j in merged_set: continue
|
||
x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j]
|
||
wj = max(1, x2_j - x1_j)
|
||
|
||
gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j))
|
||
gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j))
|
||
|
||
h_ix1 = max(x1_i, x1_j)
|
||
h_ix2 = min(x2_i, x2_j)
|
||
h_overlap = max(0, h_ix2 - h_ix1)
|
||
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
|
||
|
||
if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25:
|
||
if bid_i not in merge_map:
|
||
merge_map[bid_i] = [bid_i]
|
||
merge_map[bid_i].append(bid_j)
|
||
merged_set.add(bid_j)
|
||
|
||
if not merge_map:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
next_bid = 1
|
||
for bid in bids:
|
||
if bid in merged_set: continue
|
||
if bid in merge_map:
|
||
group = merge_map[bid]
|
||
all_indices = sorted(set(idx for b in group for idx in bubble_indices[b]))
|
||
new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
|
||
new_quads[next_bid] = [ocr[i][0] for i in all_indices]
|
||
new_indices[next_bid] = all_indices
|
||
else:
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = bubble_indices[bid]
|
||
next_bid += 1
|
||
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
|
||
# ============================================================
|
||
# WIDE / BRIDGE QUAD SPLITTING
|
||
# ============================================================
|
||
def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0):
|
||
if not ocr_list: return ocr_list, 0
|
||
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
|
||
med_h = float(np.median(hs)) if hs else 14.0
|
||
result, splits_made = [], 0
|
||
|
||
for quad, text, conf in ocr_list:
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
w = x2 - x1
|
||
if w > med_h * width_factor:
|
||
pad = 2
|
||
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
|
||
max(0,x1):min(image_bgr.shape[1],x2)]
|
||
if roi.size > 0:
|
||
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
||
_, binary = cv2.threshold(gray, 0, 255,
|
||
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
v_proj = np.sum(binary, axis=0)
|
||
gap_threshold = roi.shape[0] * 255 * 0.15
|
||
gaps, in_gap, gap_start = [], False, 0
|
||
for x in range(len(v_proj)):
|
||
if v_proj[x] < gap_threshold:
|
||
if not in_gap: gap_start, in_gap = x, True
|
||
else:
|
||
if in_gap:
|
||
gw = x - gap_start
|
||
if gw >= max(int(med_h * 0.6), 12):
|
||
gaps.append((gap_start + gw // 2, gw))
|
||
in_gap = False
|
||
if gaps:
|
||
gaps.sort(key=lambda g: g[1], reverse=True)
|
||
split_x_abs = max(0, x1) + gaps[0][0]
|
||
if ' ' in text:
|
||
char_w = w / max(1, len(text))
|
||
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
|
||
spaces = [i for i, c in enumerate(text) if c == ' ']
|
||
if spaces:
|
||
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
else:
|
||
split_idx = int(len(text) * (split_x_abs - x1) / w)
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
if tl and tr:
|
||
result.extend([
|
||
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
|
||
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
|
||
splits_made += 1
|
||
continue
|
||
result.append((quad, text, conf))
|
||
return result, splits_made
|
||
|
||
|
||
def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0):
|
||
if not ocr_list: return ocr_list, 0
|
||
hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
|
||
med_h = float(np.median(hs)) if hs else 14.0
|
||
result, splits_made = [], 0
|
||
|
||
for quad, text, conf in ocr_list:
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
w, h = x2 - x1, max(1, y2 - y1)
|
||
if w / h > aspect_ratio_threshold:
|
||
pad = 2
|
||
roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
|
||
max(0,x1):min(image_bgr.shape[1],x2)]
|
||
if roi.size > 0:
|
||
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
||
_, binary = cv2.threshold(gray, 0, 255,
|
||
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
v_proj = np.sum(binary, axis=0)
|
||
gap_threshold = h * 255 * 0.20
|
||
gaps, in_gap, gap_start = [], False, 0
|
||
for x in range(len(v_proj)):
|
||
if v_proj[x] < gap_threshold:
|
||
if not in_gap: gap_start, in_gap = x, True
|
||
else:
|
||
if in_gap:
|
||
gw = x - gap_start
|
||
if gw >= max(int(med_h * 0.8), 15):
|
||
gaps.append((gap_start + gw // 2, gw))
|
||
in_gap = False
|
||
if gaps:
|
||
gaps.sort(key=lambda g: g[1], reverse=True)
|
||
split_x_abs = max(0, x1) + gaps[0][0]
|
||
if ' ' in text:
|
||
char_w = w / max(1, len(text))
|
||
split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
|
||
spaces = [i for i, c in enumerate(text) if c == ' ']
|
||
if spaces:
|
||
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
else:
|
||
split_idx = int(len(text) * (split_x_abs - x1) / w)
|
||
tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
|
||
if tl and tr:
|
||
result.extend([
|
||
([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
|
||
([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
|
||
splits_made += 1
|
||
continue
|
||
result.append((quad, text, conf))
|
||
return result, splits_made
|
||
|
||
|
||
def normalize_ocr_quads(ocr_list):
|
||
result = []
|
||
for quad, text, conf in ocr_list:
|
||
x1, y1, x2, y2 = quad_bbox(quad)
|
||
pad = 3
|
||
new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]]
|
||
result.append((new_quad, text, conf))
|
||
return result
|
||
|
||
|
||
# ============================================================
|
||
# VISION RE-READ
|
||
# ============================================================
|
||
def preprocess_variant(crop_bgr, mode):
|
||
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
||
if mode == "raw": return gray
|
||
if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0,
|
||
tileGridSize=(8,8)).apply(gray)
|
||
if mode == "adaptive":
|
||
den = cv2.GaussianBlur(gray, (3,3), 0)
|
||
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY, 35, 11)
|
||
if mode == "otsu":
|
||
den = cv2.GaussianBlur(gray, (3,3), 0)
|
||
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
return th
|
||
if mode == "invert": return 255 - gray
|
||
if mode == "bilateral":
|
||
den = cv2.bilateralFilter(gray, 7, 60, 60)
|
||
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
return th
|
||
if mode == "morph_open":
|
||
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8))
|
||
return gray
|
||
|
||
|
||
def rotate_image_keep_bounds(img, angle_deg):
|
||
h, w = img.shape[:2]
|
||
c = (w/2, h/2)
|
||
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
|
||
cos, sin = abs(M[0,0]), abs(M[0,1])
|
||
new_w = int((h*sin) + (w*cos))
|
||
new_h = int((h*cos) + (w*sin))
|
||
M[0,2] += (new_w/2) - c[0]
|
||
M[1,2] += (new_h/2) - c[1]
|
||
return cv2.warpAffine(img, M, (new_w, new_h),
|
||
flags=cv2.INTER_CUBIC, borderValue=255)
|
||
|
||
|
||
def rebuild_text_from_vision_result(res):
|
||
if not res: return ""
|
||
norm = []
|
||
for bbox, txt, conf in res:
|
||
if not txt or not txt.strip(): continue
|
||
b = quad_bbox(bbox)
|
||
norm.append((b, txt, conf,
|
||
(b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
|
||
if not norm: return ""
|
||
med_h = float(np.median([x[5] for x in norm]))
|
||
row_tol = max(6.0, med_h * 0.75)
|
||
norm.sort(key=lambda z: z[4])
|
||
rows = []
|
||
for it in norm:
|
||
placed = False
|
||
for r in rows:
|
||
if abs(it[4] - r["yc"]) <= row_tol:
|
||
r["m"].append(it)
|
||
r["yc"] = float(np.mean([k[4] for k in r["m"]]))
|
||
placed = True; break
|
||
if not placed: rows.append({"yc": it[4], "m": [it]})
|
||
rows.sort(key=lambda r: r["yc"])
|
||
lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3])))
|
||
for r in rows]
|
||
return normalize_text(" ".join(filter(None, lines)))
|
||
|
||
|
||
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector,
|
||
upscale=3.0, pad=24):
|
||
ih, iw = image_bgr.shape[:2]
|
||
x1, y1, x2, y2 = bbox_xyxy
|
||
x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad))
|
||
x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad))
|
||
crop = image_bgr[y1:y2, x1:x2]
|
||
if crop.size == 0: return None, 0.0, "none"
|
||
|
||
modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
|
||
angles = [0.0, 1.5, -1.5]
|
||
best_v_txt, best_v_sc = "", 0.0
|
||
up0 = cv2.resize(crop,
|
||
(int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)),
|
||
interpolation=cv2.INTER_CUBIC)
|
||
|
||
for mode in modes:
|
||
proc = preprocess_variant(up0, mode)
|
||
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
|
||
for a in angles:
|
||
rot = rotate_image_keep_bounds(proc3, a)
|
||
res = (vision_detector.run_vision_ocr(rot)
|
||
if hasattr(vision_detector, 'run_vision_ocr')
|
||
else vision_detector.read(rot))
|
||
txt = rebuild_text_from_vision_result(res)
|
||
sc = ocr_candidate_score(txt)
|
||
if sc > best_v_sc:
|
||
best_v_txt, best_v_sc = txt, sc
|
||
|
||
if best_v_txt: return best_v_txt, best_v_sc, "vision-reread"
|
||
return None, 0.0, "none"
|
||
|
||
|
||
# ============================================================
|
||
# LINES + BUBBLES
|
||
# ============================================================
|
||
def build_lines_from_indices(indices, ocr):
|
||
if not indices: return []
|
||
items = []
|
||
for i in indices:
|
||
b = quad_bbox(ocr[i][0])
|
||
items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
|
||
med_h = float(np.median([it[4] for it in items])) if items else 10.0
|
||
row_tol = max(6.0, med_h * 0.75)
|
||
items.sort(key=lambda x: x[3])
|
||
rows = []
|
||
for it in items:
|
||
placed = False
|
||
for r in rows:
|
||
if abs(it[3] - r["yc"]) <= row_tol:
|
||
r["m"].append(it)
|
||
r["yc"] = float(np.mean([k[3] for k in r["m"]]))
|
||
placed = True; break
|
||
if not placed: rows.append({"yc": it[3], "m": [it]})
|
||
rows.sort(key=lambda r: r["yc"])
|
||
return [normalize_text(
|
||
" ".join(ocr[i][1]
|
||
for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2])))
|
||
for r in rows if r["m"]]
|
||
|
||
|
||
def auto_gap(image_path, base=18, ref_w=750):
|
||
img = cv2.imread(image_path)
|
||
return base * (img.shape[1] / ref_w) if img is not None else base
|
||
|
||
|
||
def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1,
|
||
strict_mode=False):
|
||
n = len(ocr)
|
||
if n == 0: return {}, {}, {}, {}
|
||
|
||
boxes = [quad_bbox(r[0]) for r in ocr]
|
||
centers = [quad_center(r[0]) for r in ocr]
|
||
hs = [max(1.0, b[3]-b[1]) for b in boxes]
|
||
med_h = float(np.median(hs)) if hs else 12.0
|
||
|
||
max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0
|
||
max_horizontal_offset = med_h * 1.8
|
||
|
||
sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0]))
|
||
groups, used = [], set()
|
||
|
||
for i in sorted_indices:
|
||
if i in used: continue
|
||
current_group = [i]
|
||
used.add(i)
|
||
cx_i = centers[i][0]
|
||
|
||
for j in sorted_indices:
|
||
if j in used or j == i: continue
|
||
cx_j, cy_j = centers[j]
|
||
if cy_j <= centers[i][1]: continue
|
||
if abs(cx_i - cx_j) > max_horizontal_offset: continue
|
||
|
||
# Horizontal gap guard
|
||
gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2]))
|
||
if gap_x > med_h * 1.5: continue
|
||
|
||
# Orientation compatibility guard
|
||
if not orientation_compatible(i, j, ocr): continue
|
||
|
||
vertical_gap = boxes[j][1] - boxes[current_group[-1]][3]
|
||
if vertical_gap <= max_vertical_gap:
|
||
current_group.append(j)
|
||
used.add(j)
|
||
cx_i = (cx_i + cx_j) / 2.0
|
||
|
||
if current_group:
|
||
groups.append(current_group)
|
||
|
||
# Secondary merge pass
|
||
merged_groups, used_groups = [], set()
|
||
for i, group1 in enumerate(groups):
|
||
if i in used_groups: continue
|
||
merged = list(group1)
|
||
used_groups.add(i)
|
||
for j, group2 in enumerate(groups):
|
||
if i == j or j in used_groups: continue
|
||
if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap):
|
||
compat = all(orientation_compatible(a, b, ocr)
|
||
for a in merged for b in group2)
|
||
if compat:
|
||
merged.extend(group2)
|
||
used_groups.add(j)
|
||
merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1]))
|
||
|
||
# Horizontal gap split pass
|
||
final_groups = []
|
||
for group in merged_groups:
|
||
h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5)
|
||
if h_split:
|
||
lg, rg = h_split
|
||
final_groups.append(sorted(lg, key=lambda idx: centers[idx][1]))
|
||
final_groups.append(sorted(rg, key=lambda idx: centers[idx][1]))
|
||
else:
|
||
final_groups.append(group)
|
||
|
||
final_groups.sort(key=lambda g: (min(centers[i][1] for i in g),
|
||
min(centers[i][0] for i in g)))
|
||
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
|
||
ih, iw = image_shape[:2]
|
||
|
||
for bid, idxs in enumerate(final_groups, start=1):
|
||
lines = build_lines_from_indices(idxs, ocr)
|
||
quads = [ocr[k][0] for k in idxs]
|
||
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
|
||
if ub is None: continue
|
||
x1, y1, x2, y2 = ub
|
||
ap = max(1, int(round(med_h * 0.16)))
|
||
bubbles[bid] = lines
|
||
bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap),
|
||
min(iw-1,x2+ap), min(ih-1,y2+ap))
|
||
bubble_quads[bid] = quads
|
||
bubble_indices[bid] = idxs
|
||
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
|
||
# ============================================================
|
||
# SPLIT HELPER — centralises all split strategies
|
||
# ============================================================
|
||
def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes,
|
||
filtered, image, iw, ih):
|
||
"""
|
||
Attempts all split strategies in priority order.
|
||
Returns ((part1_indices, part2_indices), reason_str) or (None, None).
|
||
|
||
BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4
|
||
so the gap between the top speech bubble and the bottom cluster triggers.
|
||
"""
|
||
indices = bubble_indices[bid]
|
||
box = bubble_boxes[bid]
|
||
|
||
# 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster)
|
||
if is_vertical_text_like(indices, filtered):
|
||
vgap = split_cluster_by_big_vertical_gap(indices, filtered,
|
||
factor=1.4, min_gap=18)
|
||
if vgap:
|
||
return vgap, "vertical-stack y-gap"
|
||
|
||
# 2. Panel border
|
||
sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
|
||
if sr:
|
||
_, _, split_x = sr
|
||
li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x]
|
||
ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x]
|
||
if li and ri:
|
||
return (li, ri), "panel border"
|
||
elif len(bubble_quads[bid]) >= 4:
|
||
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid,
|
||
use_aggressive_thresholds=True)
|
||
if cs:
|
||
return cs, "aggressive column"
|
||
|
||
# 3. Column gap
|
||
cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid)
|
||
if cs:
|
||
return cs, "vertical column"
|
||
|
||
# 4. Nested / side-by-side
|
||
ns = split_nested_or_side_by_side(indices, filtered)
|
||
if ns:
|
||
return ns, "nested/side-by-side"
|
||
|
||
# 5. Row split
|
||
rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid)
|
||
if rs:
|
||
return rs, "horizontal row"
|
||
|
||
# 6. Large vertical gap (general, less sensitive)
|
||
gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22)
|
||
if gy:
|
||
return gy, "large vertical-gap"
|
||
|
||
return None, None
|
||
|
||
|
||
# ============================================================
|
||
# DEBUG / EXPORT
|
||
# ============================================================
|
||
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices,
|
||
clean_lines=None, out_path="debug_clusters.png"):
|
||
"""
|
||
Draws all detected boxes.
|
||
Single-quad boxes are drawn in orange for visibility but are NOT
|
||
labelled as (ISOLATED) — they participate fully in merge passes.
|
||
"""
|
||
img = cv2.imread(image_path)
|
||
if img is None: return
|
||
|
||
for bbox, txt, conf in ocr:
|
||
pts = np.array(bbox, dtype=np.int32)
|
||
cv2.fillPoly(img, [pts], (255, 255, 255))
|
||
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
|
||
|
||
for bid, bb in bubble_boxes.items():
|
||
x1, y1, x2, y2 = bb
|
||
n_quads = len(bubble_indices.get(bid, []))
|
||
color = (255, 165, 0) if n_quads == 1 else (0, 220, 0)
|
||
thickness = 3 if n_quads == 1 else 2
|
||
cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
|
||
cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)),
|
||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
||
|
||
if clean_lines and bid in clean_lines:
|
||
text = clean_lines[bid]
|
||
words = text.split()
|
||
lines, cur = [], ""
|
||
for w in words:
|
||
if len(cur) + len(w) < 25: cur += w + " "
|
||
else: lines.append(cur.strip()); cur = w + " "
|
||
if cur: lines.append(cur.strip())
|
||
y_text = y2 + 18
|
||
for line in lines:
|
||
cv2.putText(img, line, (x1, y_text),
|
||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3)
|
||
cv2.putText(img, line, (x1, y_text),
|
||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
|
||
y_text += 18
|
||
|
||
cv2.imwrite(out_path, img)
|
||
|
||
|
||
def estimate_reading_order(bbox_dict, mode="ltr"):
|
||
items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0)
|
||
for bid, bb in bbox_dict.items()]
|
||
items.sort(key=lambda t: t[2])
|
||
rows, tol = [], 90
|
||
for it in items:
|
||
placed = False
|
||
for r in rows:
|
||
if abs(it[2] - r["cy"]) <= tol:
|
||
r["items"].append(it)
|
||
r["cy"] = float(np.mean([x[2] for x in r["items"]]))
|
||
placed = True; break
|
||
if not placed: rows.append({"cy": it[2], "items": [it]})
|
||
rows.sort(key=lambda r: r["cy"])
|
||
order = []
|
||
for r in rows:
|
||
r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
|
||
order.extend([z[0] for z in r["items"]])
|
||
return {bid: i+1 for i, bid in enumerate(order)}
|
||
|
||
# ============================================================
|
||
# NAME / SHORT TOKEN RESCUE
|
||
# ============================================================
|
||
def _text_key_for_dedup(text: str) -> str:
|
||
return re.sub(r'[^A-ZÀ-Ý0-9]', '', normalize_text(text or ""))
|
||
|
||
def rescue_name_and_short_tokens(ocr_list, min_conf=0.20):
|
||
"""
|
||
Keep plausible short/name tokens that OCR found but strict filtering may drop.
|
||
Returns rescued items as (quad, text, conf).
|
||
"""
|
||
rescued = []
|
||
|
||
for quad, text, conf in ocr_list:
|
||
t = normalize_text(text or "")
|
||
if not t:
|
||
continue
|
||
|
||
t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t)
|
||
|
||
if t_alpha in KNOWN_NAMES and conf >= min_conf:
|
||
rescued.append((quad, t, max(conf, 0.45)))
|
||
continue
|
||
|
||
if is_protected_token(t) and conf >= min_conf:
|
||
rescued.append((quad, t, max(conf, 0.40)))
|
||
continue
|
||
|
||
if 2 <= len(t_alpha) <= 8 and conf >= 0.25:
|
||
if re.fullmatch(r'[A-ZÀ-Ý]{2,8}', t_alpha):
|
||
rescued.append((quad, t, max(conf, 0.35)))
|
||
|
||
return rescued
|
||
|
||
def merge_rescued_items(base_ocr, rescued_ocr, iou_threshold=0.55):
|
||
"""
|
||
Merge rescued tokens into OCR list if not duplicate by text+overlap.
|
||
"""
|
||
if not rescued_ocr:
|
||
return base_ocr
|
||
|
||
def iou_xyxy(a, b):
|
||
ax1, ay1, ax2, ay2 = a
|
||
bx1, by1, bx2, by2 = b
|
||
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
||
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
||
if inter == 0:
|
||
return 0.0
|
||
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
|
||
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
|
||
return inter / max(1, area_a + area_b - inter)
|
||
|
||
out = list(base_ocr)
|
||
for rq, rt, rc in rescued_ocr:
|
||
rb = quad_bbox(rq)
|
||
rk = _text_key_for_dedup(rt)
|
||
duplicate = False
|
||
|
||
for bq, bt, _ in out:
|
||
bb = quad_bbox(bq)
|
||
bk = _text_key_for_dedup(bt)
|
||
if rk == bk and iou_xyxy(rb, bb) >= iou_threshold:
|
||
duplicate = True
|
||
break
|
||
|
||
if not duplicate:
|
||
out.append((rq, rt, rc))
|
||
|
||
return out
|
||
|
||
def _joined_text_for_indices(indices, ocr):
|
||
parts = []
|
||
for i in indices:
|
||
if i < 0 or i >= len(ocr):
|
||
continue
|
||
t = normalize_text(ocr[i][1])
|
||
if t:
|
||
parts.append(t)
|
||
s = " ".join(parts).strip()
|
||
return s, len(s)
|
||
|
||
def _in_same_bubble_contour(box_i, box_j, bubble_contours):
|
||
cx_i = (box_i[0] + box_i[2]) / 2.0
|
||
cy_i = (box_i[1] + box_i[3]) / 2.0
|
||
cx_j = (box_j[0] + box_j[2]) / 2.0
|
||
cy_j = (box_j[1] + box_j[3]) / 2.0
|
||
for c in bubble_contours:
|
||
if (cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
|
||
cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0):
|
||
return True
|
||
return False
|
||
|
||
def merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr):
|
||
"""
|
||
Relaxed merge for tiny interjection/name boxes (e.g. HUH? + MORNING).
|
||
"""
|
||
bids = sorted(bubble_boxes.keys())
|
||
if len(bids) < 2:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
bubble_contours = detect_speech_bubbles(image_bgr)
|
||
|
||
parent = {b: b for b in bids}
|
||
|
||
def find(x):
|
||
while parent[x] != x:
|
||
parent[x] = parent[parent[x]]
|
||
x = parent[x]
|
||
return x
|
||
|
||
def union(a, b):
|
||
ra, rb = find(a), find(b)
|
||
if ra != rb:
|
||
parent[rb] = ra
|
||
|
||
SHORT_TEXT_MAX_CHARS = 12
|
||
|
||
for i in range(len(bids)):
|
||
for j in range(i + 1, len(bids)):
|
||
bi, bj = bids[i], bids[j]
|
||
box_i, box_j = bubble_boxes[bi], bubble_boxes[bj]
|
||
|
||
wi = max(1, box_i[2] - box_i[0])
|
||
wj = max(1, box_j[2] - box_j[0])
|
||
|
||
gap_x = max(0, max(box_i[0], box_j[0]) - min(box_i[2], box_j[2]))
|
||
vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))
|
||
|
||
h_ix1 = max(box_i[0], box_j[0])
|
||
h_ix2 = min(box_i[2], box_j[2])
|
||
h_overlap = max(0, h_ix2 - h_ix1)
|
||
h_overlap_ratio = h_overlap / max(1, min(wi, wj))
|
||
|
||
txt_i, len_i = _joined_text_for_indices(bubble_indices[bi], ocr)
|
||
txt_j, len_j = _joined_text_for_indices(bubble_indices[bj], ocr)
|
||
|
||
micro_pair = (len_i <= SHORT_TEXT_MAX_CHARS and len_j <= SHORT_TEXT_MAX_CHARS)
|
||
protected_hint = is_protected_token(txt_i) or is_protected_token(txt_j)
|
||
same_contour = _in_same_bubble_contour(box_i, box_j, bubble_contours)
|
||
|
||
if micro_pair and vert_gap <= med_h * 2.2 and gap_x <= med_h * 2.0:
|
||
if h_overlap_ratio >= 0.10 or same_contour or protected_hint:
|
||
union(bi, bj)
|
||
|
||
groups = {}
|
||
for b in bids:
|
||
r = find(b)
|
||
groups.setdefault(r, []).append(b)
|
||
|
||
if all(len(v) == 1 for v in groups.values()):
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
next_bid = 1
|
||
|
||
for _, group in groups.items():
|
||
if len(group) == 1:
|
||
b = group[0]
|
||
new_bubbles[next_bid] = bubbles[b]
|
||
new_boxes[next_bid] = bubble_boxes[b]
|
||
new_quads[next_bid] = bubble_quads[b]
|
||
new_indices[next_bid] = bubble_indices[b]
|
||
else:
|
||
all_idx = sorted(set(idx for b in group for idx in bubble_indices[b]))
|
||
new_bubbles[next_bid] = build_lines_from_indices(all_idx, ocr)
|
||
new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_idx])
|
||
new_quads[next_bid] = [ocr[i][0] for i in all_idx]
|
||
new_indices[next_bid] = all_idx
|
||
next_bid += 1
|
||
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
def reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr):
|
||
"""
|
||
Reattach tiny orphan token boxes (e.g., single 'HUH?') to nearest plausible bubble.
|
||
"""
|
||
bids = sorted(bubble_boxes.keys())
|
||
if len(bids) < 2:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
|
||
orphan_bids = []
|
||
for b in bids:
|
||
idxs = bubble_indices.get(b, [])
|
||
if len(idxs) != 1:
|
||
continue
|
||
t = normalize_text(ocr[idxs[0]][1])
|
||
if is_protected_token(t) or len(re.sub(r'[^A-ZÀ-Ý]', '', t)) <= 5:
|
||
orphan_bids.append(b)
|
||
|
||
if not orphan_bids:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
consumed = set()
|
||
|
||
for ob in orphan_bids:
|
||
if ob in consumed:
|
||
continue
|
||
|
||
obox = bubble_boxes[ob]
|
||
ocx = (obox[0] + obox[2]) / 2.0
|
||
ocy = (obox[1] + obox[3]) / 2.0
|
||
|
||
best_b = None
|
||
best_d = 1e9
|
||
|
||
for tb in bids:
|
||
if tb == ob or tb in consumed:
|
||
continue
|
||
tbox = bubble_boxes[tb]
|
||
tcx = (tbox[0] + tbox[2]) / 2.0
|
||
tcy = (tbox[1] + tbox[3]) / 2.0
|
||
|
||
dx = abs(ocx - tcx)
|
||
dy = abs(ocy - tcy)
|
||
|
||
if dx <= med_h * 2.2 and dy <= med_h * 3.0:
|
||
d = dx + dy
|
||
if d < best_d:
|
||
best_d = d
|
||
best_b = tb
|
||
|
||
if best_b is not None:
|
||
merged = sorted(set(bubble_indices[best_b] + bubble_indices[ob]))
|
||
bubble_indices[best_b] = merged
|
||
bubble_quads[best_b] = [ocr[i][0] for i in merged]
|
||
bubble_boxes[best_b] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged])
|
||
bubbles[best_b] = build_lines_from_indices(merged, ocr)
|
||
consumed.add(ob)
|
||
|
||
if consumed:
|
||
for b in consumed:
|
||
bubble_indices.pop(b, None)
|
||
bubble_quads.pop(b, None)
|
||
bubble_boxes.pop(b, None)
|
||
bubbles.pop(b, None)
|
||
|
||
# reindex for stable downstream order
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
for new_id, old_id in enumerate(sorted(bubble_boxes.keys()), start=1):
|
||
new_bubbles[new_id] = bubbles[old_id]
|
||
new_boxes[new_id] = bubble_boxes[old_id]
|
||
new_quads[new_id] = bubble_quads[old_id]
|
||
new_indices[new_id] = bubble_indices[old_id]
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
def _bubble_text(indices, ocr):
|
||
return normalize_text(" ".join(build_lines_from_indices(indices, ocr)))
|
||
|
||
def _box_dims(b):
|
||
return max(1, b[2]-b[0]), max(1, b[3]-b[1])
|
||
|
||
def _intersection(a, b):
|
||
ix1, iy1 = max(a[0], b[0]), max(a[1], b[1])
|
||
ix2, iy2 = min(a[2], b[2]), min(a[3], b[3])
|
||
w, h = max(0, ix2-ix1), max(0, iy2-iy1)
|
||
return w*h
|
||
|
||
def _containment_ratio(child, parent):
|
||
inter = _intersection(child, parent)
|
||
c_area = max(1, (child[2]-child[0])*(child[3]-child[1]))
|
||
return inter / c_area
|
||
|
||
def _center_distance(a, b):
|
||
acx, acy = (a[0]+a[2])/2.0, (a[1]+a[3])/2.0
|
||
bcx, bcy = (b[0]+b[2])/2.0, (b[1]+b[3])/2.0
|
||
return ((acx-bcx)**2 + (acy-bcy)**2) ** 0.5
|
||
|
||
def _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices):
|
||
new_b, new_bb, new_bq, new_bi = {}, {}, {}, {}
|
||
for nid, old in enumerate(sorted(bubble_boxes.keys()), start=1):
|
||
new_b[nid] = bubbles[old]
|
||
new_bb[nid] = bubble_boxes[old]
|
||
new_bq[nid] = bubble_quads[old]
|
||
new_bi[nid] = bubble_indices[old]
|
||
return new_b, new_bb, new_bq, new_bi
|
||
|
||
def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr):
|
||
"""
|
||
Final reconciliation pass for:
|
||
- overlap merges (5+16, 8+18)
|
||
- child absorption (4->14, 9->19)
|
||
- complementary fragment merge (1+11)
|
||
"""
|
||
if not bubble_boxes:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
|
||
med_h = float(np.median(all_h)) if all_h else 14.0
|
||
bubble_contours = detect_speech_bubbles(image_bgr)
|
||
|
||
changed = True
|
||
while changed:
|
||
changed = False
|
||
bids = sorted(bubble_boxes.keys())
|
||
|
||
# ---- (A) Merge highly-overlapping pairs
|
||
merged_any = False
|
||
for i in range(len(bids)):
|
||
if merged_any: break
|
||
for j in range(i+1, len(bids)):
|
||
bi, bj = bids[i], bids[j]
|
||
if bi not in bubble_boxes or bj not in bubble_boxes:
|
||
continue
|
||
a, b = bubble_boxes[bi], bubble_boxes[bj]
|
||
iou = boxes_iou(a, b)
|
||
ovs = boxes_overlap_ratio(a, b) # inter / smaller
|
||
|
||
same_contour = _in_same_bubble_contour(a, b, bubble_contours)
|
||
if ovs >= 0.55 or (iou >= 0.35 and same_contour):
|
||
idx = sorted(set(bubble_indices[bi] + bubble_indices[bj]))
|
||
bubble_indices[bi] = idx
|
||
bubble_quads[bi] = [ocr[k][0] for k in idx]
|
||
bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx])
|
||
bubbles[bi] = build_lines_from_indices(idx, ocr)
|
||
|
||
bubble_indices.pop(bj, None)
|
||
bubble_quads.pop(bj, None)
|
||
bubble_boxes.pop(bj, None)
|
||
bubbles.pop(bj, None)
|
||
|
||
changed = True
|
||
merged_any = True
|
||
break
|
||
|
||
if changed:
|
||
continue
|
||
|
||
# ---- (B) Absorb tiny child boxes inside larger parent
|
||
absorbed_any = False
|
||
bids = sorted(bubble_boxes.keys())
|
||
for i in range(len(bids)):
|
||
if absorbed_any: break
|
||
for j in range(len(bids)):
|
||
if i == j:
|
||
continue
|
||
child, parent = bids[i], bids[j]
|
||
if child not in bubble_boxes or parent not in bubble_boxes:
|
||
continue
|
||
|
||
cb, pb = bubble_boxes[child], bubble_boxes[parent]
|
||
cw, ch = _box_dims(cb)
|
||
pw, ph = _box_dims(pb)
|
||
|
||
contain = _containment_ratio(cb, pb)
|
||
child_txt = _bubble_text(bubble_indices[child], ocr)
|
||
parent_txt = _bubble_text(bubble_indices[parent], ocr)
|
||
|
||
# tiny or fragment child
|
||
is_tiny = (cw <= med_h*3.2 and ch <= med_h*2.2) or len(child_txt) <= 14
|
||
|
||
# don't absorb if it's clearly separate and far
|
||
close = _center_distance(cb, pb) <= med_h * 4.0
|
||
|
||
if contain >= 0.70 and (is_tiny or close):
|
||
idx = sorted(set(bubble_indices[parent] + bubble_indices[child]))
|
||
bubble_indices[parent] = idx
|
||
bubble_quads[parent] = [ocr[k][0] for k in idx]
|
||
bubble_boxes[parent] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx])
|
||
bubbles[parent] = build_lines_from_indices(idx, ocr)
|
||
|
||
bubble_indices.pop(child, None)
|
||
bubble_quads.pop(child, None)
|
||
bubble_boxes.pop(child, None)
|
||
bubbles.pop(child, None)
|
||
|
||
changed = True
|
||
absorbed_any = True
|
||
break
|
||
|
||
if changed:
|
||
continue
|
||
|
||
# ---- (C) Merge complementary fragments (partial overlap, same contour, similar x-span)
|
||
comp_any = False
|
||
bids = sorted(bubble_boxes.keys())
|
||
for i in range(len(bids)):
|
||
if comp_any: break
|
||
for j in range(i+1, len(bids)):
|
||
bi, bj = bids[i], bids[j]
|
||
if bi not in bubble_boxes or bj not in bubble_boxes:
|
||
continue
|
||
|
||
a, b = bubble_boxes[bi], bubble_boxes[bj]
|
||
wi, hi = _box_dims(a)
|
||
wj, hj = _box_dims(b)
|
||
|
||
vert_gap = max(0, max(a[1], b[1]) - min(a[3], b[3]))
|
||
h_ix = max(0, min(a[2], b[2]) - max(a[0], b[0]))
|
||
h_overlap_ratio = h_ix / max(1, min(wi, wj))
|
||
same_contour = _in_same_bubble_contour(a, b, bubble_contours)
|
||
|
||
txt_i = _bubble_text(bubble_indices[bi], ocr)
|
||
txt_j = _bubble_text(bubble_indices[bj], ocr)
|
||
|
||
if same_contour and vert_gap <= med_h*2.8 and h_overlap_ratio >= 0.45:
|
||
# prefer merge when one is “upper fragment” + the other “lower fragment”
|
||
# and text isn't identical duplicate
|
||
if txt_i != txt_j:
|
||
idx = sorted(set(bubble_indices[bi] + bubble_indices[bj]))
|
||
bubble_indices[bi] = idx
|
||
bubble_quads[bi] = [ocr[k][0] for k in idx]
|
||
bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx])
|
||
bubbles[bi] = build_lines_from_indices(idx, ocr)
|
||
|
||
bubble_indices.pop(bj, None)
|
||
bubble_quads.pop(bj, None)
|
||
bubble_boxes.pop(bj, None)
|
||
bubbles.pop(bj, None)
|
||
|
||
changed = True
|
||
comp_any = True
|
||
break
|
||
|
||
return _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices)
|
||
|
||
def split_box_by_internal_vertical_gaps(bid, bubble_indices, ocr, factor=1.45, min_gap=16):
|
||
"""
|
||
Multi-cut vertical splitter.
|
||
Splits one bubble into N vertical groups when there are multiple strong y-gaps.
|
||
Good for 4+4 quad accidental merges.
|
||
"""
|
||
idxs = bubble_indices.get(bid, [])
|
||
if len(idxs) < 4:
|
||
return None
|
||
|
||
items = []
|
||
for i in idxs:
|
||
b = quad_bbox(ocr[i][0])
|
||
cy = (b[1] + b[3]) / 2.0
|
||
h = max(1, b[3] - b[1])
|
||
items.append((i, b, cy, h))
|
||
|
||
items.sort(key=lambda x: x[2]) # top->bottom
|
||
med_h = float(np.median([x[3] for x in items])) if items else 12.0
|
||
th = max(min_gap, med_h * factor)
|
||
|
||
# Collect cut points
|
||
cut_positions = []
|
||
prev_b = items[0][1]
|
||
for k in range(1, len(items)):
|
||
cur_b = items[k][1]
|
||
gap = cur_b[1] - prev_b[3]
|
||
if gap > th:
|
||
cut_positions.append(k)
|
||
prev_b = cur_b
|
||
|
||
if not cut_positions:
|
||
return None
|
||
|
||
# Build groups using all cut positions
|
||
groups = []
|
||
start = 0
|
||
for cp in cut_positions:
|
||
groups.append([it[0] for it in items[start:cp]])
|
||
start = cp
|
||
groups.append([it[0] for it in items[start:]])
|
||
|
||
# Remove empty groups
|
||
groups = [g for g in groups if g]
|
||
if len(groups) <= 1:
|
||
return None
|
||
|
||
# Sanity: each group should be meaningful
|
||
clean_groups = []
|
||
for g in groups:
|
||
txt = normalize_text(" ".join(build_lines_from_indices(g, ocr)))
|
||
if len(g) >= 2 or len(txt) >= 12:
|
||
clean_groups.append(g)
|
||
|
||
if len(clean_groups) <= 1:
|
||
return None
|
||
|
||
return clean_groups
|
||
|
||
def force_split_bridged_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr):
|
||
"""
|
||
Force-split boxes that accidentally contain multiple vertically separated speech chunks.
|
||
|
||
Typical fixes:
|
||
- one detected box actually contains 2 stacked bubbles
|
||
- "4 quads + 4 quads" merged into one cluster
|
||
- mixed contour membership inside one grouped box
|
||
"""
|
||
if not bubble_boxes:
|
||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||
|
||
bubble_contours = detect_speech_bubbles(image_bgr)
|
||
|
||
def contour_id_for_idx(i):
|
||
b = quad_bbox(ocr[i][0])
|
||
cx = (b[0] + b[2]) / 2.0
|
||
cy = (b[1] + b[3]) / 2.0
|
||
for ci, c in enumerate(bubble_contours):
|
||
if cv2.pointPolygonTest(c, (cx, cy), False) >= 0:
|
||
return ci
|
||
return -1
|
||
|
||
def build_group_payload(g):
|
||
g_sorted = sorted(g, key=lambda i: quad_center(ocr[i][0])[1])
|
||
ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g_sorted])
|
||
return (
|
||
build_lines_from_indices(g_sorted, ocr), # lines
|
||
ub, # box
|
||
[ocr[i][0] for i in g_sorted], # quads
|
||
g_sorted # indices
|
||
)
|
||
|
||
new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
|
||
next_bid = 1
|
||
|
||
for bid in sorted(bubble_boxes.keys()):
|
||
idxs = bubble_indices.get(bid, [])
|
||
if len(idxs) < 2:
|
||
# keep as-is
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = bubble_indices[bid]
|
||
next_bid += 1
|
||
continue
|
||
|
||
parts = None
|
||
|
||
# ------------------------------------------------------------------
|
||
# (A) Primary: internal vertical-gap multi-split
|
||
# ------------------------------------------------------------------
|
||
parts = split_box_by_internal_vertical_gaps(
|
||
bid, bubble_indices, ocr, factor=1.45, min_gap=16
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# (B) Secondary: split by contour membership if clearly mixed
|
||
# ------------------------------------------------------------------
|
||
if parts is None and len(idxs) >= 3:
|
||
by_contour = {}
|
||
for i in idxs:
|
||
cid = contour_id_for_idx(i)
|
||
by_contour.setdefault(cid, []).append(i)
|
||
|
||
contour_groups = [g for g in by_contour.values() if len(g) >= 1]
|
||
if len(contour_groups) >= 2:
|
||
# sort groups top->bottom for stable order
|
||
contour_groups.sort(key=lambda g: min(quad_bbox(ocr[i][0])[1] for i in g))
|
||
|
||
# sanity: avoid splitting tiny noise-only tails
|
||
valid = []
|
||
for g in contour_groups:
|
||
txt = normalize_text(" ".join(build_lines_from_indices(g, ocr)))
|
||
if len(g) >= 2 or len(txt) >= 10:
|
||
valid.append(g)
|
||
|
||
if len(valid) >= 2:
|
||
parts = valid
|
||
|
||
# ------------------------------------------------------------------
|
||
# (C) Tertiary: balanced 2-block pattern (e.g., 4 quads + 4 quads)
|
||
# ------------------------------------------------------------------
|
||
if parts is None and len(idxs) >= 8:
|
||
sorted_idxs = sorted(
|
||
idxs,
|
||
key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0
|
||
)
|
||
mid = len(sorted_idxs) // 2
|
||
g1, g2 = sorted_idxs[:mid], sorted_idxs[mid:]
|
||
|
||
b1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g1])
|
||
b2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g2])
|
||
|
||
if b1 and b2:
|
||
vgap = max(0, b2[1] - b1[3])
|
||
h1 = max(1, b1[3] - b1[1])
|
||
h2 = max(1, b2[3] - b2[1])
|
||
med_local_h = (h1 + h2) / 2.0
|
||
|
||
h_ix = max(0, min(b1[2], b2[2]) - max(b1[0], b2[0]))
|
||
min_w = max(1, min(b1[2] - b1[0], b2[2] - b2[0]))
|
||
h_overlap_ratio = h_ix / min_w
|
||
|
||
if vgap >= max(14, 0.22 * med_local_h) and h_overlap_ratio >= 0.30:
|
||
parts = [g1, g2]
|
||
|
||
# ------------------------------------------------------------------
|
||
# Commit split or keep original
|
||
# ------------------------------------------------------------------
|
||
if parts is None or len(parts) <= 1:
|
||
new_bubbles[next_bid] = bubbles[bid]
|
||
new_boxes[next_bid] = bubble_boxes[bid]
|
||
new_quads[next_bid] = bubble_quads[bid]
|
||
new_indices[next_bid] = bubble_indices[bid]
|
||
next_bid += 1
|
||
continue
|
||
|
||
for g in parts:
|
||
lines, box, quads, gidx = build_group_payload(g)
|
||
new_bubbles[next_bid] = lines
|
||
new_boxes[next_bid] = box
|
||
new_quads[next_bid] = quads
|
||
new_indices[next_bid] = gidx
|
||
next_bid += 1
|
||
|
||
return new_bubbles, new_boxes, new_quads, new_indices
|
||
|
||
def translate_manga_text(
|
||
image_path="001-page.png",
|
||
source_lang="en",
|
||
target_lang="ca",
|
||
confidence_threshold=0.03,
|
||
min_text_length=1,
|
||
gap_px="auto",
|
||
quality_threshold=0.62,
|
||
export_to_file="output.txt",
|
||
export_bubbles_to="bubbles.json",
|
||
reading_mode="ltr",
|
||
debug=True,
|
||
use_enhanced_ocr=True,
|
||
strict_grouping=True,
|
||
max_box_width_ratio=0.6,
|
||
max_box_height_ratio=0.5,
|
||
auto_fix_bubbles=True
|
||
):
|
||
image = cv2.imread(image_path)
|
||
if image is None:
|
||
print(f"❌ Cannot load image: {image_path}")
|
||
return
|
||
|
||
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
||
ih, iw = image.shape[:2]
|
||
print("Loading OCR engines...")
|
||
|
||
if use_enhanced_ocr:
|
||
detector = ImprovedMacVisionDetector(source_lang=source_lang)
|
||
print("🚀 Using Enhanced Multi-Pass OCR")
|
||
else:
|
||
detector = MacVisionDetector(source_lang=source_lang)
|
||
|
||
print("Running detection OCR (Apple Vision)...")
|
||
raw = detector.read(image_path)
|
||
print(f"Raw detections: {len(raw)}")
|
||
|
||
if use_enhanced_ocr:
|
||
existing_quads = [r[0] for r in raw]
|
||
missed_regions = detect_small_text_regions(image, existing_quads)
|
||
if missed_regions:
|
||
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
|
||
for region in missed_regions:
|
||
rx1, ry1, rx2, ry2 = region
|
||
pad = 10
|
||
rx1, ry1 = max(0, rx1 - pad), max(0, ry1 - pad)
|
||
rx2, ry2 = min(iw, rx2 + pad), min(ih, ry2 + pad)
|
||
crop = image[ry1:ry2, rx1:rx2]
|
||
if crop.size > 0:
|
||
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
|
||
for quad, text, conf in detector.run_vision_ocr(upscaled):
|
||
raw.append(([[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad], text, conf))
|
||
print(f"📝 Total detections after missed region scan: {len(raw)}")
|
||
|
||
# ── Filtering ─────────────────────────────────────────────────────────
|
||
filtered, skipped = [], 0
|
||
for bbox, text, conf in raw:
|
||
t = normalize_text(text)
|
||
qb = quad_bbox(bbox)
|
||
|
||
if conf < confidence_threshold:
|
||
skipped += 1
|
||
continue
|
||
if len(t) < min_text_length:
|
||
skipped += 1
|
||
continue
|
||
if not is_valid_language(t, source_lang):
|
||
skipped += 1
|
||
continue
|
||
if not is_meaningful_text(t, source_lang):
|
||
skipped += 1
|
||
continue
|
||
if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5:
|
||
skipped += 1
|
||
continue
|
||
|
||
filtered.append((bbox, t, conf))
|
||
|
||
print(f"Kept: {len(filtered)} | Skipped: {skipped}")
|
||
|
||
# Protect short dialogue token confidence
|
||
tmp = []
|
||
for bbox, t, conf in filtered:
|
||
tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40)))
|
||
filtered = tmp
|
||
|
||
# Rescue names/short tokens dropped by strict filters
|
||
rescued = rescue_name_and_short_tokens(raw, min_conf=0.20)
|
||
filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55)
|
||
|
||
if not filtered:
|
||
print("⚠️ No text after filtering.")
|
||
return
|
||
|
||
# ── Pre-grouping quad splits ──────────────────────────────────────────
|
||
filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered)
|
||
if oversized_splits > 0:
|
||
print(f"📐 Split {oversized_splits} oversized quad(s) before grouping")
|
||
|
||
filtered, wide_splits = split_wide_ocr_items(image, filtered)
|
||
if wide_splits > 0:
|
||
print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.")
|
||
|
||
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
|
||
if bridge_splits > 0:
|
||
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
|
||
|
||
hs_pre = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered]
|
||
med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0
|
||
filtered, _ = apply_column_gap_splits(image, filtered, med_h_pre)
|
||
|
||
filtered = normalize_ocr_quads(filtered)
|
||
|
||
# ── Grouping ──────────────────────────────────────────────────────────
|
||
print("📊 Grouping quads vertically...")
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical(
|
||
filtered, image.shape, gap_px=resolved_gap,
|
||
bbox_padding=1, strict_mode=strict_grouping
|
||
)
|
||
print(f" Created {len(bubbles)} initial box(es)")
|
||
|
||
# ── Auto-fix (split + merge) ──────────────────────────────────────────
|
||
if auto_fix_bubbles:
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(
|
||
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image
|
||
)
|
||
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed(
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image
|
||
)
|
||
|
||
# ── Enforce max box size ──────────────────────────────────────────────
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size(
|
||
bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered,
|
||
max_width_ratio=max_box_width_ratio,
|
||
max_height_ratio=max_box_height_ratio,
|
||
image_shape=image.shape
|
||
)
|
||
|
||
# ── Close-proximity merge ─────────────────────────────────────────────
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
|
||
)
|
||
|
||
# ── Per-bubble split pass ─────────────────────────────────────────────
|
||
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
|
||
next_bid = max(bubbles.keys()) + 1 if bubbles else 1
|
||
splits_performed = []
|
||
|
||
for bid in list(bubbles.keys()):
|
||
split_result, split_reason = _split_bubble_if_needed(
|
||
bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih
|
||
)
|
||
|
||
if split_result:
|
||
p1, p2 = split_result
|
||
splits_performed.append(f"BOX#{bid} ({split_reason})")
|
||
for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]:
|
||
ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs])
|
||
new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered)
|
||
new_bubble_boxes[part_bid] = (
|
||
max(0, ub[0] - 2), max(0, ub[1] - 2),
|
||
min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2)
|
||
)
|
||
new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs]
|
||
new_bubble_indices[part_bid] = part_idxs
|
||
next_bid += 1
|
||
else:
|
||
new_bubbles[bid] = bubbles[bid]
|
||
new_bubble_boxes[bid] = bubble_boxes[bid]
|
||
new_bubble_quads[bid] = bubble_quads[bid]
|
||
new_bubble_indices[bid] = bubble_indices[bid]
|
||
|
||
if splits_performed:
|
||
print(f"\n🔀 Splits detected: {len(splits_performed)}")
|
||
for s in splits_performed:
|
||
print(f" ✓ {s}")
|
||
|
||
# IMPORTANT: commit split-pass results
|
||
bubbles = new_bubbles
|
||
bubble_boxes = new_bubble_boxes
|
||
bubble_quads = new_bubble_quads
|
||
bubble_indices = new_bubble_indices
|
||
|
||
# ── Reattach orphan short tokens ──────────────────────────────────────
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = reattach_orphan_short_tokens(
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
|
||
)
|
||
|
||
# ── Final reconciliation pass (overlaps, child absorb, complementary merge) ──
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes(
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image
|
||
)
|
||
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes(
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image
|
||
)
|
||
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes(
|
||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image
|
||
)
|
||
|
||
print(f"✅ Final box count: {len(bubbles)}")
|
||
|
||
# ── OCR quality pass ──────────────────────────────────────────────────
|
||
translator = GoogleTranslator(source=source_lang, target=target_lang)
|
||
clean_lines: Dict[int, str] = {}
|
||
sources_used: Dict[int, str] = {}
|
||
translations: Dict[int, str] = {}
|
||
|
||
for bid, lines in bubbles.items():
|
||
base_txt = normalize_text(" ".join(lines))
|
||
base_sc = ocr_candidate_score(base_txt)
|
||
txt, src_used = base_txt, "vision-base"
|
||
|
||
if base_sc < quality_threshold:
|
||
rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
|
||
image, bubble_boxes[bid], detector, upscale=3.0, pad=24
|
||
)
|
||
if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang):
|
||
txt, src_used = rr_txt, rr_src
|
||
|
||
clean_lines[bid] = normalize_text(txt)
|
||
sources_used[bid] = src_used
|
||
|
||
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
|
||
|
||
# ── Translation ───────────────────────────────────────────────────────
|
||
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
|
||
src_txt = clean_lines[bid].strip()
|
||
if not src_txt:
|
||
continue
|
||
if not is_valid_language(src_txt, source_lang):
|
||
continue
|
||
if not is_meaningful_text(src_txt, source_lang):
|
||
continue
|
||
|
||
try:
|
||
tgt = translator.translate(src_txt) or ""
|
||
tgt = postprocess_translation_general(tgt).upper()
|
||
except Exception as e:
|
||
tgt = f"[Error: {e}]"
|
||
|
||
translations[bid] = tgt
|
||
|
||
if debug:
|
||
save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, clean_lines, "debug_clusters.png")
|
||
|
||
# ── Text output ───────────────────────────────────────────────────────
|
||
divider = "─" * 120
|
||
out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
|
||
print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} "
|
||
f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider)
|
||
|
||
translated_count = 0
|
||
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
|
||
src_txt = clean_lines[bid].strip()
|
||
if not src_txt:
|
||
continue
|
||
if not is_valid_language(src_txt, source_lang):
|
||
continue
|
||
if not is_meaningful_text(src_txt, source_lang):
|
||
continue
|
||
|
||
flags = []
|
||
tgt = translations.get(bid, "")
|
||
if not tgt:
|
||
flags.append("NO_TRANSLATION")
|
||
src_u = src_txt.upper()
|
||
src_engine = sources_used.get(bid, "unknown")
|
||
|
||
out_lines.append(
|
||
f"#{bid}|{reading_map.get(bid, bid)}|{src_engine}|{src_u}|{tgt}|"
|
||
f"{','.join(flags) if flags else '-'}"
|
||
)
|
||
print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
|
||
f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}")
|
||
translated_count += 1
|
||
|
||
out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.")
|
||
with open(export_to_file, "w", encoding="utf-8") as f:
|
||
f.write("\n".join(out_lines))
|
||
|
||
# ── bubbles.json ──────────────────────────────────────────────────────
|
||
bubbles_payload = {}
|
||
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
|
||
src_txt = clean_lines[bid].strip()
|
||
if not src_txt:
|
||
continue
|
||
if not is_valid_language(src_txt, source_lang):
|
||
continue
|
||
if not is_meaningful_text(src_txt, source_lang):
|
||
continue
|
||
|
||
box = bubble_boxes.get(bid)
|
||
tgt = translations.get(bid, "")
|
||
bubbles_payload[str(bid)] = {
|
||
"order": reading_map.get(bid, bid),
|
||
"ocr_source": sources_used.get(bid, "unknown"),
|
||
"original": src_txt.upper(),
|
||
"translated": tgt,
|
||
"box": {
|
||
"x": box[0] if box else 0,
|
||
"y": box[1] if box else 0,
|
||
"w": (box[2] - box[0]) if box else 0,
|
||
"h": (box[3] - box[1]) if box else 0,
|
||
},
|
||
"lines": [line.upper() for line in bubbles.get(bid, [])],
|
||
}
|
||
|
||
with open(export_bubbles_to, "w", encoding="utf-8") as f:
|
||
json.dump(bubbles_payload, f, ensure_ascii=False, indent=2)
|
||
|
||
print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}")
|
||
|
||
# ============================================================
|
||
# ENTRY POINT
|
||
# ============================================================
|
||
if __name__ == "__main__":
|
||
translate_manga_text(
|
||
image_path="19.png",
|
||
source_lang="english",
|
||
target_lang="ca",
|
||
confidence_threshold=0.03,
|
||
min_text_length=1,
|
||
gap_px="auto",
|
||
quality_threshold=0.62,
|
||
export_to_file="output.txt",
|
||
export_bubbles_to="bubbles.json",
|
||
reading_mode="rtl",
|
||
debug=True,
|
||
use_enhanced_ocr=True,
|
||
strict_grouping=True,
|
||
max_box_width_ratio=0.6,
|
||
max_box_height_ratio=0.5,
|
||
auto_fix_bubbles=True
|
||
) |