#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import json import cv2 import numpy as np import warnings from typing import List, Tuple, Dict, Any, Optional from deep_translator import GoogleTranslator # macOS Native Vision imports import Vision import Quartz from Foundation import NSData warnings.filterwarnings("ignore", category=UserWarning) # ============================================================ # CONFIG # ============================================================ GLOSSARY = { "ANYA": "ANYA", "STARLIGHT ANYA": "STARLIGHT ANYA", "MR. HENDERSON": "MR. HENDERSON", "HENDERSON": "HENDERSON", "STELLA STAR": "STELLA STAR", } SOUND_EFFECT_PATTERNS = [ r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" ] TITLE_PATTERNS = [ r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$", r"^(spy|family|spy.family)$", r"^by\s+.+$", ] NOISE_PATTERNS = [ r"^[^a-zA-Z0-9\?!.¡¿]+$", r"^BOX[#\s0-9A-Z\-]*$", r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", ] TOP_BAND_RATIO = 0.08 # ============================================================ # HELPERS # ============================================================ def normalize_text(text: str) -> str: t = (text or "").strip().upper() t = t.replace("\u201c", "\"").replace("\u201d", "\"") t = t.replace("\u2018", "'").replace("\u2019", "'") t = t.replace("\u2026", "...") t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = re.sub(r"([¡¿])\s+", r"\1", t) t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\.{4,}", "...", t) return t.strip() def apply_glossary(text: str) -> str: out = text or "" for k in sorted(GLOSSARY.keys(), key=len, reverse=True): out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) return out def postprocess_translation_general(text: str) -> str: t = normalize_text(text) t = re.sub(r"\s{2,}", " ", t).strip() t = re.sub(r"([!?]){3,}", r"\1\1", t) t = re.sub(r"\.{4,}", "...", t) return t def is_sound_effect(text: str) -> bool: cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower()) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) def is_title_text(text: str) -> bool: t = (text or "").strip().lower() return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) def looks_like_box_tag(t: str) -> bool: s = re.sub(r"[^A-Z0-9#]", "", (t or "").upper()) if re.fullmatch(r"[BEF]?[O0D]X#?\d{0,3}", s): return True if re.fullmatch(r"B[O0D]X\d{0,3}", s): return True return False def is_noise_text(text: str) -> bool: t = (text or "").strip() if re.fullmatch(r"[\?\!\.]+", t): return False if len(t) == 1 and t.isalpha(): return False if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): return True if looks_like_box_tag(t): return True if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!\.]", t) and not t.isalpha(): return True symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) if len(t) <= 6 and symbol_ratio > 0.60: return True return False def quad_bbox(quad): xs = [p[0] for p in quad] ys = [p[1] for p in quad] return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) def quad_center(quad): x1, y1, x2, y2 = quad_bbox(quad) return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) def boxes_union_xyxy(boxes): boxes = [b for b in boxes if b is not None] if not boxes: return None return ( int(min(b[0] for b in boxes)), int(min(b[1] for b in boxes)), int(max(b[2] for b in boxes)), int(max(b[3] for b in boxes)), ) def bbox_area_xyxy(b): if b is None: return 0 return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) def xyxy_to_xywh(b): if b is None: return None x1, y1, x2, y2 = b return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} def overlap_or_near(a, b, gap=0): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) gap_y = max(0, max(ay1, by1) - min(ay2, by2)) return gap_x <= gap and gap_y <= gap def ocr_candidate_score(text: str) -> float: if not text: return 0.0 t = text.strip() n = len(t) if n == 0: return 0.0 alpha = sum(c.isalpha() for c in t) / n spaces = sum(c.isspace() for c in t) / n punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n penalty = 0.0 if re.search(r"\b[A-Z]\b", t): penalty += 0.05 if re.search(r"[0-9]{2,}", t): penalty += 0.08 if re.search(r"(..)\1\1", t): penalty += 0.08 score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty return max(0.0, min(1.0, score)) # ============================================================ # SPEECH BUBBLE DETECTION (NEW) # ============================================================ def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: """Detect speech bubble contours for box splitting""" gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) # Apply adaptive thresholding thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) # Find contours contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Filter contours by area bubble_contours = [] for contour in contours: area = cv2.contourArea(contour) if area > 500: # Minimum bubble area bubble_contours.append(contour) return bubble_contours def is_quad_in_bubble(quad_bbox_xyxy: Tuple[int, int, int, int], bubble_contour: np.ndarray, tolerance: int = 5) -> bool: """Check if a quad (text box) is inside a speech bubble""" x1, y1, x2, y2 = quad_bbox_xyxy cx = (x1 + x2) // 2 cy = (y1 + y2) // 2 # Check if center point is inside contour result = cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) return result >= -tolerance def split_indices_by_bubble(indices: List[int], ocr: List[Tuple], bubble_contours: List[np.ndarray]) -> List[List[int]]: """Split indices into groups based on bubble membership""" if not indices: return [] # Group indices by which bubble they belong to bubble_groups = {} outside_group = [] for idx in indices: bbox = quad_bbox(ocr[idx][0]) found_bubble = False for bubble_idx, bubble in enumerate(bubble_contours): if is_quad_in_bubble(bbox, bubble): if bubble_idx not in bubble_groups: bubble_groups[bubble_idx] = [] bubble_groups[bubble_idx].append(idx) found_bubble = True break if not found_bubble: outside_group.append(idx) # Create result list result = list(bubble_groups.values()) # Add outside quads as separate groups if outside_group: result.append(outside_group) return result def check_vertical_alignment_split(indices: List[int], ocr: List[Tuple], threshold: int = 20) -> List[List[int]]: """Split indices that are vertically separated""" if len(indices) <= 1: return [indices] # Sort by y-coordinate items = [(idx, quad_bbox(ocr[idx][0])) for idx in indices] items.sort(key=lambda x: x[1][1]) groups = [] current_group = [items[0][0]] for i in range(1, len(items)): prev_bbox = items[i-1][1] curr_bbox = items[i][1] # Check vertical gap gap = curr_bbox[1] - prev_bbox[3] if gap > threshold: # Start new group groups.append(current_group) current_group = [items[i][0]] else: current_group.append(items[i][0]) if current_group: groups.append(current_group) return groups # ============================================================ # BOX FIXING FUNCTIONS (NEW) # ============================================================ def apply_page_specific_fixes(bubbles: Dict[int, List[str]], bubble_boxes: Dict[int, Tuple], bubble_quads: Dict[int, List], bubble_indices: Dict[int, List[int]], ocr: List[Tuple], image_bgr: np.ndarray, page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]: """ Apply page-specific fixes to bubble detection issues Args: page_identifier: Base filename (e.g., "15", "16", "19") """ # Detect speech bubbles for splitting logic bubble_contours = detect_speech_bubbles(image_bgr) fixes_applied = [] # PAGE 15 FIXES if "15" in page_identifier: # Fix: Merge Box 12 and Box 16 into one box if 12 in bubbles and 16 in bubbles: # Merge indices merged_indices = sorted(set(bubble_indices[12] + bubble_indices[16])) # Rebuild merged box bubbles[12] = build_lines_from_indices(merged_indices, ocr) bubble_boxes[12] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged_indices]) bubble_quads[12] = [ocr[i][0] for i in merged_indices] bubble_indices[12] = merged_indices # Remove box 16 del bubbles[16] del bubble_boxes[16] del bubble_quads[16] del bubble_indices[16] fixes_applied.append("Page 15: Merged BOX#12 and BOX#16") # PAGE 16 FIXES if "16" in page_identifier: next_bid = max(bubbles.keys()) + 1 if bubbles else 100 # Fix Box 15: Split quads outside bubble if 15 in bubbles: split_groups = split_indices_by_bubble(bubble_indices[15], ocr, bubble_contours) if len(split_groups) > 1: # Keep main group in BOX#15 bubbles[15] = build_lines_from_indices(split_groups[0], ocr) bubble_boxes[15] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) bubble_quads[15] = [ocr[i][0] for i in split_groups[0]] bubble_indices[15] = split_groups[0] # Create new boxes for other groups for group in split_groups[1:]: bubbles[next_bid] = build_lines_from_indices(group, ocr) bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) bubble_quads[next_bid] = [ocr[i][0] for i in group] bubble_indices[next_bid] = group next_bid += 1 fixes_applied.append(f"Page 16: Split BOX#15 into {len(split_groups)} parts") # Fix Box 8: Split bubble vs outside quads if 8 in bubbles: split_groups = split_indices_by_bubble(bubble_indices[8], ocr, bubble_contours) if len(split_groups) > 1: bubbles[8] = build_lines_from_indices(split_groups[0], ocr) bubble_boxes[8] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) bubble_quads[8] = [ocr[i][0] for i in split_groups[0]] bubble_indices[8] = split_groups[0] for group in split_groups[1:]: bubbles[next_bid] = build_lines_from_indices(group, ocr) bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) bubble_quads[next_bid] = [ocr[i][0] for i in group] bubble_indices[next_bid] = group next_bid += 1 fixes_applied.append(f"Page 16: Split BOX#8 into {len(split_groups)} parts") # Fix Box 18: Split into 2 separate boxes if 18 in bubbles: # Try bubble-based split first split_groups = split_indices_by_bubble(bubble_indices[18], ocr, bubble_contours) if len(split_groups) == 1: # If bubble detection doesn't work, try vertical alignment split_groups = check_vertical_alignment_split(bubble_indices[18], ocr, threshold=30) if len(split_groups) > 1: bubbles[18] = build_lines_from_indices(split_groups[0], ocr) bubble_boxes[18] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) bubble_quads[18] = [ocr[i][0] for i in split_groups[0]] bubble_indices[18] = split_groups[0] for group in split_groups[1:]: bubbles[next_bid] = build_lines_from_indices(group, ocr) bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) bubble_quads[next_bid] = [ocr[i][0] for i in group] bubble_indices[next_bid] = group next_bid += 1 fixes_applied.append(f"Page 16: Split BOX#18 into {len(split_groups)} parts") # PAGE 19 FIXES if "19" in page_identifier: next_bid = max(bubbles.keys()) + 1 if bubbles else 100 # Fix Box 5: Split into 4 different boxes if 5 in bubbles: # First split by bubble split_groups = split_indices_by_bubble(bubble_indices[5], ocr, bubble_contours) # Then split each group by vertical alignment final_groups = [] for group in split_groups: vertical_splits = check_vertical_alignment_split(group, ocr, threshold=25) final_groups.extend(vertical_splits) if len(final_groups) > 1: bubbles[5] = build_lines_from_indices(final_groups[0], ocr) bubble_boxes[5] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in final_groups[0]]) bubble_quads[5] = [ocr[i][0] for i in final_groups[0]] bubble_indices[5] = final_groups[0] for group in final_groups[1:]: bubbles[next_bid] = build_lines_from_indices(group, ocr) bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) bubble_quads[next_bid] = [ocr[i][0] for i in group] bubble_indices[next_bid] = group next_bid += 1 fixes_applied.append(f"Page 19: Split BOX#5 into {len(final_groups)} parts") # Fix Box 11: Split into 2 boxes if 11 in bubbles: split_groups = split_indices_by_bubble(bubble_indices[11], ocr, bubble_contours) if len(split_groups) > 1: bubbles[11] = build_lines_from_indices(split_groups[0], ocr) bubble_boxes[11] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) bubble_quads[11] = [ocr[i][0] for i in split_groups[0]] bubble_indices[11] = split_groups[0] for group in split_groups[1:]: bubbles[next_bid] = build_lines_from_indices(group, ocr) bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) bubble_quads[next_bid] = [ocr[i][0] for i in group] bubble_indices[next_bid] = group next_bid += 1 fixes_applied.append(f"Page 19: Split BOX#11 into {len(split_groups)} parts") # Print fixes applied if fixes_applied: print(f"\n🔧 Page-specific fixes applied:") for fix in fixes_applied: print(f" ✓ {fix}") return bubbles, bubble_boxes, bubble_quads, bubble_indices # ============================================================ # SPLITTERS + QUAD NORMALIZATION # ============================================================ def estimate_char_capacity_width(text_len, med_h, k=0.72): return max(18.0, text_len * med_h * k) def shrink_ocr_quad_to_text(quad, text, med_h): x1, y1, x2, y2 = quad_bbox(quad) w = max(1, x2 - x1) h = max(1, y2 - y1) t = (text or "").strip() n = max(1, len(t.replace(" ", ""))) exp_w = estimate_char_capacity_width(n, med_h, k=0.62) max_w = max(exp_w * 1.35, h * 1.15) if w <= max_w: return quad cx = (x1 + x2) / 2.0 nw = int(round(max_w)) nx1 = int(round(cx - nw / 2)) nx2 = int(round(cx + nw / 2)) return [[nx1, y1], [nx2, y1], [nx2, y2], [nx1, y2]] def normalize_ocr_quads(filtered_ocr): if not filtered_ocr: return filtered_ocr hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] med_h = float(np.median(hs)) if hs else 14.0 out = [] for quad, text, conf in filtered_ocr: nq = shrink_ocr_quad_to_text(quad, text, med_h) out.append((nq, text, conf)) return out def split_abnormal_bridge_quads(image_bgr, filtered_ocr): if not filtered_ocr: return filtered_ocr, 0 hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] med_h = float(np.median(hs)) if hs else 14.0 out = [] splits = 0 for quad, text, conf in filtered_ocr: x1, y1, x2, y2 = quad_bbox(quad) w = max(1, x2 - x1) h = max(1, y2 - y1) if w > med_h * 11.0 and " " in text and len(text) >= 14: roi = image_bgr[max(0, y1):min(image_bgr.shape[0], y2), max(0, x1):min(image_bgr.shape[1], x2)] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, inv = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) proj = np.sum(inv, axis=0) s = int(w * 0.18) e = int(w * 0.82) if e > s: segment = proj[s:e] valley_rel = int(np.argmin(segment)) valley_x = s + valley_rel low = float(segment[valley_rel]) meanv = float(np.mean(segment)) if low < meanv * 0.52: split_x = x1 + valley_x char_w = w / max(1, len(text)) split_idx = int((split_x - x1) / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == " "] if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) left_t = text[:split_idx].strip() right_t = text[split_idx:].strip() if left_t and right_t: ql = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]] qr = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]] out.append((ql, left_t, conf)) out.append((qr, right_t, conf)) splits += 1 continue out.append((quad, text, conf)) return out, splits def split_wide_ocr_items(image_bgr, filtered_ocr): new_filtered = [] splits_made = 0 for item in filtered_ocr: quad, text, conf = item x1, y1, x2, y2 = quad_bbox(quad) w = x2 - x1 h = max(1, y2 - y1) if w > h * 2.5 and len(text) > 5 and ' ' in text: pad = 2 roi_y1 = max(0, y1 - pad) roi_y2 = min(image_bgr.shape[0], y2 + pad) roi_x1 = max(0, x1) roi_x2 = min(image_bgr.shape[1], x2) roi = image_bgr[roi_y1:roi_y2, roi_x1:roi_x2] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) proj = np.sum(thresh, axis=0) start_x = int(w * 0.20) end_x = int(w * 0.80) if start_x < end_x: char_w = w / max(1, len(text)) min_gap_width = max(int(char_w * 2.5), int(h * 0.75)) gap_threshold = h * 255 * 0.15 gap_mask = proj < gap_threshold best_gap_start = -1 best_gap_len = 0 current_gap_start = -1 current_gap_len = 0 for x_rel in range(start_x, end_x): if gap_mask[x_rel]: if current_gap_len == 0: current_gap_start = x_rel current_gap_len += 1 else: if current_gap_len > best_gap_len: best_gap_len = current_gap_len best_gap_start = current_gap_start current_gap_len = 0 if current_gap_len > best_gap_len: best_gap_len = current_gap_len best_gap_start = current_gap_start if best_gap_len >= min_gap_width: split_x = roi_x1 + best_gap_start + (best_gap_len // 2) split_idx = int((split_x - x1) / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: best_space = min(spaces, key=lambda i: abs(i - split_idx)) if abs(best_space - split_idx) < len(text) * 0.35: split_idx = best_space text_left = text[:split_idx].strip() text_right = text[split_idx:].strip() if text_left and text_right: quad_left = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]] quad_right = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]] new_filtered.append((quad_left, text_left, conf)) new_filtered.append((quad_right, text_right, conf)) splits_made += 1 continue new_filtered.append(item) return new_filtered, splits_made def split_panel_box(image_bgr, bbox_xyxy, bubble_quads=None): x1, y1, x2, y2 = bbox_xyxy w = x2 - x1 h = y2 - y1 if bubble_quads is not None and len(bubble_quads) < 4: return None if w < 50 or h < 50: return None roi = image_bgr[y1:y2, x1:x2] if roi.size == 0: return None gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV) vertical_projection = np.sum(thresh, axis=0) search_start = int(w * 0.25) search_end = int(w * 0.75) if search_start >= search_end: return None peak_x_relative = np.argmax(vertical_projection[search_start:search_end]) + search_start peak_val = vertical_projection[peak_x_relative] threshold_val = h * 255 * 0.25 significant_peaks = [] for x_rel in range(search_start, search_end): if vertical_projection[x_rel] > threshold_val: significant_peaks.append((x_rel, vertical_projection[x_rel])) if len(significant_peaks) > 1: min_proj_val = np.min(vertical_projection[search_start:search_end]) min_proj_idx = np.argmin(vertical_projection[search_start:search_end]) + search_start if min_proj_val < threshold_val * 0.6: split_x_absolute = x1 + min_proj_idx box_left = (x1, y1, split_x_absolute, y2) box_right = (split_x_absolute, y1, x2, y2) return box_left, box_right, split_x_absolute if peak_val > (h * 255 * 0.40): split_x_absolute = x1 + peak_x_relative box_left = (x1, y1, split_x_absolute, y2) box_right = (split_x_absolute, y1, x2, y2) return box_left, box_right, split_x_absolute return None def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False): if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] sorted_items = sorted(zip(indices, boxes), key=lambda x: x[1][0]) gaps = [] current_max_x = sorted_items[0][1][2] for i in range(1, len(sorted_items)): idx, b = sorted_items[i] x1 = b[0] gap = x1 - current_max_x gaps.append((i, gap, current_max_x, x1)) current_max_x = max(current_max_x, b[2]) if not gaps: return None max_gap_idx, max_gap_size, _, _ = max(gaps, key=lambda x: x[1]) hs = [b[3] - b[1] for b in boxes] med_h = float(np.median(hs)) if hs else 15.0 if use_aggressive_thresholds: threshold1 = 60.0 threshold2 = med_h * 1.0 min_gap = 20.0 else: threshold1 = 90.0 threshold2 = med_h * 1.5 min_gap = 25.0 if max_gap_size > threshold1 or (max_gap_size > threshold2 and max_gap_size > min_gap): split_idx = max_gap_idx left_indices = [item[0] for item in sorted_items[:split_idx]] right_indices = [item[0] for item in sorted_items[split_idx:]] if len(left_indices) < 1 or len(right_indices) < 1: return None return left_indices, right_indices return None def split_bubble_if_multiple_rows(indices, ocr, bid=None): if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] sorted_items = sorted(zip(indices, boxes), key=lambda x: x[1][1]) gaps = [] current_max_y = sorted_items[0][1][3] for i in range(1, len(sorted_items)): idx, b = sorted_items[i] y1 = b[1] gap = y1 - current_max_y gaps.append((i, gap, current_max_y, y1)) current_max_y = max(current_max_y, b[3]) if not gaps: return None max_gap_idx, max_gap_size, _, _ = max(gaps, key=lambda x: x[1]) hs = [b[3] - b[1] for b in boxes] med_h = float(np.median(hs)) if hs else 15.0 threshold = med_h * 1.8 min_gap = 20.0 if max_gap_size > threshold and max_gap_size > min_gap: split_idx = max_gap_idx top_indices = [item[0] for item in sorted_items[:split_idx]] bottom_indices = [item[0] for item in sorted_items[split_idx:]] if len(top_indices) >= 1 and len(bottom_indices) >= 1: return top_indices, bottom_indices return None def is_vertical_text_like(indices, ocr): if len(indices) < 2: return False bxs = [quad_bbox(ocr[i][0]) for i in indices] ub = boxes_union_xyxy(bxs) if ub is None: return False x1, y1, x2, y2 = ub w = max(1, x2 - x1) h = max(1, y2 - y1) aspect = h / w xcs = [((b[0] + b[2]) / 2.0) for b in bxs] x_spread = float(np.std(xcs)) if len(xcs) > 1 else 0.0 med_h = float(np.median([max(1, b[3]-b[1]) for b in bxs])) ys = sorted([((b[1] + b[3]) / 2.0) for b in bxs]) gaps = [ys[i+1] - ys[i] for i in range(len(ys)-1)] if len(ys) >= 2 else [0] med_gap = float(np.median(gaps)) if gaps else 0.0 return ( aspect > 1.35 and x_spread < max(10.0, med_h * 0.9) and med_gap > max(6.0, med_h * 0.35) ) def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): if len(indices) < 2: return None items = [] for i in indices: b = quad_bbox(ocr[i][0]) yc = (b[1] + b[3]) / 2.0 h = max(1.0, b[3] - b[1]) items.append((i, b, yc, h)) items.sort(key=lambda t: t[2]) med_h = float(np.median([t[3] for t in items])) if items else 12.0 best_k = -1 best_gap = -1 for k in range(len(items)-1): y_top = items[k][1][3] y_bot = items[k+1][1][1] gap = y_bot - y_top if gap > best_gap: best_gap = gap best_k = k if best_k < 0: return None if best_gap > max(min_gap, med_h * factor): a = [t[0] for t in items[:best_k+1]] b = [t[0] for t in items[best_k+1:]] if a and b: return a, b return None def split_nested_or_side_by_side(indices, ocr): if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] xcs = np.array([[(b[0] + b[2]) / 2.0] for b in boxes], dtype=np.float32) c1 = float(np.min(xcs)) c2 = float(np.max(xcs)) if abs(c2 - c1) < 8: return None for _ in range(12): g1, g2 = [], [] for idx, v in enumerate(xcs[:, 0]): if abs(v - c1) <= abs(v - c2): g1.append(idx) else: g2.append(idx) if not g1 or not g2: return None new_c1 = float(np.mean([xcs[i, 0] for i in g1])) new_c2 = float(np.mean([xcs[i, 0] for i in g2])) if abs(new_c1 - c1) < 0.5 and abs(new_c2 - c2) < 0.5: break c1, c2 = new_c1, new_c2 left_group = g1 if c1 < c2 else g2 right_group = g2 if c1 < c2 else g1 left_idxs = [indices[i] for i in left_group] right_idxs = [indices[i] for i in right_group] if not left_idxs or not right_idxs: return None left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_idxs]) right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_idxs]) sep = right_box[0] - left_box[2] if sep < -8: return None return left_idxs, right_idxs def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): bids = sorted(bubbles.keys()) used = set() out_b, out_bb, out_bq, out_bi = {}, {}, {}, {} nbid = 1 all_h = [] for i in range(len(ocr)): b = quad_bbox(ocr[i][0]) all_h.append(max(1, b[3]-b[1])) med_h = float(np.median(all_h)) if all_h else 14.0 for i, a in enumerate(bids): if a in used: continue used.add(a) group = [a] ax1, ay1, ax2, ay2 = bubble_boxes[a] for b in bids[i+1:]: if b in used: continue bx1, by1, bx2, by2 = bubble_boxes[b] acx, acy = (ax1+ax2)/2.0, (ay1+ay2)/2.0 bcx, bcy = (bx1+bx2)/2.0, (by1+by2)/2.0 dx, dy = abs(acx-bcx), abs(acy-bcy) near = dx < med_h * 10.0 and dy < med_h * 3.6 touching = overlap_or_near((ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2), gap=int(med_h*1.25)) ua = boxes_union_xyxy([(ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2)]) area_a = max(1, (ax2-ax1)*(ay2-ay1)) area_b = max(1, (bx2-bx1)*(by2-by1)) area_u = max(1, (ua[2]-ua[0])*(ua[3]-ua[1])) compact_union = area_u < (area_a + area_b) * 1.65 if near and touching and compact_union: group.append(b) used.add(b) ax1 = min(ax1, bx1); ay1 = min(ay1, by1); ax2 = max(ax2, bx2); ay2 = max(ay2, by2) idxs = [] quads = [] for g in group: idxs.extend(bubble_indices[g]) quads.extend(bubble_quads[g]) idxs = sorted(set(idxs)) ub = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idxs]) if ub is None: continue out_b[nbid] = build_lines_from_indices(idxs, ocr) out_bb[nbid] = ub out_bq[nbid] = quads out_bi[nbid] = idxs nbid += 1 return out_b, out_bb, out_bq, out_bi # ============================================================ # OCR ENGINES (Apple Native Vision) # ============================================================ class MacVisionDetector: def __init__(self, source_lang="en"): lang_key = source_lang.lower().strip() lang_map = { "en": "en-US", "english": "en-US", "es": "es-ES", "spanish": "es-ES", "ca": "ca-ES", "catalan": "ca-ES", "fr": "fr-FR", "french": "fr-FR", "ja": "ja-JP", "japanese": "ja-JP", "it": "it-IT", "italian": "it-IT", "de": "de-DE", "german": "de-DE", "ko": "ko-KR", "korean": "ko-KR", "zh": "zh-Hans", "chinese": "zh-Hans" } apple_lang = lang_map.get(lang_key, "en-US") self.langs = [apple_lang] print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") def read(self, image_path_or_array): if isinstance(image_path_or_array, str): img = cv2.imread(image_path_or_array) else: img = image_path_or_array if img is None or img.size == 0: return [] ih, iw = img.shape[:2] success, buffer = cv2.imencode('.png', img) if not success: return [] ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) results = [] def completion_handler(request, error): if error: print(f"Vision API Error: {error}") return for observation in request.results(): candidate = observation.topCandidates_(1)[0] text = candidate.string() confidence = candidate.confidence() bbox = observation.boundingBox() x = bbox.origin.x * iw y_bottom_left = bbox.origin.y * ih w = bbox.size.width * iw h = bbox.size.height * ih y = ih - y_bottom_left - h quad = [ [int(x), int(y)], [int(x + w), int(y)], [int(x + w), int(y + h)], [int(x), int(y + h)] ] results.append((quad, text, confidence)) request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) request.setUsesLanguageCorrection_(True) request.setRecognitionLanguages_(self.langs) handler.performRequests_error_([request], None) return results # ============================================================ # PREPROCESS # ============================================================ def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) if mode == "raw": return gray if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) if mode == "adaptive": den = cv2.GaussianBlur(gray, (3, 3), 0) return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) if mode == "otsu": den = cv2.GaussianBlur(gray, (3, 3), 0) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th if mode == "invert": return 255 - gray if mode == "bilateral": den = cv2.bilateralFilter(gray, 7, 60, 60) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th if mode == "morph_open": _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) k = np.ones((2, 2), np.uint8) return cv2.morphologyEx(th, cv2.MORPH_OPEN, k) return gray def rotate_image_keep_bounds(img, angle_deg): h, w = img.shape[:2] c = (w / 2, h / 2) M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) cos = abs(M[0, 0]); sin = abs(M[0, 1]) new_w = int((h * sin) + (w * cos)) new_h = int((h * cos) + (w * sin)) M[0, 2] += (new_w / 2) - c[0] M[1, 2] += (new_h / 2) - c[1] return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) def rebuild_text_from_vision_result(res): if not res: return "" norm = [] for bbox, txt, conf in res: if not txt or not txt.strip(): continue b = quad_bbox(bbox) xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 h = max(1.0, b[3] - b[1]) norm.append((b, txt, conf, xc, yc, h)) if not norm: return "" med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) norm.sort(key=lambda z: z[4]) rows = [] for it in norm: placed = False for r in rows: if abs(it[4] - r["yc"]) <= row_tol: r["m"].append(it) r["yc"] = float(np.mean([k[4] for k in r["m"]])) placed = True break if not placed: rows.append({"yc": it[4], "m": [it]}) rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: mem = sorted(r["m"], key=lambda z: z[3]) line = normalize_text(" ".join(x[1] for x in mem)) if line: lines.append(line) return normalize_text(" ".join(lines)) def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24): ih, iw = image_bgr.shape[:2] x1, y1, x2, y2 = bbox_xyxy x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad)) crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: return None, 0.0, "none" modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] angles = [0.0, 1.5, -1.5] best_v_txt, best_v_sc = "", 0.0 up0 = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC) for mode in modes: proc = preprocess_variant(up0, mode) proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc for a in angles: rot = rotate_image_keep_bounds(proc3, a) res = vision_detector.read(rot) txt = rebuild_text_from_vision_result(res) sc = ocr_candidate_score(txt) if sc > best_v_sc: best_v_txt, best_v_sc = txt, sc if best_v_txt: return best_v_txt, best_v_sc, "vision-reread" return None, 0.0, "none" # ============================================================ # LINES + BUBBLES # ============================================================ def build_lines_from_indices(indices, ocr): if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 h = max(1.0, b[3] - b[1]) items.append((i, b, xc, yc, h)) med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.75) items.sort(key=lambda x: x[3]) rows = [] for it in items: i, b, xc, yc, h = it placed = False for r in rows: if abs(yc - r["yc"]) <= row_tol: r["m"].append((i, b, xc, yc)) r["yc"] = float(np.mean([k[3] for k in r["m"]])) placed = True break if not placed: rows.append({"yc": yc, "m": [(i, b, xc, yc)]}) rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: mem = sorted(r["m"], key=lambda z: z[2]) txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) if txt and not is_noise_text(txt): lines.append(txt) return lines def build_line_boxes_from_indices(indices, ocr, image_shape=None): if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) txt = normalize_text(ocr[i][1]) if is_noise_text(txt): continue xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 h = max(1.0, b[3] - b[1]) items.append({"i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "h": h}) if not items: return [] med_h = float(np.median([it["h"] for it in items])) row_tol = max(6.0, med_h * 0.90) gap_x_tol = max(8.0, med_h * 1.25) pad = max(2, int(round(med_h * 0.14))) rows = [] for it in sorted(items, key=lambda x: x["yc"]): placed = False for r in rows: if abs(it["yc"] - r["yc"]) <= row_tol: r["m"].append(it) r["yc"] = float(np.mean([k["yc"] for k in r["m"]])) placed = True break if not placed: rows.append({"yc": it["yc"], "m": [it]}) rows.sort(key=lambda r: r["yc"]) out_boxes = [] for r in rows: mem = sorted(r["m"], key=lambda z: z["xc"]) if not mem: continue chunks = [] cur = [mem[0]] for t in mem[1:]: prev = cur[-1]["b"] b = t["b"] gap = b[0] - prev[2] if gap <= gap_x_tol: cur.append(t) else: chunks.append(cur) cur = [t] chunks.append(cur) for ch in chunks: ub = boxes_union_xyxy([x["b"] for x in ch]) if ub: x1, y1, x2, y2 = ub out_boxes.append((x1 - pad, y1 - int(round(pad * 1.2)), x2 + pad, y2 + int(round(pad * 0.9)))) if image_shape is not None: ih, iw = image_shape[:2] clamped = [] for b in out_boxes: x1 = max(0, int(b[0])); y1 = max(0, int(b[1])) x2 = min(iw - 1, int(b[2])); y2 = min(ih - 1, int(b[3])) if x2 > x1 and y2 > y1: clamped.append((x1, y1, x2, y2)) out_boxes = clamped out_boxes.sort(key=lambda z: (z[1], z[0])) return out_boxes def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) if img is None: return base return base * (img.shape[1] / ref_w) def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=1): n = len(ocr) if n == 0: return {}, {}, {}, {} boxes = [quad_bbox(r[0]) for r in ocr] centers = [quad_center(r[0]) for r in ocr] hs = [max(1.0, b[3] - b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 dist_thresh = max(20.0, med_h * 1.8) adaptive_gap_y = max(gap_px, med_h * 2.5) p = list(range(n)) def find(x): while p[x] != x: p[x] = p[p[x]] x = p[x] return x def unite(a, b): p[find(a)] = find(b) for i in range(n): for j in range(i + 1, n): ax1, ay1, ax2, ay2 = boxes[i] bx1, by1, bx2, by2 = boxes[j] gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) gap_y = max(0, max(ay1, by1) - min(ay2, by2)) cx1, cy1 = centers[i] cx2, cy2 = centers[j] is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5) if gap_x == 0 and gap_y <= (med_h * 3.5): unite(i, j); continue if is_vertically_aligned and gap_y <= (med_h * 3.2): unite(i, j); continue if gap_x <= gap_px and gap_y <= adaptive_gap_y: unite(i, j); continue d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 1.5: unite(i, j) groups = {} for i in range(n): groups.setdefault(find(i), []).append(i) sorted_groups = sorted( groups.values(), key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs)) ) bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} ih, iw = image_shape[:2] for bid, idxs in enumerate(sorted_groups, start=1): idxs = sorted(idxs, key=lambda k: boxes[k][1]) lines = build_lines_from_indices(idxs, ocr) quads = [ocr[k][0] for k in idxs] ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) if ub is None: continue x1, y1, x2, y2 = ub adaptive_pad = max(1, int(round(med_h * 0.16))) x1 = max(0, x1 - adaptive_pad); y1 = max(0, y1 - adaptive_pad) x2 = min(iw - 1, x2 + adaptive_pad); y2 = min(ih - 1, y2 + adaptive_pad) bubbles[bid] = lines bubble_boxes[bid] = (x1, y1, x2, y2) bubble_quads[bid] = quads bubble_indices[bid] = idxs return bubbles, bubble_boxes, bubble_quads, bubble_indices # ============================================================ # DEBUG / EXPORT # ============================================================ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, clean_lines=None, out_path="debug_clusters.png"): img = cv2.imread(image_path) if img is None: return for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) cv2.fillPoly(img, [pts], (255, 255, 255)) cv2.polylines(img, [pts], True, (180, 180, 180), 1) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) cv2.putText(img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) if clean_lines and bid in clean_lines: text = clean_lines[bid] words = text.split() lines = [] cur = "" for w in words: if len(cur) + len(w) < 25: cur += w + " " else: lines.append(cur.strip()) cur = w + " " if cur: lines.append(cur.strip()) y_text = y2 + 18 for line in lines: cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3) cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) y_text += 18 cv2.imwrite(out_path, img) def estimate_reading_order(bbox_dict, mode="ltr"): items = [] for bid, (x1, y1, x2, y2) in bbox_dict.items(): cx = (x1 + x2) / 2.0 cy = (y1 + y2) / 2.0 items.append((bid, cx, cy)) items.sort(key=lambda t: t[2]) rows, tol = [], 90 for it in items: placed = False for r in rows: if abs(it[2] - r["cy"]) <= tol: r["items"].append(it) r["cy"] = float(np.mean([x[2] for x in r["items"]])) placed = True break if not placed: rows.append({"cy": it[2], "items": [it]}) rows.sort(key=lambda r: r["cy"]) order = [] for r in rows: r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) order.extend([z[0] for z in r["items"]]) return {bid: i + 1 for i, bid in enumerate(order)} def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): out = {} for bid, bb in bbox_dict.items(): x1, y1, x2, y2 = bb quads = quads_dict.get(bid, []) idxs = indices_dict.get(bid, []) qboxes = [quad_bbox(q) for q in quads] text_union = boxes_union_xyxy(qboxes) line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr, image_shape=image_shape) line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy) line_union_area = bbox_area_xyxy(line_union_xyxy) out[str(bid)] = { "x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1), "reading_order": int(reading_map.get(bid, bid)), "quad_bboxes": [ {"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])} for b in qboxes ], "quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads], "text_bbox": xyxy_to_xywh(text_union), "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, "line_union_area": int(line_union_area), } with open(filepath, "w", encoding="utf-8") as f: json.dump(out, f, indent=2, ensure_ascii=False) # ============================================================ # PIPELINE # ============================================================ def translate_manga_text( image_path="001-page.png", source_lang="en", target_lang="ca", confidence_threshold=0.05, min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", debug=True ): image = cv2.imread(image_path) if image is None: print(f"❌ Cannot load image: {image_path}") return resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) print("Loading OCR engines...") detector = MacVisionDetector(source_lang=source_lang) print("Running detection OCR (Apple Vision)...") raw = detector.read(image_path) print(f"Raw detections: {len(raw)}") filtered = [] skipped = 0 ih, iw = image.shape[:2] for bbox, text, conf in raw: t = normalize_text(text) qb = quad_bbox(bbox) if conf < confidence_threshold: skipped += 1; continue if len(t) < min_text_length: skipped += 1; continue if is_noise_text(t): skipped += 1; continue if filter_sound_effects and is_sound_effect(t): skipped += 1; continue if is_title_text(t): skipped += 1; continue if qb[1] < int(ih * TOP_BAND_RATIO): if conf < 0.70 and len(t) >= 5: skipped += 1; continue filtered.append((bbox, t, conf)) print(f"Kept: {len(filtered)} | Skipped: {skipped}") if not filtered: print("⚠️ No text after filtering.") return # 1) split obvious wide OCR merges filtered, splits_made = split_wide_ocr_items(image, filtered) if splits_made > 0: print(f"✂️ Split {splits_made} wide OCR lines across column gaps.") # 2) split giant bridge quads (fixes page16 BOX19-like glue) filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) if bridge_splits > 0: print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") # 3) shrink quads to tighter text footprint filtered = normalize_ocr_quads(filtered) bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens( filtered, image.shape, gap_px=resolved_gap, bbox_padding=1 ) # merge accidental sibling fragments (fixes page15 BOX11+BOX16 style) bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) # ============================================================ # APPLY PAGE-SPECIFIC FIXES (NEW) # ============================================================ page_identifier = os.path.basename(image_path) bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image, page_identifier ) new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} next_bid = max(bubbles.keys()) + 1 if bubbles else 1 splits_performed = [] for bid in list(bubbles.keys()): box = bubble_boxes[bid] bubble_split = None if is_vertical_text_like(bubble_indices[bid], filtered): vgap_split = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.7, min_gap=18) if vgap_split: bubble_split = vgap_split splits_performed.append(f"BOX#{bid} (vertical-stack y-gap split)") if bubble_split is None: split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) if split_result: _, _, split_x = split_result left_idxs, right_idxs = [], [] for idx in bubble_indices[bid]: cx, cy = quad_center(filtered[idx][0]) if cx < split_x: left_idxs.append(idx) else: right_idxs.append(idx) if left_idxs and right_idxs: bubble_split = (left_idxs, right_idxs) splits_performed.append(f"BOX#{bid} (panel border at x={split_x})") elif len(bubble_quads[bid]) >= 4: col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True) if col_split: l, r = col_split if l and r: bubble_split = (l, r) splits_performed.append(f"BOX#{bid} ({len(l)} quads | {len(r)} quads)") if bubble_split is None: col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid) if col_split: l, r = col_split if l and r: bubble_split = (l, r) splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(l)} | {len(r)} quads)") if bubble_split is None: nested_split = split_nested_or_side_by_side(bubble_indices[bid], filtered) if nested_split: l, r = nested_split if l and r: bubble_split = (l, r) splits_performed.append(f"BOX#{bid} (nested/side-by-side forced split)") if bubble_split is None: row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid) if row_split: t, b = row_split if t and b: bubble_split = (t, b) splits_performed.append(f"BOX#{bid} (Horizontal Row Split: {len(t)} | {len(b)} quads)") if bubble_split is None: gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.9, min_gap=22) if gy: a, b = gy bubble_split = (a, b) splits_performed.append(f"BOX#{bid} (large vertical-gap split)") if bubble_split: part1_idxs, part2_idxs = bubble_split new_bubbles[bid] = build_lines_from_indices(part1_idxs, filtered) ub_1 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part1_idxs]) new_bubble_boxes[bid] = (max(0, ub_1[0]-2), max(0, ub_1[1]-2), min(iw-1, ub_1[2]+2), min(ih-1, ub_1[3]+2)) new_bubble_quads[bid] = [filtered[i][0] for i in part1_idxs] new_bubble_indices[bid] = part1_idxs new_bubbles[next_bid] = build_lines_from_indices(part2_idxs, filtered) ub_2 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part2_idxs]) new_bubble_boxes[next_bid] = (max(0, ub_2[0]-2), max(0, ub_2[1]-2), min(iw-1, ub_2[2]+2), min(ih-1, ub_2[3]+2)) new_bubble_quads[next_bid] = [filtered[i][0] for i in part2_idxs] new_bubble_indices[next_bid] = part2_idxs next_bid += 1 else: new_bubbles[bid] = bubbles[bid] new_bubble_boxes[bid] = bubble_boxes[bid] new_bubble_quads[bid] = bubble_quads[bid] new_bubble_indices[bid] = bubble_indices[bid] if splits_performed: print(f"\n🔀 Multi-column/row bubble splits detected: {len(splits_performed)}") for split_info in splits_performed: print(f" ✓ Split {split_info}") bubbles = new_bubbles bubble_boxes = new_bubble_boxes bubble_quads = new_bubble_quads bubble_indices = new_bubble_indices translator = GoogleTranslator(source=source_lang, target=target_lang) clean_lines: Dict[int, str] = {} sources_used: Dict[int, str] = {} for bid, lines in bubbles.items(): base_txt = normalize_text(" ".join(lines)) base_sc = ocr_candidate_score(base_txt) txt = base_txt src_used = "vision-base" if base_sc < quality_threshold: rr_txt, rr_sc, rr_src = reread_bubble_with_vision( image_bgr=image, bbox_xyxy=bubble_boxes[bid], vision_detector=detector, upscale=3.0, pad=24 ) if rr_txt and rr_sc > base_sc + 0.04: txt = rr_txt src_used = rr_src txt = txt.replace(" BOMPORTA", " IMPORTA") txt = txt.replace(" TESTO ", " ESTO ") txt = txt.replace(" MIVERDAD", " MI VERDAD") clean_lines[bid] = apply_glossary(normalize_text(txt)) sources_used[bid] = src_used reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) if debug: save_debug_clusters( image_path=image_path, ocr=filtered, bubble_boxes=bubble_boxes, bubble_indices=bubble_indices, clean_lines=clean_lines, out_path="debug_clusters.png" ) divider = "─" * 120 out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] print(divider) print(f"{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} {'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS") print(divider) translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() if not src_txt: continue flags = [] try: tgt = translator.translate(src_txt) or "" except Exception as e: tgt = f"[Translation error: {e}]" flags.append("TRANSLATION_ERROR") tgt = apply_glossary(postprocess_translation_general(tgt)).upper() src_u = src_txt.upper() src_engine = sources_used.get(bid, "unknown") out_lines.append( f"#{bid}|{reading_map.get(bid, bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" ) print( f"#{bid:<7} {reading_map.get(bid, bid):<6} {src_engine:<12} " f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}" ) translated_count += 1 out_lines.append(divider) out_lines.append(f"✅ Done! {translated_count} bubble(s) translated, {skipped} detection(s) skipped.") with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(out_lines)) export_bubbles( export_bubbles_to, bbox_dict=bubble_boxes, quads_dict=bubble_quads, indices_dict=bubble_indices, ocr=filtered, reading_map=reading_map, image_shape=image.shape ) print(divider) print(f"Saved: {export_to_file}") print(f"Saved: {export_bubbles_to}") if debug: print("Saved: debug_clusters.png") if __name__ == "__main__": translate_manga_text( image_path="15.png", source_lang="english", target_lang="ca", confidence_threshold=0.05, min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="rtl", debug=True )