manga-translator/manga-translator.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import json
import cv2
import numpy as np
import warnings
from typing import List, Tuple, Dict, Any, Optional

from deep_translator import GoogleTranslator

# macOS Native Vision imports
import Vision
import Quartz
from Foundation import NSData

warnings.filterwarnings("ignore", category=UserWarning)

# ============================================================
# CONFIG
# ============================================================
TOP_BAND_RATIO = 0.08

# ============================================================
# HELPERS
# ============================================================
def normalize_text(text: str) -> str:
    t = (text or "").strip().upper()
    t = t.replace("\u201c", "\"").replace("\u201d", "\"")
    t = t.replace("\u2018", "'").replace("\u2019", "'")
    t = t.replace("\u2026", "...")
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"\s+([,.;:!?])", r"\1", t)
    t = re.sub(r"([¡¿])\s+", r"\1", t)
    t = re.sub(r"\(\s+", "(", t)
    t = re.sub(r"\s+\)", ")", t)
    t = re.sub(r"\.{4,}", "...", t)
    return t.strip()

def postprocess_translation_general(text: str) -> str:
    t = normalize_text(text)
    t = re.sub(r"\s{2,}", " ", t).strip()
    t = re.sub(r"([!?]){3,}", r"\1\1", t)
    t = re.sub(r"\.{4,}", "...", t)
    return t

def fix_common_ocr_errors(text: str) -> str:
    result = text
    result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
    result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
    result = result.replace('|', 'I')
    result = result.replace('`', "'")
    return result

def is_valid_language(text: str, source_lang: str) -> bool:
    if not text:
        return False
    clean_text = re.sub(r'[^\w]', '', text)
    if not clean_text:
        return False

    lang = source_lang.lower()

    if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
                'it', 'italian', 'ca', 'catalan', 'de', 'german']:
        foreign_chars = len(re.findall(
            r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF'
            r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]',
            clean_text
        ))
        if foreign_chars > 0:
            return False
        latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text))
        total = len(clean_text)
        if total <= 3:
            return latin_chars >= 1
        if total <= 6:
            return (latin_chars / total) >= 0.55
        return (latin_chars / total) >= 0.45

    elif lang in ['ja', 'japanese']:
        ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text))
        if len(clean_text) <= 3:
            return ja_chars >= 1
        return (ja_chars / len(clean_text)) >= 0.4

    elif lang in ['ko', 'korean']:
        ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text))
        if len(clean_text) <= 3:
            return ko_chars >= 1
        return (ko_chars / len(clean_text)) >= 0.4

    elif lang in ['zh', 'chinese']:
        zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text))
        if len(clean_text) <= 3:
            return zh_chars >= 1
        return (zh_chars / len(clean_text)) >= 0.4

    return True


_NOISE_TOKENS = {
    'P', 'F', 'N', 'M', 'X', 'Z', 'Q',
    'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP',
}

_MANGA_INTERJECTIONS = {
    'HUH', 'HUH?', 'HUH??', 'HUH?!',
    'OH', 'OH!', 'OOH', 'OOH!',
    'AH', 'AH!', 'UH', 'UH...',
    'HEY', 'HEY!',
    'EH', 'EH?',
    'WOW', 'WOW!',
    'YES', 'NO', 'NO!',
    'RUN', 'GO', 'GO!',
    'STOP', 'WAIT',
    'WHAT', 'WHAT?', 'WHAT?!',
    'WHY', 'WHY?',
    'HOW', 'HOW?',
    'OK', 'OK!', 'OKAY',
    'EEEEP', 'EEEP',
    'OMIGOSH',
    'HMM', 'HMM...',
    'TSK', 'TCH',
    'GRRR','I','A',
    'FWUP', 'FWAP',
    'SHIVER',
    'RRRING',
    'MORNING', 'MORNING.',
}

def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool:
    if not text:
        return False
    t = text.strip()
    t_upper = t.upper()
    t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper)
    if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS:
        return True

    alpha_count = sum(c.isalpha() for c in t)
    if alpha_count < min_alpha_chars:
        return False
    if t_upper in _NOISE_TOKENS:
        return False

    lang = source_lang.lower()
    if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
                'it', 'italian', 'ca', 'catalan', 'de', 'german']:
        non_alpha = sum(not c.isalpha() for c in t)
        if len(t) > 0 and (non_alpha / len(t)) > 0.60:
            return False

    if len(t) >= 3 and len(set(t_upper)) == 1:
        return False

    if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french',
                'it', 'italian', 'ca', 'catalan', 'de', 'german']:
        if len(t) > 4:
            vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t))
            if vowels == 0:
                return False

    return True


def quad_bbox(quad):
    xs = [p[0] for p in quad]
    ys = [p[1] for p in quad]
    return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))

def quad_center(quad):
    x1, y1, x2, y2 = quad_bbox(quad)
    return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)

def boxes_union_xyxy(boxes):
    boxes = [b for b in boxes if b is not None]
    if not boxes:
        return None
    return (
        int(min(b[0] for b in boxes)),
        int(min(b[1] for b in boxes)),
        int(max(b[2] for b in boxes)),
        int(max(b[3] for b in boxes)),
    )

def bbox_area_xyxy(b):
    if b is None:
        return 0
    return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))

def xyxy_to_xywh(b):
    if b is None:
        return None
    x1, y1, x2, y2 = b
    return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}

def overlap_or_near(a, b, gap=0):
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
    gap_y = max(0, max(ay1, by1) - min(ay2, by2))
    return gap_x <= gap and gap_y <= gap

def boxes_iou(a, b):
    """Intersection over Union for two xyxy boxes."""
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    ix1 = max(ax1, bx1)
    iy1 = max(ay1, by1)
    ix2 = min(ax2, bx2)
    iy2 = min(ay2, by2)
    inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
    if inter == 0:
        return 0.0
    area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
    area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
    return inter / max(1, area_a + area_b - inter)

def boxes_overlap_ratio(a, b):
    """Ratio of intersection to the SMALLER box area."""
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    ix1 = max(ax1, bx1)
    iy1 = max(ay1, by1)
    ix2 = min(ax2, bx2)
    iy2 = min(ay2, by2)
    inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
    if inter == 0:
        return 0.0
    area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
    area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
    return inter / max(1, min(area_a, area_b))

def ocr_candidate_score(text: str) -> float:
    if not text:
        return 0.0
    t = text.strip()
    n = len(t)
    if n == 0:
        return 0.0
    alpha = sum(c.isalpha() for c in t) / n
    spaces = sum(c.isspace() for c in t) / n
    punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
    bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
    penalty = 0.0
    if re.search(r"\b[A-Z]\b", t):
        penalty += 0.05
    if re.search(r"[0-9]{2,}", t):
        penalty += 0.08
    score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
    return max(0.0, min(1.0, score))

def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool:
    x1, y1, x2, y2 = quad_bbox(quad)
    w = max(1, x2 - x1)
    h = max(1, y2 - y1)
    return (w / h) >= ratio_threshold

def quad_is_vertical(quad, ratio_threshold=1.5) -> bool:
    x1, y1, x2, y2 = quad_bbox(quad)
    w = max(1, x2 - x1)
    h = max(1, y2 - y1)
    return (h / w) >= ratio_threshold


# ============================================================
# ENHANCED IMAGE PREPROCESSING
# ============================================================
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
    h, w = image_bgr.shape[:2]
    upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)),
                          interpolation=cv2.INTER_CUBIC)
    gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    kernel_sharpen = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
    binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 11, 2)
    kernel = np.ones((2, 2), np.uint8)
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)

def detect_small_text_regions(image_bgr, existing_quads):
    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
    mask = np.zeros(gray.shape, dtype=np.uint8)
    for quad in existing_quads:
        pts = np.array(quad, dtype=np.int32)
        cv2.fillPoly(mask, [pts], 255)
    mask_inv = cv2.bitwise_not(mask)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
    contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    text_regions = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = w * h
        if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
            text_regions.append((x, y, x+w, y+h))
    return text_regions


# ============================================================
# SPEECH BUBBLE DETECTION
# ============================================================
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY_INV, 11, 2)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    return [c for c in contours if cv2.contourArea(c) > 500]

def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5):
    x1, y1, x2, y2 = quad_bbox_xyxy
    cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
    return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance

def split_indices_by_bubble(indices, ocr, bubble_contours):
    if not indices:
        return []
    bubble_groups = {}
    outside_group = []
    for idx in indices:
        bbox = quad_bbox(ocr[idx][0])
        found = False
        for bidx, bubble in enumerate(bubble_contours):
            if is_quad_in_bubble(bbox, bubble):
                bubble_groups.setdefault(bidx, []).append(idx)
                found = True
                break
        if not found:
            outside_group.append(idx)
    result = list(bubble_groups.values())
    if outside_group:
        result.append(outside_group)
    return result

def check_vertical_alignment_split(indices, ocr, threshold=20):
    if len(indices) <= 1:
        return [indices]
    items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], key=lambda x: x[1][1])
    groups, current_group = [], [items[0][0]]
    for i in range(1, len(items)):
        if items[i][1][1] - items[i-1][1][3] > threshold:
            groups.append(current_group)
            current_group = [items[i][0]]
        else:
            current_group.append(items[i][0])
    if current_group:
        groups.append(current_group)
    return groups


# ============================================================
# QUAD SIZE VALIDATION AND SPLITTING
# ============================================================
def is_quad_oversized(quad, median_height, width_threshold=8.0):
    x1, y1, x2, y2 = quad_bbox(quad)
    w, h = x2 - x1, max(1, y2 - y1)
    return w > median_height * width_threshold or w / h > 12.0

def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height):
    x1, y1, x2, y2 = quad_bbox(quad)
    w, h = x2 - x1, max(1, y2 - y1)
    pad = 2
    roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
                    max(0,x1):min(image_bgr.shape[1],x2)]
    if roi.size == 0:
        return [(quad, text, conf)]
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    v_proj = np.sum(binary, axis=0)
    gap_threshold = h * 255 * 0.20
    gaps, in_gap, gap_start = [], False, 0
    for x in range(len(v_proj)):
        if v_proj[x] < gap_threshold:
            if not in_gap: gap_start, in_gap = x, True
        else:
            if in_gap:
                gw = x - gap_start
                if gw >= max(int(median_height * 0.8), 15):
                    gaps.append((gap_start + gw // 2, gw))
                in_gap = False
    if not gaps:
        return [(quad, text, conf)]
    gaps.sort(key=lambda g: g[1], reverse=True)
    split_x_abs = max(0, x1) + gaps[0][0]
    if ' ' in text:
        char_w = w / max(1, len(text))
        split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
        spaces = [i for i, c in enumerate(text) if c == ' ']
        if spaces:
            split_idx = min(spaces, key=lambda i: abs(i - split_idx))
        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
    else:
        split_idx = int(len(text) * (split_x_abs - x1) / w)
        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
    if tl and tr:
        return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
                ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
    return [(quad, text, conf)]

def validate_and_split_oversized_quads(image_bgr, filtered_ocr):
    if not filtered_ocr:
        return filtered_ocr, 0
    heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
    median_height = float(np.median(heights)) if heights else 14.0
    result, splits_made = [], 0
    for quad, text, conf in filtered_ocr:
        if is_quad_oversized(quad, median_height, 8.0):
            sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height)
            if len(sr) > 1:
                result.extend(sr); splits_made += 1
            else:
                result.append((quad, text, conf))
        else:
            result.append((quad, text, conf))
    return result, splits_made


# ============================================================
# HORIZONTAL GAP DETECTION AT QUAD LEVEL
# ============================================================
def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5):
    """
    Detects a large horizontal gap between quads within a group and splits them.
    Fixes cases like BOX#8 in debug_clusters_016 where two column groups
    are incorrectly merged into one box.
    """
    if len(indices) < 2:
        return None
    items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0])
    boxes = [quad_bbox(ocr[i][0]) for i in items]
    gap_threshold = med_h * gap_factor
    best_gap, best_split = 0.0, None
    for k in range(len(items) - 1):
        gap = boxes[k + 1][0] - boxes[k][2]
        if gap > gap_threshold and gap > best_gap:
            best_gap, best_split = gap, k
    if best_split is None:
        return None
    left_group = [items[i] for i in range(best_split + 1)]
    right_group = [items[i] for i in range(best_split + 1, len(items))]
    if not left_group or not right_group:
        return None
    return (left_group, right_group)


def orientation_compatible(idx_a, idx_b, ocr):
    """
    Prevents merging a tall/narrow isolated glyph with wide horizontal text lines.
    Fixes BOX#1 type problems in debug_clusters_015.
    """
    ba = quad_bbox(ocr[idx_a][0])
    bb = quad_bbox(ocr[idx_b][0])
    wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1])
    wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1])
    ra, rb = wa/ha, wb/hb
    if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0):
        return False
    return True


# ============================================================
# WIDE QUAD COLUMN SPLIT — pre-grouping
# ============================================================
def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h,
                                   min_gap_factor=1.8):
    """
    FIX for BOX#6 type problem:
    Splits a single OCR quad that spans two distinct text columns by finding
    the largest vertical gap in its pixel projection. More aggressive than
    split_oversized_quad_by_content — targets column-level gaps specifically.
    """
    x1, y1, x2, y2 = quad_bbox(quad)
    w, h = x2 - x1, max(1, y2 - y1)

    # Only attempt if the quad is wide enough to plausibly span two columns
    if w < med_h * 3.0:
        return [(quad, text, conf)]

    pad = 2
    roi = image_bgr[max(0, y1-pad):min(image_bgr.shape[0], y2+pad),
                    max(0, x1):min(image_bgr.shape[1], x2)]
    if roi.size == 0:
        return [(quad, text, conf)]

    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    v_proj = np.sum(binary, axis=0)

    # Threshold: column gap must be nearly empty
    gap_threshold = h * 255 * 0.12
    min_gap_px = max(int(med_h * min_gap_factor), 10)

    gaps, in_gap, gap_start = [], False, 0
    for x in range(len(v_proj)):
        if v_proj[x] < gap_threshold:
            if not in_gap: gap_start, in_gap = x, True
        else:
            if in_gap:
                gw = x - gap_start
                if gw >= min_gap_px:
                    gaps.append((gap_start + gw // 2, gw))
                in_gap = False

    if not gaps:
        return [(quad, text, conf)]

    # Use the widest gap as the split point
    gaps.sort(key=lambda g: g[1], reverse=True)
    split_x_rel = gaps[0][0]
    split_x_abs = x1 + split_x_rel

    # Ensure the split produces two non-trivial halves
    if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h:
        return [(quad, text, conf)]

    if ' ' in text:
        char_w = w / max(1, len(text))
        split_idx = int(split_x_rel / max(1e-6, char_w))
        spaces = [i for i, c in enumerate(text) if c == ' ']
        if spaces:
            split_idx = min(spaces, key=lambda i: abs(i - split_idx))
        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
    else:
        split_idx = int(len(text) * split_x_rel / w)
        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()

    if tl and tr:
        return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
                ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]
    return [(quad, text, conf)]


def apply_column_gap_splits(image_bgr, ocr_list, med_h):
    """
    Applies split_wide_quad_by_column_gap to every quad in the list.
    Run this BEFORE grouping so column-spanning quads never seed bad groups.
    """
    result, splits_made = [], 0
    for quad, text, conf in ocr_list:
        parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h)
        if len(parts) > 1:
            splits_made += 1
        result.extend(parts)
    if splits_made:
        print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping")
    return result, splits_made


# ============================================================
# GENERALIZED BOX FIXING FUNCTIONS
# ============================================================
def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads,
                                        bubbles, ocr, image_bgr):
    all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
    med_h = float(np.median(all_h)) if all_h else 14.0
    bubble_contours = detect_speech_bubbles(image_bgr)

    new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
    next_bid = 1
    splits_made = []

    for bid, indices in bubble_indices.items():
        if len(indices) < 2:
            new_bubbles[next_bid] = bubbles[bid]
            new_boxes[next_bid] = bubble_boxes[bid]
            new_quads[next_bid] = bubble_quads[bid]
            new_indices[next_bid] = indices
            next_bid += 1
            continue

        split_groups = split_indices_by_bubble(indices, ocr, bubble_contours)
        if len(split_groups) > 1:
            for group in split_groups:
                if group:
                    new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
                    new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
                    new_quads[next_bid] = [ocr[i][0] for i in group]
                    new_indices[next_bid] = group
                    next_bid += 1
            splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles")
            continue

        vertical_splits = check_vertical_alignment_split(indices, ocr, threshold=int(med_h * 2.0))
        if len(vertical_splits) > 1:
            for group in vertical_splits:
                if group:
                    new_bubbles[next_bid] = build_lines_from_indices(group, ocr)
                    new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group])
                    new_quads[next_bid] = [ocr[i][0] for i in group]
                    new_indices[next_bid] = group
                    next_bid += 1
            splits_made.append(f"BOX#{bid} → {len(vertical_splits)} vertical groups")
            continue

        box = bubble_boxes[bid]
        x1, y1, x2, y2 = box
        if (x2 - x1) > med_h * 10:
            x_centers = [quad_center(ocr[i][0])[0] for i in indices]
            x_median = np.median(x_centers)
            left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median]
            right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median]
            if left_group and right_group:
                left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group])
                right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group])
                if right_box[0] - left_box[2] > med_h * 1.5:
                    for grp in [left_group, right_group]:
                        new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
                        new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
                        new_quads[next_bid] = [ocr[i][0] for i in grp]
                        new_indices[next_bid] = grp
                        next_bid += 1
                    splits_made.append(f"BOX#{bid} → 2 horizontal panels")
                    continue

        new_bubbles[next_bid] = bubbles[bid]
        new_boxes[next_bid] = bubble_boxes[bid]
        new_quads[next_bid] = bubble_quads[bid]
        new_indices[next_bid] = indices
        next_bid += 1

    if splits_made:
        print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):")
        for s in splits_made: print(f"   ✓ {s}")

    return new_bubbles, new_boxes, new_quads, new_indices


def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads,
                                        bubbles, ocr, image_bgr):
    all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
    med_h = float(np.median(all_h)) if all_h else 14.0
    bubble_contours = detect_speech_bubbles(image_bgr)
    bids = list(bubble_boxes.keys())
    to_merge = []

    for i in range(len(bids)):
        for j in range(i + 1, len(bids)):
            bid_i, bid_j = bids[i], bids[j]
            box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j]
            cx_i = (box_i[0] + box_i[2]) / 2.0
            cy_i = (box_i[1] + box_i[3]) / 2.0
            cx_j = (box_j[0] + box_j[2]) / 2.0
            cy_j = (box_j[1] + box_j[3]) / 2.0

            in_same_bubble = any(
                cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and
                cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0
                for c in bubble_contours
            )

            if in_same_bubble:
                if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0:
                    to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i))

    if not to_merge:
        return bubbles, bubble_boxes, bubble_quads, bubble_indices

    print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):")
    merge_groups = {}
    for top, bottom in to_merge:
        found = False
        for key in merge_groups:
            if top in merge_groups[key] or bottom in merge_groups[key]:
                merge_groups[key].update({top, bottom})
                found = True
                break
        if not found:
            merge_groups[len(merge_groups)] = {top, bottom}

    new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
    merged_bids, next_bid = set(), 1

    for merge_set in merge_groups.values():
        merge_list = sorted(merge_set)
        print(f"   ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}")
        all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b]))
        for b in merge_list: merged_bids.add(b)
        new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
        new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
        new_quads[next_bid] = [ocr[i][0] for i in all_indices]
        new_indices[next_bid] = all_indices
        next_bid += 1

    for bid in bids:
        if bid not in merged_bids:
            new_bubbles[next_bid] = bubbles[bid]
            new_boxes[next_bid] = bubble_boxes[bid]
            new_quads[next_bid] = bubble_quads[bid]
            new_indices[next_bid] = bubble_indices[bid]
            next_bid += 1

    return new_bubbles, new_boxes, new_quads, new_indices


def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads,
                                          bubbles, ocr, med_h):
    """
    FIX for BOX#2+BOX#14 and BOX#7+BOX#18 type problems:
    Merges boxes whose bounding rectangles are very close vertically AND
    share significant horizontal overlap — indicating they belong to the
    same speech bubble that the contour detector missed (e.g. dashed outlines).

    Unlike merge_close_bubbles_by_line_height, this checks BOTH axes strictly
    to avoid merging boxes from adjacent but distinct bubbles.
    """
    bids = sorted(bubble_boxes.keys())
    merge_map: Dict[int, List[int]] = {}
    merged_into: Dict[int, int] = {}

    for i, bid_i in enumerate(bids):
        if bid_i in merged_into:
            continue
        box_i = bubble_boxes[bid_i]
        wi = box_i[2] - box_i[0]

        for j in range(i + 1, len(bids)):
            bid_j = bids[j]
            if bid_j in merged_into:
                continue
            box_j = bubble_boxes[bid_j]
            wj = box_j[2] - box_j[0]

            # Vertical gap between the two boxes
            vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3]))

            # Horizontal overlap ratio (intersection / min width)
            h_ix1 = max(box_i[0], box_j[0])
            h_ix2 = min(box_i[2], box_j[2])
            h_overlap = max(0, h_ix2 - h_ix1)
            h_overlap_ratio = h_overlap / max(1, min(wi, wj))

            # Merge only when:
            #   1. Vertical gap is small (boxes are stacked closely)
            #   2. Horizontal overlap is significant (same column)
            if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35:
                root = merged_into.get(bid_i, bid_i)
                merge_map.setdefault(root, [root])
                if bid_j not in merge_map[root]:
                    merge_map[root].append(bid_j)
                merged_into[bid_j] = root

    if not merge_map:
        return bubbles, bubble_boxes, bubble_quads, bubble_indices

    print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):")
    new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
    processed, next_bid = set(), 1

    for root, group in merge_map.items():
        group_unique = sorted(set(group))
        print(f"   ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}")
        all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b]))
        new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
        new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
        new_quads[next_bid] = [ocr[i][0] for i in all_indices]
        new_indices[next_bid] = all_indices
        next_bid += 1
        processed.update(group_unique)

    for bid in bids:
        if bid not in processed:
            new_bubbles[next_bid] = bubbles[bid]
            new_boxes[next_bid] = bubble_boxes[bid]
            new_quads[next_bid] = bubble_quads[bid]
            new_indices[next_bid] = bubble_indices[bid]
            next_bid += 1

    return new_bubbles, new_boxes, new_quads, new_indices


def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads,
                               bubbles, ocr, image_bgr):
    print("\n🔍 Running automatic bubble detection fixes...")
    all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
    med_h = float(np.median(all_h)) if all_h else 14.0

    bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_split_multi_bubble_boxes(
        bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
    bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_merge_fragmented_bubbles(
        bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr)
    # Second pass: catch fragments missed by contour detection (dashed bubbles, etc.)
    bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_boxes_by_proximity_and_overlap(
        bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h)
    return bubbles, bubble_boxes, bubble_quads, bubble_indices


def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles,
                        overlap_threshold=0.50):
    bids = list(bubble_boxes.keys())
    to_remove = set()

    for i in range(len(bids)):
        bid_i = bids[i]
        if bid_i in to_remove: continue
        box_i = bubble_boxes[bid_i]
        area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1])

        for j in range(i + 1, len(bids)):
            bid_j = bids[j]
            if bid_j in to_remove: continue
            box_j = bubble_boxes[bid_j]
            area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1])

            shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j])
            overlap = boxes_overlap_ratio(box_i, box_j)

            if overlap > overlap_threshold or len(shared) > 0:
                if area_i >= area_j:
                    to_remove.add(bid_j)
                    print(f"   🗑️  Removing BOX#{bid_j} (overlaps BOX#{bid_i})")
                else:
                    to_remove.add(bid_i)
                    print(f"   🗑️  Removing BOX#{bid_i} (overlaps BOX#{bid_j})")
                    break

    if to_remove:
        print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)")
        for bid in to_remove:
            bubble_boxes.pop(bid, None)
            bubble_indices.pop(bid, None)
            bubble_quads.pop(bid, None)
            bubbles.pop(bid, None)

    return bubbles, bubble_boxes, bubble_quads, bubble_indices


def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr,
                         max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None):
    if image_shape is None:
        return bubbles, bubble_boxes, bubble_quads, bubble_indices
    ih, iw = image_shape[:2]
    max_width, max_height = iw * max_width_ratio, ih * max_height_ratio
    new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
    next_bid, splits_made = 1, []

    for bid, box in bubble_boxes.items():
        x1, y1, x2, y2 = box
        w, h = x2 - x1, y2 - y1
        if w > max_width or h > max_height:
            indices = bubble_indices[bid]
            col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid,
                                                         use_aggressive_thresholds=True)
            if col_split:
                for grp in col_split:
                    new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
                    new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
                    new_quads[next_bid] = [ocr[i][0] for i in grp]
                    new_indices[next_bid] = grp
                    next_bid += 1
                splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
                continue
            row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid)
            if row_split:
                for grp in row_split:
                    new_bubbles[next_bid] = build_lines_from_indices(grp, ocr)
                    new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp])
                    new_quads[next_bid] = [ocr[i][0] for i in grp]
                    new_indices[next_bid] = grp
                    next_bid += 1
                splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)")
                continue
        new_bubbles[next_bid] = bubbles[bid]
        new_boxes[next_bid] = box
        new_quads[next_bid] = bubble_quads[bid]
        new_indices[next_bid] = bubble_indices[bid]
        next_bid += 1

    if splits_made:
        print(f"\n📏 Split {len(splits_made)} oversized box(es):")
        for s in splits_made: print(f"   ✓ {s}")
    return new_bubbles, new_boxes, new_quads, new_indices


def should_merge_groups(group1_indices, group2_indices, ocr, median_height,
                        max_vertical_gap=None):
    if max_vertical_gap is None:
        max_vertical_gap = median_height * 2.5
    box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices])
    box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices])
    if box1 is None or box2 is None:
        return False
    cx1 = (box1[0] + box1[2]) / 2.0
    cx2 = (box2[0] + box2[2]) / 2.0
    if abs(cx1 - cx2) > median_height * 1.8:
        return False
    vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3]))
    return vertical_gap <= max_vertical_gap


# ============================================================
# ENHANCED OCR ENGINE
# ============================================================
class ImprovedMacVisionDetector:
    def __init__(self, source_lang="en"):
        lang_key = source_lang.lower().strip()
        lang_map = {
            "en": "en-US", "english": "en-US",
            "es": "es-ES", "spanish": "es-ES",
            "ca": "ca-ES", "catalan": "ca-ES",
            "fr": "fr-FR", "french": "fr-FR",
            "ja": "ja-JP", "japanese": "ja-JP",
            "it": "it-IT", "italian": "it-IT",
            "de": "de-DE", "german": "de-DE",
            "ko": "ko-KR", "korean": "ko-KR",
            "zh": "zh-Hans", "chinese": "zh-Hans"
        }
        self.langs = [lang_map.get(lang_key, "en-US")]
        print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")

    def preprocess_variants(self, image_bgr):
        variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))]
        gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
        _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        variants.append(("high_contrast", cv2.cvtColor(
            cv2.resize(hc, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC),
            cv2.COLOR_GRAY2BGR)))
        variants.append(("bilateral", cv2.resize(
            cv2.bilateralFilter(image_bgr, 9, 75, 75),
            None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
        variants.append(("inverted", cv2.resize(
            cv2.bitwise_not(image_bgr),
            None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
        variants.append(("original", cv2.resize(
            image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)))
        return variants

    def run_vision_ocr(self, image_bgr):
        if image_bgr is None or image_bgr.size == 0:
            return []
        ih, iw = image_bgr.shape[:2]
        success, buffer = cv2.imencode('.png', image_bgr)
        if not success:
            return []
        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
        results = []

        def completion_handler(request, error):
            if error: return
            for obs in request.results():
                candidate = obs.topCandidates_(1)[0]
                text, confidence = candidate.string(), candidate.confidence()
                bbox = obs.boundingBox()
                x = bbox.origin.x * iw
                y_bl = bbox.origin.y * ih
                w = bbox.size.width * iw
                h = bbox.size.height * ih
                y = ih - y_bl - h
                quad = [[int(x),int(y)],[int(x+w),int(y)],
                        [int(x+w),int(y+h)],[int(x),int(y+h)]]
                results.append((quad, text, confidence))

        req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
        req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
        req.setUsesLanguageCorrection_(False)
        req.setRecognitionLanguages_(self.langs)
        req.setAutomaticallyDetectsLanguage_(True)
        handler.performRequests_error_([req], None)
        return results

    def merge_multi_pass_results(self, all_results, original_shape):
        if not all_results:
            return []
        scale_factor = 2.5
        normalized = []
        for variant_name, results in all_results:
            for quad, text, conf in results:
                sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad]
                normalized.append((sq, text, conf, variant_name))

        def quads_overlap(q1, q2, threshold=0.5):
            b1, b2 = quad_bbox(q1), quad_bbox(q2)
            x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1])
            x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3])
            if x2 < x1 or y2 < y1: return False
            inter = (x2-x1)*(y2-y1)
            union = (b1[2]-b1[0])*(b1[3]-b1[1]) + (b2[2]-b2[0])*(b2[3]-b2[1]) - inter
            return inter / max(union, 1) > threshold

        clusters, used = [], set()
        for i, (q1, t1, c1, v1) in enumerate(normalized):
            if i in used: continue
            cluster = [(q1, t1, c1, v1)]
            used.add(i)
            for j, (q2, t2, c2, v2) in enumerate(normalized):
                if j in used or i == j: continue
                if quads_overlap(q1, q2):
                    cluster.append((q2, t2, c2, v2))
                    used.add(j)
            clusters.append(cluster)

        final_results = []
        for cluster in clusters:
            cluster.sort(key=lambda x: x[2], reverse=True)
            best_quad, best_text, best_conf, _ = cluster[0]
            text_votes = {}
            for _, text, conf, _ in cluster:
                n = normalize_text(text)
                if n: text_votes[n] = text_votes.get(n, 0) + conf
            if text_votes:
                voted = max(text_votes.items(), key=lambda x: x[1])[0]
                if voted != normalize_text(best_text):
                    best_text = voted
            final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf))
        return final_results

    def read(self, image_path_or_array):
        img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
              else image_path_or_array
        if img is None or img.size == 0:
            return []
        variants = self.preprocess_variants(img)
        all_results = []
        for vname, vimg in variants:
            r = self.run_vision_ocr(vimg)
            if r: all_results.append((vname, r))
        return self.merge_multi_pass_results(all_results, img.shape)


class MacVisionDetector:
    def __init__(self, source_lang="en"):
        lang_key = source_lang.lower().strip()
        lang_map = {
            "en": "en-US", "english": "en-US",
            "es": "es-ES", "spanish": "es-ES",
            "ca": "ca-ES", "catalan": "ca-ES",
            "fr": "fr-FR", "french": "fr-FR",
            "ja": "ja-JP", "japanese": "ja-JP",
            "it": "it-IT", "italian": "it-IT",
            "de": "de-DE", "german": "de-DE",
            "ko": "ko-KR", "korean": "ko-KR",
            "zh": "zh-Hans", "chinese": "zh-Hans"
        }
        self.langs = [lang_map.get(lang_key, "en-US")]
        print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")

    def read(self, image_path_or_array):
        img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \
              else image_path_or_array
        if img is None or img.size == 0:
            return []
        ih, iw = img.shape[:2]
        success, buffer = cv2.imencode('.png', img)
        if not success: return []
        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
        results = []

        def completion_handler(request, error):
            if error: return
            for obs in request.results():
                candidate = obs.topCandidates_(1)[0]
                text, confidence = candidate.string(), candidate.confidence()
                bbox = obs.boundingBox()
                x = bbox.origin.x * iw
                y_bl = bbox.origin.y * ih
                w = bbox.size.width * iw
                h = bbox.size.height * ih
                y = ih - y_bl - h
                quad = [[int(x),int(y)],[int(x+w),int(y)],
                        [int(x+w),int(y+h)],[int(x),int(y+h)]]
                results.append((quad, text, confidence))

        req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
        req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
        req.setUsesLanguageCorrection_(True)
        req.setRecognitionLanguages_(self.langs)
        req.setAutomaticallyDetectsLanguage_(True)
        handler.performRequests_error_([req], None)
        return results

def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False):
    if len(indices) < 2: return None
    boxes = [quad_bbox(ocr[i][0]) for i in indices]
    hs = [max(1, b[3] - b[1]) for b in boxes]
    med_h = float(np.median(hs)) if hs else 12.0
    xs = [(b[0] + b[2]) / 2.0 for b in boxes]
    xs_sorted = sorted(xs)

    gap_thresh = max(med_h * 1.2, 18) if use_aggressive_thresholds else max(med_h * 1.5, 22)
    best_gap_idx, best_gap_size = None, 0.0

    for i in range(len(xs_sorted) - 1):
        gap = xs_sorted[i + 1] - xs_sorted[i]
        if gap > gap_thresh and gap > best_gap_size:
            best_gap_size, best_gap_idx = gap, i

    if best_gap_idx is None: return None
    split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx + 1]) / 2.0

    left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 < split_x]
    right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 >= split_x]

    if not left_idxs or not right_idxs: return None
    return (left_idxs, right_idxs)


def split_bubble_if_multiple_rows(indices, ocr, bid=None):
    if len(indices) < 2: return None
    boxes = [quad_bbox(ocr[i][0]) for i in indices]
    hs = [max(1, b[3] - b[1]) for b in boxes]
    med_h = float(np.median(hs)) if hs else 12.0
    ys = [(b[1] + b[3]) / 2.0 for b in boxes]
    ys_sorted = sorted(ys)

    gap_thresh = max(med_h * 2.0, 30)
    best_gap_idx, best_gap_size = None, 0.0

    for i in range(len(ys_sorted) - 1):
        gap = ys_sorted[i + 1] - ys_sorted[i]
        if gap > gap_thresh and gap > best_gap_size:
            best_gap_size, best_gap_idx = gap, i

    if best_gap_idx is None: return None
    split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx + 1]) / 2.0

    top_idxs    = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 < split_y]
    bottom_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 >= split_y]

    if not top_idxs or not bottom_idxs: return None
    return (top_idxs, bottom_idxs)


def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
    if len(indices) < 2: return None
    boxes = [quad_bbox(ocr[i][0]) for i in indices]
    hs = [max(1, b[3] - b[1]) for b in boxes]
    med_h = float(np.median(hs)) if hs else 12.0

    items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices],
                   key=lambda x: (x[1][1] + x[1][3]) / 2.0)
    gap_thresh = max(med_h * factor, min_gap)
    best_gap, best_split_idx = 0.0, None

    for k in range(len(items) - 1):
        gap = items[k + 1][1][1] - items[k][1][3]
        if gap > gap_thresh and gap > best_gap:
            best_gap, best_split_idx = gap, k

    if best_split_idx is None: return None
    top_idxs    = [it[0] for it in items[:best_split_idx + 1]]
    bottom_idxs = [it[0] for it in items[best_split_idx + 1:]]
    if not top_idxs or not bottom_idxs: return None
    return (top_idxs, bottom_idxs)


def is_vertical_text_like(indices, ocr):
    if len(indices) < 2: return False
    boxes = [quad_bbox(ocr[i][0]) for i in indices]
    med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes]))
    med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes]))
    if med_h < med_w * 1.2: return False
    xs = [(b[0]+b[2])/2.0 for b in boxes]
    ys = [(b[1]+b[3])/2.0 for b in boxes]
    if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False
    return True


def split_nested_or_side_by_side(indices, ocr):
    if len(indices) < 2: return None
    xs = sorted([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in indices])
    mid_idx = len(xs) // 2
    split_x = (xs[mid_idx - 1] + xs[mid_idx]) / 2.0

    left_idxs  = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x]
    right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x]

    if not left_idxs or not right_idxs: return None
    return (left_idxs, right_idxs)


def split_panel_box(image_bgr, box_xyxy, bubble_quads=None):
    x1, y1, x2, y2 = box_xyxy
    ih, iw = image_bgr.shape[:2]
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(iw-1, x2), min(ih-1, y2)
    if x2 <= x1 or y2 <= y1: return None
    crop = image_bgr[y1:y2, x1:x2]
    if crop.size == 0: return None

    gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    h_proj = np.sum(edges, axis=0)
    w = x2 - x1
    if w < 100: return None

    search_start = int(w * 0.35)
    search_end   = int(w * 0.65)
    if search_end <= search_start: return None
    region = h_proj[search_start:search_end]
    if len(region) == 0: return None

    threshold  = np.percentile(region, 85)
    candidates = [x1 + search_start + rx for rx in range(len(region)) if region[rx] >= threshold]
    if not candidates: return None
    split_x = int(np.median(candidates))

    if bubble_quads:
        left_count  = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x)
        right_count = len(bubble_quads) - left_count
        if left_count == 0 or right_count == 0: return None

    return (x1, x2, split_x)


# ============================================================
# MERGE CLOSE BUBBLES
# ============================================================
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads,
                                        bubble_indices, ocr):
    """
    Merges boxes that are spatially very close (within ~1.4× line height on
    BOTH axes simultaneously). Strict dual-axis check prevents merging boxes
    from adjacent but distinct bubbles — fixing the BOX#5+BOX#16 overlap problem.
    """
    if not bubbles:
        return bubbles, bubble_boxes, bubble_quads, bubble_indices

    all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))]
    med_h = float(np.median(all_h)) if all_h else 14.0
    merge_tol = max(8, med_h * 1.4)

    bids = sorted(bubble_boxes.keys())
    merged_set, merge_map = set(), {}

    for i, bid_i in enumerate(bids):
        if bid_i in merged_set: continue
        x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i]
        wi = x2_i - x1_i

        for j in range(i + 1, len(bids)):
            bid_j = bids[j]
            if bid_j in merged_set: continue
            x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j]
            wj = x2_j - x1_j

            gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j))
            gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j))

            # Horizontal overlap ratio — must be significant to merge
            h_ix1 = max(x1_i, x1_j)
            h_ix2 = min(x2_i, x2_j)
            h_overlap = max(0, h_ix2 - h_ix1)
            h_overlap_ratio = h_overlap / max(1, min(wi, wj))

            # STRICT: both gap_x AND gap_y must be small, AND boxes must
            # share meaningful horizontal overlap (same column).
            # This prevents merging horizontally adjacent distinct bubbles.
            if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25:
                if bid_i not in merge_map:
                    merge_map[bid_i] = [bid_i]
                merge_map[bid_i].append(bid_j)
                merged_set.add(bid_j)

    if not merge_map:
        return bubbles, bubble_boxes, bubble_quads, bubble_indices

    new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {}
    next_bid = 1

    for bid in bids:
        if bid in merged_set: continue
        if bid in merge_map:
            group = merge_map[bid]
            all_indices = sorted(set(idx for b in group for idx in bubble_indices[b]))
            new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr)
            new_boxes[next_bid]   = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices])
            new_quads[next_bid]   = [ocr[i][0] for i in all_indices]
            new_indices[next_bid] = all_indices
        else:
            new_bubbles[next_bid] = bubbles[bid]
            new_boxes[next_bid]   = bubble_boxes[bid]
            new_quads[next_bid]   = bubble_quads[bid]
            new_indices[next_bid] = bubble_indices[bid]
        next_bid += 1

    return new_bubbles, new_boxes, new_quads, new_indices


# ============================================================
# WIDE / BRIDGE QUAD SPLITTING
# ============================================================
def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0):
    if not ocr_list: return ocr_list, 0
    hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
    med_h = float(np.median(hs)) if hs else 14.0
    result, splits_made = [], 0

    for quad, text, conf in ocr_list:
        x1, y1, x2, y2 = quad_bbox(quad)
        w = x2 - x1
        if w > med_h * width_factor:
            pad = 2
            roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
                            max(0,x1):min(image_bgr.shape[1],x2)]
            if roi.size > 0:
                gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
                _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
                v_proj = np.sum(binary, axis=0)
                gap_threshold = roi.shape[0] * 255 * 0.15
                gaps, in_gap, gap_start = [], False, 0
                for x in range(len(v_proj)):
                    if v_proj[x] < gap_threshold:
                        if not in_gap: gap_start, in_gap = x, True
                    else:
                        if in_gap:
                            gw = x - gap_start
                            if gw >= max(int(med_h * 0.6), 12):
                                gaps.append((gap_start + gw // 2, gw))
                            in_gap = False
                if gaps:
                    gaps.sort(key=lambda g: g[1], reverse=True)
                    split_x_abs = max(0, x1) + gaps[0][0]
                    if ' ' in text:
                        char_w = w / max(1, len(text))
                        split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
                        spaces = [i for i, c in enumerate(text) if c == ' ']
                        if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx))
                        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
                    else:
                        split_idx = int(len(text) * (split_x_abs - x1) / w)
                        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
                    if tl and tr:
                        result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
                                        ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
                        splits_made += 1
                        continue
        result.append((quad, text, conf))
    return result, splits_made


def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0):
    if not ocr_list: return ocr_list, 0
    hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list]
    med_h = float(np.median(hs)) if hs else 14.0
    result, splits_made = [], 0

    for quad, text, conf in ocr_list:
        x1, y1, x2, y2 = quad_bbox(quad)
        w, h = x2 - x1, max(1, y2 - y1)
        if w / h > aspect_ratio_threshold:
            pad = 2
            roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad),
                            max(0,x1):min(image_bgr.shape[1],x2)]
            if roi.size > 0:
                gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
                _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
                v_proj = np.sum(binary, axis=0)
                gap_threshold = h * 255 * 0.20
                gaps, in_gap, gap_start = [], False, 0
                for x in range(len(v_proj)):
                    if v_proj[x] < gap_threshold:
                        if not in_gap: gap_start, in_gap = x, True
                    else:
                        if in_gap:
                            gw = x - gap_start
                            if gw >= max(int(med_h * 0.8), 15):
                                gaps.append((gap_start + gw // 2, gw))
                            in_gap = False
                if gaps:
                    gaps.sort(key=lambda g: g[1], reverse=True)
                    split_x_abs = max(0, x1) + gaps[0][0]
                    if ' ' in text:
                        char_w = w / max(1, len(text))
                        split_idx = int((split_x_abs - x1) / max(1e-6, char_w))
                        spaces = [i for i, c in enumerate(text) if c == ' ']
                        if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx))
                        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
                    else:
                        split_idx = int(len(text) * (split_x_abs - x1) / w)
                        tl, tr = text[:split_idx].strip(), text[split_idx:].strip()
                    if tl and tr:
                        result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf),
                                        ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)])
                        splits_made += 1
                        continue
        result.append((quad, text, conf))
    return result, splits_made


def normalize_ocr_quads(ocr_list):
    result = []
    for quad, text, conf in ocr_list:
        x1, y1, x2, y2 = quad_bbox(quad)
        pad = 3
        new_quad = [[x1-pad, y1-pad], [x2+pad, y1-pad], [x2+pad, y2+pad], [x1-pad, y2+pad]]
        result.append((new_quad, text, conf))
    return result


# ============================================================
# VISION RE-READ
# ============================================================
def preprocess_variant(crop_bgr, mode):
    gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
    if mode == "raw":        return gray
    if mode == "clahe":      return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(gray)
    if mode == "adaptive":
        den = cv2.GaussianBlur(gray, (3,3), 0)
        return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11)
    if mode == "otsu":
        den = cv2.GaussianBlur(gray, (3,3), 0)
        _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return th
    if mode == "invert":     return 255 - gray
    if mode == "bilateral":
        den = cv2.bilateralFilter(gray, 7, 60, 60)
        _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return th
    if mode == "morph_open":
        _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8))
    return gray


def rotate_image_keep_bounds(img, angle_deg):
    h, w = img.shape[:2]
    c = (w/2, h/2)
    M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
    cos, sin = abs(M[0,0]), abs(M[0,1])
    new_w = int((h*sin) + (w*cos))
    new_h = int((h*cos) + (w*sin))
    M[0,2] += (new_w/2) - c[0]
    M[1,2] += (new_h/2) - c[1]
    return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)


def rebuild_text_from_vision_result(res):
    if not res: return ""
    norm = []
    for bbox, txt, conf in res:
        if not txt or not txt.strip(): continue
        b = quad_bbox(bbox)
        norm.append((b, txt, conf, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
    if not norm: return ""
    med_h   = float(np.median([x[5] for x in norm]))
    row_tol = max(6.0, med_h * 0.75)
    norm.sort(key=lambda z: z[4])
    rows = []
    for it in norm:
        placed = False
        for r in rows:
            if abs(it[4] - r["yc"]) <= row_tol:
                r["m"].append(it)
                r["yc"] = float(np.mean([k[4] for k in r["m"]]))
                placed = True; break
        if not placed: rows.append({"yc": it[4], "m": [it]})
    rows.sort(key=lambda r: r["yc"])
    lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) for r in rows]
    return normalize_text(" ".join(filter(None, lines)))


def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
    ih, iw = image_bgr.shape[:2]
    x1, y1, x2, y2 = bbox_xyxy
    x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad))
    x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad))
    crop = image_bgr[y1:y2, x1:x2]
    if crop.size == 0: return None, 0.0, "none"

    modes  = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
    angles = [0.0, 1.5, -1.5]
    best_v_txt, best_v_sc = "", 0.0
    up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)),
                     interpolation=cv2.INTER_CUBIC)

    for mode in modes:
        proc  = preprocess_variant(up0, mode)
        proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
        for a in angles:
            rot = rotate_image_keep_bounds(proc3, a)
            res = (vision_detector.run_vision_ocr(rot)
                   if hasattr(vision_detector, 'run_vision_ocr')
                   else vision_detector.read(rot))
            txt = rebuild_text_from_vision_result(res)
            sc  = ocr_candidate_score(txt)
            if sc > best_v_sc:
                best_v_txt, best_v_sc = txt, sc

    if best_v_txt: return best_v_txt, best_v_sc, "vision-reread"
    return None, 0.0, "none"


# ============================================================
# LINES + BUBBLES
# ============================================================
def build_lines_from_indices(indices, ocr):
    if not indices: return []
    items = []
    for i in indices:
        b = quad_bbox(ocr[i][0])
        items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1])))
    med_h   = float(np.median([it[4] for it in items])) if items else 10.0
    row_tol = max(6.0, med_h * 0.75)
    items.sort(key=lambda x: x[3])
    rows = []
    for it in items:
        placed = False
        for r in rows:
            if abs(it[3] - r["yc"]) <= row_tol:
                r["m"].append(it)
                r["yc"] = float(np.mean([k[3] for k in r["m"]]))
                placed = True; break
        if not placed: rows.append({"yc": it[3], "m": [it]})
    rows.sort(key=lambda r: r["yc"])
    return [normalize_text(" ".join(ocr[i][1] for i,_,_,_,_ in sorted(r["m"], key=lambda z: z[2])))
            for r in rows if r["m"]]


def auto_gap(image_path, base=18, ref_w=750):
    img = cv2.imread(image_path)
    return base * (img.shape[1] / ref_w) if img is not None else base


def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mode=False):
    """
    Groups OCR quads into bubble candidates.

    Generic protections applied:
      - orientation_compatible(): prevents tall/narrow glyphs merging with wide text lines.
      - Horizontal gap guard: prevents side-by-side column quads from merging.
      - detect_horizontal_gap_in_group(): post-merge split for groups with large internal gaps.
      - Orientation check in secondary merge pass.
    """
    n = len(ocr)
    if n == 0: return {}, {}, {}, {}

    boxes   = [quad_bbox(r[0]) for r in ocr]
    centers = [quad_center(r[0]) for r in ocr]
    hs      = [max(1.0, b[3]-b[1]) for b in boxes]
    med_h   = float(np.median(hs)) if hs else 12.0

    max_vertical_gap      = med_h * 2.5 if not strict_mode else med_h * 2.0
    max_horizontal_offset = med_h * 1.8

    sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0]))
    groups, used   = [], set()

    for i in sorted_indices:
        if i in used: continue
        current_group = [i]
        used.add(i)
        cx_i, cy_i = centers[i]

        for j in sorted_indices:
            if j in used or j == i: continue
            cx_j, cy_j = centers[j]
            if cy_j <= cy_i: continue
            if abs(cx_i - cx_j) > max_horizontal_offset: continue

            # Horizontal gap guard
            gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2]))
            if gap_x > med_h * 1.5: continue

            # Orientation compatibility guard
            if not orientation_compatible(i, j, ocr): continue

            vertical_gap = boxes[j][1] - boxes[current_group[-1]][3]
            if vertical_gap <= max_vertical_gap:
                current_group.append(j)
                used.add(j)
                cx_i = (cx_i + cx_j) / 2.0

        if current_group:
            groups.append(current_group)

    # Secondary merge pass
    merged_groups, used_groups = [], set()
    for i, group1 in enumerate(groups):
        if i in used_groups: continue
        merged = list(group1)
        used_groups.add(i)
        for j, group2 in enumerate(groups):
            if i == j or j in used_groups: continue
            if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap):
                compat = all(orientation_compatible(a, b, ocr)
                             for a in merged for b in group2)
                if compat:
                    merged.extend(group2)
                    used_groups.add(j)
        merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1]))

    # Horizontal gap split pass
    final_groups = []
    for group in merged_groups:
        h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5)
        if h_split:
            lg, rg = h_split
            final_groups.append(sorted(lg, key=lambda idx: centers[idx][1]))
            final_groups.append(sorted(rg, key=lambda idx: centers[idx][1]))
        else:
            final_groups.append(group)

    final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), min(centers[i][0] for i in g)))

    bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
    ih, iw = image_shape[:2]

    for bid, idxs in enumerate(final_groups, start=1):
        lines = build_lines_from_indices(idxs, ocr)
        quads = [ocr[k][0] for k in idxs]
        ub    = boxes_union_xyxy([quad_bbox(q) for q in quads])
        if ub is None: continue
        x1, y1, x2, y2 = ub
        ap = max(1, int(round(med_h * 0.16)))
        bubbles[bid]       = lines
        bubble_boxes[bid]  = (max(0,x1-ap), max(0,y1-ap), min(iw-1,x2+ap), min(ih-1,y2+ap))
        bubble_quads[bid]  = quads
        bubble_indices[bid]= idxs

    return bubbles, bubble_boxes, bubble_quads, bubble_indices


# ============================================================
# DEBUG / EXPORT
# ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices,
                        clean_lines=None, out_path="debug_clusters.png"):
    img = cv2.imread(image_path)
    if img is None: return

    for bbox, txt, conf in ocr:
        pts = np.array(bbox, dtype=np.int32)
        cv2.fillPoly(img, [pts], (255,255,255))
        cv2.polylines(img, [pts], True, (180,180,180), 1)

    for bid, bb in bubble_boxes.items():
        x1, y1, x2, y2 = bb
        is_isolated = len(bubble_indices.get(bid, [])) == 1
        color     = (255,165,0) if is_isolated else (0,220,0)
        thickness = 3           if is_isolated else 2
        cv2.rectangle(img, (x1,y1), (x2,y2), color, thickness)
        label = f"BOX#{bid}" + (" (ISOLATED)" if is_isolated else "")
        cv2.putText(img, label, (x1+2, max(15, y1+16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        if clean_lines and bid in clean_lines:
            text  = clean_lines[bid]
            words = text.split()
            lines, cur = [], ""
            for w in words:
                if len(cur) + len(w) < 25: cur += w + " "
                else: lines.append(cur.strip()); cur = w + " "
            if cur: lines.append(cur.strip())
            y_text = y2 + 18
            for line in lines:
                cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3)
                cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
                y_text += 18

    cv2.imwrite(out_path, img)


def estimate_reading_order(bbox_dict, mode="ltr"):
    items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) for bid, bb in bbox_dict.items()]
    items.sort(key=lambda t: t[2])
    rows, tol = [], 90
    for it in items:
        placed = False
        for r in rows:
            if abs(it[2] - r["cy"]) <= tol:
                r["items"].append(it)
                r["cy"] = float(np.mean([x[2] for x in r["items"]]))
                placed = True; break
        if not placed: rows.append({"cy": it[2], "items": [it]})
    rows.sort(key=lambda r: r["cy"])
    order = []
    for r in rows:
        r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
        order.extend([z[0] for z in r["items"]])
    return {bid: i+1 for i, bid in enumerate(order)}


# ============================================================
# MAIN PIPELINE
# ============================================================
def translate_manga_text(
    image_path="001-page.png",
    source_lang="en",
    target_lang="ca",
    confidence_threshold=0.03,
    min_text_length=1,
    gap_px="auto",
    quality_threshold=0.62,
    export_to_file="output.txt",
    export_bubbles_to="bubbles.json",
    reading_mode="ltr",
    debug=True,
    use_enhanced_ocr=True,
    strict_grouping=True,
    max_box_width_ratio=0.6,
    max_box_height_ratio=0.5,
    auto_fix_bubbles=True
):
    image = cv2.imread(image_path)
    if image is None:
        print(f"❌ Cannot load image: {image_path}"); return

    resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
    print("Loading OCR engines...")

    if use_enhanced_ocr:
        detector = ImprovedMacVisionDetector(source_lang=source_lang)
        print("🚀 Using Enhanced Multi-Pass OCR")
    else:
        detector = MacVisionDetector(source_lang=source_lang)

    print("Running detection OCR (Apple Vision)...")
    raw = detector.read(image_path)
    print(f"Raw detections: {len(raw)}")

    if use_enhanced_ocr:
        existing_quads = [r[0] for r in raw]
        missed_regions = detect_small_text_regions(image, existing_quads)
        if missed_regions:
            print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
            for region in missed_regions:
                x1, y1, x2, y2 = region
                pad = 10
                x1, y1 = max(0, x1-pad), max(0, y1-pad)
                x2, y2 = min(image.shape[1], x2+pad), min(image.shape[0], y2+pad)
                crop = image[y1:y2, x1:x2]
                if crop.size > 0:
                    upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0,
                                          interpolation=cv2.INTER_CUBIC)
                    for quad, text, conf in detector.run_vision_ocr(upscaled):
                        raw.append(([[int(p[0]/4.0+x1), int(p[1]/4.0+y1)] for p in quad],
                                    text, conf))
            print(f"📝 Total detections after missed region scan: {len(raw)}")

    filtered, skipped = [], 0
    ih, iw = image.shape[:2]

    for bbox, text, conf in raw:
        t  = normalize_text(text)
        qb = quad_bbox(bbox)
        if conf < confidence_threshold:           skipped += 1; continue
        if len(t) < min_text_length:              skipped += 1; continue
        if not is_valid_language(t, source_lang): skipped += 1; continue
        if not is_meaningful_text(t, source_lang):skipped += 1; continue
        if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5:
            skipped += 1; continue
        filtered.append((bbox, t, conf))

    print(f"Kept: {len(filtered)} | Skipped: {skipped}")
    if not filtered:
        print("⚠️ No text after filtering."); return

    # ── Pre-grouping quad splits ──────────────────────────────────────────
    filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered)
    if oversized_splits > 0:
        print(f"📐 Split {oversized_splits} oversized quad(s) before grouping")

    filtered, splits_made = split_wide_ocr_items(image, filtered)
    if splits_made > 0:
        print(f"✂️  Split {splits_made} wide OCR lines across column gaps.")

    filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
    if bridge_splits > 0:
        print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")

    # ── Column-gap split: catches BOX#6 type wide quads spanning two columns ──
    hs_pre  = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered]
    med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0
    filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre)
    if col_splits > 0:
        print(f"📐 Column-gap split: {col_splits} quad(s) split before grouping")

    filtered = normalize_ocr_quads(filtered)

    print("📊 Grouping quads vertically...")
    bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical(
        filtered, image.shape, gap_px=resolved_gap, bbox_padding=1, strict_mode=strict_grouping
    )
    print(f"   Created {len(bubbles)} initial box(es)")

    if auto_fix_bubbles:
        bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(
            bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image
        )

    bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size(
        bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered,
        max_width_ratio=max_box_width_ratio,
        max_height_ratio=max_box_height_ratio,
        image_shape=image.shape
    )

    bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
        bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
    )

    new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
    next_bid         = max(bubbles.keys()) + 1 if bubbles else 1
    splits_performed = []

    for bid in list(bubbles.keys()):
        box          = bubble_boxes[bid]
        bubble_split = None

        if is_vertical_text_like(bubble_indices[bid], filtered):
            vgap = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered,
                                                     factor=1.7, min_gap=18)
            if vgap:
                bubble_split = vgap
                splits_performed.append(f"BOX#{bid} (vertical-stack y-gap)")

        if bubble_split is None:
            sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
            if sr:
                _, _, split_x = sr
                li = [idx for idx in bubble_indices[bid]
                      if quad_center(filtered[idx][0])[0] < split_x]
                ri = [idx for idx in bubble_indices[bid]
                      if quad_center(filtered[idx][0])[0] >= split_x]
                if li and ri:
                    bubble_split = (li, ri)
                    splits_performed.append(f"BOX#{bid} (panel border)")
                elif len(bubble_quads[bid]) >= 4:
                    cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered,
                                                          bid=bid, use_aggressive_thresholds=True)
                    if cs:
                        bubble_split = cs
                        splits_performed.append(f"BOX#{bid} (aggressive column)")

        if bubble_split is None:
            cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid)
            if cs:
                bubble_split = cs
                splits_performed.append(f"BOX#{bid} (vertical column)")

        if bubble_split is None:
            ns = split_nested_or_side_by_side(bubble_indices[bid], filtered)
            if ns:
                bubble_split = ns
                splits_performed.append(f"BOX#{bid} (nested/side-by-side)")

        if bubble_split is None:
            rs = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid)
            if rs:
                bubble_split = rs
                splits_performed.append(f"BOX#{bid} (horizontal row)")

        if bubble_split is None:
            gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered,
                                                   factor=1.9, min_gap=22)
            if gy:
                bubble_split = gy
                splits_performed.append(f"BOX#{bid} (large vertical-gap)")

        if bubble_split:
            p1, p2 = bubble_split
            for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]:
                ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs])
                new_bubbles[part_bid]        = build_lines_from_indices(part_idxs, filtered)
                new_bubble_boxes[part_bid]   = (max(0,ub[0]-2), max(0,ub[1]-2),
                                                min(iw-1,ub[2]+2), min(ih-1,ub[3]+2))
                new_bubble_quads[part_bid]   = [filtered[i][0] for i in part_idxs]
                new_bubble_indices[part_bid] = part_idxs
            next_bid += 1
        else:
            new_bubbles[bid]        = bubbles[bid]
            new_bubble_boxes[bid]   = bubble_boxes[bid]
            new_bubble_quads[bid]   = bubble_quads[bid]
            new_bubble_indices[bid] = bubble_indices[bid]

    if splits_performed:
        print(f"\n🔀 Splits detected: {len(splits_performed)}")

    bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes(
        new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles,
        overlap_threshold=0.50
    )
    print(f"✅ Final box count: {len(bubbles)}")

    # ── OCR quality pass ──────────────────────────────────────────────────
    translator    = GoogleTranslator(source=source_lang, target=target_lang)
    clean_lines:  Dict[int, str] = {}
    sources_used: Dict[int, str] = {}
    translations: Dict[int, str] = {}

    for bid, lines in bubbles.items():
        base_txt = normalize_text(" ".join(lines))
        base_sc  = ocr_candidate_score(base_txt)
        txt, src_used = base_txt, "vision-base"
        if base_sc < quality_threshold:
            rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
                image, bubble_boxes[bid], detector, upscale=3.0, pad=24)
            if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang):
                txt, src_used = rr_txt, rr_src
        clean_lines[bid]  = normalize_text(txt)
        sources_used[bid] = src_used

    reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)

    # ── Single-pass translation cache ────────────────────────────────────
    for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
        src_txt = clean_lines[bid].strip()
        if not src_txt: continue
        if not is_valid_language(src_txt, source_lang): continue
        if not is_meaningful_text(src_txt, source_lang): continue
        try:
            tgt = translator.translate(src_txt) or ""
            tgt = postprocess_translation_general(tgt).upper()
        except Exception as e:
            tgt = f"[Error: {e}]"
        translations[bid] = tgt

    if debug:
        save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices,
                            clean_lines, "debug_clusters.png")

    # ── Text output ───────────────────────────────────────────────────────
    divider   = "─" * 120
    out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
    print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} "
          f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider)

    translated_count = 0
    for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
        src_txt = clean_lines[bid].strip()
        if not src_txt: continue
        if not is_valid_language(src_txt, source_lang): continue
        if not is_meaningful_text(src_txt, source_lang): continue

        flags      = []
        tgt        = translations.get(bid, "")
        if not tgt: flags.append("NO_TRANSLATION")
        src_u      = src_txt.upper()
        src_engine = sources_used.get(bid, "unknown")

        out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|"
                         f"{','.join(flags) if flags else '-'}")
        print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
              f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}")
        translated_count += 1

    out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.")
    with open(export_to_file, "w", encoding="utf-8") as f:
        f.write("\n".join(out_lines))

    # ── bubbles.json ──────────────────────────────────────────────────────
    bubbles_payload = {}
    for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
        src_txt = clean_lines[bid].strip()
        if not src_txt: continue
        if not is_valid_language(src_txt, source_lang): continue
        if not is_meaningful_text(src_txt, source_lang): continue
        box = bubble_boxes.get(bid)
        tgt = translations.get(bid, "")
        bubbles_payload[str(bid)] = {
            "order":      reading_map.get(bid, bid),
            "ocr_source": sources_used.get(bid, "unknown"),
            "original":   src_txt.upper(),
            "translated": tgt,
            "box": {
                "x": box[0] if box else 0,
                "y": box[1] if box else 0,
                "w": (box[2]-box[0]) if box else 0,
                "h": (box[3]-box[1]) if box else 0,
            },
            "lines": [line.upper() for line in bubbles.get(bid, [])],
        }

    with open(export_bubbles_to, "w", encoding="utf-8") as f:
        json.dump(bubbles_payload, f, ensure_ascii=False, indent=2)

    print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}")


# ============================================================
# ENTRY POINT
# ============================================================
if __name__ == "__main__":
    translate_manga_text(
        image_path="17.jpg",
        source_lang="english",
        target_lang="ca",
        confidence_threshold=0.03,
        min_text_length=1,
        gap_px="auto",
        quality_threshold=0.62,
        export_to_file="output.txt",
        export_bubbles_to="bubbles.json",
        reading_mode="rtl",
        debug=True,
        use_enhanced_ocr=True,
        strict_grouping=True,
        max_box_width_ratio=0.6,
        max_box_height_ratio=0.5,
        auto_fix_bubbles=True
    )