From b6b0df47740b648b7fabc785e8d90f31ece7cba9 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Wed, 22 Apr 2026 10:51:57 +0200 Subject: [PATCH] Added stuff --- .gitignore | 4 + manga-translator.py | 2895 +++++++++++++++++++--------------------- pipeline-translator.py | 197 ++- requirements | 79 -- 4 files changed, 1543 insertions(+), 1632 deletions(-) delete mode 100644 requirements diff --git a/.gitignore b/.gitignore index 646c941..b40dbe3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,10 @@ .venv311/ +Spy_x_Family_076/ + +Dandadan_059/ + # Icon must end with two \r Icon diff --git a/manga-translator.py b/manga-translator.py index 5610251..fc53f22 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -21,35 +21,8 @@ warnings.filterwarnings("ignore", category=UserWarning) # ============================================================ # CONFIG # ============================================================ -GLOSSARY = { - "ANYA": "ANYA", - "STARLIGHT ANYA": "STARLIGHT ANYA", - "MR. HENDERSON": "MR. HENDERSON", - "HENDERSON": "HENDERSON", - "STELLA STAR": "STELLA STAR", -} - -SOUND_EFFECT_PATTERNS = [ - r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", - r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", - r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$", - r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" -] - -TITLE_PATTERNS = [ - r"^(chapter|episode|vol\.?|volume)\s*\d+$", - r"^by\s+.+$", -] - -NOISE_PATTERNS = [ - r"^[^a-zA-Z0-9\?!.¡¿]+$", - r"^BOX[#\s0-9A-Z\-]*$", - r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", -] - TOP_BAND_RATIO = 0.08 - # ============================================================ # HELPERS # ============================================================ @@ -66,14 +39,6 @@ def normalize_text(text: str) -> str: t = re.sub(r"\.{4,}", "...", t) return t.strip() - -def apply_glossary(text: str) -> str: - out = text or "" - for k in sorted(GLOSSARY.keys(), key=len, reverse=True): - out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) - return out - - def postprocess_translation_general(text: str) -> str: t = normalize_text(text) t = re.sub(r"\s{2,}", " ", t).strip() @@ -81,63 +46,124 @@ def postprocess_translation_general(text: str) -> str: t = re.sub(r"\.{4,}", "...", t) return t - def fix_common_ocr_errors(text: str) -> str: - """Fix common OCR mistakes in manga text""" result = text - - # Apply context-aware fixes - # Fix "O" to "0" only if surrounded by numbers result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result) result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result) - - # Fix common character confusions result = result.replace('|', 'I') result = result.replace('`', "'") - return result - -def is_sound_effect(text: str) -> bool: - cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower()) - return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) - - -def is_title_text(text: str) -> bool: - t = (text or "").strip().lower() - return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) - - -def looks_like_box_tag(t: str) -> bool: - s = re.sub(r"[^A-Z0-9#]", "", (t or "").upper()) - if re.fullmatch(r"[BEF]?[O0D]X#?\d{0,3}", s): - return True - if re.fullmatch(r"B[O0D]X\d{0,3}", s): - return True - return False - - -def is_noise_text(text: str) -> bool: - t = (text or "").strip() - - if re.fullmatch(r"[\?\!\.]+", t): +def is_valid_language(text: str, source_lang: str) -> bool: + if not text: + return False + clean_text = re.sub(r'[^\w]', '', text) + if not clean_text: return False - if len(t) == 1 and t.isalpha(): + lang = source_lang.lower() + + if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', + 'it', 'italian', 'ca', 'catalan', 'de', 'german']: + foreign_chars = len(re.findall( + r'[\u0600-\u06FF\u0750-\u077F\u3040-\u30FF' + r'\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\u1100-\u11FF]', + clean_text + )) + if foreign_chars > 0: + return False + latin_chars = len(re.findall(r'[a-zA-ZÀ-ÿ]', clean_text)) + total = len(clean_text) + if total <= 3: + return latin_chars >= 1 + if total <= 6: + return (latin_chars / total) >= 0.55 + return (latin_chars / total) >= 0.45 + + elif lang in ['ja', 'japanese']: + ja_chars = len(re.findall(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', clean_text)) + if len(clean_text) <= 3: + return ja_chars >= 1 + return (ja_chars / len(clean_text)) >= 0.4 + + elif lang in ['ko', 'korean']: + ko_chars = len(re.findall(r'[\uAC00-\uD7AF\u1100-\u11FF]', clean_text)) + if len(clean_text) <= 3: + return ko_chars >= 1 + return (ko_chars / len(clean_text)) >= 0.4 + + elif lang in ['zh', 'chinese']: + zh_chars = len(re.findall(r'[\u4E00-\u9FFF\u3400-\u4DBF]', clean_text)) + if len(clean_text) <= 3: + return zh_chars >= 1 + return (zh_chars / len(clean_text)) >= 0.4 + + return True + + +_NOISE_TOKENS = { + 'P', 'F', 'N', 'M', 'X', 'Z', 'Q', + 'FN', 'PF', 'NM', 'XZ', 'FSHOO', 'GRRP', +} + +_MANGA_INTERJECTIONS = { + 'HUH', 'HUH?', 'HUH??', 'HUH?!', + 'OH', 'OH!', 'OOH', 'OOH!', + 'AH', 'AH!', 'UH', 'UH...', + 'HEY', 'HEY!', + 'EH', 'EH?', + 'WOW', 'WOW!', + 'YES', 'NO', 'NO!', + 'RUN', 'GO', 'GO!', + 'STOP', 'WAIT', + 'WHAT', 'WHAT?', 'WHAT?!', + 'WHY', 'WHY?', + 'HOW', 'HOW?', + 'OK', 'OK!', 'OKAY', + 'EEEEP', 'EEEP', + 'OMIGOSH', + 'HMM', 'HMM...', + 'TSK', 'TCH', + 'GRRR','I','A', + 'FWUP', 'FWAP', + 'SHIVER', + 'RRRING', + 'MORNING', 'MORNING.', +} + +def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool: + if not text: + return False + t = text.strip() + t_upper = t.upper() + t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper) + if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS: + return True + + alpha_count = sum(c.isalpha() for c in t) + if alpha_count < min_alpha_chars: + return False + if t_upper in _NOISE_TOKENS: return False - if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): - return True - if looks_like_box_tag(t): - return True + lang = source_lang.lower() + if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', + 'it', 'italian', 'ca', 'catalan', 'de', 'german']: + non_alpha = sum(not c.isalpha() for c in t) + if len(t) > 0 and (non_alpha / len(t)) > 0.60: + return False - if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!\.]", t) and not t.isalpha(): - return True + if len(t) >= 3 and len(set(t_upper)) == 1: + return False - symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) - if len(t) <= 6 and symbol_ratio > 0.60: - return True - return False + if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', + 'it', 'italian', 'ca', 'catalan', 'de', 'german']: + if len(t) > 4: + vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t)) + if vowels == 0: + return False + + return True def quad_bbox(quad): @@ -145,12 +171,10 @@ def quad_bbox(quad): ys = [p[1] for p in quad] return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) - def quad_center(quad): x1, y1, x2, y2 = quad_bbox(quad) return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) - def boxes_union_xyxy(boxes): boxes = [b for b in boxes if b is not None] if not boxes: @@ -162,20 +186,17 @@ def boxes_union_xyxy(boxes): int(max(b[3] for b in boxes)), ) - def bbox_area_xyxy(b): if b is None: return 0 return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) - def xyxy_to_xywh(b): if b is None: return None x1, y1, x2, y2 = b return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} - def overlap_or_near(a, b, gap=0): ax1, ay1, ax2, ay2 = a bx1, by1, bx2, by2 = b @@ -183,6 +204,35 @@ def overlap_or_near(a, b, gap=0): gap_y = max(0, max(ay1, by1) - min(ay2, by2)) return gap_x <= gap and gap_y <= gap +def boxes_iou(a, b): + """Intersection over Union for two xyxy boxes.""" + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + ix1 = max(ax1, bx1) + iy1 = max(ay1, by1) + ix2 = min(ax2, bx2) + iy2 = min(ay2, by2) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + if inter == 0: + return 0.0 + area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) + area_b = max(0, bx2 - bx1) * max(0, by2 - by1) + return inter / max(1, area_a + area_b - inter) + +def boxes_overlap_ratio(a, b): + """Ratio of intersection to the SMALLER box area.""" + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + ix1 = max(ax1, bx1) + iy1 = max(ay1, by1) + ix2 = min(ax2, bx2) + iy2 = min(ay2, by2) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + if inter == 0: + return 0.0 + area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) + area_b = max(0, bx2 - bx1) * max(0, by2 - by1) + return inter / max(1, min(area_a, area_b)) def ocr_candidate_score(text: str) -> float: if not text: @@ -191,94 +241,66 @@ def ocr_candidate_score(text: str) -> float: n = len(t) if n == 0: return 0.0 - alpha = sum(c.isalpha() for c in t) / n spaces = sum(c.isspace() for c in t) / n punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n - penalty = 0.0 if re.search(r"\b[A-Z]\b", t): penalty += 0.05 if re.search(r"[0-9]{2,}", t): penalty += 0.08 - if re.search(r"(..)\1\1", t): - penalty += 0.08 - score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty return max(0.0, min(1.0, score)) +def quad_is_horizontal(quad, ratio_threshold=1.5) -> bool: + x1, y1, x2, y2 = quad_bbox(quad) + w = max(1, x2 - x1) + h = max(1, y2 - y1) + return (w / h) >= ratio_threshold + +def quad_is_vertical(quad, ratio_threshold=1.5) -> bool: + x1, y1, x2, y2 = quad_bbox(quad) + w = max(1, x2 - x1) + h = max(1, y2 - y1) + return (h / w) >= ratio_threshold + # ============================================================ # ENHANCED IMAGE PREPROCESSING # ============================================================ def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): - """Enhanced preprocessing for better OCR results""" - - # Upscale first h, w = image_bgr.shape[:2] - new_w = int(w * upscale_factor) - new_h = int(h * upscale_factor) - upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC) - - # Convert to grayscale + upscaled = cv2.resize(image_bgr, (int(w * upscale_factor), int(h * upscale_factor)), + interpolation=cv2.INTER_CUBIC) gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) - - # Denoise denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21) - - # Increase contrast with CLAHE clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) enhanced = clahe.apply(denoised) - - # Sharpen - kernel_sharpen = np.array([[-1,-1,-1], - [-1, 9,-1], - [-1,-1,-1]]) + kernel_sharpen = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]]) sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen) - - # Adaptive thresholding for clean text binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) - - # Morphological operations to clean up kernel = np.ones((2, 2), np.uint8) cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) - - # Convert back to BGR for Vision API return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR) - def detect_small_text_regions(image_bgr, existing_quads): - """Detect small text regions that might have been missed""" gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - - # Create mask of existing detections mask = np.zeros(gray.shape, dtype=np.uint8) for quad in existing_quads: pts = np.array(quad, dtype=np.int32) cv2.fillPoly(mask, [pts], 255) - - # Invert mask to find undetected regions mask_inv = cv2.bitwise_not(mask) - - # Find text-like regions _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv) - - # Find contours in undetected regions contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - # Filter for text-like contours text_regions = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) area = w * h - - # Filter by size and aspect ratio if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10: text_regions.append((x, y, x+w, y+h)) - return text_regions @@ -286,281 +308,598 @@ def detect_small_text_regions(image_bgr, existing_quads): # SPEECH BUBBLE DETECTION # ============================================================ def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: - """Detect speech bubble contours for box splitting""" gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - - # Apply adaptive thresholding thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) - - # Find contours contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - # Filter contours by area - bubble_contours = [] - for contour in contours: - area = cv2.contourArea(contour) - if area > 500: # Minimum bubble area - bubble_contours.append(contour) - - return bubble_contours + return [c for c in contours if cv2.contourArea(c) > 500] - -def is_quad_in_bubble(quad_bbox_xyxy: Tuple[int, int, int, int], - bubble_contour: np.ndarray, - tolerance: int = 5) -> bool: - """Check if a quad (text box) is inside a speech bubble""" +def is_quad_in_bubble(quad_bbox_xyxy, bubble_contour, tolerance=5): x1, y1, x2, y2 = quad_bbox_xyxy - cx = (x1 + x2) // 2 - cy = (y1 + y2) // 2 - - # Check if center point is inside contour - result = cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) - - return result >= -tolerance + cx, cy = (x1 + x2) // 2, (y1 + y2) // 2 + return cv2.pointPolygonTest(bubble_contour, (float(cx), float(cy)), False) >= -tolerance - -def split_indices_by_bubble(indices: List[int], - ocr: List[Tuple], - bubble_contours: List[np.ndarray]) -> List[List[int]]: - """Split indices into groups based on bubble membership""" +def split_indices_by_bubble(indices, ocr, bubble_contours): if not indices: return [] - - # Group indices by which bubble they belong to bubble_groups = {} outside_group = [] - for idx in indices: bbox = quad_bbox(ocr[idx][0]) - found_bubble = False - - for bubble_idx, bubble in enumerate(bubble_contours): + found = False + for bidx, bubble in enumerate(bubble_contours): if is_quad_in_bubble(bbox, bubble): - if bubble_idx not in bubble_groups: - bubble_groups[bubble_idx] = [] - bubble_groups[bubble_idx].append(idx) - found_bubble = True + bubble_groups.setdefault(bidx, []).append(idx) + found = True break - - if not found_bubble: + if not found: outside_group.append(idx) - - # Create result list result = list(bubble_groups.values()) - - # Add outside quads as separate groups if outside_group: result.append(outside_group) - return result - -def check_vertical_alignment_split(indices: List[int], - ocr: List[Tuple], - threshold: int = 20) -> List[List[int]]: - """Split indices that are vertically separated""" +def check_vertical_alignment_split(indices, ocr, threshold=20): if len(indices) <= 1: return [indices] - - # Sort by y-coordinate - items = [(idx, quad_bbox(ocr[idx][0])) for idx in indices] - items.sort(key=lambda x: x[1][1]) - - groups = [] - current_group = [items[0][0]] - + items = sorted([(idx, quad_bbox(ocr[idx][0])) for idx in indices], key=lambda x: x[1][1]) + groups, current_group = [], [items[0][0]] for i in range(1, len(items)): - prev_bbox = items[i-1][1] - curr_bbox = items[i][1] - - # Check vertical gap - gap = curr_bbox[1] - prev_bbox[3] - - if gap > threshold: - # Start new group + if items[i][1][1] - items[i-1][1][3] > threshold: groups.append(current_group) current_group = [items[i][0]] else: current_group.append(items[i][0]) - if current_group: groups.append(current_group) - return groups # ============================================================ -# BOX FIXING FUNCTIONS +# QUAD SIZE VALIDATION AND SPLITTING # ============================================================ -def apply_page_specific_fixes(bubbles: Dict[int, List[str]], - bubble_boxes: Dict[int, Tuple], - bubble_quads: Dict[int, List], - bubble_indices: Dict[int, List[int]], - ocr: List[Tuple], - image_bgr: np.ndarray, - page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]: - """Apply page-specific fixes to bubble detection issues""" - - # Detect speech bubbles for splitting logic +def is_quad_oversized(quad, median_height, width_threshold=8.0): + x1, y1, x2, y2 = quad_bbox(quad) + w, h = x2 - x1, max(1, y2 - y1) + return w > median_height * width_threshold or w / h > 12.0 + +def split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height): + x1, y1, x2, y2 = quad_bbox(quad) + w, h = x2 - x1, max(1, y2 - y1) + pad = 2 + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] + if roi.size == 0: + return [(quad, text, conf)] + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) + gap_threshold = h * 255 * 0.20 + gaps, in_gap, gap_start = [], False, 0 + for x in range(len(v_proj)): + if v_proj[x] < gap_threshold: + if not in_gap: gap_start, in_gap = x, True + else: + if in_gap: + gw = x - gap_start + if gw >= max(int(median_height * 0.8), 15): + gaps.append((gap_start + gw // 2, gw)) + in_gap = False + if not gaps: + return [(quad, text, conf)] + gaps.sort(key=lambda g: g[1], reverse=True) + split_x_abs = max(0, x1) + gaps[0][0] + if ' ' in text: + char_w = w / max(1, len(text)) + split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + else: + split_idx = int(len(text) * (split_x_abs - x1) / w) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + if tl and tr: + return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] + return [(quad, text, conf)] + +def validate_and_split_oversized_quads(image_bgr, filtered_ocr): + if not filtered_ocr: + return filtered_ocr, 0 + heights = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] + median_height = float(np.median(heights)) if heights else 14.0 + result, splits_made = [], 0 + for quad, text, conf in filtered_ocr: + if is_quad_oversized(quad, median_height, 8.0): + sr = split_oversized_quad_by_content(image_bgr, quad, text, conf, median_height) + if len(sr) > 1: + result.extend(sr); splits_made += 1 + else: + result.append((quad, text, conf)) + else: + result.append((quad, text, conf)) + return result, splits_made + + +# ============================================================ +# HORIZONTAL GAP DETECTION AT QUAD LEVEL +# ============================================================ +def detect_horizontal_gap_in_group(indices, ocr, med_h, gap_factor=2.5): + """ + Detects a large horizontal gap between quads within a group and splits them. + Fixes cases like BOX#8 in debug_clusters_016 where two column groups + are incorrectly merged into one box. + """ + if len(indices) < 2: + return None + items = sorted(indices, key=lambda i: quad_center(ocr[i][0])[0]) + boxes = [quad_bbox(ocr[i][0]) for i in items] + gap_threshold = med_h * gap_factor + best_gap, best_split = 0.0, None + for k in range(len(items) - 1): + gap = boxes[k + 1][0] - boxes[k][2] + if gap > gap_threshold and gap > best_gap: + best_gap, best_split = gap, k + if best_split is None: + return None + left_group = [items[i] for i in range(best_split + 1)] + right_group = [items[i] for i in range(best_split + 1, len(items))] + if not left_group or not right_group: + return None + return (left_group, right_group) + + +def orientation_compatible(idx_a, idx_b, ocr): + """ + Prevents merging a tall/narrow isolated glyph with wide horizontal text lines. + Fixes BOX#1 type problems in debug_clusters_015. + """ + ba = quad_bbox(ocr[idx_a][0]) + bb = quad_bbox(ocr[idx_b][0]) + wa, ha = max(1, ba[2]-ba[0]), max(1, ba[3]-ba[1]) + wb, hb = max(1, bb[2]-bb[0]), max(1, bb[3]-bb[1]) + ra, rb = wa/ha, wb/hb + if (ra < 0.6 and rb > 2.0) or (rb < 0.6 and ra > 2.0): + return False + return True + + +# ============================================================ +# WIDE QUAD COLUMN SPLIT — pre-grouping +# ============================================================ +def split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h, + min_gap_factor=1.8): + """ + FIX for BOX#6 type problem: + Splits a single OCR quad that spans two distinct text columns by finding + the largest vertical gap in its pixel projection. More aggressive than + split_oversized_quad_by_content — targets column-level gaps specifically. + """ + x1, y1, x2, y2 = quad_bbox(quad) + w, h = x2 - x1, max(1, y2 - y1) + + # Only attempt if the quad is wide enough to plausibly span two columns + if w < med_h * 3.0: + return [(quad, text, conf)] + + pad = 2 + roi = image_bgr[max(0, y1-pad):min(image_bgr.shape[0], y2+pad), + max(0, x1):min(image_bgr.shape[1], x2)] + if roi.size == 0: + return [(quad, text, conf)] + + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) + + # Threshold: column gap must be nearly empty + gap_threshold = h * 255 * 0.12 + min_gap_px = max(int(med_h * min_gap_factor), 10) + + gaps, in_gap, gap_start = [], False, 0 + for x in range(len(v_proj)): + if v_proj[x] < gap_threshold: + if not in_gap: gap_start, in_gap = x, True + else: + if in_gap: + gw = x - gap_start + if gw >= min_gap_px: + gaps.append((gap_start + gw // 2, gw)) + in_gap = False + + if not gaps: + return [(quad, text, conf)] + + # Use the widest gap as the split point + gaps.sort(key=lambda g: g[1], reverse=True) + split_x_rel = gaps[0][0] + split_x_abs = x1 + split_x_rel + + # Ensure the split produces two non-trivial halves + if split_x_abs - x1 < med_h or x2 - split_x_abs < med_h: + return [(quad, text, conf)] + + if ' ' in text: + char_w = w / max(1, len(text)) + split_idx = int(split_x_rel / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + else: + split_idx = int(len(text) * split_x_rel / w) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + + if tl and tr: + return [([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)] + return [(quad, text, conf)] + + +def apply_column_gap_splits(image_bgr, ocr_list, med_h): + """ + Applies split_wide_quad_by_column_gap to every quad in the list. + Run this BEFORE grouping so column-spanning quads never seed bad groups. + """ + result, splits_made = [], 0 + for quad, text, conf in ocr_list: + parts = split_wide_quad_by_column_gap(image_bgr, quad, text, conf, med_h) + if len(parts) > 1: + splits_made += 1 + result.extend(parts) + if splits_made: + print(f"📐 Column-gap split: {splits_made} wide quad(s) split before grouping") + return result, splits_made + + +# ============================================================ +# GENERALIZED BOX FIXING FUNCTIONS +# ============================================================ +def detect_and_split_multi_bubble_boxes(bubble_boxes, bubble_indices, bubble_quads, + bubbles, ocr, image_bgr): + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 bubble_contours = detect_speech_bubbles(image_bgr) - - fixes_applied = [] - - # PAGE 15 FIXES - if "15" in page_identifier: - # Fix: Merge Box 12 and Box 16 into one box - if 12 in bubbles and 16 in bubbles: - # Merge indices - merged_indices = sorted(set(bubble_indices[12] + bubble_indices[16])) - - # Rebuild merged box - bubbles[12] = build_lines_from_indices(merged_indices, ocr) - bubble_boxes[12] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged_indices]) - bubble_quads[12] = [ocr[i][0] for i in merged_indices] - bubble_indices[12] = merged_indices - - # Remove box 16 - del bubbles[16] - del bubble_boxes[16] - del bubble_quads[16] - del bubble_indices[16] - - fixes_applied.append("Page 15: Merged BOX#12 and BOX#16") - - # PAGE 16 FIXES - if "16" in page_identifier: - next_bid = max(bubbles.keys()) + 1 if bubbles else 100 - - # Fix Box 15: Split quads outside bubble - if 15 in bubbles: - split_groups = split_indices_by_bubble(bubble_indices[15], ocr, bubble_contours) - - if len(split_groups) > 1: - # Keep main group in BOX#15 - bubbles[15] = build_lines_from_indices(split_groups[0], ocr) - bubble_boxes[15] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) - bubble_quads[15] = [ocr[i][0] for i in split_groups[0]] - bubble_indices[15] = split_groups[0] - - # Create new boxes for other groups - for group in split_groups[1:]: - bubbles[next_bid] = build_lines_from_indices(group, ocr) - bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - bubble_quads[next_bid] = [ocr[i][0] for i in group] - bubble_indices[next_bid] = group - next_bid += 1 - - fixes_applied.append(f"Page 16: Split BOX#15 into {len(split_groups)} parts") - - # Fix Box 8: Split bubble vs outside quads - if 8 in bubbles: - split_groups = split_indices_by_bubble(bubble_indices[8], ocr, bubble_contours) - - if len(split_groups) > 1: - bubbles[8] = build_lines_from_indices(split_groups[0], ocr) - bubble_boxes[8] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) - bubble_quads[8] = [ocr[i][0] for i in split_groups[0]] - bubble_indices[8] = split_groups[0] - - for group in split_groups[1:]: - bubbles[next_bid] = build_lines_from_indices(group, ocr) - bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - bubble_quads[next_bid] = [ocr[i][0] for i in group] - bubble_indices[next_bid] = group - next_bid += 1 - - fixes_applied.append(f"Page 16: Split BOX#8 into {len(split_groups)} parts") - - # Fix Box 18: Split into 2 separate boxes - if 18 in bubbles: - # Try bubble-based split first - split_groups = split_indices_by_bubble(bubble_indices[18], ocr, bubble_contours) - - if len(split_groups) == 1: - # If bubble detection doesn't work, try vertical alignment - split_groups = check_vertical_alignment_split(bubble_indices[18], ocr, threshold=30) - - if len(split_groups) > 1: - bubbles[18] = build_lines_from_indices(split_groups[0], ocr) - bubble_boxes[18] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) - bubble_quads[18] = [ocr[i][0] for i in split_groups[0]] - bubble_indices[18] = split_groups[0] - - for group in split_groups[1:]: - bubbles[next_bid] = build_lines_from_indices(group, ocr) - bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - bubble_quads[next_bid] = [ocr[i][0] for i in group] - bubble_indices[next_bid] = group - next_bid += 1 - - fixes_applied.append(f"Page 16: Split BOX#18 into {len(split_groups)} parts") - - # PAGE 19 FIXES - if "19" in page_identifier: - next_bid = max(bubbles.keys()) + 1 if bubbles else 100 - - # Fix Box 5: Split into 4 different boxes - if 5 in bubbles: - # First split by bubble - split_groups = split_indices_by_bubble(bubble_indices[5], ocr, bubble_contours) - - # Then split each group by vertical alignment - final_groups = [] + + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + next_bid = 1 + splits_made = [] + + for bid, indices in bubble_indices.items(): + if len(indices) < 2: + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = indices + next_bid += 1 + continue + + split_groups = split_indices_by_bubble(indices, ocr, bubble_contours) + if len(split_groups) > 1: for group in split_groups: - vertical_splits = check_vertical_alignment_split(group, ocr, threshold=25) - final_groups.extend(vertical_splits) - - if len(final_groups) > 1: - bubbles[5] = build_lines_from_indices(final_groups[0], ocr) - bubble_boxes[5] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in final_groups[0]]) - bubble_quads[5] = [ocr[i][0] for i in final_groups[0]] - bubble_indices[5] = final_groups[0] - - for group in final_groups[1:]: - bubbles[next_bid] = build_lines_from_indices(group, ocr) - bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - bubble_quads[next_bid] = [ocr[i][0] for i in group] - bubble_indices[next_bid] = group + if group: + new_bubbles[next_bid] = build_lines_from_indices(group, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) + new_quads[next_bid] = [ocr[i][0] for i in group] + new_indices[next_bid] = group next_bid += 1 - - fixes_applied.append(f"Page 19: Split BOX#5 into {len(final_groups)} parts") - - # Fix Box 11: Split into 2 boxes - if 11 in bubbles: - split_groups = split_indices_by_bubble(bubble_indices[11], ocr, bubble_contours) - - if len(split_groups) > 1: - bubbles[11] = build_lines_from_indices(split_groups[0], ocr) - bubble_boxes[11] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in split_groups[0]]) - bubble_quads[11] = [ocr[i][0] for i in split_groups[0]] - bubble_indices[11] = split_groups[0] - - for group in split_groups[1:]: - bubbles[next_bid] = build_lines_from_indices(group, ocr) - bubble_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) - bubble_quads[next_bid] = [ocr[i][0] for i in group] - bubble_indices[next_bid] = group + splits_made.append(f"BOX#{bid} → {len(split_groups)} bubbles") + continue + + vertical_splits = check_vertical_alignment_split(indices, ocr, threshold=int(med_h * 2.0)) + if len(vertical_splits) > 1: + for group in vertical_splits: + if group: + new_bubbles[next_bid] = build_lines_from_indices(group, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group]) + new_quads[next_bid] = [ocr[i][0] for i in group] + new_indices[next_bid] = group next_bid += 1 - - fixes_applied.append(f"Page 19: Split BOX#11 into {len(split_groups)} parts") - - # Print fixes applied - if fixes_applied: - print(f"\n🔧 Page-specific fixes applied:") - for fix in fixes_applied: - print(f" ✓ {fix}") - + splits_made.append(f"BOX#{bid} → {len(vertical_splits)} vertical groups") + continue + + box = bubble_boxes[bid] + x1, y1, x2, y2 = box + if (x2 - x1) > med_h * 10: + x_centers = [quad_center(ocr[i][0])[0] for i in indices] + x_median = np.median(x_centers) + left_group = [i for i in indices if quad_center(ocr[i][0])[0] < x_median] + right_group = [i for i in indices if quad_center(ocr[i][0])[0] >= x_median] + if left_group and right_group: + left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_group]) + right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_group]) + if right_box[0] - left_box[2] > med_h * 1.5: + for grp in [left_group, right_group]: + new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_quads[next_bid] = [ocr[i][0] for i in grp] + new_indices[next_bid] = grp + next_bid += 1 + splits_made.append(f"BOX#{bid} → 2 horizontal panels") + continue + + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = indices + next_bid += 1 + + if splits_made: + print(f"\n🔧 Split {len(splits_made)} multi-bubble box(es):") + for s in splits_made: print(f" ✓ {s}") + + return new_bubbles, new_boxes, new_quads, new_indices + + +def detect_and_merge_fragmented_bubbles(bubble_boxes, bubble_indices, bubble_quads, + bubbles, ocr, image_bgr): + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + bubble_contours = detect_speech_bubbles(image_bgr) + bids = list(bubble_boxes.keys()) + to_merge = [] + + for i in range(len(bids)): + for j in range(i + 1, len(bids)): + bid_i, bid_j = bids[i], bids[j] + box_i, box_j = bubble_boxes[bid_i], bubble_boxes[bid_j] + cx_i = (box_i[0] + box_i[2]) / 2.0 + cy_i = (box_i[1] + box_i[3]) / 2.0 + cx_j = (box_j[0] + box_j[2]) / 2.0 + cy_j = (box_j[1] + box_j[3]) / 2.0 + + in_same_bubble = any( + cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and + cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0 + for c in bubble_contours + ) + + if in_same_bubble: + if abs(cx_i - cx_j) < med_h * 3.0 and abs(cy_i - cy_j) < med_h * 6.0: + to_merge.append((bid_i, bid_j) if cy_i < cy_j else (bid_j, bid_i)) + + if not to_merge: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + print(f"\n🔗 Merging {len(to_merge)} fragmented bubble(s):") + merge_groups = {} + for top, bottom in to_merge: + found = False + for key in merge_groups: + if top in merge_groups[key] or bottom in merge_groups[key]: + merge_groups[key].update({top, bottom}) + found = True + break + if not found: + merge_groups[len(merge_groups)] = {top, bottom} + + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + merged_bids, next_bid = set(), 1 + + for merge_set in merge_groups.values(): + merge_list = sorted(merge_set) + print(f" ✓ Merging: {', '.join(f'#{b}' for b in merge_list)}") + all_indices = sorted(set(idx for b in merge_list for idx in bubble_indices[b])) + for b in merge_list: merged_bids.add(b) + new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) + new_quads[next_bid] = [ocr[i][0] for i in all_indices] + new_indices[next_bid] = all_indices + next_bid += 1 + + for bid in bids: + if bid not in merged_bids: + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + + return new_bubbles, new_boxes, new_quads, new_indices + + +def merge_boxes_by_proximity_and_overlap(bubble_boxes, bubble_indices, bubble_quads, + bubbles, ocr, med_h): + """ + FIX for BOX#2+BOX#14 and BOX#7+BOX#18 type problems: + Merges boxes whose bounding rectangles are very close vertically AND + share significant horizontal overlap — indicating they belong to the + same speech bubble that the contour detector missed (e.g. dashed outlines). + + Unlike merge_close_bubbles_by_line_height, this checks BOTH axes strictly + to avoid merging boxes from adjacent but distinct bubbles. + """ + bids = sorted(bubble_boxes.keys()) + merge_map: Dict[int, List[int]] = {} + merged_into: Dict[int, int] = {} + + for i, bid_i in enumerate(bids): + if bid_i in merged_into: + continue + box_i = bubble_boxes[bid_i] + wi = box_i[2] - box_i[0] + + for j in range(i + 1, len(bids)): + bid_j = bids[j] + if bid_j in merged_into: + continue + box_j = bubble_boxes[bid_j] + wj = box_j[2] - box_j[0] + + # Vertical gap between the two boxes + vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) + + # Horizontal overlap ratio (intersection / min width) + h_ix1 = max(box_i[0], box_j[0]) + h_ix2 = min(box_i[2], box_j[2]) + h_overlap = max(0, h_ix2 - h_ix1) + h_overlap_ratio = h_overlap / max(1, min(wi, wj)) + + # Merge only when: + # 1. Vertical gap is small (boxes are stacked closely) + # 2. Horizontal overlap is significant (same column) + if vert_gap <= med_h * 1.5 and h_overlap_ratio >= 0.35: + root = merged_into.get(bid_i, bid_i) + merge_map.setdefault(root, [root]) + if bid_j not in merge_map[root]: + merge_map[root].append(bid_j) + merged_into[bid_j] = root + + if not merge_map: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + print(f"\n🔀 Proximity+overlap merge: {len(merge_map)} group(s):") + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + processed, next_bid = set(), 1 + + for root, group in merge_map.items(): + group_unique = sorted(set(group)) + print(f" ✓ Merging: {', '.join(f'#{b}' for b in group_unique)}") + all_indices = sorted(set(idx for b in group_unique for idx in bubble_indices[b])) + new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) + new_quads[next_bid] = [ocr[i][0] for i in all_indices] + new_indices[next_bid] = all_indices + next_bid += 1 + processed.update(group_unique) + + for bid in bids: + if bid not in processed: + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + + return new_bubbles, new_boxes, new_quads, new_indices + + +def auto_fix_bubble_detection(bubble_boxes, bubble_indices, bubble_quads, + bubbles, ocr, image_bgr): + print("\n🔍 Running automatic bubble detection fixes...") + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + + bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_split_multi_bubble_boxes( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) + bubbles, bubble_boxes, bubble_quads, bubble_indices = detect_and_merge_fragmented_bubbles( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, image_bgr) + # Second pass: catch fragments missed by contour detection (dashed bubbles, etc.) + bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_boxes_by_proximity_and_overlap( + bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, med_h) return bubbles, bubble_boxes, bubble_quads, bubble_indices +def remove_nested_boxes(bubble_boxes, bubble_indices, bubble_quads, bubbles, + overlap_threshold=0.50): + bids = list(bubble_boxes.keys()) + to_remove = set() + + for i in range(len(bids)): + bid_i = bids[i] + if bid_i in to_remove: continue + box_i = bubble_boxes[bid_i] + area_i = max(0, box_i[2]-box_i[0]) * max(0, box_i[3]-box_i[1]) + + for j in range(i + 1, len(bids)): + bid_j = bids[j] + if bid_j in to_remove: continue + box_j = bubble_boxes[bid_j] + area_j = max(0, box_j[2]-box_j[0]) * max(0, box_j[3]-box_j[1]) + + shared = set(bubble_indices[bid_i]).intersection(bubble_indices[bid_j]) + overlap = boxes_overlap_ratio(box_i, box_j) + + if overlap > overlap_threshold or len(shared) > 0: + if area_i >= area_j: + to_remove.add(bid_j) + print(f" 🗑️ Removing BOX#{bid_j} (overlaps BOX#{bid_i})") + else: + to_remove.add(bid_i) + print(f" 🗑️ Removing BOX#{bid_i} (overlaps BOX#{bid_j})") + break + + if to_remove: + print(f"\n🧹 Removed {len(to_remove)} overlapping/nested box(es)") + for bid in to_remove: + bubble_boxes.pop(bid, None) + bubble_indices.pop(bid, None) + bubble_quads.pop(bid, None) + bubbles.pop(bid, None) + + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + +def enforce_max_box_size(bubble_boxes, bubble_indices, bubble_quads, bubbles, ocr, + max_width_ratio=0.6, max_height_ratio=0.5, image_shape=None): + if image_shape is None: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + ih, iw = image_shape[:2] + max_width, max_height = iw * max_width_ratio, ih * max_height_ratio + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + next_bid, splits_made = 1, [] + + for bid, box in bubble_boxes.items(): + x1, y1, x2, y2 = box + w, h = x2 - x1, y2 - y1 + if w > max_width or h > max_height: + indices = bubble_indices[bid] + col_split = split_bubble_if_multiple_columns(indices, ocr, bid=bid, + use_aggressive_thresholds=True) + if col_split: + for grp in col_split: + new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_quads[next_bid] = [ocr[i][0] for i in grp] + new_indices[next_bid] = grp + next_bid += 1 + splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") + continue + row_split = split_bubble_if_multiple_rows(indices, ocr, bid=bid) + if row_split: + for grp in row_split: + new_bubbles[next_bid] = build_lines_from_indices(grp, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in grp]) + new_quads[next_bid] = [ocr[i][0] for i in grp] + new_indices[next_bid] = grp + next_bid += 1 + splits_made.append(f"BOX#{bid} (oversized: {w}x{h}px)") + continue + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = box + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + + if splits_made: + print(f"\n📏 Split {len(splits_made)} oversized box(es):") + for s in splits_made: print(f" ✓ {s}") + return new_bubbles, new_boxes, new_quads, new_indices + + +def should_merge_groups(group1_indices, group2_indices, ocr, median_height, + max_vertical_gap=None): + if max_vertical_gap is None: + max_vertical_gap = median_height * 2.5 + box1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group1_indices]) + box2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in group2_indices]) + if box1 is None or box2 is None: + return False + cx1 = (box1[0] + box1[2]) / 2.0 + cx2 = (box2[0] + box2[2]) / 2.0 + if abs(cx1 - cx2) > median_height * 1.8: + return False + vertical_gap = max(0, max(box1[1], box2[1]) - min(box1[3], box2[3])) + return vertical_gap <= max_vertical_gap + + # ============================================================ # ENHANCED OCR ENGINE # ============================================================ class ImprovedMacVisionDetector: def __init__(self, source_lang="en"): lang_key = source_lang.lower().strip() - lang_map = { "en": "en-US", "english": "en-US", "es": "es-ES", "spanish": "es-ES", @@ -572,213 +911,122 @@ class ImprovedMacVisionDetector: "ko": "ko-KR", "korean": "ko-KR", "zh": "zh-Hans", "chinese": "zh-Hans" } - - apple_lang = lang_map.get(lang_key, "en-US") - self.langs = [apple_lang] + self.langs = [lang_map.get(lang_key, "en-US")] print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})") - + def preprocess_variants(self, image_bgr): - """Generate multiple preprocessing variants""" - variants = [] - - # Variant 1: Enhanced standard - variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))) - - # Variant 2: High contrast + variants = [("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))] gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) - _, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) - variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR))) - - # Variant 3: Bilateral filter (preserves edges) - bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75) - upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) - variants.append(("bilateral", upscaled_bil)) - - # Variant 4: Inverted (for white text on black) - inverted = cv2.bitwise_not(image_bgr) - upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) - variants.append(("inverted", upscaled_inv)) - - # Variant 5: Original upscaled - upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) - variants.append(("original", upscaled_orig)) - + _, hc = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + variants.append(("high_contrast", cv2.cvtColor( + cv2.resize(hc, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC), + cv2.COLOR_GRAY2BGR))) + variants.append(("bilateral", cv2.resize( + cv2.bilateralFilter(image_bgr, 9, 75, 75), + None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("inverted", cv2.resize( + cv2.bitwise_not(image_bgr), + None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) + variants.append(("original", cv2.resize( + image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC))) return variants - + def run_vision_ocr(self, image_bgr): - """Run Vision OCR on a single image""" if image_bgr is None or image_bgr.size == 0: return [] - ih, iw = image_bgr.shape[:2] - success, buffer = cv2.imencode('.png', image_bgr) if not success: return [] - ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) results = [] - + def completion_handler(request, error): - if error: - return - - for observation in request.results(): - candidate = observation.topCandidates_(1)[0] - text = candidate.string() - confidence = candidate.confidence() - - bbox = observation.boundingBox() + if error: return + for obs in request.results(): + candidate = obs.topCandidates_(1)[0] + text, confidence = candidate.string(), candidate.confidence() + bbox = obs.boundingBox() x = bbox.origin.x * iw - y_bottom_left = bbox.origin.y * ih + y_bl = bbox.origin.y * ih w = bbox.size.width * iw h = bbox.size.height * ih - - y = ih - y_bottom_left - h - - quad = [ - [int(x), int(y)], - [int(x + w), int(y)], - [int(x + w), int(y + h)], - [int(x), int(y + h)] - ] - + y = ih - y_bl - h + quad = [[int(x),int(y)],[int(x+w),int(y)], + [int(x+w),int(y+h)],[int(x),int(y+h)]] results.append((quad, text, confidence)) - - request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) - request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) - request.setUsesLanguageCorrection_(False) # Disable for manga - request.setRecognitionLanguages_(self.langs) - request.setAutomaticallyDetectsLanguage_(True) - - handler.performRequests_error_([request], None) + + req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) + req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) + req.setUsesLanguageCorrection_(False) + req.setRecognitionLanguages_(self.langs) + req.setAutomaticallyDetectsLanguage_(True) + handler.performRequests_error_([req], None) return results - + def merge_multi_pass_results(self, all_results, original_shape): - """Merge results from multiple preprocessing passes""" if not all_results: return [] - - # Scale factor to normalize coordinates back to original scale_factor = 2.5 - - # Normalize all quads to original image coordinates - normalized_results = [] + normalized = [] for variant_name, results in all_results: for quad, text, conf in results: - # Scale quad back to original size - scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad] - normalized_results.append((scaled_quad, text, conf, variant_name)) - - # Group similar detections (same location, similar text) + sq = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in quad] + normalized.append((sq, text, conf, variant_name)) + def quads_overlap(q1, q2, threshold=0.5): - b1 = quad_bbox(q1) - b2 = quad_bbox(q2) - - # Calculate IoU - x1 = max(b1[0], b2[0]) - y1 = max(b1[1], b2[1]) - x2 = min(b1[2], b2[2]) - y2 = min(b1[3], b2[3]) - - if x2 < x1 or y2 < y1: - return False - - intersection = (x2 - x1) * (y2 - y1) - area1 = (b1[2] - b1[0]) * (b1[3] - b1[1]) - area2 = (b2[2] - b2[0]) * (b2[3] - b2[1]) - union = area1 + area2 - intersection - - iou = intersection / max(union, 1) - return iou > threshold - - # Cluster overlapping detections - clusters = [] - used = set() - - for i, (quad1, text1, conf1, var1) in enumerate(normalized_results): - if i in used: - continue - - cluster = [(quad1, text1, conf1, var1)] + b1, b2 = quad_bbox(q1), quad_bbox(q2) + x1, y1 = max(b1[0],b2[0]), max(b1[1],b2[1]) + x2, y2 = min(b1[2],b2[2]), min(b1[3],b2[3]) + if x2 < x1 or y2 < y1: return False + inter = (x2-x1)*(y2-y1) + union = (b1[2]-b1[0])*(b1[3]-b1[1]) + (b2[2]-b2[0])*(b2[3]-b2[1]) - inter + return inter / max(union, 1) > threshold + + clusters, used = [], set() + for i, (q1, t1, c1, v1) in enumerate(normalized): + if i in used: continue + cluster = [(q1, t1, c1, v1)] used.add(i) - - for j, (quad2, text2, conf2, var2) in enumerate(normalized_results): - if j in used or i == j: - continue - - if quads_overlap(quad1, quad2, threshold=0.5): - cluster.append((quad2, text2, conf2, var2)) + for j, (q2, t2, c2, v2) in enumerate(normalized): + if j in used or i == j: continue + if quads_overlap(q1, q2): + cluster.append((q2, t2, c2, v2)) used.add(j) - clusters.append(cluster) - - # Vote on best result per cluster + final_results = [] for cluster in clusters: - # Sort by confidence cluster.sort(key=lambda x: x[2], reverse=True) - - # Take highest confidence result - best_quad, best_text, best_conf, best_var = cluster[0] - - # If multiple variants agree on text, boost confidence + best_quad, best_text, best_conf, _ = cluster[0] text_votes = {} for _, text, conf, _ in cluster: - normalized = normalize_text(text) - if normalized: - text_votes[normalized] = text_votes.get(normalized, 0) + conf - + n = normalize_text(text) + if n: text_votes[n] = text_votes.get(n, 0) + conf if text_votes: - best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0] - if best_voted_text != normalize_text(best_text): - # Use voted text if it has more support - best_text = best_voted_text - - # Apply OCR error fixes - best_text = fix_common_ocr_errors(best_text) - - final_results.append((best_quad, best_text, best_conf)) - + voted = max(text_votes.items(), key=lambda x: x[1])[0] + if voted != normalize_text(best_text): + best_text = voted + final_results.append((best_quad, fix_common_ocr_errors(best_text), best_conf)) return final_results - + def read(self, image_path_or_array): - """Enhanced multi-pass OCR""" - if isinstance(image_path_or_array, str): - img = cv2.imread(image_path_or_array) - else: - img = image_path_or_array - + img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \ + else image_path_or_array if img is None or img.size == 0: return [] - - original_shape = img.shape - - # Generate preprocessing variants variants = self.preprocess_variants(img) - - # Run OCR on each variant all_results = [] - for variant_name, variant_img in variants: - results = self.run_vision_ocr(variant_img) - if results: - all_results.append((variant_name, results)) - - # Merge and vote on results - final_results = self.merge_multi_pass_results(all_results, original_shape) - - return final_results + for vname, vimg in variants: + r = self.run_vision_ocr(vimg) + if r: all_results.append((vname, r)) + return self.merge_multi_pass_results(all_results, img.shape) -# ============================================================ -# ORIGINAL OCR ENGINE (Fallback) -# ============================================================ class MacVisionDetector: def __init__(self, source_lang="en"): lang_key = source_lang.lower().strip() - lang_map = { "en": "en-US", "english": "en-US", "es": "es-ES", "spanish": "es-ES", @@ -790,621 +1038,410 @@ class MacVisionDetector: "ko": "ko-KR", "korean": "ko-KR", "zh": "zh-Hans", "chinese": "zh-Hans" } - - apple_lang = lang_map.get(lang_key, "en-US") - self.langs = [apple_lang] + self.langs = [lang_map.get(lang_key, "en-US")] print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") def read(self, image_path_or_array): - if isinstance(image_path_or_array, str): - img = cv2.imread(image_path_or_array) - else: - img = image_path_or_array - + img = cv2.imread(image_path_or_array) if isinstance(image_path_or_array, str) \ + else image_path_or_array if img is None or img.size == 0: return [] - ih, iw = img.shape[:2] - success, buffer = cv2.imencode('.png', img) - if not success: - return [] - + if not success: return [] ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) results = [] def completion_handler(request, error): - if error: - print(f"Vision API Error: {error}") - return - - for observation in request.results(): - candidate = observation.topCandidates_(1)[0] - text = candidate.string() - confidence = candidate.confidence() - - bbox = observation.boundingBox() + if error: return + for obs in request.results(): + candidate = obs.topCandidates_(1)[0] + text, confidence = candidate.string(), candidate.confidence() + bbox = obs.boundingBox() x = bbox.origin.x * iw - y_bottom_left = bbox.origin.y * ih + y_bl = bbox.origin.y * ih w = bbox.size.width * iw h = bbox.size.height * ih - - y = ih - y_bottom_left - h - - quad = [ - [int(x), int(y)], - [int(x + w), int(y)], - [int(x + w), int(y + h)], - [int(x), int(y + h)] - ] - + y = ih - y_bl - h + quad = [[int(x),int(y)],[int(x+w),int(y)], + [int(x+w),int(y+h)],[int(x),int(y+h)]] results.append((quad, text, confidence)) - request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) - request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) - request.setUsesLanguageCorrection_(True) - request.setRecognitionLanguages_(self.langs) - - handler.performRequests_error_([request], None) + req = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) + req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) + req.setUsesLanguageCorrection_(True) + req.setRecognitionLanguages_(self.langs) + req.setAutomaticallyDetectsLanguage_(True) + handler.performRequests_error_([req], None) return results - -# ============================================================ -# SPLITTERS + QUAD NORMALIZATION -# ============================================================ -def estimate_char_capacity_width(text_len, med_h, k=0.72): - return max(18.0, text_len * med_h * k) - - -def shrink_ocr_quad_to_text(quad, text, med_h): - x1, y1, x2, y2 = quad_bbox(quad) - w = max(1, x2 - x1) - h = max(1, y2 - y1) - - t = (text or "").strip() - n = max(1, len(t.replace(" ", ""))) - exp_w = estimate_char_capacity_width(n, med_h, k=0.62) - max_w = max(exp_w * 1.35, h * 1.15) - - if w <= max_w: - return quad - - cx = (x1 + x2) / 2.0 - nw = int(round(max_w)) - nx1 = int(round(cx - nw / 2)) - nx2 = int(round(cx + nw / 2)) - - return [[nx1, y1], [nx2, y1], [nx2, y2], [nx1, y2]] - - -def normalize_ocr_quads(filtered_ocr): - if not filtered_ocr: - return filtered_ocr - - hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] - med_h = float(np.median(hs)) if hs else 14.0 - - out = [] - for quad, text, conf in filtered_ocr: - nq = shrink_ocr_quad_to_text(quad, text, med_h) - out.append((nq, text, conf)) - return out - - -def split_abnormal_bridge_quads(image_bgr, filtered_ocr): - if not filtered_ocr: - return filtered_ocr, 0 - - hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] - med_h = float(np.median(hs)) if hs else 14.0 - - out = [] - splits = 0 - - for quad, text, conf in filtered_ocr: - x1, y1, x2, y2 = quad_bbox(quad) - w = max(1, x2 - x1) - h = max(1, y2 - y1) - - if w > med_h * 11.0 and " " in text and len(text) >= 14: - roi = image_bgr[max(0, y1):min(image_bgr.shape[0], y2), max(0, x1):min(image_bgr.shape[1], x2)] - if roi.size > 0: - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, inv = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - proj = np.sum(inv, axis=0) - - s = int(w * 0.18) - e = int(w * 0.82) - if e > s: - segment = proj[s:e] - valley_rel = int(np.argmin(segment)) - valley_x = s + valley_rel - - low = float(segment[valley_rel]) - meanv = float(np.mean(segment)) - if low < meanv * 0.52: - split_x = x1 + valley_x - - char_w = w / max(1, len(text)) - split_idx = int((split_x - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == " "] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - - left_t = text[:split_idx].strip() - right_t = text[split_idx:].strip() - - if left_t and right_t: - ql = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]] - qr = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]] - out.append((ql, left_t, conf)) - out.append((qr, right_t, conf)) - splits += 1 - continue - - out.append((quad, text, conf)) - - return out, splits - - -def split_wide_ocr_items(image_bgr, filtered_ocr): - new_filtered = [] - splits_made = 0 - - for item in filtered_ocr: - quad, text, conf = item - x1, y1, x2, y2 = quad_bbox(quad) - w = x2 - x1 - h = max(1, y2 - y1) - - if w > h * 2.5 and len(text) > 5 and ' ' in text: - pad = 2 - roi_y1 = max(0, y1 - pad) - roi_y2 = min(image_bgr.shape[0], y2 + pad) - roi_x1 = max(0, x1) - roi_x2 = min(image_bgr.shape[1], x2) - - roi = image_bgr[roi_y1:roi_y2, roi_x1:roi_x2] - if roi.size > 0: - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - proj = np.sum(thresh, axis=0) - - start_x = int(w * 0.20) - end_x = int(w * 0.80) - - if start_x < end_x: - char_w = w / max(1, len(text)) - min_gap_width = max(int(char_w * 2.5), int(h * 0.75)) - - gap_threshold = h * 255 * 0.15 - gap_mask = proj < gap_threshold - - best_gap_start = -1 - best_gap_len = 0 - current_gap_start = -1 - current_gap_len = 0 - - for x_rel in range(start_x, end_x): - if gap_mask[x_rel]: - if current_gap_len == 0: - current_gap_start = x_rel - current_gap_len += 1 - else: - if current_gap_len > best_gap_len: - best_gap_len = current_gap_len - best_gap_start = current_gap_start - current_gap_len = 0 - - if current_gap_len > best_gap_len: - best_gap_len = current_gap_len - best_gap_start = current_gap_start - - if best_gap_len >= min_gap_width: - split_x = roi_x1 + best_gap_start + (best_gap_len // 2) - - split_idx = int((split_x - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - best_space = min(spaces, key=lambda i: abs(i - split_idx)) - if abs(best_space - split_idx) < len(text) * 0.35: - split_idx = best_space - - text_left = text[:split_idx].strip() - text_right = text[split_idx:].strip() - - if text_left and text_right: - quad_left = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]] - quad_right = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]] - new_filtered.append((quad_left, text_left, conf)) - new_filtered.append((quad_right, text_right, conf)) - splits_made += 1 - continue - - new_filtered.append(item) - - return new_filtered, splits_made - - -def split_panel_box(image_bgr, bbox_xyxy, bubble_quads=None): - x1, y1, x2, y2 = bbox_xyxy - w = x2 - x1 - h = y2 - y1 - - if bubble_quads is not None and len(bubble_quads) < 4: - return None - - if w < 50 or h < 50: - return None - - roi = image_bgr[y1:y2, x1:x2] - if roi.size == 0: - return None - - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV) - - vertical_projection = np.sum(thresh, axis=0) - - search_start = int(w * 0.25) - search_end = int(w * 0.75) - - if search_start >= search_end: - return None - - peak_x_relative = np.argmax(vertical_projection[search_start:search_end]) + search_start - peak_val = vertical_projection[peak_x_relative] - - threshold_val = h * 255 * 0.25 - significant_peaks = [] - - for x_rel in range(search_start, search_end): - if vertical_projection[x_rel] > threshold_val: - significant_peaks.append((x_rel, vertical_projection[x_rel])) - - if len(significant_peaks) > 1: - min_proj_val = np.min(vertical_projection[search_start:search_end]) - min_proj_idx = np.argmin(vertical_projection[search_start:search_end]) + search_start - - if min_proj_val < threshold_val * 0.6: - split_x_absolute = x1 + min_proj_idx - box_left = (x1, y1, split_x_absolute, y2) - box_right = (split_x_absolute, y1, x2, y2) - return box_left, box_right, split_x_absolute - - if peak_val > (h * 255 * 0.40): - split_x_absolute = x1 + peak_x_relative - box_left = (x1, y1, split_x_absolute, y2) - box_right = (split_x_absolute, y1, x2, y2) - return box_left, box_right, split_x_absolute - - return None - - def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False): - if len(indices) < 2: - return None - + if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] - sorted_items = sorted(zip(indices, boxes), key=lambda x: x[1][0]) + hs = [max(1, b[3] - b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + xs = [(b[0] + b[2]) / 2.0 for b in boxes] + xs_sorted = sorted(xs) - gaps = [] - current_max_x = sorted_items[0][1][2] + gap_thresh = max(med_h * 1.2, 18) if use_aggressive_thresholds else max(med_h * 1.5, 22) + best_gap_idx, best_gap_size = None, 0.0 - for i in range(1, len(sorted_items)): - idx, b = sorted_items[i] - x1 = b[0] - gap = x1 - current_max_x - gaps.append((i, gap, current_max_x, x1)) - current_max_x = max(current_max_x, b[2]) + for i in range(len(xs_sorted) - 1): + gap = xs_sorted[i + 1] - xs_sorted[i] + if gap > gap_thresh and gap > best_gap_size: + best_gap_size, best_gap_idx = gap, i - if not gaps: - return None + if best_gap_idx is None: return None + split_x = (xs_sorted[best_gap_idx] + xs_sorted[best_gap_idx + 1]) / 2.0 - max_gap_idx, max_gap_size, _, _ = max(gaps, key=lambda x: x[1]) + left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 < split_x] + right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 >= split_x] - hs = [b[3] - b[1] for b in boxes] - med_h = float(np.median(hs)) if hs else 15.0 - - if use_aggressive_thresholds: - threshold1 = 60.0 - threshold2 = med_h * 1.0 - min_gap = 20.0 - else: - threshold1 = 90.0 - threshold2 = med_h * 1.5 - min_gap = 25.0 - - if max_gap_size > threshold1 or (max_gap_size > threshold2 and max_gap_size > min_gap): - split_idx = max_gap_idx - left_indices = [item[0] for item in sorted_items[:split_idx]] - right_indices = [item[0] for item in sorted_items[split_idx:]] - - if len(left_indices) < 1 or len(right_indices) < 1: - return None - - return left_indices, right_indices - - return None + if not left_idxs or not right_idxs: return None + return (left_idxs, right_idxs) def split_bubble_if_multiple_rows(indices, ocr, bid=None): - if len(indices) < 2: - return None - + if len(indices) < 2: return None boxes = [quad_bbox(ocr[i][0]) for i in indices] - sorted_items = sorted(zip(indices, boxes), key=lambda x: x[1][1]) + hs = [max(1, b[3] - b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + ys = [(b[1] + b[3]) / 2.0 for b in boxes] + ys_sorted = sorted(ys) - gaps = [] - current_max_y = sorted_items[0][1][3] + gap_thresh = max(med_h * 2.0, 30) + best_gap_idx, best_gap_size = None, 0.0 - for i in range(1, len(sorted_items)): - idx, b = sorted_items[i] - y1 = b[1] - gap = y1 - current_max_y - gaps.append((i, gap, current_max_y, y1)) - current_max_y = max(current_max_y, b[3]) + for i in range(len(ys_sorted) - 1): + gap = ys_sorted[i + 1] - ys_sorted[i] + if gap > gap_thresh and gap > best_gap_size: + best_gap_size, best_gap_idx = gap, i - if not gaps: - return None + if best_gap_idx is None: return None + split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx + 1]) / 2.0 - max_gap_idx, max_gap_size, _, _ = max(gaps, key=lambda x: x[1]) + top_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 < split_y] + bottom_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 >= split_y] - hs = [b[3] - b[1] for b in boxes] - med_h = float(np.median(hs)) if hs else 15.0 - - threshold = med_h * 1.8 - min_gap = 20.0 - - if max_gap_size > threshold and max_gap_size > min_gap: - split_idx = max_gap_idx - top_indices = [item[0] for item in sorted_items[:split_idx]] - bottom_indices = [item[0] for item in sorted_items[split_idx:]] - - if len(top_indices) >= 1 and len(bottom_indices) >= 1: - return top_indices, bottom_indices - - return None - - -def is_vertical_text_like(indices, ocr): - if len(indices) < 2: - return False - - bxs = [quad_bbox(ocr[i][0]) for i in indices] - ub = boxes_union_xyxy(bxs) - if ub is None: - return False - - x1, y1, x2, y2 = ub - w = max(1, x2 - x1) - h = max(1, y2 - y1) - - aspect = h / w - xcs = [((b[0] + b[2]) / 2.0) for b in bxs] - x_spread = float(np.std(xcs)) if len(xcs) > 1 else 0.0 - med_h = float(np.median([max(1, b[3]-b[1]) for b in bxs])) - - ys = sorted([((b[1] + b[3]) / 2.0) for b in bxs]) - gaps = [ys[i+1] - ys[i] for i in range(len(ys)-1)] if len(ys) >= 2 else [0] - med_gap = float(np.median(gaps)) if gaps else 0.0 - - return ( - aspect > 1.35 and - x_spread < max(10.0, med_h * 0.9) and - med_gap > max(6.0, med_h * 0.35) - ) + if not top_idxs or not bottom_idxs: return None + return (top_idxs, bottom_idxs) def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): - if len(indices) < 2: - return None + if len(indices) < 2: return None + boxes = [quad_bbox(ocr[i][0]) for i in indices] + hs = [max(1, b[3] - b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - yc = (b[1] + b[3]) / 2.0 - h = max(1.0, b[3] - b[1]) - items.append((i, b, yc, h)) + items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], + key=lambda x: (x[1][1] + x[1][3]) / 2.0) + gap_thresh = max(med_h * factor, min_gap) + best_gap, best_split_idx = 0.0, None - items.sort(key=lambda t: t[2]) - med_h = float(np.median([t[3] for t in items])) if items else 12.0 + for k in range(len(items) - 1): + gap = items[k + 1][1][1] - items[k][1][3] + if gap > gap_thresh and gap > best_gap: + best_gap, best_split_idx = gap, k - best_k = -1 - best_gap = -1 - for k in range(len(items)-1): - y_top = items[k][1][3] - y_bot = items[k+1][1][1] - gap = y_bot - y_top - if gap > best_gap: - best_gap = gap - best_k = k + if best_split_idx is None: return None + top_idxs = [it[0] for it in items[:best_split_idx + 1]] + bottom_idxs = [it[0] for it in items[best_split_idx + 1:]] + if not top_idxs or not bottom_idxs: return None + return (top_idxs, bottom_idxs) - if best_k < 0: - return None - if best_gap > max(min_gap, med_h * factor): - a = [t[0] for t in items[:best_k+1]] - b = [t[0] for t in items[best_k+1:]] - if a and b: - return a, b - return None +def is_vertical_text_like(indices, ocr): + if len(indices) < 2: return False + boxes = [quad_bbox(ocr[i][0]) for i in indices] + med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes])) + med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes])) + if med_h < med_w * 1.2: return False + xs = [(b[0]+b[2])/2.0 for b in boxes] + ys = [(b[1]+b[3])/2.0 for b in boxes] + if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False + return True def split_nested_or_side_by_side(indices, ocr): - if len(indices) < 2: - return None + if len(indices) < 2: return None + xs = sorted([(quad_bbox(ocr[i][0])[0] + quad_bbox(ocr[i][0])[2]) / 2.0 for i in indices]) + mid_idx = len(xs) // 2 + split_x = (xs[mid_idx - 1] + xs[mid_idx]) / 2.0 - boxes = [quad_bbox(ocr[i][0]) for i in indices] - xcs = np.array([[(b[0] + b[2]) / 2.0] for b in boxes], dtype=np.float32) + left_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] + right_idxs = [i for i in indices if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] - c1 = float(np.min(xcs)) - c2 = float(np.max(xcs)) - if abs(c2 - c1) < 8: - return None - - for _ in range(12): - g1, g2 = [], [] - for idx, v in enumerate(xcs[:, 0]): - if abs(v - c1) <= abs(v - c2): - g1.append(idx) - else: - g2.append(idx) - if not g1 or not g2: - return None - new_c1 = float(np.mean([xcs[i, 0] for i in g1])) - new_c2 = float(np.mean([xcs[i, 0] for i in g2])) - if abs(new_c1 - c1) < 0.5 and abs(new_c2 - c2) < 0.5: - break - c1, c2 = new_c1, new_c2 - - left_group = g1 if c1 < c2 else g2 - right_group = g2 if c1 < c2 else g1 - - left_idxs = [indices[i] for i in left_group] - right_idxs = [indices[i] for i in right_group] - if not left_idxs or not right_idxs: - return None - - left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_idxs]) - right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_idxs]) - - sep = right_box[0] - left_box[2] - if sep < -8: - return None - - return left_idxs, right_idxs + if not left_idxs or not right_idxs: return None + return (left_idxs, right_idxs) -def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): - bids = sorted(bubbles.keys()) - used = set() - out_b, out_bb, out_bq, out_bi = {}, {}, {}, {} - nbid = 1 +def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): + x1, y1, x2, y2 = box_xyxy + ih, iw = image_bgr.shape[:2] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(iw-1, x2), min(ih-1, y2) + if x2 <= x1 or y2 <= y1: return None + crop = image_bgr[y1:y2, x1:x2] + if crop.size == 0: return None - all_h = [] - for i in range(len(ocr)): - b = quad_bbox(ocr[i][0]) - all_h.append(max(1, b[3]-b[1])) - med_h = float(np.median(all_h)) if all_h else 14.0 + gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(gray, 50, 150) + h_proj = np.sum(edges, axis=0) + w = x2 - x1 + if w < 100: return None - for i, a in enumerate(bids): - if a in used: - continue - used.add(a) - group = [a] + search_start = int(w * 0.35) + search_end = int(w * 0.65) + if search_end <= search_start: return None + region = h_proj[search_start:search_end] + if len(region) == 0: return None - ax1, ay1, ax2, ay2 = bubble_boxes[a] + threshold = np.percentile(region, 85) + candidates = [x1 + search_start + rx for rx in range(len(region)) if region[rx] >= threshold] + if not candidates: return None + split_x = int(np.median(candidates)) - for b in bids[i+1:]: - if b in used: - continue - bx1, by1, bx2, by2 = bubble_boxes[b] + if bubble_quads: + left_count = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) + right_count = len(bubble_quads) - left_count + if left_count == 0 or right_count == 0: return None - acx, acy = (ax1+ax2)/2.0, (ay1+ay2)/2.0 - bcx, bcy = (bx1+bx2)/2.0, (by1+by2)/2.0 - dx, dy = abs(acx-bcx), abs(acy-bcy) - - near = dx < med_h * 10.0 and dy < med_h * 3.6 - touching = overlap_or_near((ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2), gap=int(med_h*1.25)) - - ua = boxes_union_xyxy([(ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2)]) - area_a = max(1, (ax2-ax1)*(ay2-ay1)) - area_b = max(1, (bx2-bx1)*(by2-by1)) - area_u = max(1, (ua[2]-ua[0])*(ua[3]-ua[1])) - compact_union = area_u < (area_a + area_b) * 1.65 - - if near and touching and compact_union: - group.append(b) - used.add(b) - ax1 = min(ax1, bx1); ay1 = min(ay1, by1); ax2 = max(ax2, bx2); ay2 = max(ay2, by2) - - idxs = [] - quads = [] - for g in group: - idxs.extend(bubble_indices[g]) - quads.extend(bubble_quads[g]) - - idxs = sorted(set(idxs)) - ub = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idxs]) - if ub is None: - continue - - out_b[nbid] = build_lines_from_indices(idxs, ocr) - out_bb[nbid] = ub - out_bq[nbid] = quads - out_bi[nbid] = idxs - nbid += 1 - - return out_b, out_bb, out_bq, out_bi + return (x1, x2, split_x) # ============================================================ -# PREPROCESS +# MERGE CLOSE BUBBLES +# ============================================================ +def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, + bubble_indices, ocr): + """ + Merges boxes that are spatially very close (within ~1.4× line height on + BOTH axes simultaneously). Strict dual-axis check prevents merging boxes + from adjacent but distinct bubbles — fixing the BOX#5+BOX#16 overlap problem. + """ + if not bubbles: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + merge_tol = max(8, med_h * 1.4) + + bids = sorted(bubble_boxes.keys()) + merged_set, merge_map = set(), {} + + for i, bid_i in enumerate(bids): + if bid_i in merged_set: continue + x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i] + wi = x2_i - x1_i + + for j in range(i + 1, len(bids)): + bid_j = bids[j] + if bid_j in merged_set: continue + x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j] + wj = x2_j - x1_j + + gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j)) + gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j)) + + # Horizontal overlap ratio — must be significant to merge + h_ix1 = max(x1_i, x1_j) + h_ix2 = min(x2_i, x2_j) + h_overlap = max(0, h_ix2 - h_ix1) + h_overlap_ratio = h_overlap / max(1, min(wi, wj)) + + # STRICT: both gap_x AND gap_y must be small, AND boxes must + # share meaningful horizontal overlap (same column). + # This prevents merging horizontally adjacent distinct bubbles. + if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25: + if bid_i not in merge_map: + merge_map[bid_i] = [bid_i] + merge_map[bid_i].append(bid_j) + merged_set.add(bid_j) + + if not merge_map: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + next_bid = 1 + + for bid in bids: + if bid in merged_set: continue + if bid in merge_map: + group = merge_map[bid] + all_indices = sorted(set(idx for b in group for idx in bubble_indices[b])) + new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) + new_quads[next_bid] = [ocr[i][0] for i in all_indices] + new_indices[next_bid] = all_indices + else: + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + + return new_bubbles, new_boxes, new_quads, new_indices + + +# ============================================================ +# WIDE / BRIDGE QUAD SPLITTING +# ============================================================ +def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): + if not ocr_list: return ocr_list, 0 + hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] + med_h = float(np.median(hs)) if hs else 14.0 + result, splits_made = [], 0 + + for quad, text, conf in ocr_list: + x1, y1, x2, y2 = quad_bbox(quad) + w = x2 - x1 + if w > med_h * width_factor: + pad = 2 + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] + if roi.size > 0: + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) + gap_threshold = roi.shape[0] * 255 * 0.15 + gaps, in_gap, gap_start = [], False, 0 + for x in range(len(v_proj)): + if v_proj[x] < gap_threshold: + if not in_gap: gap_start, in_gap = x, True + else: + if in_gap: + gw = x - gap_start + if gw >= max(int(med_h * 0.6), 12): + gaps.append((gap_start + gw // 2, gw)) + in_gap = False + if gaps: + gaps.sort(key=lambda g: g[1], reverse=True) + split_x_abs = max(0, x1) + gaps[0][0] + if ' ' in text: + char_w = w / max(1, len(text)) + split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + else: + split_idx = int(len(text) * (split_x_abs - x1) / w) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + if tl and tr: + result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) + splits_made += 1 + continue + result.append((quad, text, conf)) + return result, splits_made + + +def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0): + if not ocr_list: return ocr_list, 0 + hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] + med_h = float(np.median(hs)) if hs else 14.0 + result, splits_made = [], 0 + + for quad, text, conf in ocr_list: + x1, y1, x2, y2 = quad_bbox(quad) + w, h = x2 - x1, max(1, y2 - y1) + if w / h > aspect_ratio_threshold: + pad = 2 + roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), + max(0,x1):min(image_bgr.shape[1],x2)] + if roi.size > 0: + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + v_proj = np.sum(binary, axis=0) + gap_threshold = h * 255 * 0.20 + gaps, in_gap, gap_start = [], False, 0 + for x in range(len(v_proj)): + if v_proj[x] < gap_threshold: + if not in_gap: gap_start, in_gap = x, True + else: + if in_gap: + gw = x - gap_start + if gw >= max(int(med_h * 0.8), 15): + gaps.append((gap_start + gw // 2, gw)) + in_gap = False + if gaps: + gaps.sort(key=lambda g: g[1], reverse=True) + split_x_abs = max(0, x1) + gaps[0][0] + if ' ' in text: + char_w = w / max(1, len(text)) + split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == ' '] + if spaces: split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + else: + split_idx = int(len(text) * (split_x_abs - x1) / w) + tl, tr = text[:split_idx].strip(), text[split_idx:].strip() + if tl and tr: + result.extend([([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), + ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) + splits_made += 1 + continue + result.append((quad, text, conf)) + return result, splits_made + + +def normalize_ocr_quads(ocr_list): + result = [] + for quad, text, conf in ocr_list: + x1, y1, x2, y2 = quad_bbox(quad) + pad = 3 + new_quad = [[x1-pad, y1-pad], [x2+pad, y1-pad], [x2+pad, y2+pad], [x1-pad, y2+pad]] + result.append((new_quad, text, conf)) + return result + + +# ============================================================ +# VISION RE-READ # ============================================================ def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) - - if mode == "raw": - return gray - if mode == "clahe": - return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) + if mode == "raw": return gray + if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(gray) if mode == "adaptive": - den = cv2.GaussianBlur(gray, (3, 3), 0) + den = cv2.GaussianBlur(gray, (3,3), 0) return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) if mode == "otsu": - den = cv2.GaussianBlur(gray, (3, 3), 0) + den = cv2.GaussianBlur(gray, (3,3), 0) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th - if mode == "invert": - return 255 - gray + if mode == "invert": return 255 - gray if mode == "bilateral": den = cv2.bilateralFilter(gray, 7, 60, 60) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th if mode == "morph_open": _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - k = np.ones((2, 2), np.uint8) - return cv2.morphologyEx(th, cv2.MORPH_OPEN, k) - + return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8)) return gray def rotate_image_keep_bounds(img, angle_deg): h, w = img.shape[:2] - c = (w / 2, h / 2) + c = (w/2, h/2) M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) - cos = abs(M[0, 0]); sin = abs(M[0, 1]) - - new_w = int((h * sin) + (w * cos)) - new_h = int((h * cos) + (w * sin)) - M[0, 2] += (new_w / 2) - c[0] - M[1, 2] += (new_h / 2) - c[1] - + cos, sin = abs(M[0,0]), abs(M[0,1]) + new_w = int((h*sin) + (w*cos)) + new_h = int((h*cos) + (w*sin)) + M[0,2] += (new_w/2) - c[0] + M[1,2] += (new_h/2) - c[1] return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) def rebuild_text_from_vision_result(res): - if not res: - return "" - + if not res: return "" norm = [] for bbox, txt, conf in res: - if not txt or not txt.strip(): - continue + if not txt or not txt.strip(): continue b = quad_bbox(bbox) - xc = (b[0] + b[2]) / 2.0 - yc = (b[1] + b[3]) / 2.0 - h = max(1.0, b[3] - b[1]) - norm.append((b, txt, conf, xc, yc, h)) - - if not norm: - return "" - - med_h = float(np.median([x[5] for x in norm])) + norm.append((b, txt, conf, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) + if not norm: return "" + med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) - norm.sort(key=lambda z: z[4]) rows = [] for it in norm: @@ -1413,58 +1450,41 @@ def rebuild_text_from_vision_result(res): if abs(it[4] - r["yc"]) <= row_tol: r["m"].append(it) r["yc"] = float(np.mean([k[4] for k in r["m"]])) - placed = True - break - if not placed: - rows.append({"yc": it[4], "m": [it]}) - + placed = True; break + if not placed: rows.append({"yc": it[4], "m": [it]}) rows.sort(key=lambda r: r["yc"]) - lines = [] - for r in rows: - mem = sorted(r["m"], key=lambda z: z[3]) - line = normalize_text(" ".join(x[1] for x in mem)) - if line: - lines.append(line) - - return normalize_text(" ".join(lines)) + lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) for r in rows] + return normalize_text(" ".join(filter(None, lines))) def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24): ih, iw = image_bgr.shape[:2] x1, y1, x2, y2 = bbox_xyxy - x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) - x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad)) - + x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad)) + x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad)) crop = image_bgr[y1:y2, x1:x2] - if crop.size == 0: - return None, 0.0, "none" + if crop.size == 0: return None, 0.0, "none" - modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] + modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] angles = [0.0, 1.5, -1.5] - best_v_txt, best_v_sc = "", 0.0 - up0 = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC) + up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), + interpolation=cv2.INTER_CUBIC) for mode in modes: - proc = preprocess_variant(up0, mode) + proc = preprocess_variant(up0, mode) proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc for a in angles: rot = rotate_image_keep_bounds(proc3, a) - - # Use run_vision_ocr if available (enhanced detector) - if hasattr(vision_detector, 'run_vision_ocr'): - res = vision_detector.run_vision_ocr(rot) - else: - res = vision_detector.read(rot) - + res = (vision_detector.run_vision_ocr(rot) + if hasattr(vision_detector, 'run_vision_ocr') + else vision_detector.read(rot)) txt = rebuild_text_from_vision_result(res) - sc = ocr_candidate_score(txt) + sc = ocr_candidate_score(txt) if sc > best_v_sc: best_v_txt, best_v_sc = txt, sc - if best_v_txt: - return best_v_txt, best_v_sc, "vision-reread" - + if best_v_txt: return best_v_txt, best_v_sc, "vision-reread" return None, 0.0, "none" @@ -1472,202 +1492,128 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0 # LINES + BUBBLES # ============================================================ def build_lines_from_indices(indices, ocr): - if not indices: - return [] + if not indices: return [] items = [] for i in indices: b = quad_bbox(ocr[i][0]) - xc = (b[0] + b[2]) / 2.0 - yc = (b[1] + b[3]) / 2.0 - h = max(1.0, b[3] - b[1]) - items.append((i, b, xc, yc, h)) - - med_h = float(np.median([it[4] for it in items])) if items else 10.0 + items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) + med_h = float(np.median([it[4] for it in items])) if items else 10.0 row_tol = max(6.0, med_h * 0.75) - items.sort(key=lambda x: x[3]) rows = [] for it in items: - i, b, xc, yc, h = it placed = False for r in rows: - if abs(yc - r["yc"]) <= row_tol: - r["m"].append((i, b, xc, yc)) - r["yc"] = float(np.mean([k[3] for k in r["m"]])) - placed = True - break - if not placed: - rows.append({"yc": yc, "m": [(i, b, xc, yc)]}) - - rows.sort(key=lambda r: r["yc"]) - lines = [] - for r in rows: - mem = sorted(r["m"], key=lambda z: z[2]) - txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) - if txt and not is_noise_text(txt): - lines.append(txt) - return lines - - -def build_line_boxes_from_indices(indices, ocr, image_shape=None): - if not indices: - return [] - - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - txt = normalize_text(ocr[i][1]) - if is_noise_text(txt): - continue - xc = (b[0] + b[2]) / 2.0 - yc = (b[1] + b[3]) / 2.0 - h = max(1.0, b[3] - b[1]) - items.append({"i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "h": h}) - - if not items: - return [] - - med_h = float(np.median([it["h"] for it in items])) - row_tol = max(6.0, med_h * 0.90) - gap_x_tol = max(8.0, med_h * 1.25) - pad = max(2, int(round(med_h * 0.14))) - - rows = [] - for it in sorted(items, key=lambda x: x["yc"]): - placed = False - for r in rows: - if abs(it["yc"] - r["yc"]) <= row_tol: + if abs(it[3] - r["yc"]) <= row_tol: r["m"].append(it) - r["yc"] = float(np.mean([k["yc"] for k in r["m"]])) - placed = True - break - if not placed: - rows.append({"yc": it["yc"], "m": [it]}) - + r["yc"] = float(np.mean([k[3] for k in r["m"]])) + placed = True; break + if not placed: rows.append({"yc": it[3], "m": [it]}) rows.sort(key=lambda r: r["yc"]) - out_boxes = [] - - for r in rows: - mem = sorted(r["m"], key=lambda z: z["xc"]) - if not mem: - continue - - chunks = [] - cur = [mem[0]] - for t in mem[1:]: - prev = cur[-1]["b"] - b = t["b"] - gap = b[0] - prev[2] - if gap <= gap_x_tol: - cur.append(t) - else: - chunks.append(cur) - cur = [t] - chunks.append(cur) - - for ch in chunks: - ub = boxes_union_xyxy([x["b"] for x in ch]) - if ub: - x1, y1, x2, y2 = ub - out_boxes.append((x1 - pad, y1 - int(round(pad * 1.2)), x2 + pad, y2 + int(round(pad * 0.9)))) - - if image_shape is not None: - ih, iw = image_shape[:2] - clamped = [] - for b in out_boxes: - x1 = max(0, int(b[0])); y1 = max(0, int(b[1])) - x2 = min(iw - 1, int(b[2])); y2 = min(ih - 1, int(b[3])) - if x2 > x1 and y2 > y1: - clamped.append((x1, y1, x2, y2)) - out_boxes = clamped - - out_boxes.sort(key=lambda z: (z[1], z[0])) - return out_boxes + return [normalize_text(" ".join(ocr[i][1] for i,_,_,_,_ in sorted(r["m"], key=lambda z: z[2]))) + for r in rows if r["m"]] def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) - if img is None: - return base - return base * (img.shape[1] / ref_w) + return base * (img.shape[1] / ref_w) if img is not None else base -def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=1): +def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, strict_mode=False): + """ + Groups OCR quads into bubble candidates. + + Generic protections applied: + - orientation_compatible(): prevents tall/narrow glyphs merging with wide text lines. + - Horizontal gap guard: prevents side-by-side column quads from merging. + - detect_horizontal_gap_in_group(): post-merge split for groups with large internal gaps. + - Orientation check in secondary merge pass. + """ n = len(ocr) - if n == 0: - return {}, {}, {}, {} + if n == 0: return {}, {}, {}, {} - boxes = [quad_bbox(r[0]) for r in ocr] + boxes = [quad_bbox(r[0]) for r in ocr] centers = [quad_center(r[0]) for r in ocr] - hs = [max(1.0, b[3] - b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - dist_thresh = max(20.0, med_h * 1.8) - adaptive_gap_y = max(gap_px, med_h * 2.5) + hs = [max(1.0, b[3]-b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 - p = list(range(n)) + max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0 + max_horizontal_offset = med_h * 1.8 - def find(x): - while p[x] != x: - p[x] = p[p[x]] - x = p[x] - return x + sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0])) + groups, used = [], set() - def unite(a, b): - p[find(a)] = find(b) + for i in sorted_indices: + if i in used: continue + current_group = [i] + used.add(i) + cx_i, cy_i = centers[i] - for i in range(n): - for j in range(i + 1, n): - ax1, ay1, ax2, ay2 = boxes[i] - bx1, by1, bx2, by2 = boxes[j] - gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) - gap_y = max(0, max(ay1, by1) - min(ay2, by2)) + for j in sorted_indices: + if j in used or j == i: continue + cx_j, cy_j = centers[j] + if cy_j <= cy_i: continue + if abs(cx_i - cx_j) > max_horizontal_offset: continue - cx1, cy1 = centers[i] - cx2, cy2 = centers[j] - is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5) + # Horizontal gap guard + gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2])) + if gap_x > med_h * 1.5: continue - if gap_x == 0 and gap_y <= (med_h * 3.5): - unite(i, j); continue + # Orientation compatibility guard + if not orientation_compatible(i, j, ocr): continue - if is_vertically_aligned and gap_y <= (med_h * 3.2): - unite(i, j); continue + vertical_gap = boxes[j][1] - boxes[current_group[-1]][3] + if vertical_gap <= max_vertical_gap: + current_group.append(j) + used.add(j) + cx_i = (cx_i + cx_j) / 2.0 - if gap_x <= gap_px and gap_y <= adaptive_gap_y: - unite(i, j); continue + if current_group: + groups.append(current_group) - d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 - if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 1.5: - unite(i, j) + # Secondary merge pass + merged_groups, used_groups = [], set() + for i, group1 in enumerate(groups): + if i in used_groups: continue + merged = list(group1) + used_groups.add(i) + for j, group2 in enumerate(groups): + if i == j or j in used_groups: continue + if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap): + compat = all(orientation_compatible(a, b, ocr) + for a in merged for b in group2) + if compat: + merged.extend(group2) + used_groups.add(j) + merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1])) - groups = {} - for i in range(n): - groups.setdefault(find(i), []).append(i) + # Horizontal gap split pass + final_groups = [] + for group in merged_groups: + h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5) + if h_split: + lg, rg = h_split + final_groups.append(sorted(lg, key=lambda idx: centers[idx][1])) + final_groups.append(sorted(rg, key=lambda idx: centers[idx][1])) + else: + final_groups.append(group) - sorted_groups = sorted( - groups.values(), - key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs)) - ) + final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), min(centers[i][0] for i in g))) bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} ih, iw = image_shape[:2] - for bid, idxs in enumerate(sorted_groups, start=1): - idxs = sorted(idxs, key=lambda k: boxes[k][1]) + for bid, idxs in enumerate(final_groups, start=1): lines = build_lines_from_indices(idxs, ocr) quads = [ocr[k][0] for k in idxs] - ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) - if ub is None: - continue - + ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) + if ub is None: continue x1, y1, x2, y2 = ub - adaptive_pad = max(1, int(round(med_h * 0.16))) - x1 = max(0, x1 - adaptive_pad); y1 = max(0, y1 - adaptive_pad) - x2 = min(iw - 1, x2 + adaptive_pad); y2 = min(ih - 1, y2 + adaptive_pad) - - bubbles[bid] = lines - bubble_boxes[bid] = (x1, y1, x2, y2) - bubble_quads[bid] = quads - bubble_indices[bid] = idxs + ap = max(1, int(round(med_h * 0.16))) + bubbles[bid] = lines + bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), min(iw-1,x2+ap), min(ih-1,y2+ap)) + bubble_quads[bid] = quads + bubble_indices[bid]= idxs return bubbles, bubble_boxes, bubble_quads, bubble_indices @@ -1675,54 +1621,46 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=1): # ============================================================ # DEBUG / EXPORT # ============================================================ -def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, clean_lines=None, out_path="debug_clusters.png"): +def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, + clean_lines=None, out_path="debug_clusters.png"): img = cv2.imread(image_path) - if img is None: - return + if img is None: return for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) - cv2.fillPoly(img, [pts], (255, 255, 255)) - cv2.polylines(img, [pts], True, (180, 180, 180), 1) + cv2.fillPoly(img, [pts], (255,255,255)) + cv2.polylines(img, [pts], True, (180,180,180), 1) for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb - cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) - cv2.putText(img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) + is_isolated = len(bubble_indices.get(bid, [])) == 1 + color = (255,165,0) if is_isolated else (0,220,0) + thickness = 3 if is_isolated else 2 + cv2.rectangle(img, (x1,y1), (x2,y2), color, thickness) + label = f"BOX#{bid}" + (" (ISOLATED)" if is_isolated else "") + cv2.putText(img, label, (x1+2, max(15, y1+16)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) if clean_lines and bid in clean_lines: - text = clean_lines[bid] + text = clean_lines[bid] words = text.split() - lines = [] - cur = "" + lines, cur = [], "" for w in words: - if len(cur) + len(w) < 25: - cur += w + " " - else: - lines.append(cur.strip()) - cur = w + " " - if cur: - lines.append(cur.strip()) - + if len(cur) + len(w) < 25: cur += w + " " + else: lines.append(cur.strip()); cur = w + " " + if cur: lines.append(cur.strip()) y_text = y2 + 18 for line in lines: - cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3) - cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) + cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3) + cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1) y_text += 18 cv2.imwrite(out_path, img) def estimate_reading_order(bbox_dict, mode="ltr"): - items = [] - for bid, (x1, y1, x2, y2) in bbox_dict.items(): - cx = (x1 + x2) / 2.0 - cy = (y1 + y2) / 2.0 - items.append((bid, cx, cy)) - + items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) for bid, bb in bbox_dict.items()] items.sort(key=lambda t: t[2]) - rows, tol = [], 90 for it in items: placed = False @@ -1730,50 +1668,14 @@ def estimate_reading_order(bbox_dict, mode="ltr"): if abs(it[2] - r["cy"]) <= tol: r["items"].append(it) r["cy"] = float(np.mean([x[2] for x in r["items"]])) - placed = True - break - if not placed: - rows.append({"cy": it[2], "items": [it]}) - + placed = True; break + if not placed: rows.append({"cy": it[2], "items": [it]}) rows.sort(key=lambda r: r["cy"]) order = [] for r in rows: r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) order.extend([z[0] for z in r["items"]]) - - return {bid: i + 1 for i, bid in enumerate(order)} - - -def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): - out = {} - for bid, bb in bbox_dict.items(): - x1, y1, x2, y2 = bb - quads = quads_dict.get(bid, []) - idxs = indices_dict.get(bid, []) - - qboxes = [quad_bbox(q) for q in quads] - text_union = boxes_union_xyxy(qboxes) - - line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr, image_shape=image_shape) - line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy) - line_union_area = bbox_area_xyxy(line_union_xyxy) - - out[str(bid)] = { - "x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1), - "reading_order": int(reading_map.get(bid, bid)), - "quad_bboxes": [ - {"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])} - for b in qboxes - ], - "quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads], - "text_bbox": xyxy_to_xywh(text_union), - "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], - "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, - "line_union_area": int(line_union_area), - } - - with open(filepath, "w", encoding="utf-8") as f: - json.dump(out, f, indent=2, ensure_ascii=False) + return {bid: i+1 for i, bid in enumerate(order)} # ============================================================ @@ -1786,24 +1688,24 @@ def translate_manga_text( confidence_threshold=0.03, min_text_length=1, gap_px="auto", - filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", debug=True, - use_enhanced_ocr=True + use_enhanced_ocr=True, + strict_grouping=True, + max_box_width_ratio=0.6, + max_box_height_ratio=0.5, + auto_fix_bubbles=True ): image = cv2.imread(image_path) if image is None: - print(f"❌ Cannot load image: {image_path}") - return + print(f"❌ Cannot load image: {image_path}"); return resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) - print("Loading OCR engines...") - - # Use enhanced detector + if use_enhanced_ocr: detector = ImprovedMacVisionDetector(source_lang=source_lang) print("🚀 Using Enhanced Multi-Pass OCR") @@ -1813,308 +1715,285 @@ def translate_manga_text( print("Running detection OCR (Apple Vision)...") raw = detector.read(image_path) print(f"Raw detections: {len(raw)}") - - # Secondary pass for missed regions + if use_enhanced_ocr: existing_quads = [r[0] for r in raw] missed_regions = detect_small_text_regions(image, existing_quads) - if missed_regions: print(f"🔍 Found {len(missed_regions)} potentially missed text regions") - - # Re-run OCR on missed regions with higher upscaling for region in missed_regions: x1, y1, x2, y2 = region - # Add padding pad = 10 - x1 = max(0, x1 - pad) - y1 = max(0, y1 - pad) - x2 = min(image.shape[1], x2 + pad) - y2 = min(image.shape[0], y2 + pad) - + x1, y1 = max(0, x1-pad), max(0, y1-pad) + x2, y2 = min(image.shape[1], x2+pad), min(image.shape[0], y2+pad) crop = image[y1:y2, x1:x2] if crop.size > 0: - # Aggressive upscaling for small text - upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC) - region_results = detector.run_vision_ocr(upscaled) - - # Scale back and offset coordinates - for quad, text, conf in region_results: - scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad] - raw.append((scaled_quad, text, conf)) - + upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, + interpolation=cv2.INTER_CUBIC) + for quad, text, conf in detector.run_vision_ocr(upscaled): + raw.append(([[int(p[0]/4.0+x1), int(p[1]/4.0+y1)] for p in quad], + text, conf)) print(f"📝 Total detections after missed region scan: {len(raw)}") - filtered = [] - skipped = 0 + filtered, skipped = [], 0 ih, iw = image.shape[:2] for bbox, text, conf in raw: - t = normalize_text(text) + t = normalize_text(text) qb = quad_bbox(bbox) - - if conf < confidence_threshold: + if conf < confidence_threshold: skipped += 1; continue + if len(t) < min_text_length: skipped += 1; continue + if not is_valid_language(t, source_lang): skipped += 1; continue + if not is_meaningful_text(t, source_lang):skipped += 1; continue + if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: skipped += 1; continue - if len(t) < min_text_length: - skipped += 1; continue - if is_noise_text(t): - skipped += 1; continue - if filter_sound_effects and is_sound_effect(t): - skipped += 1; continue - if is_title_text(t): - skipped += 1; continue - if qb[1] < int(ih * TOP_BAND_RATIO): - if conf < 0.70 and len(t) >= 5: - skipped += 1; continue - filtered.append((bbox, t, conf)) print(f"Kept: {len(filtered)} | Skipped: {skipped}") if not filtered: - print("⚠️ No text after filtering.") - return + print("⚠️ No text after filtering."); return + + # ── Pre-grouping quad splits ────────────────────────────────────────── + filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered) + if oversized_splits > 0: + print(f"📐 Split {oversized_splits} oversized quad(s) before grouping") - # 1) split obvious wide OCR merges filtered, splits_made = split_wide_ocr_items(image, filtered) if splits_made > 0: print(f"✂️ Split {splits_made} wide OCR lines across column gaps.") - # 2) split giant bridge quads filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) if bridge_splits > 0: print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") - # 3) shrink quads to tighter text footprint + # ── Column-gap split: catches BOX#6 type wide quads spanning two columns ── + hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered] + med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 + filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre) + if col_splits > 0: + print(f"📐 Column-gap split: {col_splits} quad(s) split before grouping") + filtered = normalize_ocr_quads(filtered) - bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens( - filtered, image.shape, gap_px=resolved_gap, bbox_padding=1 + print("📊 Grouping quads vertically...") + bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( + filtered, image.shape, gap_px=resolved_gap, bbox_padding=1, strict_mode=strict_grouping + ) + print(f" Created {len(bubbles)} initial box(es)") + + if auto_fix_bubbles: + bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( + bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image + ) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( + bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, + max_width_ratio=max_box_width_ratio, + max_height_ratio=max_box_height_ratio, + image_shape=image.shape ) - # merge accidental sibling fragments bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) - # Apply page-specific fixes - page_identifier = os.path.basename(image_path) - bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes( - bubbles, bubble_boxes, bubble_quads, bubble_indices, - filtered, image, page_identifier - ) - new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} - next_bid = max(bubbles.keys()) + 1 if bubbles else 1 + next_bid = max(bubbles.keys()) + 1 if bubbles else 1 splits_performed = [] for bid in list(bubbles.keys()): - box = bubble_boxes[bid] + box = bubble_boxes[bid] bubble_split = None if is_vertical_text_like(bubble_indices[bid], filtered): - vgap_split = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.7, min_gap=18) - if vgap_split: - bubble_split = vgap_split - splits_performed.append(f"BOX#{bid} (vertical-stack y-gap split)") + vgap = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, + factor=1.7, min_gap=18) + if vgap: + bubble_split = vgap + splits_performed.append(f"BOX#{bid} (vertical-stack y-gap)") if bubble_split is None: - split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) - if split_result: - _, _, split_x = split_result - left_idxs, right_idxs = [], [] - for idx in bubble_indices[bid]: - cx, cy = quad_center(filtered[idx][0]) - if cx < split_x: - left_idxs.append(idx) - else: - right_idxs.append(idx) - - if left_idxs and right_idxs: - bubble_split = (left_idxs, right_idxs) - splits_performed.append(f"BOX#{bid} (panel border at x={split_x})") + sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) + if sr: + _, _, split_x = sr + li = [idx for idx in bubble_indices[bid] + if quad_center(filtered[idx][0])[0] < split_x] + ri = [idx for idx in bubble_indices[bid] + if quad_center(filtered[idx][0])[0] >= split_x] + if li and ri: + bubble_split = (li, ri) + splits_performed.append(f"BOX#{bid} (panel border)") elif len(bubble_quads[bid]) >= 4: - col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True) - if col_split: - l, r = col_split - if l and r: - bubble_split = (l, r) - splits_performed.append(f"BOX#{bid} ({len(l)} quads | {len(r)} quads)") + cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, + bid=bid, use_aggressive_thresholds=True) + if cs: + bubble_split = cs + splits_performed.append(f"BOX#{bid} (aggressive column)") if bubble_split is None: - col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid) - if col_split: - l, r = col_split - if l and r: - bubble_split = (l, r) - splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(l)} | {len(r)} quads)") + cs = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid) + if cs: + bubble_split = cs + splits_performed.append(f"BOX#{bid} (vertical column)") if bubble_split is None: - nested_split = split_nested_or_side_by_side(bubble_indices[bid], filtered) - if nested_split: - l, r = nested_split - if l and r: - bubble_split = (l, r) - splits_performed.append(f"BOX#{bid} (nested/side-by-side forced split)") + ns = split_nested_or_side_by_side(bubble_indices[bid], filtered) + if ns: + bubble_split = ns + splits_performed.append(f"BOX#{bid} (nested/side-by-side)") if bubble_split is None: - row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid) - if row_split: - t, b = row_split - if t and b: - bubble_split = (t, b) - splits_performed.append(f"BOX#{bid} (Horizontal Row Split: {len(t)} | {len(b)} quads)") + rs = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid) + if rs: + bubble_split = rs + splits_performed.append(f"BOX#{bid} (horizontal row)") if bubble_split is None: - gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.9, min_gap=22) + gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, + factor=1.9, min_gap=22) if gy: - a, b = gy - bubble_split = (a, b) - splits_performed.append(f"BOX#{bid} (large vertical-gap split)") + bubble_split = gy + splits_performed.append(f"BOX#{bid} (large vertical-gap)") if bubble_split: - part1_idxs, part2_idxs = bubble_split - - new_bubbles[bid] = build_lines_from_indices(part1_idxs, filtered) - ub_1 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part1_idxs]) - new_bubble_boxes[bid] = (max(0, ub_1[0]-2), max(0, ub_1[1]-2), min(iw-1, ub_1[2]+2), min(ih-1, ub_1[3]+2)) - new_bubble_quads[bid] = [filtered[i][0] for i in part1_idxs] - new_bubble_indices[bid] = part1_idxs - - new_bubbles[next_bid] = build_lines_from_indices(part2_idxs, filtered) - ub_2 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part2_idxs]) - new_bubble_boxes[next_bid] = (max(0, ub_2[0]-2), max(0, ub_2[1]-2), min(iw-1, ub_2[2]+2), min(ih-1, ub_2[3]+2)) - new_bubble_quads[next_bid] = [filtered[i][0] for i in part2_idxs] - new_bubble_indices[next_bid] = part2_idxs + p1, p2 = bubble_split + for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: + ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) + new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) + new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2), + min(iw-1,ub[2]+2), min(ih-1,ub[3]+2)) + new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] + new_bubble_indices[part_bid] = part_idxs next_bid += 1 else: - new_bubbles[bid] = bubbles[bid] - new_bubble_boxes[bid] = bubble_boxes[bid] - new_bubble_quads[bid] = bubble_quads[bid] + new_bubbles[bid] = bubbles[bid] + new_bubble_boxes[bid] = bubble_boxes[bid] + new_bubble_quads[bid] = bubble_quads[bid] new_bubble_indices[bid] = bubble_indices[bid] if splits_performed: - print(f"\n🔀 Multi-column/row bubble splits detected: {len(splits_performed)}") - for split_info in splits_performed: - print(f" ✓ Split {split_info}") + print(f"\n🔀 Splits detected: {len(splits_performed)}") - bubbles = new_bubbles - bubble_boxes = new_bubble_boxes - bubble_quads = new_bubble_quads - bubble_indices = new_bubble_indices + bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes( + new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles, + overlap_threshold=0.50 + ) + print(f"✅ Final box count: {len(bubbles)}") - translator = GoogleTranslator(source=source_lang, target=target_lang) - - clean_lines: Dict[int, str] = {} + # ── OCR quality pass ────────────────────────────────────────────────── + translator = GoogleTranslator(source=source_lang, target=target_lang) + clean_lines: Dict[int, str] = {} sources_used: Dict[int, str] = {} + translations: Dict[int, str] = {} for bid, lines in bubbles.items(): base_txt = normalize_text(" ".join(lines)) - base_sc = ocr_candidate_score(base_txt) - - txt = base_txt - src_used = "vision-base" - + base_sc = ocr_candidate_score(base_txt) + txt, src_used = base_txt, "vision-base" if base_sc < quality_threshold: rr_txt, rr_sc, rr_src = reread_bubble_with_vision( - image_bgr=image, - bbox_xyxy=bubble_boxes[bid], - vision_detector=detector, - upscale=3.0, - pad=24 - ) - if rr_txt and rr_sc > base_sc + 0.04: - txt = rr_txt - src_used = rr_src - - txt = txt.replace(" BOMPORTA", " IMPORTA") - txt = txt.replace(" TESTO ", " ESTO ") - txt = txt.replace(" MIVERDAD", " MI VERDAD") - - clean_lines[bid] = apply_glossary(normalize_text(txt)) + image, bubble_boxes[bid], detector, upscale=3.0, pad=24) + if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): + txt, src_used = rr_txt, rr_src + clean_lines[bid] = normalize_text(txt) sources_used[bid] = src_used reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) + # ── Single-pass translation cache ──────────────────────────────────── + for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): + src_txt = clean_lines[bid].strip() + if not src_txt: continue + if not is_valid_language(src_txt, source_lang): continue + if not is_meaningful_text(src_txt, source_lang): continue + try: + tgt = translator.translate(src_txt) or "" + tgt = postprocess_translation_general(tgt).upper() + except Exception as e: + tgt = f"[Error: {e}]" + translations[bid] = tgt + if debug: - save_debug_clusters( - image_path=image_path, - ocr=filtered, - bubble_boxes=bubble_boxes, - bubble_indices=bubble_indices, - clean_lines=clean_lines, - out_path="debug_clusters.png" - ) + save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, + clean_lines, "debug_clusters.png") - divider = "─" * 120 + # ── Text output ─────────────────────────────────────────────────────── + divider = "─" * 120 out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] - - print(divider) - print(f"{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} {'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS") - print(divider) + print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} " + f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider) translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() - if not src_txt: - continue + if not src_txt: continue + if not is_valid_language(src_txt, source_lang): continue + if not is_meaningful_text(src_txt, source_lang): continue - flags = [] - try: - tgt = translator.translate(src_txt) or "" - except Exception as e: - tgt = f"[Translation error: {e}]" - flags.append("TRANSLATION_ERROR") - - tgt = apply_glossary(postprocess_translation_general(tgt)).upper() - src_u = src_txt.upper() + flags = [] + tgt = translations.get(bid, "") + if not tgt: flags.append("NO_TRANSLATION") + src_u = src_txt.upper() src_engine = sources_used.get(bid, "unknown") - out_lines.append( - f"#{bid}|{reading_map.get(bid, bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" - ) - - print( - f"#{bid:<7} {reading_map.get(bid, bid):<6} {src_engine:<12} " - f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}" - ) + out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|" + f"{','.join(flags) if flags else '-'}") + print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " + f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}") translated_count += 1 - out_lines.append(divider) - out_lines.append(f"✅ Done! {translated_count} bubble(s) translated, {skipped} detection(s) skipped.") - + out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.") with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(out_lines)) - export_bubbles( - export_bubbles_to, - bbox_dict=bubble_boxes, - quads_dict=bubble_quads, - indices_dict=bubble_indices, - ocr=filtered, - reading_map=reading_map, - image_shape=image.shape - ) + # ── bubbles.json ────────────────────────────────────────────────────── + bubbles_payload = {} + for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): + src_txt = clean_lines[bid].strip() + if not src_txt: continue + if not is_valid_language(src_txt, source_lang): continue + if not is_meaningful_text(src_txt, source_lang): continue + box = bubble_boxes.get(bid) + tgt = translations.get(bid, "") + bubbles_payload[str(bid)] = { + "order": reading_map.get(bid, bid), + "ocr_source": sources_used.get(bid, "unknown"), + "original": src_txt.upper(), + "translated": tgt, + "box": { + "x": box[0] if box else 0, + "y": box[1] if box else 0, + "w": (box[2]-box[0]) if box else 0, + "h": (box[3]-box[1]) if box else 0, + }, + "lines": [line.upper() for line in bubbles.get(bid, [])], + } - print(divider) - print(f"Saved: {export_to_file}") - print(f"Saved: {export_bubbles_to}") - if debug: - print("Saved: debug_clusters.png") + with open(export_bubbles_to, "w", encoding="utf-8") as f: + json.dump(bubbles_payload, f, ensure_ascii=False, indent=2) + + print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") +# ============================================================ +# ENTRY POINT +# ============================================================ if __name__ == "__main__": translate_manga_text( - image_path="16.jpg", + image_path="17.jpg", source_lang="english", target_lang="ca", - confidence_threshold=0.03, # Lower threshold for better detection + confidence_threshold=0.03, min_text_length=1, gap_px="auto", - filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", - reading_mode="ltr", #rtl or + reading_mode="rtl", debug=True, - use_enhanced_ocr=True # Enable enhanced multi-pass OCR - ) \ No newline at end of file + use_enhanced_ocr=True, + strict_grouping=True, + max_box_width_ratio=0.6, + max_box_height_ratio=0.5, + auto_fix_bubbles=True + ) diff --git a/pipeline-translator.py b/pipeline-translator.py index c1b9e9d..a64ad6f 100644 --- a/pipeline-translator.py +++ b/pipeline-translator.py @@ -14,10 +14,32 @@ import argparse import importlib.util from pathlib import Path -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────── +# PIPELINE CONFIGURATION +# Single source of truth — mirrors the __main__ block in +# manga-translator.py so both entry points stay in sync. +# ───────────────────────────────────────────────────────────── +PIPELINE_CONFIG = dict( + source_lang = "english", + target_lang = "ca", + confidence_threshold = 0.03, + min_text_length = 1, + gap_px = "auto", + quality_threshold = 0.62, + reading_mode = "rtl", + debug = True, + use_enhanced_ocr = True, + strict_grouping = True, + max_box_width_ratio = 0.6, + max_box_height_ratio = 0.5, + auto_fix_bubbles = True, +) + + +# ───────────────────────────────────────────────────────────── # DYNAMIC MODULE LOADER -# ───────────────────────────────────────────── -def load_module(name, filepath): +# ───────────────────────────────────────────────────────────── +def load_module(name: str, filepath: str): spec = importlib.util.spec_from_file_location(name, filepath) if spec is None or spec.loader is None: raise FileNotFoundError(f"Cannot load spec for {filepath}") @@ -25,103 +47,188 @@ def load_module(name, filepath): spec.loader.exec_module(module) return module -# ───────────────────────────────────────────── + +# ───────────────────────────────────────────────────────────── # HELPERS -# ───────────────────────────────────────────── -def sorted_pages(chapter_dir): +# ───────────────────────────────────────────────────────────── +def sorted_pages(chapter_dir: Path): exts = {".jpg", ".jpeg", ".png", ".webp"} pages = [ - p for p in Path(chapter_dir).iterdir() + p for p in chapter_dir.iterdir() if p.is_file() and p.suffix.lower() in exts ] return sorted(pages, key=lambda p: p.stem) -def make_page_workdir(chapter_dir, page_stem): - workdir = Path(chapter_dir) / "translated" / page_stem + +def make_page_workdir(chapter_dir: Path, page_stem: str) -> Path: + workdir = chapter_dir / "translated" / page_stem workdir.mkdir(parents=True, exist_ok=True) return workdir -# ───────────────────────────────────────────── + +def verify_translator_api(module) -> bool: + """ + Checks that the loaded module exposes translate_manga_text() + and that it accepts all keys defined in PIPELINE_CONFIG. + Prints a warning for any missing parameter so mismatches are + caught immediately rather than silently falling back to defaults. + """ + import inspect + + fn = getattr(module, "translate_manga_text", None) + if fn is None: + print("❌ manga-translator.py does not expose translate_manga_text()") + return False + + sig = inspect.signature(fn) + params = set(sig.parameters.keys()) + ok = True + + for key in PIPELINE_CONFIG: + if key not in params: + print(f"⚠️ PIPELINE_CONFIG key '{key}' not found in " + f"translate_manga_text() — update pipeline or translator.") + ok = False + + return ok + + +# ───────────────────────────────────────────────────────────── # PER-PAGE PIPELINE -# ───────────────────────────────────────────── -def process_page(page_path, workdir, translator_module): +# ───────────────────────────────────────────────────────────── +def process_page(page_path: Path, workdir: Path, translator_module) -> bool: print(f"\n{'─' * 70}") - print(f"PAGE: {page_path.name}") + print(f" PAGE : {page_path.name}") print(f"{'─' * 70}") orig_dir = os.getcwd() try: - # Isolate execution to the specific page's folder + # Run inside the page's own workdir so debug images and + # output files land there automatically. os.chdir(workdir) print(" ⏳ Extracting text and translating...") - - # 1) Translate using ONLY the required path arguments. - # This forces the function to use its own internal default variables - # (like source_lang, target_lang, confidence_threshold) directly from manga-translator.py - translator_module.translate_manga_text( - image_path=str(page_path.resolve()), - export_to_file="output.txt", - export_bubbles_to="bubbles.json" - ) - print(" ✅ Translation and OCR data saved successfully") + translator_module.translate_manga_text( + image_path = str(page_path.resolve()), + export_to_file = "output.txt", + export_bubbles_to= "bubbles.json", + **PIPELINE_CONFIG, # ← all settings from the single config dict + ) + + # Sanity-check that the expected outputs were actually written + for fname in ("output.txt", "bubbles.json"): + fpath = workdir / fname + if not fpath.exists() or fpath.stat().st_size == 0: + print(f" ⚠️ {fname} is missing or empty after processing.") + + print(" ✅ Translation and OCR data saved successfully") return True except Exception as e: + import traceback print(f" ❌ Failed: {e}") + traceback.print_exc() return False finally: os.chdir(orig_dir) -# ───────────────────────────────────────────── + +# ───────────────────────────────────────────────────────────── # MAIN -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────── def main(): - parser = argparse.ArgumentParser(description="Manga Translation OCR Batch Pipeline") - parser.add_argument("chapter_dir", help="Path to the folder containing manga pages") + parser = argparse.ArgumentParser( + description="Manga Translation OCR Batch Pipeline" + ) + parser.add_argument( + "chapter_dir", + help="Path to the folder containing manga page images" + ) + parser.add_argument( + "--start", type=int, default=1, + help="Start from this page number (1-based, default: 1)" + ) + parser.add_argument( + "--end", type=int, default=None, + help="Stop after this page number inclusive (default: all)" + ) args = parser.parse_args() chapter_dir = Path(args.chapter_dir).resolve() + if not chapter_dir.is_dir(): + print(f"❌ Not a directory: {chapter_dir}") + sys.exit(1) - print("Loading translator module...") - script_dir = Path(__file__).parent - + # ── Load translator module ──────────────────────────────── + script_dir = Path(__file__).parent + module_path = script_dir / "manga-translator.py" + + if not module_path.exists(): + print(f"❌ manga-translator.py not found in {script_dir}") + sys.exit(1) + + print(f"📦 Loading translator from: {module_path}") try: - translator = load_module("manga_translator", str(script_dir / "manga-translator.py")) + translator = load_module("manga_translator", str(module_path)) except Exception as e: print(f"❌ Could not load manga-translator.py: {e}") sys.exit(1) - pages = sorted_pages(chapter_dir) - if not pages: + # ── API compatibility check ─────────────────────────────── + if not verify_translator_api(translator): + print("❌ Aborting — fix the parameter mismatch above first.") + sys.exit(1) + + # ── Discover pages ──────────────────────────────────────── + all_pages = sorted_pages(chapter_dir) + if not all_pages: print(f"❌ No images found in: {chapter_dir}") sys.exit(1) - print(f"\n📖 Chapter : {chapter_dir.name}") - print(f" Pages : {len(pages)}") - print(" Note : Using translation settings directly from manga-translator.py\n") + # Apply --start / --end slice (1-based, inclusive) + start_idx = max(0, args.start - 1) + end_idx = args.end if args.end is not None else len(all_pages) + pages = all_pages[start_idx:end_idx] + + if not pages: + print(f"❌ No pages in range [{args.start}, {args.end}]") + sys.exit(1) + + # ── Summary header ──────────────────────────────────────── + print(f"\n{'═' * 70}") + print(f" 📖 Chapter : {chapter_dir.name}") + print(f" 📄 Pages : {len(pages)} " + f"(of {len(all_pages)} total, " + f"range {args.start}–{end_idx})") + print(f" 🌐 Lang : {PIPELINE_CONFIG['source_lang']} → " + f"{PIPELINE_CONFIG['target_lang']}") + print(f" 📖 Read order : {PIPELINE_CONFIG['reading_mode'].upper()}") + print(f" 🔍 Enhanced : {PIPELINE_CONFIG['use_enhanced_ocr']}") + print(f"{'═' * 70}\n") succeeded, failed = [], [] for i, page_path in enumerate(pages, start=1): - print(f"[{i}/{len(pages)}] Processing...") + print(f"[{i}/{len(pages)}] {page_path.name}") workdir = make_page_workdir(chapter_dir, page_path.stem) - + if process_page(page_path, workdir, translator): succeeded.append(page_path.name) else: failed.append(page_path.name) + # ── Final report ────────────────────────────────────────── print(f"\n{'═' * 70}") - print("PIPELINE COMPLETE") - print(f"✅ {len(succeeded)} page(s) succeeded") + print(" PIPELINE COMPLETE") + print(f" ✅ {len(succeeded)} page(s) succeeded") if failed: - print(f"❌ {len(failed)} page(s) failed:") - for f in failed: - print(f" • {f}") + print(f" ❌ {len(failed)} page(s) failed:") + for name in failed: + print(f" • {name}") print(f"{'═' * 70}\n") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/requirements b/requirements deleted file mode 100644 index 492c48d..0000000 --- a/requirements +++ /dev/null @@ -1,79 +0,0 @@ -aistudio-sdk==0.3.8 -annotated-doc==0.0.4 -annotated-types==0.7.0 -anyio==4.13.0 -bce-python-sdk==0.9.70 -beautifulsoup4==4.14.3 -certifi==2026.2.25 -chardet==7.4.3 -charset-normalizer==3.4.7 -click==8.3.2 -colorlog==6.10.1 -crc32c==2.8 -deep-translator==1.11.4 -easyocr==1.7.2 -filelock==3.28.0 -fsspec==2026.3.0 -future==1.0.0 -h11==0.16.0 -hf-xet==1.4.3 -httpcore==1.0.9 -httpx==0.28.1 -huggingface_hub==1.10.2 -idna==3.11 -ImageIO==2.37.3 -imagesize==2.0.0 -Jinja2==3.1.6 -lazy-loader==0.5 -markdown-it-py==4.0.0 -MarkupSafe==3.0.3 -mdurl==0.1.2 -modelscope==1.35.4 -mpmath==1.3.0 -networkx==3.6.1 -ninja==1.13.0 -numpy==1.26.4 -opencv-contrib-python==4.10.0.84 -opencv-python==4.11.0.86 -opencv-python-headless==4.11.0.86 -opt-einsum==3.3.0 -packaging==26.1 -paddleocr==3.4.1 -paddlepaddle==3.3.1 -paddlex==3.4.3 -pandas==3.0.2 -pillow==12.2.0 -prettytable==3.17.0 -protobuf==7.34.1 -psutil==7.2.2 -py-cpuinfo==9.0.0 -pyclipper==1.4.0 -pycryptodome==3.23.0 -pydantic==2.13.1 -pydantic_core==2.46.1 -Pygments==2.20.0 -pypdfium2==5.7.0 -python-bidi==0.6.7 -python-dateutil==2.9.0.post0 -PyYAML==6.0.2 -requests==2.33.1 -rich==15.0.0 -ruamel.yaml==0.19.1 -safetensors==0.7.0 -scikit-image==0.26.0 -scipy==1.17.1 -shapely==2.1.2 -shellingham==1.5.4 -six==1.17.0 -soupsieve==2.8.3 -sympy==1.14.0 -tifffile==2026.3.3 -torch==2.11.0 -torchvision==0.26.0 -tqdm==4.67.3 -typer==0.24.1 -typing-inspection==0.4.2 -typing_extensions==4.15.0 -ujson==5.12.0 -urllib3==2.6.3 -wcwidth==0.6.0