diff --git a/manga-translator.py b/manga-translator.py index 941cc3d..56adde1 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -125,35 +125,85 @@ _MANGA_INTERJECTIONS = { 'MORNING', 'MORNING.', } +# ============================================================ +# PROTECTED TOKENS / SHORT DIALOGUE SAFETY NET +# ============================================================ +PROTECTED_SHORT_TOKENS = { + "HUH", "HUH?", "HUH??", "HUH?!", + "OH", "OH!", "OOH", "OOH!", + "AH", "AH!", "UH", "UH...", + "HEY", "HEY!", "EH", "EH?", + "WOW", "WOW!", + "MORNING", "MORNING.", + "BECKY", "BECKY!", + "DAMIAN", "CECILE", "WALD", + "OMIGOSH", "EEEP", "EEEEP" +} + +KNOWN_NAMES = { + "BECKY", "DAMIAN", "CECILE", "WALD" +} + +def is_protected_token(text: str) -> bool: + t = normalize_text(text or "") + if not t: + return False + if t in PROTECTED_SHORT_TOKENS: + return True + # punctuation-insensitive fallback + t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t) + return t_alpha in PROTECTED_SHORT_TOKENS + +def maybe_conf_floor_for_protected(text: str, conf: float, floor: float = 0.40) -> float: + if is_protected_token(text): + return max(conf, floor) + return conf + def is_meaningful_text(text: str, source_lang: str, min_alpha_chars: int = 2) -> bool: if not text: return False + t = text.strip() - t_upper = t.upper() + t_upper = normalize_text(t) + + # 1) Hard keep for protected tokens + if is_protected_token(t_upper): + return True + t_alpha_only = re.sub(r'[^A-Za-zÀ-ÿ]', '', t_upper) if t_upper in _MANGA_INTERJECTIONS or t_alpha_only in _MANGA_INTERJECTIONS: return True + alpha_count = sum(c.isalpha() for c in t) if alpha_count < min_alpha_chars: + # allow short punctuated utterances like "Huh?" + if re.fullmatch(r"[A-Za-zÀ-ÿ]{2,6}[!?\.]{0,3}", t.strip()): + return True return False + if t_upper in _NOISE_TOKENS: return False + lang = source_lang.lower() + if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: non_alpha = sum(not c.isalpha() for c in t) - if len(t) > 0 and (non_alpha / len(t)) > 0.60: + # slightly less aggressive than before + if len(t) > 0 and (non_alpha / len(t)) > 0.72: return False + if len(t) >= 3 and len(set(t_upper)) == 1: return False + if lang in ['en', 'english', 'es', 'spanish', 'fr', 'french', 'it', 'italian', 'ca', 'catalan', 'de', 'german']: - if len(t) > 4: + if len(t) > 5: vowels = len(re.findall(r'[AEIOUaeiouÀ-ÿ]', t)) if vowels == 0: return False - return True + return True def quad_bbox(quad): xs = [p[0] for p in quad] @@ -1697,924 +1747,616 @@ def estimate_reading_order(bbox_dict, mode="ltr"): order.extend([z[0] for z in r["items"]]) return {bid: i+1 for i, bid in enumerate(order)} - # ============================================================ -# MAIN PIPELINE +# NAME / SHORT TOKEN RESCUE # ============================================================ -def translate_manga_text( - image_path="001-page.png", - source_lang="en", - target_lang="ca", - confidence_threshold=0.03, - min_text_length=1, - gap_px="auto", - quality_threshold=0.62, - export_to_file="output.txt", - export_bubbles_to="bubbles.json", - reading_mode="ltr", - debug=True, - use_enhanced_ocr=True, - strict_grouping=True, - max_box_width_ratio=0.6, - max_box_height_ratio=0.5, - auto_fix_bubbles=True -): - image = cv2.imread(image_path) - if image is None: - print(f"❌ Cannot load image: {image_path}"); return +def _text_key_for_dedup(text: str) -> str: + return re.sub(r'[^A-ZÀ-Ý0-9]', '', normalize_text(text or "")) - resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) - ih, iw = image.shape[:2] - print("Loading OCR engines...") - - if use_enhanced_ocr: - detector = ImprovedMacVisionDetector(source_lang=source_lang) - print("🚀 Using Enhanced Multi-Pass OCR") - else: - detector = MacVisionDetector(source_lang=source_lang) - - print("Running detection OCR (Apple Vision)...") - raw = detector.read(image_path) - print(f"Raw detections: {len(raw)}") - - if use_enhanced_ocr: - existing_quads = [r[0] for r in raw] - missed_regions = detect_small_text_regions(image, existing_quads) - if missed_regions: - print(f"🔍 Found {len(missed_regions)} potentially missed text regions") - for region in missed_regions: - rx1, ry1, rx2, ry2 = region - pad = 10 - rx1, ry1 = max(0, rx1-pad), max(0, ry1-pad) - rx2, ry2 = min(iw, rx2+pad), min(ih, ry2+pad) - crop = image[ry1:ry2, rx1:rx2] - if crop.size > 0: - upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, - interpolation=cv2.INTER_CUBIC) - for quad, text, conf in detector.run_vision_ocr(upscaled): - raw.append(([[int(p[0]/4.0+rx1), int(p[1]/4.0+ry1)] - for p in quad], text, conf)) - print(f"📝 Total detections after missed region scan: {len(raw)}") - - # ── Filtering ───────────────────────────────────────────────────────── - filtered, skipped = [], 0 - for bbox, text, conf in raw: - t = normalize_text(text) - qb = quad_bbox(bbox) - if conf < confidence_threshold: skipped += 1; continue - if len(t) < min_text_length: skipped += 1; continue - if not is_valid_language(t, source_lang): skipped += 1; continue - if not is_meaningful_text(t, source_lang): skipped += 1; continue - if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: - skipped += 1; continue - filtered.append((bbox, t, conf)) - - print(f"Kept: {len(filtered)} | Skipped: {skipped}") - if not filtered: - print("⚠️ No text after filtering."); return - - # ── Pre-grouping quad splits ────────────────────────────────────────── - filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered) - if oversized_splits > 0: - print(f"📐 Split {oversized_splits} oversized quad(s) before grouping") - - filtered, wide_splits = split_wide_ocr_items(image, filtered) - if wide_splits > 0: - print(f"✂️ Split {wide_splits} wide OCR lines across column gaps.") - - filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) - if bridge_splits > 0: - print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") - - # Column-gap split: catches wide quads spanning two columns (BOX#6 type) - hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered] - med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 - filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre) - - filtered = normalize_ocr_quads(filtered) - - # ── Grouping ────────────────────────────────────────────────────────── - print("📊 Grouping quads vertically...") - bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( - filtered, image.shape, gap_px=resolved_gap, - bbox_padding=1, strict_mode=strict_grouping) - print(f" Created {len(bubbles)} initial box(es)") - - # ── Auto-fix (split + merge) ────────────────────────────────────────── - if auto_fix_bubbles: - bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( - bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image) - - # ── Enforce max box size ────────────────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( - bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, - max_width_ratio=max_box_width_ratio, - max_height_ratio=max_box_height_ratio, - image_shape=image.shape) - - # ── Close-proximity merge ───────────────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered) - - # ── Per-bubble split pass ───────────────────────────────────────────── - new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} - next_bid = max(bubbles.keys()) + 1 if bubbles else 1 - splits_performed = [] - - for bid in list(bubbles.keys()): - split_result, split_reason = _split_bubble_if_needed( - bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih) - - if split_result: - p1, p2 = split_result - splits_performed.append(f"BOX#{bid} ({split_reason})") - for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: - ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) - new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) - new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2), - min(iw-1,ub[2]+2), min(ih-1,ub[3]+2)) - new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] - new_bubble_indices[part_bid] = part_idxs - next_bid += 1 - else: - new_bubbles[bid] = bubbles[bid] - new_bubble_boxes[bid] = bubble_boxes[bid] - new_bubble_quads[bid] = bubble_quads[bid] - new_bubble_indices[bid] = bubble_indices[bid] - - if splits_performed: - print(f"\n🔀 Splits detected: {len(splits_performed)}") - for s in splits_performed: print(f" ✓ {s}") - - # ── Remove nested / duplicate boxes ────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes( - new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles, - overlap_threshold=0.50) - print(f"✅ Final box count: {len(bubbles)}") - - # ── OCR quality pass ────────────────────────────────────────────────── - translator = GoogleTranslator(source=source_lang, target=target_lang) - clean_lines: Dict[int, str] = {} - sources_used: Dict[int, str] = {} - translations: Dict[int, str] = {} - - for bid, lines in bubbles.items(): - base_txt = normalize_text(" ".join(lines)) - base_sc = ocr_candidate_score(base_txt) - txt, src_used = base_txt, "vision-base" - if base_sc < quality_threshold: - rr_txt, rr_sc, rr_src = reread_bubble_with_vision( - image, bubble_boxes[bid], detector, upscale=3.0, pad=24) - if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): - txt, src_used = rr_txt, rr_src - clean_lines[bid] = normalize_text(txt) - sources_used[bid] = src_used - - reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) - - # ── Translation ─────────────────────────────────────────────────────── - for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src_txt = clean_lines[bid].strip() - if not src_txt: continue - if not is_valid_language(src_txt, source_lang): continue - if not is_meaningful_text(src_txt, source_lang): continue - try: - tgt = translator.translate(src_txt) or "" - tgt = postprocess_translation_general(tgt).upper() - except Exception as e: - tgt = f"[Error: {e}]" - translations[bid] = tgt - - if debug: - save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, - clean_lines, "debug_clusters.png") - - # ── Text output ─────────────────────────────────────────────────────── - divider = "─" * 120 - out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] - print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} " - f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider) - - translated_count = 0 - for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src_txt = clean_lines[bid].strip() - if not src_txt: continue - if not is_valid_language(src_txt, source_lang): continue - if not is_meaningful_text(src_txt, source_lang): continue - - flags = [] - tgt = translations.get(bid, "") - if not tgt: flags.append("NO_TRANSLATION") - src_u = src_txt.upper() - src_engine = sources_used.get(bid, "unknown") - - out_lines.append( - f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|" - f"{','.join(flags) if flags else '-'}") - print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " - f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}") - translated_count += 1 - - out_lines.append(divider + f"\n✅ Done! {translated_count} bubble(s) translated.") - with open(export_to_file, "w", encoding="utf-8") as f: - f.write("\n".join(out_lines)) - - # ── bubbles.json ────────────────────────────────────────────────────── - bubbles_payload = {} - for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src_txt = clean_lines[bid].strip() - if not src_txt: continue - if not is_valid_language(src_txt, source_lang): continue - if not is_meaningful_text(src_txt, source_lang): continue - box = bubble_boxes.get(bid) - tgt = translations.get(bid, "") - bubbles_payload[str(bid)] = { - "order": reading_map.get(bid, bid), - "ocr_source": sources_used.get(bid, "unknown"), - "original": src_txt.upper(), - "translated": tgt, - "box": { - "x": box[0] if box else 0, - "y": box[1] if box else 0, - "w": (box[2]-box[0]) if box else 0, - "h": (box[3]-box[1]) if box else 0, - }, - "lines": [line.upper() for line in bubbles.get(bid, [])], - } - - with open(export_bubbles_to, "w", encoding="utf-8") as f: - json.dump(bubbles_payload, f, ensure_ascii=False, indent=2) - - print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") - - -# ============================================================ -# ENTRY POINT -# ============================================================ -if __name__ == "__main__": - translate_manga_text( - image_path="19.png", - source_lang="english", - target_lang="ca", - confidence_threshold=0.03, - min_text_length=1, - gap_px="auto", - quality_threshold=0.62, - export_to_file="output.txt", - export_bubbles_to="bubbles.json", - reading_mode="rtl", - debug=True, - use_enhanced_ocr=True, - strict_grouping=True, - max_box_width_ratio=0.6, - max_box_height_ratio=0.5, - auto_fix_bubbles=True - ) - -def split_bubble_if_multiple_rows(indices, ocr, bid=None): - if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - ys = [(b[1]+b[3])/2.0 for b in boxes] - ys_sorted = sorted(ys) - gap_thresh = max(med_h * 2.0, 30) - best_gap_idx, best_gap_size = None, 0.0 - for i in range(len(ys_sorted) - 1): - gap = ys_sorted[i+1] - ys_sorted[i] - if gap > gap_thresh and gap > best_gap_size: - best_gap_size, best_gap_idx = gap, i - if best_gap_idx is None: return None - split_y = (ys_sorted[best_gap_idx] + ys_sorted[best_gap_idx+1]) / 2.0 - top_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 < split_y] - bot_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[1]+quad_bbox(ocr[i][0])[3])/2.0 >= split_y] - if not top_idxs or not bot_idxs: return None - return (top_idxs, bot_idxs) - - -def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): - if len(indices) < 2: return None - boxes = [quad_bbox(ocr[i][0]) for i in indices] - hs = [max(1, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - items = sorted([(i, quad_bbox(ocr[i][0])) for i in indices], - key=lambda x: (x[1][1]+x[1][3])/2.0) - gap_thresh = max(med_h * factor, min_gap) - best_gap, best_split_idx = 0.0, None - for k in range(len(items) - 1): - gap = items[k+1][1][1] - items[k][1][3] - if gap > gap_thresh and gap > best_gap: - best_gap, best_split_idx = gap, k - if best_split_idx is None: return None - top_idxs = [it[0] for it in items[:best_split_idx+1]] - bot_idxs = [it[0] for it in items[best_split_idx+1:]] - if not top_idxs or not bot_idxs: return None - return (top_idxs, bot_idxs) - - -def is_vertical_text_like(indices, ocr): - if len(indices) < 2: return False - boxes = [quad_bbox(ocr[i][0]) for i in indices] - med_h = float(np.median([max(1, b[3]-b[1]) for b in boxes])) - med_w = float(np.median([max(1, b[2]-b[0]) for b in boxes])) - if med_h < med_w * 1.2: return False - xs = [(b[0]+b[2])/2.0 for b in boxes] - ys = [(b[1]+b[3])/2.0 for b in boxes] - if (max(ys)-min(ys)) < (max(xs)-min(xs)) * 1.5: return False - return True - - -def split_nested_or_side_by_side(indices, ocr): - if len(indices) < 2: return None - xs = sorted([(quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 - for i in indices]) - mid_idx = len(xs) // 2 - split_x = (xs[mid_idx-1] + xs[mid_idx]) / 2.0 - left_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 < split_x] - right_idxs = [i for i in indices - if (quad_bbox(ocr[i][0])[0]+quad_bbox(ocr[i][0])[2])/2.0 >= split_x] - if not left_idxs or not right_idxs: return None - return (left_idxs, right_idxs) - - -def split_panel_box(image_bgr, box_xyxy, bubble_quads=None): - x1, y1, x2, y2 = box_xyxy - ih, iw = image_bgr.shape[:2] - x1, y1 = max(0, x1), max(0, y1) - x2, y2 = min(iw-1, x2), min(ih-1, y2) - if x2 <= x1 or y2 <= y1: return None - crop = image_bgr[y1:y2, x1:x2] - if crop.size == 0: return None - gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) - edges = cv2.Canny(gray, 50, 150) - h_proj = np.sum(edges, axis=0) - w = x2 - x1 - if w < 100: return None - search_start = int(w * 0.35) - search_end = int(w * 0.65) - if search_end <= search_start: return None - region = h_proj[search_start:search_end] - if len(region) == 0: return None - threshold = np.percentile(region, 85) - candidates = [x1 + search_start + rx - for rx in range(len(region)) if region[rx] >= threshold] - if not candidates: return None - split_x = int(np.median(candidates)) - if bubble_quads: - lc = sum(1 for q in bubble_quads if quad_center(q)[0] < split_x) - rc = len(bubble_quads) - lc - if lc == 0 or rc == 0: return None - return (x1, x2, split_x) - - -# ============================================================ -# MERGE CLOSE BUBBLES -# ============================================================ -def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, - bubble_indices, ocr): +def rescue_name_and_short_tokens(ocr_list, min_conf=0.20): """ - Merges boxes that are spatially very close on BOTH axes AND share - meaningful horizontal overlap (same column). - - Single-quad boxes participate fully — no special isolation treatment. - The h_overlap_ratio >= 0.25 guard prevents merging horizontally - adjacent distinct bubbles. + Keep plausible short/name tokens that OCR found but strict filtering may drop. + Returns rescued items as (quad, text, conf). """ - if not bubbles: + rescued = [] + + for quad, text, conf in ocr_list: + t = normalize_text(text or "") + if not t: + continue + + t_alpha = re.sub(r'[^A-ZÀ-Ý]', '', t) + + if t_alpha in KNOWN_NAMES and conf >= min_conf: + rescued.append((quad, t, max(conf, 0.45))) + continue + + if is_protected_token(t) and conf >= min_conf: + rescued.append((quad, t, max(conf, 0.40))) + continue + + if 2 <= len(t_alpha) <= 8 and conf >= 0.25: + if re.fullmatch(r'[A-ZÀ-Ý]{2,8}', t_alpha): + rescued.append((quad, t, max(conf, 0.35))) + + return rescued + +def merge_rescued_items(base_ocr, rescued_ocr, iou_threshold=0.55): + """ + Merge rescued tokens into OCR list if not duplicate by text+overlap. + """ + if not rescued_ocr: + return base_ocr + + def iou_xyxy(a, b): + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + if inter == 0: + return 0.0 + area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1) + area_b = max(0, bx2 - bx1) * max(0, by2 - by1) + return inter / max(1, area_a + area_b - inter) + + out = list(base_ocr) + for rq, rt, rc in rescued_ocr: + rb = quad_bbox(rq) + rk = _text_key_for_dedup(rt) + duplicate = False + + for bq, bt, _ in out: + bb = quad_bbox(bq) + bk = _text_key_for_dedup(bt) + if rk == bk and iou_xyxy(rb, bb) >= iou_threshold: + duplicate = True + break + + if not duplicate: + out.append((rq, rt, rc)) + + return out + +def _joined_text_for_indices(indices, ocr): + parts = [] + for i in indices: + if i < 0 or i >= len(ocr): + continue + t = normalize_text(ocr[i][1]) + if t: + parts.append(t) + s = " ".join(parts).strip() + return s, len(s) + +def _in_same_bubble_contour(box_i, box_j, bubble_contours): + cx_i = (box_i[0] + box_i[2]) / 2.0 + cy_i = (box_i[1] + box_i[3]) / 2.0 + cx_j = (box_j[0] + box_j[2]) / 2.0 + cy_j = (box_j[1] + box_j[3]) / 2.0 + for c in bubble_contours: + if (cv2.pointPolygonTest(c, (cx_i, cy_i), False) >= 0 and + cv2.pointPolygonTest(c, (cx_j, cy_j), False) >= 0): + return True + return False + +def merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): + """ + Relaxed merge for tiny interjection/name boxes (e.g. HUH? + MORNING). + """ + bids = sorted(bubble_boxes.keys()) + if len(bids) < 2: return bubbles, bubble_boxes, bubble_quads, bubble_indices - all_h = [max(1, quad_bbox(ocr[i][0])[3]-quad_bbox(ocr[i][0])[1]) - for i in range(len(ocr))] - med_h = float(np.median(all_h)) if all_h else 14.0 - merge_tol = max(8, med_h * 1.4) + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + bubble_contours = detect_speech_bubbles(image_bgr) - bids = sorted(bubble_boxes.keys()) - merged_set, merge_map = set(), {} + parent = {b: b for b in bids} - for i, bid_i in enumerate(bids): - if bid_i in merged_set: continue - x1_i, y1_i, x2_i, y2_i = bubble_boxes[bid_i] - wi = max(1, x2_i - x1_i) + def find(x): + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + def union(a, b): + ra, rb = find(a), find(b) + if ra != rb: + parent[rb] = ra + + SHORT_TEXT_MAX_CHARS = 12 + + for i in range(len(bids)): for j in range(i + 1, len(bids)): - bid_j = bids[j] - if bid_j in merged_set: continue - x1_j, y1_j, x2_j, y2_j = bubble_boxes[bid_j] - wj = max(1, x2_j - x1_j) + bi, bj = bids[i], bids[j] + box_i, box_j = bubble_boxes[bi], bubble_boxes[bj] - gap_x = max(0, max(x1_i, x1_j) - min(x2_i, x2_j)) - gap_y = max(0, max(y1_i, y1_j) - min(y2_i, y2_j)) + wi = max(1, box_i[2] - box_i[0]) + wj = max(1, box_j[2] - box_j[0]) - h_ix1 = max(x1_i, x1_j) - h_ix2 = min(x2_i, x2_j) - h_overlap = max(0, h_ix2 - h_ix1) + gap_x = max(0, max(box_i[0], box_j[0]) - min(box_i[2], box_j[2])) + vert_gap = max(0, max(box_i[1], box_j[1]) - min(box_i[3], box_j[3])) + + h_ix1 = max(box_i[0], box_j[0]) + h_ix2 = min(box_i[2], box_j[2]) + h_overlap = max(0, h_ix2 - h_ix1) h_overlap_ratio = h_overlap / max(1, min(wi, wj)) - if gap_x <= merge_tol and gap_y <= merge_tol and h_overlap_ratio >= 0.25: - if bid_i not in merge_map: - merge_map[bid_i] = [bid_i] - merge_map[bid_i].append(bid_j) - merged_set.add(bid_j) + txt_i, len_i = _joined_text_for_indices(bubble_indices[bi], ocr) + txt_j, len_j = _joined_text_for_indices(bubble_indices[bj], ocr) - if not merge_map: + micro_pair = (len_i <= SHORT_TEXT_MAX_CHARS and len_j <= SHORT_TEXT_MAX_CHARS) + protected_hint = is_protected_token(txt_i) or is_protected_token(txt_j) + same_contour = _in_same_bubble_contour(box_i, box_j, bubble_contours) + + if micro_pair and vert_gap <= med_h * 2.2 and gap_x <= med_h * 2.0: + if h_overlap_ratio >= 0.10 or same_contour or protected_hint: + union(bi, bj) + + groups = {} + for b in bids: + r = find(b) + groups.setdefault(r, []).append(b) + + if all(len(v) == 1 for v in groups.values()): return bubbles, bubble_boxes, bubble_quads, bubble_indices new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} next_bid = 1 - for bid in bids: - if bid in merged_set: continue - if bid in merge_map: - group = merge_map[bid] - all_indices = sorted(set(idx for b in group for idx in bubble_indices[b])) - new_bubbles[next_bid] = build_lines_from_indices(all_indices, ocr) - new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_indices]) - new_quads[next_bid] = [ocr[i][0] for i in all_indices] - new_indices[next_bid] = all_indices + + for _, group in groups.items(): + if len(group) == 1: + b = group[0] + new_bubbles[next_bid] = bubbles[b] + new_boxes[next_bid] = bubble_boxes[b] + new_quads[next_bid] = bubble_quads[b] + new_indices[next_bid] = bubble_indices[b] else: - new_bubbles[next_bid] = bubbles[bid] - new_boxes[next_bid] = bubble_boxes[bid] - new_quads[next_bid] = bubble_quads[bid] - new_indices[next_bid] = bubble_indices[bid] + all_idx = sorted(set(idx for b in group for idx in bubble_indices[b])) + new_bubbles[next_bid] = build_lines_from_indices(all_idx, ocr) + new_boxes[next_bid] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in all_idx]) + new_quads[next_bid] = [ocr[i][0] for i in all_idx] + new_indices[next_bid] = all_idx next_bid += 1 return new_bubbles, new_boxes, new_quads, new_indices +def reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): + """ + Reattach tiny orphan token boxes (e.g., single 'HUH?') to nearest plausible bubble. + """ + bids = sorted(bubble_boxes.keys()) + if len(bids) < 2: + return bubbles, bubble_boxes, bubble_quads, bubble_indices -# ============================================================ -# WIDE / BRIDGE QUAD SPLITTING -# ============================================================ -def split_wide_ocr_items(image_bgr, ocr_list, width_factor=8.0): - if not ocr_list: return ocr_list, 0 - hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] - med_h = float(np.median(hs)) if hs else 14.0 - result, splits_made = [], 0 + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 - for quad, text, conf in ocr_list: - x1, y1, x2, y2 = quad_bbox(quad) - w = x2 - x1 - if w > med_h * width_factor: - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] - if roi.size > 0: - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, - cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - gap_threshold = roi.shape[0] * 255 * 0.15 - gaps, in_gap, gap_start = [], False, 0 - for x in range(len(v_proj)): - if v_proj[x] < gap_threshold: - if not in_gap: gap_start, in_gap = x, True - else: - if in_gap: - gw = x - gap_start - if gw >= max(int(med_h * 0.6), 12): - gaps.append((gap_start + gw // 2, gw)) - in_gap = False - if gaps: - gaps.sort(key=lambda g: g[1], reverse=True) - split_x_abs = max(0, x1) + gaps[0][0] - if ' ' in text: - char_w = w / max(1, len(text)) - split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - else: - split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - if tl and tr: - result.extend([ - ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) - splits_made += 1 - continue - result.append((quad, text, conf)) - return result, splits_made + orphan_bids = [] + for b in bids: + idxs = bubble_indices.get(b, []) + if len(idxs) != 1: + continue + t = normalize_text(ocr[idxs[0]][1]) + if is_protected_token(t) or len(re.sub(r'[^A-ZÀ-Ý]', '', t)) <= 5: + orphan_bids.append(b) + if not orphan_bids: + return bubbles, bubble_boxes, bubble_quads, bubble_indices -def split_abnormal_bridge_quads(image_bgr, ocr_list, aspect_ratio_threshold=6.0): - if not ocr_list: return ocr_list, 0 - hs = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in ocr_list] - med_h = float(np.median(hs)) if hs else 14.0 - result, splits_made = [], 0 + consumed = set() - for quad, text, conf in ocr_list: - x1, y1, x2, y2 = quad_bbox(quad) - w, h = x2 - x1, max(1, y2 - y1) - if w / h > aspect_ratio_threshold: - pad = 2 - roi = image_bgr[max(0,y1-pad):min(image_bgr.shape[0],y2+pad), - max(0,x1):min(image_bgr.shape[1],x2)] - if roi.size > 0: - gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, - cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - v_proj = np.sum(binary, axis=0) - gap_threshold = h * 255 * 0.20 - gaps, in_gap, gap_start = [], False, 0 - for x in range(len(v_proj)): - if v_proj[x] < gap_threshold: - if not in_gap: gap_start, in_gap = x, True - else: - if in_gap: - gw = x - gap_start - if gw >= max(int(med_h * 0.8), 15): - gaps.append((gap_start + gw // 2, gw)) - in_gap = False - if gaps: - gaps.sort(key=lambda g: g[1], reverse=True) - split_x_abs = max(0, x1) + gaps[0][0] - if ' ' in text: - char_w = w / max(1, len(text)) - split_idx = int((split_x_abs - x1) / max(1e-6, char_w)) - spaces = [i for i, c in enumerate(text) if c == ' '] - if spaces: - split_idx = min(spaces, key=lambda i: abs(i - split_idx)) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - else: - split_idx = int(len(text) * (split_x_abs - x1) / w) - tl, tr = text[:split_idx].strip(), text[split_idx:].strip() - if tl and tr: - result.extend([ - ([[x1,y1],[split_x_abs,y1],[split_x_abs,y2],[x1,y2]], tl, conf), - ([[split_x_abs,y1],[x2,y1],[x2,y2],[split_x_abs,y2]], tr, conf)]) - splits_made += 1 - continue - result.append((quad, text, conf)) - return result, splits_made + for ob in orphan_bids: + if ob in consumed: + continue + obox = bubble_boxes[ob] + ocx = (obox[0] + obox[2]) / 2.0 + ocy = (obox[1] + obox[3]) / 2.0 -def normalize_ocr_quads(ocr_list): - result = [] - for quad, text, conf in ocr_list: - x1, y1, x2, y2 = quad_bbox(quad) - pad = 3 - new_quad = [[x1-pad,y1-pad],[x2+pad,y1-pad],[x2+pad,y2+pad],[x1-pad,y2+pad]] - result.append((new_quad, text, conf)) - return result + best_b = None + best_d = 1e9 + for tb in bids: + if tb == ob or tb in consumed: + continue + tbox = bubble_boxes[tb] + tcx = (tbox[0] + tbox[2]) / 2.0 + tcy = (tbox[1] + tbox[3]) / 2.0 -# ============================================================ -# VISION RE-READ -# ============================================================ -def preprocess_variant(crop_bgr, mode): - gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) - if mode == "raw": return gray - if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, - tileGridSize=(8,8)).apply(gray) - if mode == "adaptive": - den = cv2.GaussianBlur(gray, (3,3), 0) - return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 35, 11) - if mode == "otsu": - den = cv2.GaussianBlur(gray, (3,3), 0) - _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - return th - if mode == "invert": return 255 - gray - if mode == "bilateral": - den = cv2.bilateralFilter(gray, 7, 60, 60) - _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - return th - if mode == "morph_open": - _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - return cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((2,2), np.uint8)) - return gray + dx = abs(ocx - tcx) + dy = abs(ocy - tcy) + if dx <= med_h * 2.2 and dy <= med_h * 3.0: + d = dx + dy + if d < best_d: + best_d = d + best_b = tb -def rotate_image_keep_bounds(img, angle_deg): - h, w = img.shape[:2] - c = (w/2, h/2) - M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) - cos, sin = abs(M[0,0]), abs(M[0,1]) - new_w = int((h*sin) + (w*cos)) - new_h = int((h*cos) + (w*sin)) - M[0,2] += (new_w/2) - c[0] - M[1,2] += (new_h/2) - c[1] - return cv2.warpAffine(img, M, (new_w, new_h), - flags=cv2.INTER_CUBIC, borderValue=255) + if best_b is not None: + merged = sorted(set(bubble_indices[best_b] + bubble_indices[ob])) + bubble_indices[best_b] = merged + bubble_quads[best_b] = [ocr[i][0] for i in merged] + bubble_boxes[best_b] = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in merged]) + bubbles[best_b] = build_lines_from_indices(merged, ocr) + consumed.add(ob) + if consumed: + for b in consumed: + bubble_indices.pop(b, None) + bubble_quads.pop(b, None) + bubble_boxes.pop(b, None) + bubbles.pop(b, None) -def rebuild_text_from_vision_result(res): - if not res: return "" - norm = [] - for bbox, txt, conf in res: - if not txt or not txt.strip(): continue - b = quad_bbox(bbox) - norm.append((b, txt, conf, - (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) - if not norm: return "" - med_h = float(np.median([x[5] for x in norm])) - row_tol = max(6.0, med_h * 0.75) - norm.sort(key=lambda z: z[4]) - rows = [] - for it in norm: - placed = False - for r in rows: - if abs(it[4] - r["yc"]) <= row_tol: - r["m"].append(it) - r["yc"] = float(np.mean([k[4] for k in r["m"]])) - placed = True; break - if not placed: rows.append({"yc": it[4], "m": [it]}) - rows.sort(key=lambda r: r["yc"]) - lines = [normalize_text(" ".join(x[1] for x in sorted(r["m"], key=lambda z: z[3]))) - for r in rows] - return normalize_text(" ".join(filter(None, lines))) - - -def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, - upscale=3.0, pad=24): - ih, iw = image_bgr.shape[:2] - x1, y1, x2, y2 = bbox_xyxy - x1, y1 = max(0, int(x1-pad)), max(0, int(y1-pad)) - x2, y2 = min(iw, int(x2+pad)), min(ih, int(y2+pad)) - crop = image_bgr[y1:y2, x1:x2] - if crop.size == 0: return None, 0.0, "none" - - modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] - angles = [0.0, 1.5, -1.5] - best_v_txt, best_v_sc = "", 0.0 - up0 = cv2.resize(crop, - (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), - interpolation=cv2.INTER_CUBIC) - - for mode in modes: - proc = preprocess_variant(up0, mode) - proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc - for a in angles: - rot = rotate_image_keep_bounds(proc3, a) - res = (vision_detector.run_vision_ocr(rot) - if hasattr(vision_detector, 'run_vision_ocr') - else vision_detector.read(rot)) - txt = rebuild_text_from_vision_result(res) - sc = ocr_candidate_score(txt) - if sc > best_v_sc: - best_v_txt, best_v_sc = txt, sc - - if best_v_txt: return best_v_txt, best_v_sc, "vision-reread" - return None, 0.0, "none" - - -# ============================================================ -# LINES + BUBBLES -# ============================================================ -def build_lines_from_indices(indices, ocr): - if not indices: return [] - items = [] - for i in indices: - b = quad_bbox(ocr[i][0]) - items.append((i, b, (b[0]+b[2])/2.0, (b[1]+b[3])/2.0, max(1.0, b[3]-b[1]))) - med_h = float(np.median([it[4] for it in items])) if items else 10.0 - row_tol = max(6.0, med_h * 0.75) - items.sort(key=lambda x: x[3]) - rows = [] - for it in items: - placed = False - for r in rows: - if abs(it[3] - r["yc"]) <= row_tol: - r["m"].append(it) - r["yc"] = float(np.mean([k[3] for k in r["m"]])) - placed = True; break - if not placed: rows.append({"yc": it[3], "m": [it]}) - rows.sort(key=lambda r: r["yc"]) - return [normalize_text( - " ".join(ocr[i][1] - for i, _, _, _, _ in sorted(r["m"], key=lambda z: z[2]))) - for r in rows if r["m"]] - - -def auto_gap(image_path, base=18, ref_w=750): - img = cv2.imread(image_path) - return base * (img.shape[1] / ref_w) if img is not None else base - - -def group_tokens_vertical(ocr, image_shape, gap_px=18, bbox_padding=1, - strict_mode=False): - n = len(ocr) - if n == 0: return {}, {}, {}, {} - - boxes = [quad_bbox(r[0]) for r in ocr] - centers = [quad_center(r[0]) for r in ocr] - hs = [max(1.0, b[3]-b[1]) for b in boxes] - med_h = float(np.median(hs)) if hs else 12.0 - - max_vertical_gap = med_h * 2.5 if not strict_mode else med_h * 2.0 - max_horizontal_offset = med_h * 1.8 - - sorted_indices = sorted(range(n), key=lambda i: (centers[i][1], centers[i][0])) - groups, used = [], set() - - for i in sorted_indices: - if i in used: continue - current_group = [i] - used.add(i) - cx_i = centers[i][0] - - for j in sorted_indices: - if j in used or j == i: continue - cx_j, cy_j = centers[j] - if cy_j <= centers[i][1]: continue - if abs(cx_i - cx_j) > max_horizontal_offset: continue - - # Horizontal gap guard - gap_x = max(0, max(boxes[i][0], boxes[j][0]) - min(boxes[i][2], boxes[j][2])) - if gap_x > med_h * 1.5: continue - - # Orientation compatibility guard - if not orientation_compatible(i, j, ocr): continue - - vertical_gap = boxes[j][1] - boxes[current_group[-1]][3] - if vertical_gap <= max_vertical_gap: - current_group.append(j) - used.add(j) - cx_i = (cx_i + cx_j) / 2.0 - - if current_group: - groups.append(current_group) - - # Secondary merge pass - merged_groups, used_groups = [], set() - for i, group1 in enumerate(groups): - if i in used_groups: continue - merged = list(group1) - used_groups.add(i) - for j, group2 in enumerate(groups): - if i == j or j in used_groups: continue - if should_merge_groups(merged, group2, ocr, med_h, max_vertical_gap): - compat = all(orientation_compatible(a, b, ocr) - for a in merged for b in group2) - if compat: - merged.extend(group2) - used_groups.add(j) - merged_groups.append(sorted(merged, key=lambda idx: centers[idx][1])) - - # Horizontal gap split pass - final_groups = [] - for group in merged_groups: - h_split = detect_horizontal_gap_in_group(group, ocr, med_h, gap_factor=2.5) - if h_split: - lg, rg = h_split - final_groups.append(sorted(lg, key=lambda idx: centers[idx][1])) - final_groups.append(sorted(rg, key=lambda idx: centers[idx][1])) - else: - final_groups.append(group) - - final_groups.sort(key=lambda g: (min(centers[i][1] for i in g), - min(centers[i][0] for i in g))) - - bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} - ih, iw = image_shape[:2] - - for bid, idxs in enumerate(final_groups, start=1): - lines = build_lines_from_indices(idxs, ocr) - quads = [ocr[k][0] for k in idxs] - ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) - if ub is None: continue - x1, y1, x2, y2 = ub - ap = max(1, int(round(med_h * 0.16))) - bubbles[bid] = lines - bubble_boxes[bid] = (max(0,x1-ap), max(0,y1-ap), - min(iw-1,x2+ap), min(ih-1,y2+ap)) - bubble_quads[bid] = quads - bubble_indices[bid] = idxs + # reindex for stable downstream order + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + for new_id, old_id in enumerate(sorted(bubble_boxes.keys()), start=1): + new_bubbles[new_id] = bubbles[old_id] + new_boxes[new_id] = bubble_boxes[old_id] + new_quads[new_id] = bubble_quads[old_id] + new_indices[new_id] = bubble_indices[old_id] + return new_bubbles, new_boxes, new_quads, new_indices return bubbles, bubble_boxes, bubble_quads, bubble_indices +def _bubble_text(indices, ocr): + return normalize_text(" ".join(build_lines_from_indices(indices, ocr))) -# ============================================================ -# SPLIT HELPER — centralises all split strategies -# ============================================================ -def _split_bubble_if_needed(bid, bubble_indices, bubble_quads, bubble_boxes, - filtered, image, iw, ih): +def _box_dims(b): + return max(1, b[2]-b[0]), max(1, b[3]-b[1]) + +def _intersection(a, b): + ix1, iy1 = max(a[0], b[0]), max(a[1], b[1]) + ix2, iy2 = min(a[2], b[2]), min(a[3], b[3]) + w, h = max(0, ix2-ix1), max(0, iy2-iy1) + return w*h + +def _containment_ratio(child, parent): + inter = _intersection(child, parent) + c_area = max(1, (child[2]-child[0])*(child[3]-child[1])) + return inter / c_area + +def _center_distance(a, b): + acx, acy = (a[0]+a[2])/2.0, (a[1]+a[3])/2.0 + bcx, bcy = (b[0]+b[2])/2.0, (b[1]+b[3])/2.0 + return ((acx-bcx)**2 + (acy-bcy)**2) ** 0.5 + +def _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices): + new_b, new_bb, new_bq, new_bi = {}, {}, {}, {} + for nid, old in enumerate(sorted(bubble_boxes.keys()), start=1): + new_b[nid] = bubbles[old] + new_bb[nid] = bubble_boxes[old] + new_bq[nid] = bubble_quads[old] + new_bi[nid] = bubble_indices[old] + return new_b, new_bb, new_bq, new_bi + +def reconcile_final_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): """ - Attempts all split strategies in priority order. - Returns ((part1_indices, part2_indices), reason_str) or (None, None). - - BOX#18 fix: split_cluster_by_big_vertical_gap factor lowered to 1.4 - so the gap between the top speech bubble and the bottom cluster triggers. + Final reconciliation pass for: + - overlap merges (5+16, 8+18) + - child absorption (4->14, 9->19) + - complementary fragment merge (1+11) """ - indices = bubble_indices[bid] - box = bubble_boxes[bid] + if not bubble_boxes: + return bubbles, bubble_boxes, bubble_quads, bubble_indices - # 1. Vertical-stack gap (sensitive — catches top-vs-bottom cluster) - if is_vertical_text_like(indices, filtered): - vgap = split_cluster_by_big_vertical_gap(indices, filtered, - factor=1.4, min_gap=18) - if vgap: - return vgap, "vertical-stack y-gap" + all_h = [max(1, quad_bbox(ocr[i][0])[3] - quad_bbox(ocr[i][0])[1]) for i in range(len(ocr))] + med_h = float(np.median(all_h)) if all_h else 14.0 + bubble_contours = detect_speech_bubbles(image_bgr) - # 2. Panel border - sr = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) - if sr: - _, _, split_x = sr - li = [idx for idx in indices if quad_center(filtered[idx][0])[0] < split_x] - ri = [idx for idx in indices if quad_center(filtered[idx][0])[0] >= split_x] - if li and ri: - return (li, ri), "panel border" - elif len(bubble_quads[bid]) >= 4: - cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid, - use_aggressive_thresholds=True) - if cs: - return cs, "aggressive column" + changed = True + while changed: + changed = False + bids = sorted(bubble_boxes.keys()) - # 3. Column gap - cs = split_bubble_if_multiple_columns(indices, filtered, bid=bid) - if cs: - return cs, "vertical column" + # ---- (A) Merge highly-overlapping pairs + merged_any = False + for i in range(len(bids)): + if merged_any: break + for j in range(i+1, len(bids)): + bi, bj = bids[i], bids[j] + if bi not in bubble_boxes or bj not in bubble_boxes: + continue + a, b = bubble_boxes[bi], bubble_boxes[bj] + iou = boxes_iou(a, b) + ovs = boxes_overlap_ratio(a, b) # inter / smaller - # 4. Nested / side-by-side - ns = split_nested_or_side_by_side(indices, filtered) - if ns: - return ns, "nested/side-by-side" + same_contour = _in_same_bubble_contour(a, b, bubble_contours) + if ovs >= 0.55 or (iou >= 0.35 and same_contour): + idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) + bubble_indices[bi] = idx + bubble_quads[bi] = [ocr[k][0] for k in idx] + bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) + bubbles[bi] = build_lines_from_indices(idx, ocr) - # 5. Row split - rs = split_bubble_if_multiple_rows(indices, filtered, bid=bid) - if rs: - return rs, "horizontal row" + bubble_indices.pop(bj, None) + bubble_quads.pop(bj, None) + bubble_boxes.pop(bj, None) + bubbles.pop(bj, None) - # 6. Large vertical gap (general, less sensitive) - gy = split_cluster_by_big_vertical_gap(indices, filtered, factor=1.9, min_gap=22) - if gy: - return gy, "large vertical-gap" + changed = True + merged_any = True + break - return None, None + if changed: + continue + # ---- (B) Absorb tiny child boxes inside larger parent + absorbed_any = False + bids = sorted(bubble_boxes.keys()) + for i in range(len(bids)): + if absorbed_any: break + for j in range(len(bids)): + if i == j: + continue + child, parent = bids[i], bids[j] + if child not in bubble_boxes or parent not in bubble_boxes: + continue -# ============================================================ -# DEBUG / EXPORT -# ============================================================ -def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, - clean_lines=None, out_path="debug_clusters.png"): + cb, pb = bubble_boxes[child], bubble_boxes[parent] + cw, ch = _box_dims(cb) + pw, ph = _box_dims(pb) + + contain = _containment_ratio(cb, pb) + child_txt = _bubble_text(bubble_indices[child], ocr) + parent_txt = _bubble_text(bubble_indices[parent], ocr) + + # tiny or fragment child + is_tiny = (cw <= med_h*3.2 and ch <= med_h*2.2) or len(child_txt) <= 14 + + # don't absorb if it's clearly separate and far + close = _center_distance(cb, pb) <= med_h * 4.0 + + if contain >= 0.70 and (is_tiny or close): + idx = sorted(set(bubble_indices[parent] + bubble_indices[child])) + bubble_indices[parent] = idx + bubble_quads[parent] = [ocr[k][0] for k in idx] + bubble_boxes[parent] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) + bubbles[parent] = build_lines_from_indices(idx, ocr) + + bubble_indices.pop(child, None) + bubble_quads.pop(child, None) + bubble_boxes.pop(child, None) + bubbles.pop(child, None) + + changed = True + absorbed_any = True + break + + if changed: + continue + + # ---- (C) Merge complementary fragments (partial overlap, same contour, similar x-span) + comp_any = False + bids = sorted(bubble_boxes.keys()) + for i in range(len(bids)): + if comp_any: break + for j in range(i+1, len(bids)): + bi, bj = bids[i], bids[j] + if bi not in bubble_boxes or bj not in bubble_boxes: + continue + + a, b = bubble_boxes[bi], bubble_boxes[bj] + wi, hi = _box_dims(a) + wj, hj = _box_dims(b) + + vert_gap = max(0, max(a[1], b[1]) - min(a[3], b[3])) + h_ix = max(0, min(a[2], b[2]) - max(a[0], b[0])) + h_overlap_ratio = h_ix / max(1, min(wi, wj)) + same_contour = _in_same_bubble_contour(a, b, bubble_contours) + + txt_i = _bubble_text(bubble_indices[bi], ocr) + txt_j = _bubble_text(bubble_indices[bj], ocr) + + if same_contour and vert_gap <= med_h*2.8 and h_overlap_ratio >= 0.45: + # prefer merge when one is “upper fragment” + the other “lower fragment” + # and text isn't identical duplicate + if txt_i != txt_j: + idx = sorted(set(bubble_indices[bi] + bubble_indices[bj])) + bubble_indices[bi] = idx + bubble_quads[bi] = [ocr[k][0] for k in idx] + bubble_boxes[bi] = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idx]) + bubbles[bi] = build_lines_from_indices(idx, ocr) + + bubble_indices.pop(bj, None) + bubble_quads.pop(bj, None) + bubble_boxes.pop(bj, None) + bubbles.pop(bj, None) + + changed = True + comp_any = True + break + + return _reindex_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices) + +def split_box_by_internal_vertical_gaps(bid, bubble_indices, ocr, factor=1.45, min_gap=16): """ - Draws all detected boxes. - Single-quad boxes are drawn in orange for visibility but are NOT - labelled as (ISOLATED) — they participate fully in merge passes. + Multi-cut vertical splitter. + Splits one bubble into N vertical groups when there are multiple strong y-gaps. + Good for 4+4 quad accidental merges. """ - img = cv2.imread(image_path) - if img is None: return + idxs = bubble_indices.get(bid, []) + if len(idxs) < 4: + return None - for bbox, txt, conf in ocr: - pts = np.array(bbox, dtype=np.int32) - cv2.fillPoly(img, [pts], (255, 255, 255)) - cv2.polylines(img, [pts], True, (180, 180, 180), 1) + items = [] + for i in idxs: + b = quad_bbox(ocr[i][0]) + cy = (b[1] + b[3]) / 2.0 + h = max(1, b[3] - b[1]) + items.append((i, b, cy, h)) - for bid, bb in bubble_boxes.items(): - x1, y1, x2, y2 = bb - n_quads = len(bubble_indices.get(bid, [])) - color = (255, 165, 0) if n_quads == 1 else (0, 220, 0) - thickness = 3 if n_quads == 1 else 2 - cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) - cv2.putText(img, f"BOX#{bid}", (x1+2, max(15, y1+16)), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + items.sort(key=lambda x: x[2]) # top->bottom + med_h = float(np.median([x[3] for x in items])) if items else 12.0 + th = max(min_gap, med_h * factor) - if clean_lines and bid in clean_lines: - text = clean_lines[bid] - words = text.split() - lines, cur = [], "" - for w in words: - if len(cur) + len(w) < 25: cur += w + " " - else: lines.append(cur.strip()); cur = w + " " - if cur: lines.append(cur.strip()) - y_text = y2 + 18 - for line in lines: - cv2.putText(img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 3) - cv2.putText(img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1) - y_text += 18 + # Collect cut points + cut_positions = [] + prev_b = items[0][1] + for k in range(1, len(items)): + cur_b = items[k][1] + gap = cur_b[1] - prev_b[3] + if gap > th: + cut_positions.append(k) + prev_b = cur_b - cv2.imwrite(out_path, img) + if not cut_positions: + return None + # Build groups using all cut positions + groups = [] + start = 0 + for cp in cut_positions: + groups.append([it[0] for it in items[start:cp]]) + start = cp + groups.append([it[0] for it in items[start:]]) -def estimate_reading_order(bbox_dict, mode="ltr"): - items = [(bid, (bb[0]+bb[2])/2.0, (bb[1]+bb[3])/2.0) - for bid, bb in bbox_dict.items()] - items.sort(key=lambda t: t[2]) - rows, tol = [], 90 - for it in items: - placed = False - for r in rows: - if abs(it[2] - r["cy"]) <= tol: - r["items"].append(it) - r["cy"] = float(np.mean([x[2] for x in r["items"]])) - placed = True; break - if not placed: rows.append({"cy": it[2], "items": [it]}) - rows.sort(key=lambda r: r["cy"]) - order = [] - for r in rows: - r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) - order.extend([z[0] for z in r["items"]]) - return {bid: i+1 for i, bid in enumerate(order)} + # Remove empty groups + groups = [g for g in groups if g] + if len(groups) <= 1: + return None + # Sanity: each group should be meaningful + clean_groups = [] + for g in groups: + txt = normalize_text(" ".join(build_lines_from_indices(g, ocr))) + if len(g) >= 2 or len(txt) >= 12: + clean_groups.append(g) + + if len(clean_groups) <= 1: + return None + + return clean_groups + +def force_split_bridged_boxes(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr): + """ + Force-split boxes that accidentally contain multiple vertically separated speech chunks. + + Typical fixes: + - one detected box actually contains 2 stacked bubbles + - "4 quads + 4 quads" merged into one cluster + - mixed contour membership inside one grouped box + """ + if not bubble_boxes: + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + bubble_contours = detect_speech_bubbles(image_bgr) + + def contour_id_for_idx(i): + b = quad_bbox(ocr[i][0]) + cx = (b[0] + b[2]) / 2.0 + cy = (b[1] + b[3]) / 2.0 + for ci, c in enumerate(bubble_contours): + if cv2.pointPolygonTest(c, (cx, cy), False) >= 0: + return ci + return -1 + + def build_group_payload(g): + g_sorted = sorted(g, key=lambda i: quad_center(ocr[i][0])[1]) + ub = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g_sorted]) + return ( + build_lines_from_indices(g_sorted, ocr), # lines + ub, # box + [ocr[i][0] for i in g_sorted], # quads + g_sorted # indices + ) + + new_bubbles, new_boxes, new_quads, new_indices = {}, {}, {}, {} + next_bid = 1 + + for bid in sorted(bubble_boxes.keys()): + idxs = bubble_indices.get(bid, []) + if len(idxs) < 2: + # keep as-is + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + continue + + parts = None + + # ------------------------------------------------------------------ + # (A) Primary: internal vertical-gap multi-split + # ------------------------------------------------------------------ + parts = split_box_by_internal_vertical_gaps( + bid, bubble_indices, ocr, factor=1.45, min_gap=16 + ) + + # ------------------------------------------------------------------ + # (B) Secondary: split by contour membership if clearly mixed + # ------------------------------------------------------------------ + if parts is None and len(idxs) >= 3: + by_contour = {} + for i in idxs: + cid = contour_id_for_idx(i) + by_contour.setdefault(cid, []).append(i) + + contour_groups = [g for g in by_contour.values() if len(g) >= 1] + if len(contour_groups) >= 2: + # sort groups top->bottom for stable order + contour_groups.sort(key=lambda g: min(quad_bbox(ocr[i][0])[1] for i in g)) + + # sanity: avoid splitting tiny noise-only tails + valid = [] + for g in contour_groups: + txt = normalize_text(" ".join(build_lines_from_indices(g, ocr))) + if len(g) >= 2 or len(txt) >= 10: + valid.append(g) + + if len(valid) >= 2: + parts = valid + + # ------------------------------------------------------------------ + # (C) Tertiary: balanced 2-block pattern (e.g., 4 quads + 4 quads) + # ------------------------------------------------------------------ + if parts is None and len(idxs) >= 8: + sorted_idxs = sorted( + idxs, + key=lambda i: (quad_bbox(ocr[i][0])[1] + quad_bbox(ocr[i][0])[3]) / 2.0 + ) + mid = len(sorted_idxs) // 2 + g1, g2 = sorted_idxs[:mid], sorted_idxs[mid:] + + b1 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g1]) + b2 = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in g2]) + + if b1 and b2: + vgap = max(0, b2[1] - b1[3]) + h1 = max(1, b1[3] - b1[1]) + h2 = max(1, b2[3] - b2[1]) + med_local_h = (h1 + h2) / 2.0 + + h_ix = max(0, min(b1[2], b2[2]) - max(b1[0], b2[0])) + min_w = max(1, min(b1[2] - b1[0], b2[2] - b2[0])) + h_overlap_ratio = h_ix / min_w + + if vgap >= max(14, 0.22 * med_local_h) and h_overlap_ratio >= 0.30: + parts = [g1, g2] + + # ------------------------------------------------------------------ + # Commit split or keep original + # ------------------------------------------------------------------ + if parts is None or len(parts) <= 1: + new_bubbles[next_bid] = bubbles[bid] + new_boxes[next_bid] = bubble_boxes[bid] + new_quads[next_bid] = bubble_quads[bid] + new_indices[next_bid] = bubble_indices[bid] + next_bid += 1 + continue + + for g in parts: + lines, box, quads, gidx = build_group_payload(g) + new_bubbles[next_bid] = lines + new_boxes[next_bid] = box + new_quads[next_bid] = quads + new_indices[next_bid] = gidx + next_bid += 1 + + return new_bubbles, new_boxes, new_quads, new_indices -# ============================================================ -# MAIN PIPELINE -# ============================================================ def translate_manga_text( image_path="001-page.png", source_lang="en", @@ -2635,10 +2377,11 @@ def translate_manga_text( ): image = cv2.imread(image_path) if image is None: - print(f"❌ Cannot load image: {image_path}"); return + print(f"❌ Cannot load image: {image_path}") + return resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) - ih, iw = image.shape[:2] + ih, iw = image.shape[:2] print("Loading OCR engines...") if use_enhanced_ocr: @@ -2659,33 +2402,54 @@ def translate_manga_text( for region in missed_regions: rx1, ry1, rx2, ry2 = region pad = 10 - rx1, ry1 = max(0, rx1-pad), max(0, ry1-pad) - rx2, ry2 = min(iw, rx2+pad), min(ih, ry2+pad) + rx1, ry1 = max(0, rx1 - pad), max(0, ry1 - pad) + rx2, ry2 = min(iw, rx2 + pad), min(ih, ry2 + pad) crop = image[ry1:ry2, rx1:rx2] if crop.size > 0: - upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, - interpolation=cv2.INTER_CUBIC) + upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC) for quad, text, conf in detector.run_vision_ocr(upscaled): - raw.append(([[int(p[0]/4.0+rx1), int(p[1]/4.0+ry1)] - for p in quad], text, conf)) + raw.append(([[int(p[0] / 4.0 + rx1), int(p[1] / 4.0 + ry1)] for p in quad], text, conf)) print(f"📝 Total detections after missed region scan: {len(raw)}") # ── Filtering ───────────────────────────────────────────────────────── filtered, skipped = [], 0 for bbox, text, conf in raw: - t = normalize_text(text) + t = normalize_text(text) qb = quad_bbox(bbox) - if conf < confidence_threshold: skipped += 1; continue - if len(t) < min_text_length: skipped += 1; continue - if not is_valid_language(t, source_lang): skipped += 1; continue - if not is_meaningful_text(t, source_lang): skipped += 1; continue + + if conf < confidence_threshold: + skipped += 1 + continue + if len(t) < min_text_length: + skipped += 1 + continue + if not is_valid_language(t, source_lang): + skipped += 1 + continue + if not is_meaningful_text(t, source_lang): + skipped += 1 + continue if qb[1] < int(ih * TOP_BAND_RATIO) and conf < 0.70 and len(t) >= 5: - skipped += 1; continue + skipped += 1 + continue + filtered.append((bbox, t, conf)) print(f"Kept: {len(filtered)} | Skipped: {skipped}") + + # Protect short dialogue token confidence + tmp = [] + for bbox, t, conf in filtered: + tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40))) + filtered = tmp + + # Rescue names/short tokens dropped by strict filters + rescued = rescue_name_and_short_tokens(raw, min_conf=0.20) + filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55) + if not filtered: - print("⚠️ No text after filtering."); return + print("⚠️ No text after filtering.") + return # ── Pre-grouping quad splits ────────────────────────────────────────── filtered, oversized_splits = validate_and_split_oversized_quads(image, filtered) @@ -2700,10 +2464,9 @@ def translate_manga_text( if bridge_splits > 0: print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") - # Column-gap split: catches wide quads spanning two columns (BOX#6 type) - hs_pre = [max(1, quad_bbox(q)[3]-quad_bbox(q)[1]) for q, _, _ in filtered] + hs_pre = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered] med_h_pre = float(np.median(hs_pre)) if hs_pre else 14.0 - filtered, col_splits = apply_column_gap_splits(image, filtered, med_h_pre) + filtered, _ = apply_column_gap_splits(image, filtered, med_h_pre) filtered = normalize_ocr_quads(filtered) @@ -2711,77 +2474,112 @@ def translate_manga_text( print("📊 Grouping quads vertically...") bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens_vertical( filtered, image.shape, gap_px=resolved_gap, - bbox_padding=1, strict_mode=strict_grouping) + bbox_padding=1, strict_mode=strict_grouping + ) print(f" Created {len(bubbles)} initial box(es)") # ── Auto-fix (split + merge) ────────────────────────────────────────── if auto_fix_bubbles: bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection( - bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image) + bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image + ) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image + ) # ── Enforce max box size ────────────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = enforce_max_box_size( bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, max_width_ratio=max_box_width_ratio, max_height_ratio=max_box_height_ratio, - image_shape=image.shape) + image_shape=image.shape + ) # ── Close-proximity merge ───────────────────────────────────────────── bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( - bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered) + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered + ) # ── Per-bubble split pass ───────────────────────────────────────────── new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} - next_bid = max(bubbles.keys()) + 1 if bubbles else 1 + next_bid = max(bubbles.keys()) + 1 if bubbles else 1 splits_performed = [] for bid in list(bubbles.keys()): split_result, split_reason = _split_bubble_if_needed( - bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih) + bid, bubble_indices, bubble_quads, bubble_boxes, filtered, image, iw, ih + ) if split_result: p1, p2 = split_result splits_performed.append(f"BOX#{bid} ({split_reason})") for part_idxs, part_bid in [(p1, bid), (p2, next_bid)]: ub = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part_idxs]) - new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) - new_bubble_boxes[part_bid] = (max(0,ub[0]-2), max(0,ub[1]-2), - min(iw-1,ub[2]+2), min(ih-1,ub[3]+2)) - new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] + new_bubbles[part_bid] = build_lines_from_indices(part_idxs, filtered) + new_bubble_boxes[part_bid] = ( + max(0, ub[0] - 2), max(0, ub[1] - 2), + min(iw - 1, ub[2] + 2), min(ih - 1, ub[3] + 2) + ) + new_bubble_quads[part_bid] = [filtered[i][0] for i in part_idxs] new_bubble_indices[part_bid] = part_idxs next_bid += 1 else: - new_bubbles[bid] = bubbles[bid] - new_bubble_boxes[bid] = bubble_boxes[bid] - new_bubble_quads[bid] = bubble_quads[bid] + new_bubbles[bid] = bubbles[bid] + new_bubble_boxes[bid] = bubble_boxes[bid] + new_bubble_quads[bid] = bubble_quads[bid] new_bubble_indices[bid] = bubble_indices[bid] if splits_performed: print(f"\n🔀 Splits detected: {len(splits_performed)}") - for s in splits_performed: print(f" ✓ {s}") + for s in splits_performed: + print(f" ✓ {s}") + + # IMPORTANT: commit split-pass results + bubbles = new_bubbles + bubble_boxes = new_bubble_boxes + bubble_quads = new_bubble_quads + bubble_indices = new_bubble_indices + + # ── Reattach orphan short tokens ────────────────────────────────────── + bubbles, bubble_boxes, bubble_quads, bubble_indices = reattach_orphan_short_tokens( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered + ) + + # ── Final reconciliation pass (overlaps, child absorb, complementary merge) ── + bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image + ) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = force_split_bridged_boxes( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image + ) + + bubbles, bubble_boxes, bubble_quads, bubble_indices = reconcile_final_boxes( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image + ) - # ── Remove nested / duplicate boxes ────────────────────────────────── - bubbles, bubble_boxes, bubble_quads, bubble_indices = remove_nested_boxes( - new_bubble_boxes, new_bubble_indices, new_bubble_quads, new_bubbles, - overlap_threshold=0.50) print(f"✅ Final box count: {len(bubbles)}") # ── OCR quality pass ────────────────────────────────────────────────── - translator = GoogleTranslator(source=source_lang, target=target_lang) - clean_lines: Dict[int, str] = {} + translator = GoogleTranslator(source=source_lang, target=target_lang) + clean_lines: Dict[int, str] = {} sources_used: Dict[int, str] = {} translations: Dict[int, str] = {} for bid, lines in bubbles.items(): base_txt = normalize_text(" ".join(lines)) - base_sc = ocr_candidate_score(base_txt) + base_sc = ocr_candidate_score(base_txt) txt, src_used = base_txt, "vision-base" + if base_sc < quality_threshold: rr_txt, rr_sc, rr_src = reread_bubble_with_vision( - image, bubble_boxes[bid], detector, upscale=3.0, pad=24) + image, bubble_boxes[bid], detector, upscale=3.0, pad=24 + ) if rr_txt and rr_sc > base_sc + 0.04 and is_valid_language(rr_txt, source_lang): txt, src_used = rr_txt, rr_src - clean_lines[bid] = normalize_text(txt) + + clean_lines[bid] = normalize_text(txt) sources_used[bid] = src_used reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) @@ -2789,22 +2587,26 @@ def translate_manga_text( # ── Translation ─────────────────────────────────────────────────────── for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() - if not src_txt: continue - if not is_valid_language(src_txt, source_lang): continue - if not is_meaningful_text(src_txt, source_lang): continue + if not src_txt: + continue + if not is_valid_language(src_txt, source_lang): + continue + if not is_meaningful_text(src_txt, source_lang): + continue + try: tgt = translator.translate(src_txt) or "" tgt = postprocess_translation_general(tgt).upper() except Exception as e: tgt = f"[Error: {e}]" + translations[bid] = tgt if debug: - save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, - clean_lines, "debug_clusters.png") + save_debug_clusters(image_path, filtered, bubble_boxes, bubble_indices, clean_lines, "debug_clusters.png") # ── Text output ─────────────────────────────────────────────────────── - divider = "─" * 120 + divider = "─" * 120 out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] print(divider + f"\n{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} " f"{'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS\n" + divider) @@ -2812,19 +2614,24 @@ def translate_manga_text( translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() - if not src_txt: continue - if not is_valid_language(src_txt, source_lang): continue - if not is_meaningful_text(src_txt, source_lang): continue + if not src_txt: + continue + if not is_valid_language(src_txt, source_lang): + continue + if not is_meaningful_text(src_txt, source_lang): + continue - flags = [] - tgt = translations.get(bid, "") - if not tgt: flags.append("NO_TRANSLATION") - src_u = src_txt.upper() + flags = [] + tgt = translations.get(bid, "") + if not tgt: + flags.append("NO_TRANSLATION") + src_u = src_txt.upper() src_engine = sources_used.get(bid, "unknown") out_lines.append( - f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|" - f"{','.join(flags) if flags else '-'}") + f"#{bid}|{reading_map.get(bid, bid)}|{src_engine}|{src_u}|{tgt}|" + f"{','.join(flags) if flags else '-'}" + ) print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}") translated_count += 1 @@ -2837,21 +2644,25 @@ def translate_manga_text( bubbles_payload = {} for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): src_txt = clean_lines[bid].strip() - if not src_txt: continue - if not is_valid_language(src_txt, source_lang): continue - if not is_meaningful_text(src_txt, source_lang): continue + if not src_txt: + continue + if not is_valid_language(src_txt, source_lang): + continue + if not is_meaningful_text(src_txt, source_lang): + continue + box = bubble_boxes.get(bid) tgt = translations.get(bid, "") bubbles_payload[str(bid)] = { - "order": reading_map.get(bid, bid), + "order": reading_map.get(bid, bid), "ocr_source": sources_used.get(bid, "unknown"), - "original": src_txt.upper(), + "original": src_txt.upper(), "translated": tgt, "box": { "x": box[0] if box else 0, "y": box[1] if box else 0, - "w": (box[2]-box[0]) if box else 0, - "h": (box[3]-box[1]) if box else 0, + "w": (box[2] - box[0]) if box else 0, + "h": (box[3] - box[1]) if box else 0, }, "lines": [line.upper() for line in bubbles.get(bid, [])], } @@ -2861,7 +2672,6 @@ def translate_manga_text( print(divider + f"\nSaved: {export_to_file}\nSaved: {export_bubbles_to}") - # ============================================================ # ENTRY POINT # ============================================================ diff --git a/patch_manga_translator.py b/patch_manga_translator.py new file mode 100644 index 0000000..7a5c45a --- /dev/null +++ b/patch_manga_translator.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import re +from pathlib import Path + +TARGET = Path("manga-translator.py") + +def cut_after_first_entrypoint(text: str) -> str: + """ + Keep only first full __main__ block and remove duplicated tail if present. + """ + m = re.search(r'(?m)^if __name__ == "__main__":\s*$', text) + if not m: + return text + + start = m.start() + # Keep entrypoint block plus indented lines after it + lines = text[start:].splitlines(True) + keep = [] + keep.append(lines[0]) # if __name__... + i = 1 + while i < len(lines): + ln = lines[i] + if ln.strip() == "": + keep.append(ln) + i += 1 + continue + # if dedented back to col 0 => end of block + if not ln.startswith((" ", "\t")): + break + keep.append(ln) + i += 1 + + cleaned = text[:start] + "".join(keep) + return cleaned + +def replace_bad_vars(text: str) -> str: + text = text.replace( + "merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr, image_bgr)", + "merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image)" + ) + text = text.replace( + "reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr)", + "reattach_orphan_short_tokens(bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered)" + ) + return text + +def ensure_autofix_chain(text: str) -> str: + old = ( + " # ── Auto-fix (split + merge) ──────────────────────────────────────────\n" + " if auto_fix_bubbles:\n" + " bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed(bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image)\n" + ) + new = ( + " # ── Auto-fix (split + merge) ──────────────────────────────────────────\n" + " if auto_fix_bubbles:\n" + " bubbles, bubble_boxes, bubble_quads, bubble_indices = auto_fix_bubble_detection(\n" + " bubble_boxes, bubble_indices, bubble_quads, bubbles, filtered, image)\n" + " bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_micro_boxes_relaxed(\n" + " bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered, image)\n" + ) + return text.replace(old, new) + +def ensure_split_commit(text: str) -> str: + marker = " # ── Remove nested / duplicate boxes ──────────────────────────────────\n" + if marker not in text: + return text + + if "bubbles = new_bubbles" in text: + return text + + inject = ( + " bubbles = new_bubbles\n" + " bubble_boxes = new_bubble_boxes\n" + " bubble_quads = new_bubble_quads\n" + " bubble_indices = new_bubble_indices\n\n" + ) + return text.replace(marker, inject + marker) + +def ensure_rescue_pipeline(text: str) -> str: + anchor = ' print(f"Kept: {len(filtered)} | Skipped: {skipped}")\n' + if anchor not in text: + return text + + if "rescue_name_and_short_tokens(raw" in text: + return text + + block = ( + ' print(f"Kept: {len(filtered)} | Skipped: {skipped}")\n' + ' # Protect short dialogue tokens confidence\n' + ' tmp = []\n' + ' for bbox, t, conf in filtered:\n' + ' tmp.append((bbox, t, maybe_conf_floor_for_protected(t, conf, floor=0.40)))\n' + ' filtered = tmp\n' + ' # Rescue names/short tokens dropped by strict filters\n' + ' rescued = rescue_name_and_short_tokens(raw, min_conf=0.20)\n' + ' filtered = merge_rescued_items(filtered, rescued, iou_threshold=0.55)\n' + ) + return text.replace(anchor, block) + +def main(): + if not TARGET.exists(): + raise FileNotFoundError(f"Not found: {TARGET}") + + src = TARGET.read_text(encoding="utf-8") + out = src + + out = cut_after_first_entrypoint(out) + out = replace_bad_vars(out) + out = ensure_autofix_chain(out) + out = ensure_split_commit(out) + out = ensure_rescue_pipeline(out) + + TARGET.write_text(out, encoding="utf-8") + print("✅ Patched manga-translator.py") + +if __name__ == "__main__": + main()