diff --git a/manga-translator.py b/manga-translator.py index 4b49525..5209481 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -114,7 +114,7 @@ def is_noise_text(text: str) -> bool: return True if looks_like_box_tag(t): return True - + if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!\.]", t) and not t.isalpha(): return True @@ -194,55 +194,141 @@ def ocr_candidate_score(text: str) -> float: # ============================================================ -# SPLITTERS +# SPLITTERS + QUAD NORMALIZATION # ============================================================ +def estimate_char_capacity_width(text_len, med_h, k=0.72): + return max(18.0, text_len * med_h * k) + + +def shrink_ocr_quad_to_text(quad, text, med_h): + x1, y1, x2, y2 = quad_bbox(quad) + w = max(1, x2 - x1) + h = max(1, y2 - y1) + + t = (text or "").strip() + n = max(1, len(t.replace(" ", ""))) + exp_w = estimate_char_capacity_width(n, med_h, k=0.62) + max_w = max(exp_w * 1.35, h * 1.15) + + if w <= max_w: + return quad + + cx = (x1 + x2) / 2.0 + nw = int(round(max_w)) + nx1 = int(round(cx - nw / 2)) + nx2 = int(round(cx + nw / 2)) + + return [[nx1, y1], [nx2, y1], [nx2, y2], [nx1, y2]] + + +def normalize_ocr_quads(filtered_ocr): + if not filtered_ocr: + return filtered_ocr + + hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] + med_h = float(np.median(hs)) if hs else 14.0 + + out = [] + for quad, text, conf in filtered_ocr: + nq = shrink_ocr_quad_to_text(quad, text, med_h) + out.append((nq, text, conf)) + return out + + +def split_abnormal_bridge_quads(image_bgr, filtered_ocr): + if not filtered_ocr: + return filtered_ocr, 0 + + hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr] + med_h = float(np.median(hs)) if hs else 14.0 + + out = [] + splits = 0 + + for quad, text, conf in filtered_ocr: + x1, y1, x2, y2 = quad_bbox(quad) + w = max(1, x2 - x1) + h = max(1, y2 - y1) + + if w > med_h * 11.0 and " " in text and len(text) >= 14: + roi = image_bgr[max(0, y1):min(image_bgr.shape[0], y2), max(0, x1):min(image_bgr.shape[1], x2)] + if roi.size > 0: + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) + _, inv = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + proj = np.sum(inv, axis=0) + + s = int(w * 0.18) + e = int(w * 0.82) + if e > s: + segment = proj[s:e] + valley_rel = int(np.argmin(segment)) + valley_x = s + valley_rel + + low = float(segment[valley_rel]) + meanv = float(np.mean(segment)) + if low < meanv * 0.52: + split_x = x1 + valley_x + + char_w = w / max(1, len(text)) + split_idx = int((split_x - x1) / max(1e-6, char_w)) + spaces = [i for i, c in enumerate(text) if c == " "] + if spaces: + split_idx = min(spaces, key=lambda i: abs(i - split_idx)) + + left_t = text[:split_idx].strip() + right_t = text[split_idx:].strip() + + if left_t and right_t: + ql = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]] + qr = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]] + out.append((ql, left_t, conf)) + out.append((qr, right_t, conf)) + splits += 1 + continue + + out.append((quad, text, conf)) + + return out, splits + + def split_wide_ocr_items(image_bgr, filtered_ocr): - """ - Detects if Apple Vision incorrectly merged two columns into a single wide line. - It measures the width of the white gaps and only splits if the gap is - significantly wider than a normal space between words. - """ new_filtered = [] splits_made = 0 - + for item in filtered_ocr: quad, text, conf = item x1, y1, x2, y2 = quad_bbox(quad) w = x2 - x1 h = max(1, y2 - y1) - - # Check if it's abnormally wide + if w > h * 2.5 and len(text) > 5 and ' ' in text: pad = 2 roi_y1 = max(0, y1 - pad) roi_y2 = min(image_bgr.shape[0], y2 + pad) roi_x1 = max(0, x1) roi_x2 = min(image_bgr.shape[1], x2) - + roi = image_bgr[roi_y1:roi_y2, roi_x1:roi_x2] if roi.size > 0: gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) proj = np.sum(thresh, axis=0) - + start_x = int(w * 0.20) end_x = int(w * 0.80) - + if start_x < end_x: - # Calculate expected character width char_w = w / max(1, len(text)) - # A real column gap should be at least 2.5 chars wide or 75% of line height min_gap_width = max(int(char_w * 2.5), int(h * 0.75)) - + gap_threshold = h * 255 * 0.15 gap_mask = proj < gap_threshold - - # Find the widest continuous gap + best_gap_start = -1 best_gap_len = 0 current_gap_start = -1 current_gap_len = 0 - + for x_rel in range(start_x, end_x): if gap_mask[x_rel]: if current_gap_len == 0: @@ -253,26 +339,24 @@ def split_wide_ocr_items(image_bgr, filtered_ocr): best_gap_len = current_gap_len best_gap_start = current_gap_start current_gap_len = 0 - + if current_gap_len > best_gap_len: best_gap_len = current_gap_len best_gap_start = current_gap_start - - # ONLY split if the gap is wide enough to be a gutter between bubbles + if best_gap_len >= min_gap_width: split_x = roi_x1 + best_gap_start + (best_gap_len // 2) - - split_idx = int((split_x - x1) / char_w) - + + split_idx = int((split_x - x1) / max(1e-6, char_w)) spaces = [i for i, c in enumerate(text) if c == ' '] if spaces: best_space = min(spaces, key=lambda i: abs(i - split_idx)) if abs(best_space - split_idx) < len(text) * 0.35: split_idx = best_space - + text_left = text[:split_idx].strip() text_right = text[split_idx:].strip() - + if text_left and text_right: quad_left = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]] quad_right = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]] @@ -280,10 +364,9 @@ def split_wide_ocr_items(image_bgr, filtered_ocr): new_filtered.append((quad_right, text_right, conf)) splits_made += 1 continue - - # If no split was made, keep the original item + new_filtered.append(item) - + return new_filtered, splits_made @@ -291,82 +374,82 @@ def split_panel_box(image_bgr, bbox_xyxy, bubble_quads=None): x1, y1, x2, y2 = bbox_xyxy w = x2 - x1 h = y2 - y1 - + if bubble_quads is not None and len(bubble_quads) < 4: return None - + if w < 50 or h < 50: return None - + roi = image_bgr[y1:y2, x1:x2] if roi.size == 0: return None - + gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV) - + vertical_projection = np.sum(thresh, axis=0) - + search_start = int(w * 0.25) search_end = int(w * 0.75) - + if search_start >= search_end: return None - + peak_x_relative = np.argmax(vertical_projection[search_start:search_end]) + search_start peak_val = vertical_projection[peak_x_relative] - + threshold_val = h * 255 * 0.25 significant_peaks = [] - + for x_rel in range(search_start, search_end): if vertical_projection[x_rel] > threshold_val: significant_peaks.append((x_rel, vertical_projection[x_rel])) - + if len(significant_peaks) > 1: min_proj_val = np.min(vertical_projection[search_start:search_end]) min_proj_idx = np.argmin(vertical_projection[search_start:search_end]) + search_start - + if min_proj_val < threshold_val * 0.6: split_x_absolute = x1 + min_proj_idx box_left = (x1, y1, split_x_absolute, y2) box_right = (split_x_absolute, y1, x2, y2) return box_left, box_right, split_x_absolute - + if peak_val > (h * 255 * 0.40): split_x_absolute = x1 + peak_x_relative box_left = (x1, y1, split_x_absolute, y2) box_right = (split_x_absolute, y1, x2, y2) return box_left, box_right, split_x_absolute - + return None def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thresholds=False): if len(indices) < 2: return None - + boxes = [quad_bbox(ocr[i][0]) for i in indices] sorted_items = sorted(zip(indices, boxes), key=lambda x: x[1][0]) - + gaps = [] current_max_x = sorted_items[0][1][2] - + for i in range(1, len(sorted_items)): idx, b = sorted_items[i] x1 = b[0] gap = x1 - current_max_x gaps.append((i, gap, current_max_x, x1)) current_max_x = max(current_max_x, b[2]) - + if not gaps: return None - + max_gap_idx, max_gap_size, _, _ = max(gaps, key=lambda x: x[1]) - + hs = [b[3] - b[1] for b in boxes] med_h = float(np.median(hs)) if hs else 15.0 - + if use_aggressive_thresholds: threshold1 = 60.0 threshold2 = med_h * 1.0 @@ -375,66 +458,238 @@ def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thre threshold1 = 90.0 threshold2 = med_h * 1.5 min_gap = 25.0 - + if max_gap_size > threshold1 or (max_gap_size > threshold2 and max_gap_size > min_gap): split_idx = max_gap_idx left_indices = [item[0] for item in sorted_items[:split_idx]] right_indices = [item[0] for item in sorted_items[split_idx:]] - + if len(left_indices) < 1 or len(right_indices) < 1: return None - + return left_indices, right_indices - + return None def split_bubble_if_multiple_rows(indices, ocr, bid=None): if len(indices) < 2: return None - + boxes = [quad_bbox(ocr[i][0]) for i in indices] sorted_items = sorted(zip(indices, boxes), key=lambda x: x[1][1]) - + gaps = [] current_max_y = sorted_items[0][1][3] - + for i in range(1, len(sorted_items)): idx, b = sorted_items[i] y1 = b[1] gap = y1 - current_max_y gaps.append((i, gap, current_max_y, y1)) current_max_y = max(current_max_y, b[3]) - + if not gaps: return None - + max_gap_idx, max_gap_size, _, _ = max(gaps, key=lambda x: x[1]) - + hs = [b[3] - b[1] for b in boxes] med_h = float(np.median(hs)) if hs else 15.0 - + threshold = med_h * 1.8 min_gap = 20.0 - + if max_gap_size > threshold and max_gap_size > min_gap: split_idx = max_gap_idx top_indices = [item[0] for item in sorted_items[:split_idx]] bottom_indices = [item[0] for item in sorted_items[split_idx:]] - + if len(top_indices) >= 1 and len(bottom_indices) >= 1: return top_indices, bottom_indices - + return None +def is_vertical_text_like(indices, ocr): + if len(indices) < 2: + return False + + bxs = [quad_bbox(ocr[i][0]) for i in indices] + ub = boxes_union_xyxy(bxs) + if ub is None: + return False + + x1, y1, x2, y2 = ub + w = max(1, x2 - x1) + h = max(1, y2 - y1) + + aspect = h / w + xcs = [((b[0] + b[2]) / 2.0) for b in bxs] + x_spread = float(np.std(xcs)) if len(xcs) > 1 else 0.0 + med_h = float(np.median([max(1, b[3]-b[1]) for b in bxs])) + + ys = sorted([((b[1] + b[3]) / 2.0) for b in bxs]) + gaps = [ys[i+1] - ys[i] for i in range(len(ys)-1)] if len(ys) >= 2 else [0] + med_gap = float(np.median(gaps)) if gaps else 0.0 + + return ( + aspect > 1.35 and + x_spread < max(10.0, med_h * 0.9) and + med_gap > max(6.0, med_h * 0.35) + ) + + +def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22): + if len(indices) < 2: + return None + + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + yc = (b[1] + b[3]) / 2.0 + h = max(1.0, b[3] - b[1]) + items.append((i, b, yc, h)) + + items.sort(key=lambda t: t[2]) + med_h = float(np.median([t[3] for t in items])) if items else 12.0 + + best_k = -1 + best_gap = -1 + for k in range(len(items)-1): + y_top = items[k][1][3] + y_bot = items[k+1][1][1] + gap = y_bot - y_top + if gap > best_gap: + best_gap = gap + best_k = k + + if best_k < 0: + return None + + if best_gap > max(min_gap, med_h * factor): + a = [t[0] for t in items[:best_k+1]] + b = [t[0] for t in items[best_k+1:]] + if a and b: + return a, b + return None + + +def split_nested_or_side_by_side(indices, ocr): + if len(indices) < 2: + return None + + boxes = [quad_bbox(ocr[i][0]) for i in indices] + xcs = np.array([[(b[0] + b[2]) / 2.0] for b in boxes], dtype=np.float32) + + c1 = float(np.min(xcs)) + c2 = float(np.max(xcs)) + if abs(c2 - c1) < 8: + return None + + for _ in range(12): + g1, g2 = [], [] + for idx, v in enumerate(xcs[:, 0]): + if abs(v - c1) <= abs(v - c2): + g1.append(idx) + else: + g2.append(idx) + if not g1 or not g2: + return None + new_c1 = float(np.mean([xcs[i, 0] for i in g1])) + new_c2 = float(np.mean([xcs[i, 0] for i in g2])) + if abs(new_c1 - c1) < 0.5 and abs(new_c2 - c2) < 0.5: + break + c1, c2 = new_c1, new_c2 + + left_group = g1 if c1 < c2 else g2 + right_group = g2 if c1 < c2 else g1 + + left_idxs = [indices[i] for i in left_group] + right_idxs = [indices[i] for i in right_group] + if not left_idxs or not right_idxs: + return None + + left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_idxs]) + right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_idxs]) + + sep = right_box[0] - left_box[2] + if sep < -8: + return None + + return left_idxs, right_idxs + + +def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr): + bids = sorted(bubbles.keys()) + used = set() + out_b, out_bb, out_bq, out_bi = {}, {}, {}, {} + nbid = 1 + + all_h = [] + for i in range(len(ocr)): + b = quad_bbox(ocr[i][0]) + all_h.append(max(1, b[3]-b[1])) + med_h = float(np.median(all_h)) if all_h else 14.0 + + for i, a in enumerate(bids): + if a in used: + continue + used.add(a) + group = [a] + + ax1, ay1, ax2, ay2 = bubble_boxes[a] + + for b in bids[i+1:]: + if b in used: + continue + bx1, by1, bx2, by2 = bubble_boxes[b] + + acx, acy = (ax1+ax2)/2.0, (ay1+ay2)/2.0 + bcx, bcy = (bx1+bx2)/2.0, (by1+by2)/2.0 + dx, dy = abs(acx-bcx), abs(acy-bcy) + + near = dx < med_h * 10.0 and dy < med_h * 3.6 + touching = overlap_or_near((ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2), gap=int(med_h*1.25)) + + ua = boxes_union_xyxy([(ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2)]) + area_a = max(1, (ax2-ax1)*(ay2-ay1)) + area_b = max(1, (bx2-bx1)*(by2-by1)) + area_u = max(1, (ua[2]-ua[0])*(ua[3]-ua[1])) + compact_union = area_u < (area_a + area_b) * 1.65 + + if near and touching and compact_union: + group.append(b) + used.add(b) + ax1 = min(ax1, bx1); ay1 = min(ay1, by1); ax2 = max(ax2, bx2); ay2 = max(ay2, by2) + + idxs = [] + quads = [] + for g in group: + idxs.extend(bubble_indices[g]) + quads.extend(bubble_quads[g]) + + idxs = sorted(set(idxs)) + ub = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idxs]) + if ub is None: + continue + + out_b[nbid] = build_lines_from_indices(idxs, ocr) + out_bb[nbid] = ub + out_bq[nbid] = quads + out_bi[nbid] = idxs + nbid += 1 + + return out_b, out_bb, out_bq, out_bi + + # ============================================================ # OCR ENGINES (Apple Native Vision) # ============================================================ class MacVisionDetector: def __init__(self, source_lang="en"): lang_key = source_lang.lower().strip() - + lang_map = { "en": "en-US", "english": "en-US", "es": "es-ES", "spanish": "es-ES", @@ -446,7 +701,7 @@ class MacVisionDetector: "ko": "ko-KR", "korean": "ko-KR", "zh": "zh-Hans", "chinese": "zh-Hans" } - + apple_lang = lang_map.get(lang_key, "en-US") self.langs = [apple_lang] print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") @@ -503,7 +758,6 @@ class MacVisionDetector: request.setRecognitionLanguages_(self.langs) handler.performRequests_error_([request], None) - return results @@ -596,13 +850,7 @@ def rebuild_text_from_vision_result(res): return normalize_text(" ".join(lines)) -def reread_bubble_with_vision( - image_bgr, - bbox_xyxy, - vision_detector: MacVisionDetector, - upscale=3.0, - pad=24 -): +def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24): ih, iw = image_bgr.shape[:2] x1, y1, x2, y2 = bbox_xyxy x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) @@ -616,7 +864,7 @@ def reread_bubble_with_vision( angles = [0.0, 1.5, -1.5] best_v_txt, best_v_sc = "", 0.0 - up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), interpolation=cv2.INTER_CUBIC) + up0 = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC) for mode in modes: proc = preprocess_variant(up0, mode) @@ -697,7 +945,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): med_h = float(np.median([it["h"] for it in items])) row_tol = max(6.0, med_h * 0.90) gap_x_tol = max(8.0, med_h * 1.25) - pad = max(3, int(round(med_h * 0.22))) + pad = max(2, int(round(med_h * 0.14))) rows = [] for it in sorted(items, key=lambda x: x["yc"]): @@ -736,7 +984,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): ub = boxes_union_xyxy([x["b"] for x in ch]) if ub: x1, y1, x2, y2 = ub - out_boxes.append((x1 - pad, y1 - int(round(pad*1.35)), x2 + pad, y2 + int(round(pad*0.95)))) + out_boxes.append((x1 - pad, y1 - int(round(pad * 1.2)), x2 + pad, y2 + int(round(pad * 0.9)))) if image_shape is not None: ih, iw = image_shape[:2] @@ -759,7 +1007,7 @@ def auto_gap(image_path, base=18, ref_w=750): return base * (img.shape[1] / ref_w) -def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): +def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=1): n = len(ocr) if n == 0: return {}, {}, {}, {} @@ -769,7 +1017,6 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): hs = [max(1.0, b[3] - b[1]) for b in boxes] med_h = float(np.median(hs)) if hs else 12.0 dist_thresh = max(20.0, med_h * 1.8) - adaptive_gap_y = max(gap_px, med_h * 2.5) p = list(range(n)) @@ -789,23 +1036,20 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): bx1, by1, bx2, by2 = boxes[j] gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) gap_y = max(0, max(ay1, by1) - min(ay2, by2)) - + cx1, cy1 = centers[i] cx2, cy2 = centers[j] is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5) - - if gap_x == 0 and gap_y <= (med_h * 3.5): - unite(i, j) - continue - if is_vertically_aligned and gap_y <= (med_h * 3.5): - unite(i, j) - continue + if gap_x == 0 and gap_y <= (med_h * 3.5): + unite(i, j); continue + + if is_vertically_aligned and gap_y <= (med_h * 3.2): + unite(i, j); continue if gap_x <= gap_px and gap_y <= adaptive_gap_y: - unite(i, j) - continue - + unite(i, j); continue + d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 1.5: unite(i, j) @@ -831,8 +1075,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): continue x1, y1, x2, y2 = ub - - adaptive_pad = max(bbox_padding, int(round(med_h * 0.35))) + adaptive_pad = max(1, int(round(med_h * 0.16))) x1 = max(0, x1 - adaptive_pad); y1 = max(0, y1 - adaptive_pad) x2 = min(iw - 1, x2 + adaptive_pad); y2 = min(ih - 1, y2 + adaptive_pad) @@ -847,14 +1090,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): # ============================================================ # DEBUG / EXPORT # ============================================================ -def save_debug_clusters( - image_path, - ocr, - bubble_boxes, - bubble_indices, - clean_lines=None, - out_path="debug_clusters.png" -): +def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, clean_lines=None, out_path="debug_clusters.png"): img = cv2.imread(image_path) if img is None: return @@ -874,23 +1110,20 @@ def save_debug_clusters( text = clean_lines[bid] words = text.split() lines = [] - current_line = "" - - for word in words: - if len(current_line) + len(word) < 25: - current_line += word + " " + cur = "" + for w in words: + if len(cur) + len(w) < 25: + cur += w + " " else: - lines.append(current_line.strip()) - current_line = word + " " - if current_line: - lines.append(current_line.strip()) + lines.append(cur.strip()) + cur = w + " " + if cur: + lines.append(cur.strip()) y_text = y2 + 18 for line in lines: - cv2.putText(img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3) - cv2.putText(img, line, (x1, y_text), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) + cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3) + cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) y_text += 18 cv2.imwrite(out_path, img) @@ -1018,12 +1251,26 @@ def translate_manga_text( print("⚠️ No text after filtering.") return + # 1) split obvious wide OCR merges filtered, splits_made = split_wide_ocr_items(image, filtered) if splits_made > 0: print(f"✂️ Split {splits_made} wide OCR lines across column gaps.") + # 2) split giant bridge quads (fixes page16 BOX19-like glue) + filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) + if bridge_splits > 0: + print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") + + # 3) shrink quads to tighter text footprint + filtered = normalize_ocr_quads(filtered) + bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens( - filtered, image.shape, gap_px=resolved_gap, bbox_padding=3 + filtered, image.shape, gap_px=resolved_gap, bbox_padding=1 + ) + + # merge accidental sibling fragments (fixes page15 BOX11+BOX16 style) + bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( + bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {} @@ -1033,57 +1280,79 @@ def translate_manga_text( for bid in list(bubbles.keys()): box = bubble_boxes[bid] bubble_split = None - - split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) - if split_result: - box_left, box_right, split_x = split_result - left_idxs, right_idxs = [], [] - - for idx in bubble_indices[bid]: - cx, cy = quad_center(filtered[idx][0]) - if cx < split_x: - left_idxs.append(idx) - else: - right_idxs.append(idx) - - if left_idxs and right_idxs: - bubble_split = (left_idxs, right_idxs) - splits_performed.append(f"BOX#{bid} (panel border at x={split_x})") - elif len(bubble_quads[bid]) >= 4: - col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True) - if col_split: - left_idxs, right_idxs = col_split - if left_idxs and right_idxs: - bubble_split = (left_idxs, right_idxs) - splits_performed.append(f"BOX#{bid} ({len(left_idxs)} quads | {len(right_idxs)} quads)") - + + if is_vertical_text_like(bubble_indices[bid], filtered): + vgap_split = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.7, min_gap=18) + if vgap_split: + bubble_split = vgap_split + splits_performed.append(f"BOX#{bid} (vertical-stack y-gap split)") + + if bubble_split is None: + split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) + if split_result: + _, _, split_x = split_result + left_idxs, right_idxs = [], [] + for idx in bubble_indices[bid]: + cx, cy = quad_center(filtered[idx][0]) + if cx < split_x: + left_idxs.append(idx) + else: + right_idxs.append(idx) + + if left_idxs and right_idxs: + bubble_split = (left_idxs, right_idxs) + splits_performed.append(f"BOX#{bid} (panel border at x={split_x})") + elif len(bubble_quads[bid]) >= 4: + col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True) + if col_split: + l, r = col_split + if l and r: + bubble_split = (l, r) + splits_performed.append(f"BOX#{bid} ({len(l)} quads | {len(r)} quads)") + if bubble_split is None: col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid) if col_split: - left_idxs, right_idxs = col_split - if left_idxs and right_idxs: - bubble_split = (left_idxs, right_idxs) - splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(left_idxs)} | {len(right_idxs)} quads)") - + l, r = col_split + if l and r: + bubble_split = (l, r) + splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(l)} | {len(r)} quads)") + + if bubble_split is None: + nested_split = split_nested_or_side_by_side(bubble_indices[bid], filtered) + if nested_split: + l, r = nested_split + if l and r: + bubble_split = (l, r) + splits_performed.append(f"BOX#{bid} (nested/side-by-side forced split)") + if bubble_split is None: row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid) if row_split: - top_idxs, bottom_idxs = row_split - if top_idxs and bottom_idxs: - bubble_split = (top_idxs, bottom_idxs) - splits_performed.append(f"BOX#{bid} (Horizontal Row Split: {len(top_idxs)} | {len(bottom_idxs)} quads)") - + t, b = row_split + if t and b: + bubble_split = (t, b) + splits_performed.append(f"BOX#{bid} (Horizontal Row Split: {len(t)} | {len(b)} quads)") + + if bubble_split is None: + gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.9, min_gap=22) + if gy: + a, b = gy + bubble_split = (a, b) + splits_performed.append(f"BOX#{bid} (large vertical-gap split)") + if bubble_split: part1_idxs, part2_idxs = bubble_split + new_bubbles[bid] = build_lines_from_indices(part1_idxs, filtered) ub_1 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part1_idxs]) - new_bubble_boxes[bid] = (max(0, ub_1[0]-3), max(0, ub_1[1]-3), min(iw-1, ub_1[2]+3), min(ih-1, ub_1[3]+3)) + new_bubble_boxes[bid] = (max(0, ub_1[0]-2), max(0, ub_1[1]-2), min(iw-1, ub_1[2]+2), min(ih-1, ub_1[3]+2)) new_bubble_quads[bid] = [filtered[i][0] for i in part1_idxs] new_bubble_indices[bid] = part1_idxs - + new_bubbles[next_bid] = build_lines_from_indices(part2_idxs, filtered) ub_2 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part2_idxs]) - new_bubble_boxes[next_bid] = (max(0, ub_2[0]-3), max(0, ub_2[1]-3), min(iw-1, ub_2[2]+3), min(ih-1, ub_2[3]+3)) + new_bubble_boxes[next_bid] = (max(0, ub_2[0]-2), max(0, ub_2[1]-2), min(iw-1, ub_2[2]+2), min(ih-1, ub_2[3]+2)) new_bubble_quads[next_bid] = [filtered[i][0] for i in part2_idxs] new_bubble_indices[next_bid] = part2_idxs next_bid += 1 @@ -1092,12 +1361,12 @@ def translate_manga_text( new_bubble_boxes[bid] = bubble_boxes[bid] new_bubble_quads[bid] = bubble_quads[bid] new_bubble_indices[bid] = bubble_indices[bid] - + if splits_performed: print(f"\n🔀 Multi-column/row bubble splits detected: {len(splits_performed)}") for split_info in splits_performed: print(f" ✓ Split {split_info}") - + bubbles = new_bubbles bubble_boxes = new_bubble_boxes bubble_quads = new_bubble_quads @@ -1128,7 +1397,7 @@ def translate_manga_text( src_used = rr_src txt = txt.replace(" BOMPORTA", " IMPORTA") - txt = txt.replace(" TESTO ", " ESTO ") + txt = txt.replace(" TESTO ", " ESTO ") txt = txt.replace(" MIVERDAD", " MI VERDAD") clean_lines[bid] = apply_glossary(normalize_text(txt)) @@ -1171,11 +1440,11 @@ def translate_manga_text( src_engine = sources_used.get(bid, "unknown") out_lines.append( - f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" + f"#{bid}|{reading_map.get(bid, bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" ) print( - f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " + f"#{bid:<7} {reading_map.get(bid, bid):<6} {src_engine:<12} " f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}" ) translated_count += 1 @@ -1205,8 +1474,8 @@ def translate_manga_text( if __name__ == "__main__": translate_manga_text( - image_path="004.png", - source_lang="english", + image_path="16.jpg", + source_lang="english", target_lang="ca", confidence_threshold=0.05, min_text_length=1, @@ -1215,6 +1484,6 @@ if __name__ == "__main__": quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", - reading_mode="rtl", + reading_mode="rtl", debug=True ) \ No newline at end of file