diff --git a/002-page.jpg b/002-page.jpg deleted file mode 100755 index e8c059b..0000000 Binary files a/002-page.jpg and /dev/null differ diff --git a/manga-renderer.py b/manga-renderer.py index 860ca62..5dbd36d 100644 --- a/manga-renderer.py +++ b/manga-renderer.py @@ -1,29 +1,13 @@ -""" -manga-renderer.py -───────────────────────────────────────────────────────────────── -Pipeline: - 1. Detect panel boundaries - 2. Assign bubble -> panel - 3. Detect/fallback bubble ellipse - 4. Clean original text region: - - OCR union mask (default) - - Hybrid mask fallback - - Ellipse mode optional - 5. Render translated text with ellipse-aware wrapping -""" - import os -import math import json import re - import cv2 import numpy as np from PIL import Image, ImageDraw, ImageFont # ───────────────────────────────────────────── -# CONSTANTS +# CONFIG # ───────────────────────────────────────────── DEFAULT_FONT_CANDIDATES = [ "fonts/AnimeAce2_reg.ttf", @@ -36,33 +20,27 @@ DEFAULT_STROKE_COLOR = (255, 255, 255) MAX_FONT_SIZE = 20 MIN_FONT_SIZE = 6 -FONT_SIZE_STEP = 1 -TEXT_RATIO = 0.76 -FLOOD_TOLERANCE = 30 -BORDER_SHRINK_PX = 4 -MIN_PANEL_AREA_RATIO = 0.02 -MAX_NUDGE_RATIO = 0.30 +# Guarantee full wipe of yellow squares +YELLOW_BOX_PAD_X = 1 +YELLOW_BOX_PAD_Y = 1 +YELLOW_UNION_PAD_X = 4 +YELLOW_UNION_PAD_Y = 4 -# Cleaning mode: -# "ocr_union" -> precise cleanup from OCR quad boxes (recommended) -# "hybrid" -> rounded-rect + inner ellipse -# "ellipse" -> legacy large ellipse fill -CLEAN_MODE = "ocr_union" +# Optional extra cleanup expansion +ENABLE_EXTRA_CLEAN = True +EXTRA_DILATE_ITERS = 1 +EXTRA_CLOSE_ITERS = 1 -# OCR-union cleaning tuning -OCR_CLEAN_PAD_X = 12 -OCR_CLEAN_PAD_Y = 10 -OCR_CLEAN_MIN_W = 24 -OCR_CLEAN_MIN_H = 24 -OCR_CLEAN_CLOSE_KERNEL = 5 -OCR_CLEAN_DILATE = 1 +# Bubble detection (for optional extra mask / border preservation) +FLOOD_TOL = 30 -# Hybrid cleanup mask tuning -CLEAN_MASK_RECT_SCALE_W = 1.08 -CLEAN_MASK_RECT_SCALE_H = 1.20 -CLEAN_MASK_ELLIPSE_SCALE = 0.84 -CLEAN_MASK_BLUR = 0 +# Border restoration: keep very conservative +ENABLE_EDGE_RESTORE = True +EDGE_RESTORE_DILATE = 1 + +# Text layout inside yellow-union +TEXT_INSET = 0.92 # ───────────────────────────────────────────── @@ -78,10 +56,8 @@ def parse_translations(translations_file): line = line.strip() if not line.startswith("#"): continue + parts = line.split("|") - # Format: - # #ID|ORDER|ORIGINAL|TRANSLATED|FLAGS - # backward-compatible with older variants try: bubble_id = int(parts[0].lstrip("#")) except Exception: @@ -121,6 +97,40 @@ def parse_bubbles(bubbles_file): # ───────────────────────────────────────────── # HELPERS # ───────────────────────────────────────────── +def clamp(v, lo, hi): + return max(lo, min(hi, v)) + + +def xywh_to_xyxy(box): + if not box: + return None + x = int(box.get("x", 0)) + y = int(box.get("y", 0)) + w = int(box.get("w", 0)) + h = int(box.get("h", 0)) + return (x, y, x + w, y + h) + + +def union_xyxy(boxes): + boxes = [b for b in boxes if b is not None] + if not boxes: + return None + x1 = min(b[0] for b in boxes) + y1 = min(b[1] for b in boxes) + x2 = max(b[2] for b in boxes) + y2 = max(b[3] for b in boxes) + if x2 <= x1 or y2 <= y1: + return None + return (x1, y1, x2, y2) + + +def bbox_from_mask(mask): + ys, xs = np.where(mask > 0) + if len(xs) == 0: + return None + return (int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1) + + def normalize_text(s): t = s.upper().strip() t = re.sub(r"[^\w]+", "", t) @@ -129,13 +139,11 @@ def normalize_text(s): def is_sfx_like(text): t = normalize_text(text) - if len(t) <= 8 and re.fullmatch(r"(SHA+|BIP+|BEEP+|HN+|AH+|OH+)", t): - return True - return False + return bool(len(t) <= 8 and re.fullmatch(r"(SHA+|BIP+|BEEP+|HN+|AH+|OH+)", t)) # ───────────────────────────────────────────── -# FONT HELPERS +# FONT # ───────────────────────────────────────────── def load_font_from_candidates(candidates, size): for path in candidates: @@ -148,129 +156,60 @@ def load_font_from_candidates(candidates, size): def measure_text(draw, text, font): - bbox = draw.textbbox((0, 0), text, font=font) - return bbox[2] - bbox[0], bbox[3] - bbox[1] - - -def ellipse_line_max_width(y_offset, a, b): - if b <= 0: - return 0 - t = 1.0 - (y_offset * y_offset) / (b * b) - t = max(0.0, t) - return 2.0 * a * math.sqrt(t) + bb = draw.textbbox((0, 0), text, font=font) + return bb[2] - bb[0], bb[3] - bb[1] def wrap_text(draw, text, font, max_width): words = text.split() lines = [] - current = "" - for word in words: - test = (current + " " + word).strip() - w, _ = measure_text(draw, test, font) - if w <= max_width or not current: - current = test + cur = "" + + for w in words: + test = (cur + " " + w).strip() + tw, _ = measure_text(draw, test, font) + if tw <= max_width or not cur: + cur = test else: - lines.append(current) - current = word - if current: - lines.append(current) + lines.append(cur) + cur = w + if cur: + lines.append(cur) if not lines: return [""], 0, 0 - widths, heights = [], [] - for ln in lines: - w, h = measure_text(draw, ln, font) - widths.append(w) - heights.append(h) - - line_gap = max(heights[0] // 5, 2) if heights else 2 - total_h = sum(heights) + line_gap * (len(lines) - 1) - return lines, total_h, max(widths) if widths else 0 - - -def wrap_text_ellipse_aware(draw, text, font, safe_w, safe_h, tall_bubble=False): - target_w = safe_w * (0.85 if tall_bubble else 1.0) - lines, total_h, _ = wrap_text(draw, text, font, target_w) - if not lines: - return lines, total_h - + widths = [] heights = [] for ln in lines: - _, h = measure_text(draw, ln, font) - heights.append(h) + lw, lh = measure_text(draw, ln, font) + widths.append(lw) + heights.append(lh) - line_gap = max(heights[0] // 5, 2) if heights else 2 - if tall_bubble: - line_gap += 1 - - block_h = sum(heights) + line_gap * (len(lines) - 1) - if block_h > safe_h: - return lines, block_h - - a = target_w / 2.0 - b = safe_h / 2.0 - - words = text.split() - refined = [] - cursor_y = -block_h / 2.0 - current = "" - idx_h = 0 - - for word in words: - h_line = heights[min(idx_h, len(heights) - 1)] if heights else 12 - y_mid = cursor_y + h_line / 2.0 - row_max = ellipse_line_max_width(y_mid, a, b) * 0.95 - row_max = max(20, row_max) - - candidate = (current + " " + word).strip() - w, _ = measure_text(draw, candidate, font) - - if (w <= row_max) or (not current): - current = candidate - else: - refined.append(current) - cursor_y += h_line + line_gap - idx_h += 1 - current = word - - if current: - refined.append(current) - - hs = [] - for ln in refined: - _, h = measure_text(draw, ln, font) - hs.append(h) - - total = sum(hs) + (max(hs[0] // 5, 2) + (1 if tall_bubble else 0)) * (len(refined) - 1) if hs else 0 - return refined, total + gap = max(2, heights[0] // 5) + total_h = sum(heights) + gap * (len(lines) - 1) + return lines, total_h, max(widths) -def best_fit_font(draw, text, font_candidates, safe_w, safe_h, tall_bubble=False): - for size in range(MAX_FONT_SIZE, MIN_FONT_SIZE - 1, -FONT_SIZE_STEP): - font, path_used = load_font_from_candidates(font_candidates, size) - lines, total_h = wrap_text_ellipse_aware(draw, text, font, safe_w, safe_h, tall_bubble=tall_bubble) +def fit_font(draw, text, font_candidates, safe_w, safe_h): + for size in range(MAX_FONT_SIZE, MIN_FONT_SIZE - 1, -1): + font, _ = load_font_from_candidates(font_candidates, size) + lines, total_h, max_w = wrap_text(draw, text, font, safe_w) + if total_h <= safe_h and max_w <= safe_w: + return font, lines, total_h - max_lw = 0 - for ln in lines: - lw, _ = measure_text(draw, ln, font) - max_lw = max(max_lw, lw) - - if total_h <= safe_h and max_lw <= safe_w: - return font, lines, total_h, path_used - - font, path_used = load_font_from_candidates(font_candidates, MIN_FONT_SIZE) - lines, total_h = wrap_text_ellipse_aware(draw, text, font, safe_w, safe_h, tall_bubble=tall_bubble) - return font, lines, total_h, path_used + font, _ = load_font_from_candidates(font_candidates, MIN_FONT_SIZE) + lines, total_h, _ = wrap_text(draw, text, font, safe_w) + return font, lines, total_h def draw_text_with_stroke(draw, pos, text, font, fill, stroke_fill): x, y = pos _, h = measure_text(draw, text, font) - stroke_width = 2 if h <= 11 else 1 + sw = 2 if h <= 11 else 1 - for dx in range(-stroke_width, stroke_width + 1): - for dy in range(-stroke_width, stroke_width + 1): + for dx in range(-sw, sw + 1): + for dy in range(-sw, sw + 1): if dx == 0 and dy == 0: continue draw.text((x + dx, y + dy), text, font=font, fill=stroke_fill) @@ -279,489 +218,229 @@ def draw_text_with_stroke(draw, pos, text, font, fill, stroke_fill): # ───────────────────────────────────────────── -# CLEAN MASK BUILDERS +# MASK BUILDERS # ───────────────────────────────────────────── -def draw_rounded_rect_mask(mask, x1, y1, x2, y2, radius, color=255): - x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) - if x2 <= x1 or y2 <= y1: - return mask - - r = int(max(1, min(radius, (x2 - x1) // 2, (y2 - y1) // 2))) - - cv2.rectangle(mask, (x1 + r, y1), (x2 - r, y2), color, -1) - cv2.rectangle(mask, (x1, y1 + r), (x2, y2 - r), color, -1) - - cv2.circle(mask, (x1 + r, y1 + r), r, color, -1) - cv2.circle(mask, (x2 - r, y1 + r), r, color, -1) - cv2.circle(mask, (x1 + r, y2 - r), r, color, -1) - cv2.circle(mask, (x2 - r, y2 - r), r, color, -1) - return mask - - -def build_hybrid_clean_mask(img_h, img_w, cx, cy, sa_fill, sb_fill, angle, safe_w, safe_h, panel): - px1, py1, px2, py2 = panel +def build_yellow_mask(bubble_data, img_h, img_w): + """ + HARD GUARANTEE: + Returned mask always covers all yellow squares (line_bboxes). + """ mask = np.zeros((img_h, img_w), dtype=np.uint8) - rw = max(8, int(safe_w * CLEAN_MASK_RECT_SCALE_W)) - rh = max(8, int(safe_h * CLEAN_MASK_RECT_SCALE_H)) - x1 = int(cx - rw / 2) - y1 = int(cy - rh / 2) - x2 = int(cx + rw / 2) - y2 = int(cy + rh / 2) - rr = int(min(rw, rh) * 0.22) - - draw_rounded_rect_mask(mask, x1, y1, x2, y2, rr, color=255) - - e_sa = max(3, int(sa_fill * CLEAN_MASK_ELLIPSE_SCALE)) - e_sb = max(3, int(sb_fill * CLEAN_MASK_ELLIPSE_SCALE)) - cv2.ellipse(mask, (int(round(cx)), int(round(cy))), (e_sa, e_sb), angle, 0, 360, 255, -1) - - kernel = np.ones((3, 3), np.uint8) - mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=1) - - clip = np.zeros_like(mask) - clip[py1:py2, px1:px2] = 255 - mask = cv2.bitwise_and(mask, clip) - - if CLEAN_MASK_BLUR > 0: - mask = cv2.GaussianBlur(mask, (0, 0), CLEAN_MASK_BLUR) - _, mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY) - - return mask - - -def build_ocr_union_clean_mask(img_h, img_w, bubble_data, panel): - """ - Build precise text cleanup mask from OCR quad bounding boxes. - """ - px1, py1, px2, py2 = panel - quad_boxes = bubble_data.get("quad_bboxes", []) - mask = np.zeros((img_h, img_w), dtype=np.uint8) - - if not quad_boxes: - return mask - - for qb in quad_boxes: - x = int(qb.get("x", 0)) - y = int(qb.get("y", 0)) - w = int(qb.get("w", 0)) - h = int(qb.get("h", 0)) - - if w <= 0 or h <= 0: + # Preferred: exact line boxes + line_boxes = bubble_data.get("line_bboxes", []) + for lb in line_boxes: + b = xywh_to_xyxy(lb) + if not b: continue - - if w < OCR_CLEAN_MIN_W: - extra = (OCR_CLEAN_MIN_W - w) // 2 - x -= extra - w += 2 * extra - - if h < OCR_CLEAN_MIN_H: - extra = (OCR_CLEAN_MIN_H - h) // 2 - y -= extra - h += 2 * extra - - x1 = max(px1, x - OCR_CLEAN_PAD_X) - y1 = max(py1, y - OCR_CLEAN_PAD_Y) - x2 = min(px2, x + w + OCR_CLEAN_PAD_X) - y2 = min(py2, y + h + OCR_CLEAN_PAD_Y) - + x1, y1, x2, y2 = b + x1 -= YELLOW_BOX_PAD_X + y1 -= YELLOW_BOX_PAD_Y + x2 += YELLOW_BOX_PAD_X + y2 += YELLOW_BOX_PAD_Y + x1 = clamp(x1, 0, img_w - 1) + y1 = clamp(y1, 0, img_h - 1) + x2 = clamp(x2, 1, img_w) + y2 = clamp(y2, 1, img_h) if x2 > x1 and y2 > y1: cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) - # Merge nearby fragments - ksize = max(3, int(OCR_CLEAN_CLOSE_KERNEL) | 1) # ensure odd and >=3 - k = np.ones((ksize, ksize), np.uint8) - mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k, iterations=1) + # If no line boxes available, use line_union fallback + if np.count_nonzero(mask) == 0: + ub = xywh_to_xyxy(bubble_data.get("line_union_bbox")) + if ub: + x1, y1, x2, y2 = ub + x1 -= YELLOW_UNION_PAD_X + y1 -= YELLOW_UNION_PAD_Y + x2 += YELLOW_UNION_PAD_X + y2 += YELLOW_UNION_PAD_Y + x1 = clamp(x1, 0, img_w - 1) + y1 = clamp(y1, 0, img_h - 1) + x2 = clamp(x2, 1, img_w) + y2 = clamp(y2, 1, img_h) + if x2 > x1 and y2 > y1: + cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) - if OCR_CLEAN_DILATE > 0: - mask = cv2.dilate(mask, np.ones((3, 3), np.uint8), iterations=OCR_CLEAN_DILATE) - - # Clip to panel bounds - clip = np.zeros_like(mask) - clip[py1:py2, px1:px2] = 255 - mask = cv2.bitwise_and(mask, clip) + # Last fallback: text_bbox + if np.count_nonzero(mask) == 0: + tb = xywh_to_xyxy(bubble_data.get("text_bbox")) + if tb: + x1, y1, x2, y2 = tb + x1 -= YELLOW_UNION_PAD_X + y1 -= YELLOW_UNION_PAD_Y + x2 += YELLOW_UNION_PAD_X + y2 += YELLOW_UNION_PAD_Y + x1 = clamp(x1, 0, img_w - 1) + y1 = clamp(y1, 0, img_h - 1) + x2 = clamp(x2, 1, img_w) + y2 = clamp(y2, 1, img_h) + if x2 > x1 and y2 > y1: + cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) return mask -# ───────────────────────────────────────────── -# PANEL DETECTION -# ───────────────────────────────────────────── -def merge_nested_panels(panels): - if len(panels) <= 1: - return panels +def bubble_interior_mask(img_bgr, bubble_data): + """ + Optional helper to expand clean region safely; never used to shrink yellow coverage. + """ + h, w = img_bgr.shape[:2] - panels_sorted = sorted(panels, key=lambda p: (p[2] - p[0]) * (p[3] - p[1]), reverse=True) - keep = [] - - for panel in panels_sorted: - px1, py1, px2, py2 = panel - p_area = (px2 - px1) * (py2 - py1) - dominated = False - - for kept in keep: - kx1, ky1, kx2, ky2 = kept - ix1 = max(px1, kx1) - iy1 = max(py1, ky1) - ix2 = min(px2, kx2) - iy2 = min(py2, ky2) - if ix2 > ix1 and iy2 > iy1: - inter = (ix2 - ix1) * (iy2 - iy1) - if inter / max(1, p_area) > 0.80: - dominated = True - break - - if not dominated: - keep.append(panel) - - return keep - - -def split_panels_on_internal_borders(panels, v_lines, img_w, img_h): - result = [] - for (px1, py1, px2, py2) in panels: - pw = px2 - px1 - if pw < img_w * 0.30: - result.append((px1, py1, px2, py2)) - continue - - margin = int(pw * 0.20) - search_x1 = px1 + margin - search_x2 = px2 - margin - if search_x2 <= search_x1: - result.append((px1, py1, px2, py2)) - continue - - panel_vlines = v_lines[py1:py2, search_x1:search_x2] - col_sums = panel_vlines.sum(axis=0) - - panel_h = py2 - py1 - threshold = panel_h * 255 * 0.40 - split_cols = np.where(col_sums > threshold)[0] - - if len(split_cols) == 0: - result.append((px1, py1, px2, py2)) - continue - - split_x = int(np.median(split_cols)) + search_x1 - left_w = split_x - px1 - right_w = px2 - split_x - - if left_w > img_w * 0.10 and right_w > img_w * 0.10: - result.append((px1, py1, split_x, py2)) - result.append((split_x, py1, px2, py2)) - else: - result.append((px1, py1, px2, py2)) - - return result - - -def detect_panels(img_bgr): - img_h, img_w = img_bgr.shape[:2] - total_area = img_h * img_w - min_area = total_area * MIN_PANEL_AREA_RATIO - - gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) - _, dark_mask = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY_INV) - - h_len = max(40, img_w // 25) - h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_len, 1)) - h_lines = cv2.morphologyEx(dark_mask, cv2.MORPH_OPEN, h_kernel) - - v_len = max(40, img_h // 25) - v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_len)) - v_lines = cv2.morphologyEx(dark_mask, cv2.MORPH_OPEN, v_kernel) - - borders = cv2.bitwise_or(h_lines, v_lines) - borders = cv2.dilate(borders, np.ones((5, 5), np.uint8), iterations=2) - - panel_interior = cv2.bitwise_not(borders) - num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(panel_interior, connectivity=8) - - panels = [] - for label_id in range(1, num_labels): - area = stats[label_id, cv2.CC_STAT_AREA] - if area < min_area: - continue - - x = stats[label_id, cv2.CC_STAT_LEFT] - y = stats[label_id, cv2.CC_STAT_TOP] - w = stats[label_id, cv2.CC_STAT_WIDTH] - h = stats[label_id, cv2.CC_STAT_HEIGHT] - x2 = x + w - y2 = y + h - - if w * h > total_area * 0.90: - continue - - aspect = max(w, h) / max(min(w, h), 1) - if aspect > 15: - continue - - if w < img_w * 0.05 or h < img_h * 0.05: - continue - - panels.append((x, y, x2, y2)) - - panels = merge_nested_panels(panels) - panels = split_panels_on_internal_borders(panels, v_lines, img_w, img_h) - panels.sort(key=lambda p: (p[1] // 100, p[0])) - - if not panels: - panels = [(0, 0, img_w, img_h)] - - return panels - - -# ───────────────────────────────────────────── -# BUBBLE GEOMETRY -# ───────────────────────────────────────────── -def assign_panel(bubble_data, panels, img_w, img_h): - bx, bw = bubble_data["x"], bubble_data["w"] - by, bh = bubble_data["y"], bubble_data["h"] - bcx, bcy = bx + bw / 2.0, by + bh / 2.0 - - best_panel, best_overlap = None, 0 - for (px1, py1, px2, py2) in panels: - ix1 = max(bx, px1) - iy1 = max(by, py1) - ix2 = min(bx + bw, px2) - iy2 = min(by + bh, py2) - if ix2 > ix1 and iy2 > iy1: - overlap = (ix2 - ix1) * (iy2 - iy1) - if overlap > best_overlap: - best_overlap = overlap - best_panel = (px1, py1, px2, py2) - - if best_panel is None: - for (px1, py1, px2, py2) in panels: - if px1 <= bcx <= px2 and py1 <= bcy <= py2: - return (px1, py1, px2, py2) - return (0, 0, img_w, img_h) - - return best_panel - - -def detect_bubble_ellipse(img_bgr, bubble_data, panel): - x, w = bubble_data["x"], bubble_data["w"] - y, h = bubble_data["y"], bubble_data["h"] - - img_h, img_w = img_bgr.shape[:2] + panel = xywh_to_xyxy(bubble_data.get("panel_bbox")) + if panel is None: + panel = (0, 0, w, h) px1, py1, px2, py2 = panel - seed_x = max(1, min(img_w - 2, int(x + w / 2.0))) - seed_y = max(1, min(img_h - 2, int(y + h / 2.0))) + seed = bubble_data.get("seed_point", {}) + sx = int(seed.get("x", bubble_data.get("x", 0) + bubble_data.get("w", 1) // 2)) + sy = int(seed.get("y", bubble_data.get("y", 0) + bubble_data.get("h", 1) // 2)) + sx = clamp(sx, 1, w - 2) + sy = clamp(sy, 1, h - 2) gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY) - panel_mask = np.zeros_like(binary) - panel_mask[py1:py2, px1:px2] = binary[py1:py2, px1:px2] + panel_bin = np.zeros_like(binary) + panel_bin[py1:py2, px1:px2] = binary[py1:py2, px1:px2] - if gray[seed_y, seed_x] < 150: + # if seed on dark pixel, search nearby white + if gray[sy, sx] < 150: found = False - for r in range(1, max(2, min(w, h) // 3)): + search_r = max(2, min(bubble_data.get("w", 20), bubble_data.get("h", 20)) // 3) + for r in range(1, search_r + 1): for dy in range(-r, r + 1): for dx in range(-r, r + 1): - nx, ny = seed_x + dx, seed_y + dy + nx, ny = sx + dx, sy + dy if px1 <= nx < px2 and py1 <= ny < py2 and gray[ny, nx] >= 200: - seed_x, seed_y = nx, ny + sx, sy = nx, ny found = True break if found: break if found: break - if not found: - return None - flood_mask = np.zeros((img_h + 2, img_w + 2), dtype=np.uint8) - flood_fill_img = panel_mask.copy() + if not found: + m = np.zeros((h, w), dtype=np.uint8) + bx = bubble_data.get("x", 0) + by = bubble_data.get("y", 0) + bw = bubble_data.get("w", 20) + bh = bubble_data.get("h", 20) + cv2.ellipse(m, (bx + bw // 2, by + bh // 2), (max(4, bw // 2), max(4, bh // 2)), 0, 0, 360, 255, -1) + return m + + ff_mask = np.zeros((h + 2, w + 2), dtype=np.uint8) + flood = panel_bin.copy() cv2.floodFill( - flood_fill_img, - flood_mask, - (seed_x, seed_y), - 255, - loDiff=FLOOD_TOLERANCE, - upDiff=FLOOD_TOLERANCE, + flood, ff_mask, (sx, sy), 255, + loDiff=FLOOD_TOL, upDiff=FLOOD_TOL, flags=cv2.FLOODFILL_FIXED_RANGE ) - filled_region = flood_mask[1:-1, 1:-1] * 255 - filled_region = cv2.morphologyEx(filled_region, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=1) - - contours, _ = cv2.findContours(filled_region, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - if not contours: - return None - - bubble_contour = max(contours, key=cv2.contourArea) - if len(bubble_contour) < 5: - return None - if cv2.contourArea(bubble_contour) < 100: - return None - - (ecx, ecy), (ew, eh), angle = cv2.fitEllipse(bubble_contour) - return float(ecx), float(ecy), float(ew / 2), float(eh / 2), float(angle) + m = (ff_mask[1:-1, 1:-1] * 255).astype(np.uint8) + m = cv2.morphologyEx(m, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=1) + return m -def clip_ellipse_to_panel(cx, cy, sa, sb, angle, panel, shrink=BORDER_SHRINK_PX): - px1, py1, px2, py2 = panel - inner_x1, inner_y1 = px1 + shrink, py1 + shrink - inner_x2, inner_y2 = px2 - shrink, py2 - shrink +def build_clean_mask(img_bgr, bubble_data): + """ + FINAL RULE: + clean_mask MUST cover yellow_mask completely. + """ + h, w = img_bgr.shape[:2] + yellow = build_yellow_mask(bubble_data, h, w) - sa_s = max(sa - shrink, 1.0) - sb_s = max(sb - shrink, 1.0) + # start with guaranteed yellow + clean = yellow.copy() - for _ in range(3): - rad = math.radians(angle) - hw = math.sqrt((sa_s * math.cos(rad))**2 + (sb_s * math.sin(rad))**2) - hh = math.sqrt((sa_s * math.sin(rad))**2 + (sb_s * math.cos(rad))**2) + if ENABLE_EXTRA_CLEAN: + bubble_m = bubble_interior_mask(img_bgr, bubble_data) + extra = cv2.dilate(yellow, np.ones((3, 3), np.uint8), iterations=EXTRA_DILATE_ITERS) + extra = cv2.morphologyEx(extra, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=EXTRA_CLOSE_ITERS) + extra = cv2.bitwise_and(extra, bubble_m) - ovf_l = max(0, inner_x1 - (cx - hw)) - ovf_r = max(0, (cx + hw) - inner_x2) - ovf_t = max(0, inner_y1 - (cy - hh)) - ovf_b = max(0, (cy + hh) - inner_y2) + # IMPORTANT: union with yellow (never subtract yellow) + clean = cv2.bitwise_or(yellow, extra) - if max(ovf_l, ovf_r, ovf_t, ovf_b) == 0: - break + # final guarantee (defensive) + clean = cv2.bitwise_or(clean, yellow) - max_nx = sa_s * MAX_NUDGE_RATIO - max_ny = sb_s * MAX_NUDGE_RATIO - cx += min(ovf_l, max_nx) - min(ovf_r, max_nx) - cy += min(ovf_t, max_ny) - min(ovf_b, max_ny) - - rad = math.radians(angle) - hw = math.sqrt((sa_s * math.cos(rad))**2 + (sb_s * math.sin(rad))**2) - hh = math.sqrt((sa_s * math.sin(rad))**2 + (sb_s * math.cos(rad))**2) - - ovf_l = max(0, inner_x1 - (cx - hw)) - ovf_r = max(0, (cx + hw) - inner_x2) - ovf_t = max(0, inner_y1 - (cy - hh)) - ovf_b = max(0, (cy + hh) - inner_y2) - max_ovf = max(ovf_l, ovf_r, ovf_t, ovf_b) - - if max_ovf > 0: - sa_s = max(sa_s - max_ovf, 1.0) - sb_s = max(sb_s - max_ovf, 1.0) - - return cx, cy, sa_s, sb_s - - -def get_render_ellipse(img_bgr, bubble_data, panel): - x, w = bubble_data["x"], bubble_data["w"] - y, h = bubble_data["y"], bubble_data["h"] - - detected = detect_bubble_ellipse(img_bgr, bubble_data, panel) - if detected is not None: - ecx, ecy, sa, sb, angle = detected - ecx, ecy, sa_fill, sb_fill = clip_ellipse_to_panel(ecx, ecy, sa, sb, angle, panel) - safe_w = sa_fill * math.sqrt(2) * TEXT_RATIO - safe_h = sb_fill * math.sqrt(2) * TEXT_RATIO - return (ecx, ecy, sa_fill, sb_fill, angle, safe_w, safe_h, "detected") - else: - cx, cy = x + w / 2.0, y + h / 2.0 - sa, sb = w / 2.0, h / 2.0 - cx, cy, sa_fill, sb_fill = clip_ellipse_to_panel(cx, cy, sa, sb, 0.0, panel) - safe_w = sa_fill * math.sqrt(2) * TEXT_RATIO - safe_h = sb_fill * math.sqrt(2) * TEXT_RATIO - return (cx, cy, sa_fill, sb_fill, 0.0, safe_w, safe_h, "fallback") + return clean, yellow # ───────────────────────────────────────────── -# DRAW ONE BUBBLE +# DRAW BUBBLE # ───────────────────────────────────────────── def draw_bubble( pil_img, - img_bgr, + img_bgr_ref, bubble_data, original_text, translated_text, - flags, font_candidates, font_color, - stroke_color, - panel + stroke_color ): - # skip unchanged SFX if original_text and translated_text: if normalize_text(original_text) == normalize_text(translated_text) and is_sfx_like(original_text): - return "skip_sfx", "NO_FONT" + return "skip_sfx" - (cx, cy, sa_fill, sb_fill, angle, safe_w, safe_h, method) = get_render_ellipse(img_bgr, bubble_data, panel) + rgb = np.array(pil_img) + h, w = rgb.shape[:2] - cx_i, cy_i = int(round(cx)), int(round(cy)) - img_h, img_w = img_bgr.shape[:2] + clean_mask, yellow_mask = build_clean_mask(img_bgr_ref, bubble_data) + if np.count_nonzero(clean_mask) == 0: + return "skip_no_area" - # choose cleaning mask - if CLEAN_MODE == "ocr_union": - mask = build_ocr_union_clean_mask(img_h, img_w, bubble_data, panel) - # robust fallback - if mask is None or int(mask.sum()) == 0: - mask = build_hybrid_clean_mask( - img_h=img_h, img_w=img_w, - cx=cx, cy=cy, - sa_fill=sa_fill, sb_fill=sb_fill, angle=angle, - safe_w=safe_w, safe_h=safe_h, - panel=panel - ) - elif CLEAN_MODE == "hybrid": - mask = build_hybrid_clean_mask( - img_h=img_h, img_w=img_w, - cx=cx, cy=cy, - sa_fill=sa_fill, sb_fill=sb_fill, angle=angle, - safe_w=safe_w, safe_h=safe_h, - panel=panel - ) - else: # ellipse - mask = np.zeros((img_h, img_w), dtype=np.uint8) - cv2.ellipse(mask, (cx_i, cy_i), (int(math.ceil(sa_fill)), int(math.ceil(sb_fill))), angle, 0, 360, 255, -1) + # 1) FORCE white fill on clean mask (includes full yellow by guarantee) + rgb[clean_mask == 255] = [255, 255, 255] - # paint white over mask - img_np = np.array(pil_img) - img_np[mask == 255] = [255, 255, 255] - pil_img.paste(Image.fromarray(img_np)) + # 2) Optional edge restore, but NEVER overwrite yellow coverage + if ENABLE_EDGE_RESTORE: + bubble_m = bubble_interior_mask(img_bgr_ref, bubble_data) + edge = cv2.morphologyEx(bubble_m, cv2.MORPH_GRADIENT, np.ones((3, 3), np.uint8)) + edge = cv2.dilate(edge, np.ones((3, 3), np.uint8), iterations=EDGE_RESTORE_DILATE) + + # Don't restore where yellow exists (hard guarantee) + edge[yellow_mask == 255] = 0 + + orig_rgb = cv2.cvtColor(img_bgr_ref, cv2.COLOR_BGR2RGB) + rgb[edge == 255] = orig_rgb[edge == 255] + + pil_img.paste(Image.fromarray(rgb)) if not translated_text: - return method, "NO_FONT" + return "clean_only" + + # text region based on yellow area (exact requirement) + text_bbox = bbox_from_mask(yellow_mask) + if text_bbox is None: + text_bbox = bbox_from_mask(clean_mask) + if text_bbox is None: + return "skip_no_area" + + x1, y1, x2, y2 = text_bbox draw = ImageDraw.Draw(pil_img) + text_cx = int((x1 + x2) / 2) + text_cy = int((y1 + y2) / 2) + safe_w = max(16, int((x2 - x1) * TEXT_INSET)) + safe_h = max(16, int((y2 - y1) * TEXT_INSET)) - # Center text in the cleaned region bbox (red-box style target) - ys, xs = np.where(mask > 0) - if len(xs) > 0 and len(ys) > 0: - mx1, my1, mx2, my2 = xs.min(), ys.min(), xs.max(), ys.max() - text_cx = int((mx1 + mx2) / 2) - text_cy = int((my1 + my2) / 2) - sw = max(20, int((mx2 - mx1) * 0.92)) - sh = max(20, int((my2 - my1) * 0.92)) - else: - text_cx, text_cy = cx_i, cy_i - sw, sh = max(int(safe_w), 1), max(int(safe_h), 1) - - bw = max(1, bubble_data.get("w", 1)) - bh = max(1, bubble_data.get("h", 1)) - tall_bubble = (bh / bw) > 1.25 - - font, lines, total_h, font_used = best_fit_font( - draw, translated_text, font_candidates, sw, sh, tall_bubble=tall_bubble - ) - - if not lines: - return method, font_used - - y_cursor = int(round(text_cy - total_h / 2.0 - 0.02 * sh)) + font, lines, total_h = fit_font(draw, translated_text, font_candidates, safe_w, safe_h) + y_cursor = int(round(text_cy - total_h / 2.0)) for line in lines: lw, lh = measure_text(draw, line, font) x = text_cx - lw // 2 draw_text_with_stroke(draw, (x, y_cursor), line, font, fill=font_color, stroke_fill=stroke_color) - y_cursor += lh + max(lh // 5, 2) + (1 if tall_bubble else 0) + y_cursor += lh + max(lh // 5, 2) - return method, font_used + return "rendered" # ───────────────────────────────────────────── -# MAIN RENDER FUNCTION +# MAIN # ───────────────────────────────────────────── def render_translations( input_image, @@ -770,21 +449,18 @@ def render_translations( bubbles_file, font_candidates=DEFAULT_FONT_CANDIDATES, font_color=DEFAULT_FONT_COLOR, - stroke_color=DEFAULT_STROKE_COLOR, + stroke_color=DEFAULT_STROKE_COLOR ): img_bgr = cv2.imread(input_image) if img_bgr is None: raise FileNotFoundError(f"Cannot load image: {input_image}") - img_h, img_w = img_bgr.shape[:2] img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) translations, originals, flags_map = parse_translations(translations_file) bubbles = parse_bubbles(bubbles_file) - panels = detect_panels(img_bgr) - rendered = 0 - skipped = 0 + rendered, skipped = 0, 0 def sort_key(item): bid, _ = item @@ -797,47 +473,39 @@ def render_translations( continue bubble_data = bubbles[bubble_id] - panel = assign_panel(bubble_data, panels, img_w, img_h) - original_text = originals.get(bubble_id, "") - flags = flags_map.get(bubble_id, "-") - method, font_used = draw_bubble( + status = draw_bubble( pil_img=img_pil, - img_bgr=img_bgr, + img_bgr_ref=img_bgr, bubble_data=bubble_data, original_text=original_text, translated_text=translated_text, - flags=flags, font_candidates=font_candidates, font_color=font_color, - stroke_color=stroke_color, - panel=panel + stroke_color=stroke_color ) - if method == "skip_sfx": + if status.startswith("skip"): skipped += 1 else: rendered += 1 - result_cv = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) - cv2.imwrite(output_image, result_cv) + out_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) + cv2.imwrite(output_image, out_bgr) print(f"✅ Done — {rendered} rendered, {skipped} skipped.") print(f"📄 Output → {output_image}") - print(f"🧼 Clean mode: {CLEAN_MODE}") + print("Guarantee: full yellow-square area is always white-cleaned before drawing text.") -# ───────────────────────────────────────────── -# ENTRY POINT -# ───────────────────────────────────────────── if __name__ == "__main__": render_translations( - input_image="001-page.png", + input_image="002-page.png", output_image="page_translated.png", translations_file="output.txt", bubbles_file="bubbles.json", font_candidates=DEFAULT_FONT_CANDIDATES, font_color=DEFAULT_FONT_COLOR, - stroke_color=DEFAULT_STROKE_COLOR, + stroke_color=DEFAULT_STROKE_COLOR ) diff --git a/manga-translator.py b/manga-translator.py index 8869f2b..8ce9f3b 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -1,7 +1,6 @@ import re import os import json -import difflib import cv2 import numpy as np import easyocr @@ -9,181 +8,40 @@ from deep_translator import GoogleTranslator # ───────────────────────────────────────────── -# LANGUAGE CODE REFERENCE -# ───────────────────────────────────────────── -SUPPORTED_LANGUAGES = { - "Vietnamese" : "vi", - "Japanese" : "ja", - "English" : "en", - "Spanish" : "es", - "Korean" : "ko", - "Chinese (Simplified)" : "ch_sim", - "Chinese (Traditional)": "ch_tra", - "French" : "fr", - "German" : "de", - "Italian" : "it", - "Portuguese" : "pt", - "Arabic" : "ar", - "Russian" : "ru", - "Thai" : "th", - "Catalan" : "ca", -} - -# ───────────────────────────────────────────── -# DOMAIN GLOSSARY +# CONFIG # ───────────────────────────────────────────── GLOSSARY = { "ANYA": "ANYA", - "STELLA STAR": "STELLA STAR", - "MR. HENDERSON": "MR. HENDERSON", "STARLIGHT ANYA": "STARLIGHT ANYA", + "MR. HENDERSON": "MR. HENDERSON", + "HENDERSON": "HENDERSON", + "STELLA STAR": "STELLA STAR", } -# Phrase-level fallback (source IT -> target CA) -PHRASE_MAP_IT_CA = { - "LA BAMBINA È ILLESA!": "LA NENA ESTÀ IL·LESA!", - "L'UOMO E LA DONNA SONO MORTI!": "L'HOME I LA DONA SÓN MORTS!", - "IL BAMBINO È FERITO GRAVEMENTE, MA È ANCORA VIVO!!": "EL NEN ESTÀ GREUMENT FERIT, PERÒ ENCARA ÉS VIU!!", - "UN CASO URGENTE...?": "UN CAS URGENT...?", - "UN CASO URGENTE,?": "UN CAS URGENT?", -} - -ITALIAN_OCR_FIXES = [ - (r"\bL'LOMO\b", "L'UOMO"), - (r"\bLOMO\b", "UOMO"), - (r"\bMORT I\b", "MORTI"), - (r"\bI[L1]LESA\b", "ILLESA"), - (r"\bBAM8INA\b", "BAMBINA"), - (r"\bBAM8INO\b", "BAMBINO"), - (r",\?", "?"), - (r"\?{2,}", "?"), - (r"\!{3,}", "!!"), -] - - -# ───────────────────────────────────────────── -# SOUND EFFECT FILTER -# ───────────────────────────────────────────── SOUND_EFFECT_PATTERNS = [ - r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", - r"^oh+$", r"^ugh+$", r"^gr+$", r"^bam+$", - r"^pow+$", r"^crash+$", r"^boom+$", r"^bang+$", - r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$", - r"^zip+$", r"^swoosh+$", r"^chirp+$", r"^tweet+$", + r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$", + r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", + r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" ] -def is_sound_effect(text): - cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) - return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) - - -# ───────────────────────────────────────────── -# TITLE / LOGO / AUTHOR FILTER -# ───────────────────────────────────────────── TITLE_PATTERNS = [ r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$", r"^(spy|family|spy.family)$", r"^by\s+.+$", - r"^[a-z]{1,4}\s+[a-z]+\s+[a-z]+$", ] -def is_title_text(text): - cleaned = text.strip().lower() - return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in TITLE_PATTERNS) - - -# ───────────────────────────────────────────── -# GARBAGE TOKEN FILTER -# ───────────────────────────────────────────── -GARBAGE_PATTERNS = [ - r"^[^a-zA-Z]*$", - r"^.{1,2}$", - r".*\d+.*", - r"^[A-Z]{1,4}$", +NOISE_PATTERNS = [ + r"^[^a-zA-Z0-9\?!.]+$", + r"^BOX[0-9A-Z]*$", ] -def is_garbage(text): - t = text.strip() - return any(re.fullmatch(p, t) for p in GARBAGE_PATTERNS) +TOP_BAND_RATIO = 0.08 # ───────────────────────────────────────────── -# TOKEN CLASSIFIER +# TEXT HELPERS # ───────────────────────────────────────────── -def classify_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects): - cleaned = text.strip() - - if confidence < confidence_threshold: - return "noise" - if len(cleaned) < min_text_length: - return "noise" - if re.fullmatch(r"\d+", cleaned): - return "noise" - if len(cleaned) == 1 and not cleaned.isalpha(): - return "noise" - if filter_sound_effects and is_sound_effect(cleaned): - return "noise" - if is_title_text(cleaned): - return "noise" - if is_garbage(cleaned): - return "noise" - if not any(ch.isalpha() for ch in cleaned): - return "punct" - return "alpha" - -def should_keep_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects): - cat = classify_token(text, confidence, confidence_threshold, min_text_length, filter_sound_effects) - return cat != "noise", cat - - -# ───────────────────────────────────────────── -# QUAD / BBOX HELPERS -# ───────────────────────────────────────────── -def quad_bbox(quad): - xs = [pt[0] for pt in quad] - ys = [pt[1] for pt in quad] - return min(xs), min(ys), max(xs), max(ys) - -def quad_center(quad): - x1, y1, x2, y2 = quad_bbox(quad) - return (x1 + x2) / 2.0, (y1 + y2) / 2.0 - -def quad_h(quad): - x1, y1, x2, y2 = quad_bbox(quad) - return max(1.0, y2 - y1) - -def bbox_center(b): - x1, y1, x2, y2 = b - return (x1 + x2) / 2.0, (y1 + y2) / 2.0 - -def bbox_h(b): - return max(1.0, b[3] - b[1]) - -def distance_pt(a, b): - return ((a[0]-b[0])**2 + (a[1]-b[1])**2) ** 0.5 - -def quads_bbox(quads, image_shape, padding_px=10): - img_h, img_w = image_shape[:2] - all_x = [pt[0] for quad in quads for pt in quad] - all_y = [pt[1] for quad in quads for pt in quad] - x1 = max(0, min(all_x) - padding_px) - y1 = max(0, min(all_y) - padding_px) - x2 = min(img_w, max(all_x) + padding_px) - y2 = min(img_h, max(all_y) + padding_px) - return x1, y1, x2, y2 - -def bboxes_overlap_or_touch(a, b, gap_px=0): - ax1, ay1, ax2, ay2 = a - bx1, by1, bx2, by2 = b - gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) - gap_y = max(0, max(ay1, by1) - min(ay2, by2)) - return gap_x <= gap_px and gap_y <= gap_px - - -# ───────────────────────────────────────────── -# TEXT NORMALIZATION -# ───────────────────────────────────────────── -def normalize_ocr_text(text): +def normalize_text(text): t = text.strip().upper() t = t.replace("“", "\"").replace("”", "\"") t = t.replace("’", "'").replace("‘", "'") @@ -196,690 +54,658 @@ def normalize_ocr_text(text): t = re.sub(r",\?", "?", t) return t.strip() -def italian_post_ocr_cleanup(text): - t = normalize_ocr_text(text) - for pat, rep in ITALIAN_OCR_FIXES: - t = re.sub(pat, rep, t, flags=re.IGNORECASE) - t = re.sub(r"\s{2,}", " ", t).strip().upper() +def apply_glossary(text): + out = text + for k in sorted(GLOSSARY.keys(), key=len, reverse=True): + out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) + return out + +def postprocess_translation_general(text): + t = normalize_text(text) + t = re.sub(r"\s{2,}", " ", t).strip() + t = re.sub(r"([!?]){3,}", r"\1\1", t) + t = re.sub(r"\.{4,}", "...", t) return t -def fix_hyphens(lines): - if not lines: - return "" - merged = lines[0] - for line in lines[1:]: - line = line.strip() - if merged.endswith("-"): - merged = merged[:-1] + line - else: - merged = merged + " " + line - merged = re.sub(r" {2,}", " ", merged).strip() - return normalize_ocr_text(merged) -def apply_glossary(text, glossary): - out = text - keys = sorted(glossary.keys(), key=len, reverse=True) - for k in keys: - v = glossary[k] - out = re.sub(rf"\b{re.escape(k)}\b", v, out, flags=re.IGNORECASE) - return out +# ───────────────────────────────────────────── +# FILTERS +# ───────────────────────────────────────────── +def is_sound_effect(text): + cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) + return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) + +def is_title_text(text): + t = text.strip().lower() + return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) + +def is_noise_text(text): + t = text.strip() + return any(re.fullmatch(p, t) for p in NOISE_PATTERNS) # ───────────────────────────────────────────── -# TRANSLATION SAFETY +# GEOMETRY # ───────────────────────────────────────────── -def fuzzy_phrase_match(source_text, phrase_map, min_ratio=0.88): - if source_text in phrase_map: - return phrase_map[source_text], 1.0, source_text +def quad_bbox(quad): + xs = [p[0] for p in quad] + ys = [p[1] for p in quad] + return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))) - best_key, best_ratio = None, 0.0 - for k in phrase_map.keys(): - ratio = difflib.SequenceMatcher(None, source_text, k).ratio() - if ratio > best_ratio: - best_ratio = ratio - best_key = k +def quad_center(quad): + x1, y1, x2, y2 = quad_bbox(quad) + return ((x1 + x2) / 2.0, (y1 + y2) / 2.0) - if best_key and best_ratio >= min_ratio: - return phrase_map[best_key], best_ratio, best_key - return None, best_ratio, best_key +def boxes_union_xyxy(boxes): + boxes = [b for b in boxes if b is not None] + if not boxes: + return None + return ( + int(min(b[0] for b in boxes)), + int(min(b[1] for b in boxes)), + int(max(b[2] for b in boxes)), + int(max(b[3] for b in boxes)), + ) -def looks_suspicious_translation(src, tgt): - t = normalize_ocr_text(tgt) - bad_tokens = ["NEETA", "LOMO", "MORT I", "ESTA IL", "MORT I LA"] - if any(b in t for b in bad_tokens): - return True - if len(t) < 3: - return True - return False +def bbox_area_xyxy(b): + if b is None: + return 0 + return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1])) + +def xyxy_to_xywh(b): + if b is None: + return None + x1, y1, x2, y2 = b + return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))} + +def overlap_or_near(a, b, gap=0): + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + gap_x = max(0, max(ax1, bx1) - min(ax2, bx2)) + gap_y = max(0, max(ay1, by1) - min(ay2, by2)) + return gap_x <= gap and gap_y <= gap # ───────────────────────────────────────────── -# LINE REBUILD (shared) -# ───────────────────────────────────────────── -def rebuild_bubble_lines_from_indices(indices, ocr_results): - if not indices: - return [] - - token_bboxes = [quad_bbox(ocr_results[i][0]) for i in indices] - items = [] - for i, bx in zip(indices, token_bboxes): - xc = (bx[0] + bx[2]) / 2.0 - yc = (bx[1] + bx[3]) / 2.0 - h = max(1.0, bx[3] - bx[1]) - items.append((i, xc, yc, h)) - - line_tol = max(6.0, float(np.median([it[3] for it in items])) * 0.6) - items.sort(key=lambda t: t[2]) - - lines = [] - for it in items: - i, xc, yc, h = it - placed = False - for ln in lines: - if abs(yc - ln["yc"]) <= line_tol: - ln["members"].append((i, xc, yc)) - ln["yc"] = np.mean([m[2] for m in ln["members"]]) - placed = True - break - if not placed: - lines.append({"yc": yc, "members": [(i, xc, yc)]}) - - lines.sort(key=lambda ln: ln["yc"]) - out = [] - for ln in lines: - mem = sorted(ln["members"], key=lambda m: m[1]) - toks = [ocr_results[i][1] for i, _, _ in mem] - line = " ".join(toks) - line = re.sub(r"\s+([,.;:!?])", r"\1", line) - line = re.sub(r"\(\s+", "(", line) - line = re.sub(r"\s+\)", ")", line) - out.append(normalize_ocr_text(line)) - return out - - -# ───────────────────────────────────────────── -# GROUPING (pass 1) -# ───────────────────────────────────────────── -def group_quads_by_overlap(ocr_results, image_shape, gap_px=18, bbox_padding=10): - n = len(ocr_results) - if n == 0: - return {}, {}, {} - - token_bboxes = [quad_bbox(r[0]) for r in ocr_results] - token_centers = [quad_center(r[0]) for r in ocr_results] - token_heights = [quad_h(r[0]) for r in ocr_results] - median_h = float(np.median(token_heights)) if token_heights else 12.0 - dist_thresh = max(20.0, median_h * 2.2) - - parent = list(range(n)) - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(x, y): - parent[find(x)] = find(y) - - for i in range(n): - for j in range(i + 1, n): - ov = bboxes_overlap_or_touch(token_bboxes[i], token_bboxes[j], gap_px=gap_px) - if ov: - union(i, j) - continue - cx1, cy1 = token_centers[i] - cx2, cy2 = token_centers[j] - d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 - if d <= dist_thresh and abs(cy1 - cy2) <= median_h * 3.0: - union(i, j) - - groups = {} - for i in range(n): - root = find(i) - groups.setdefault(root, []).append(i) - - def group_sort_key(indices): - ys = [token_bboxes[i][1] for i in indices] - xs = [token_bboxes[i][0] for i in indices] - return (min(ys) // 150, min(xs)) - - sorted_groups = sorted(groups.values(), key=group_sort_key) - - bubble_dict = {} - bbox_dict = {} - ocr_quads = {} - bubble_indices = {} - - for gid, indices in enumerate(sorted_groups, start=1): - idxs = sorted(indices, key=lambda k: token_bboxes[k][1]) - lines = rebuild_bubble_lines_from_indices(idxs, ocr_results) - quads = [ocr_results[k][0] for k in idxs] - bb = quads_bbox(quads, image_shape, padding_px=bbox_padding) - - bubble_dict[gid] = lines - ocr_quads[gid] = quads - bbox_dict[gid] = bb - bubble_indices[gid] = idxs - - return bubble_dict, bbox_dict, ocr_quads, bubble_indices - - -# ───────────────────────────────────────────── -# ORPHAN ABSORPTION (pass 2) -# ───────────────────────────────────────────── -def absorb_orphan_tokens_into_bubbles( - ocr_results, - bubble_dict, - bbox_dict, - ocr_quads, - bubble_indices, - image_shape, - bbox_padding=2, - gap_factor=1.9, - max_center_dist_factor=3.2, -): - n = len(ocr_results) - token_bboxes = [quad_bbox(r[0]) for r in ocr_results] - token_centers = [bbox_center(b) for b in token_bboxes] - token_heights = [bbox_h(b) for b in token_bboxes] - median_h = float(np.median(token_heights)) if token_heights else 12.0 - - used = set() - for bid, idxs in bubble_indices.items(): - for i in idxs: - used.add(i) - - orphan_indices = [i for i in range(n) if i not in used] - - for i in orphan_indices: - tb = token_bboxes[i] - tc = token_centers[i] - - best_bid = None - best_score = 1e18 - - for bid, bb in bbox_dict.items(): - bc = bbox_center(bb) - dist = distance_pt(tc, bc) - bh = bbox_h(bb) - - max_dist = max(60.0, median_h * max_center_dist_factor + bh * 0.15) - if dist > max_dist: - continue - - near = bboxes_overlap_or_touch(tb, bb, gap_px=int(median_h * gap_factor)) - y_ok = abs(tc[1] - bc[1]) <= max(bh * 0.65, median_h * 4.0) - - if near or y_ok: - score = dist - (25.0 if near else 0.0) - if score < best_score: - best_score = score - best_bid = bid - - if best_bid is not None: - bubble_indices.setdefault(best_bid, []) - bubble_indices[best_bid].append(i) - - # rebuild bubbles after absorption - new_bubble_dict = {} - new_ocr_quads = {} - new_bbox_dict = {} - new_bubble_indices = {} - - for bid in sorted(bubble_dict.keys()): - idxs = sorted(set(bubble_indices.get(bid, [])), key=lambda k: token_bboxes[k][1]) - if not idxs: - idxs = [] - - lines = rebuild_bubble_lines_from_indices(idxs, ocr_results) if idxs else bubble_dict.get(bid, []) - quads = [ocr_results[k][0] for k in idxs] if idxs else ocr_quads.get(bid, []) - - if quads: - bb = quads_bbox(quads, image_shape, padding_px=bbox_padding) - else: - bb = bbox_dict[bid] - - new_bubble_dict[bid] = lines - new_ocr_quads[bid] = quads - new_bbox_dict[bid] = bb - new_bubble_indices[bid] = idxs - - return new_bubble_dict, new_bbox_dict, new_ocr_quads, new_bubble_indices - - -# ───────────────────────────────────────────── -# OCR QUALITY SCORE +# QUALITY # ───────────────────────────────────────────── def ocr_quality_score(text): if not text or len(text) < 2: return 0.0 alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text)) penalty = 0.0 - for p in [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,\-]{2,}"]: - if re.search(p, text): - penalty += 0.2 + if re.search(r"[^\w\s\'\!\?\.,\-]{2,}", text): + penalty += 0.2 + if re.search(r",,", text): + penalty += 0.2 bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0 return max(0.0, min(1.0, alpha_ratio - penalty + bonus)) # ───────────────────────────────────────────── -# OCR VARIANTS +# OCR RE-READ # ───────────────────────────────────────────── def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) if mode == "raw": return gray if mode == "clahe": - clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) - return clahe.apply(gray) + return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) if mode == "adaptive": den = cv2.GaussianBlur(gray, (3, 3), 0) return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) return gray -def run_ocr_on_img_array(reader, img_arr): - temp_path = "_temp_crop_ocr.png" - cv2.imwrite(temp_path, img_arr) +def run_ocr_on_array(reader, arr): + tmp = "_tmp_ocr.png" + cv2.imwrite(tmp, arr) try: - return reader.readtext(temp_path, paragraph=False) + return reader.readtext(tmp, paragraph=False) finally: - if os.path.exists(temp_path): - os.remove(temp_path) + if os.path.exists(tmp): + os.remove(tmp) -def reread_cluster_crop(image, bbox, reader, source_lang="en", padding_px=20, upscale_factor=2.5): - img_h, img_w = image.shape[:2] +def reread_crop(image, bbox, reader, upscale=2.5, pad=18): + ih, iw = image.shape[:2] x1, y1, x2, y2 = bbox - x1 = max(0, int(x1) - padding_px) - y1 = max(0, int(y1) - padding_px) - x2 = min(img_w, int(x2) + padding_px) - y2 = min(img_h, int(y2) + padding_px) - + x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) + x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad)) crop = image[y1:y2, x1:x2] if crop.size == 0: return None - new_w = int(crop.shape[1] * upscale_factor) - new_h = int(crop.shape[0] * upscale_factor) - upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + up = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC) - candidates = [] + best = None for mode in ("raw", "clahe", "adaptive"): - proc = preprocess_variant(upscaled, mode) - res = run_ocr_on_img_array(reader, proc) + proc = preprocess_variant(up, mode) + res = run_ocr_on_array(reader, proc) if not res: continue res.sort(key=lambda r: (r[0][0][1], r[0][0][0])) - lines = [normalize_ocr_text(t) for _, t, _ in res if t.strip()] - merged = fix_hyphens(lines) if lines else "" - if source_lang == "it": - merged = italian_post_ocr_cleanup(merged) - score = ocr_quality_score(merged) - candidates.append((score, mode, merged)) + lines = [normalize_text(t) for _, t, _ in res if t.strip()] + merged = re.sub(r"\s{2,}", " ", " ".join(lines)).strip() + s = ocr_quality_score(merged) + if best is None or s > best[0]: + best = (s, merged) - if not candidates: - return None - candidates.sort(key=lambda x: x[0], reverse=True) - return candidates[0][2] if candidates[0][2] else None + return best[1] if best else None # ───────────────────────────────────────────── -# AUTO GAP +# LINES + YELLOW BOXES # ───────────────────────────────────────────── -def compute_auto_gap(image_path, base_gap=18, reference_width=750): - image = cv2.imread(image_path) - if image is None: - return base_gap - img_w = image.shape[1] - return base_gap * (img_w / reference_width) +def build_lines_from_indices(indices, ocr): + if not indices: + return [] + + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + xc = (b[0] + b[2]) / 2.0 + yc = (b[1] + b[3]) / 2.0 + h = max(1.0, b[3] - b[1]) + items.append((i, b, xc, yc, h)) + + med_h = float(np.median([it[4] for it in items])) if items else 10.0 + row_tol = max(6.0, med_h * 0.75) + + items.sort(key=lambda x: x[3]) + rows = [] + for it in items: + i, b, xc, yc, h = it + placed = False + for r in rows: + if abs(yc - r["yc"]) <= row_tol: + r["m"].append((i, b, xc, yc)) + r["yc"] = float(np.mean([k[3] for k in r["m"]])) + placed = True + break + if not placed: + rows.append({"yc": yc, "m": [(i, b, xc, yc)]}) + + rows.sort(key=lambda r: r["yc"]) + lines = [] + for r in rows: + mem = sorted(r["m"], key=lambda z: z[2]) + txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) + lines.append(txt) + + return lines + + +def build_line_boxes_from_indices(indices, ocr): + """ + Robust yellow-box generation with punctuation attachment: + - row grouping + - chunking by x gap + - attach tiny punctuation/special tokens to nearest chunk + - coverage guarantee + """ + if not indices: + return [] + + items = [] + for i in indices: + b = quad_bbox(ocr[i][0]) + txt = normalize_text(ocr[i][1]) + xc = (b[0] + b[2]) / 2.0 + yc = (b[1] + b[3]) / 2.0 + w = max(1.0, b[2] - b[0]) + h = max(1.0, b[3] - b[1]) + items.append({ + "i": i, "b": b, "txt": txt, + "xc": xc, "yc": yc, "w": w, "h": h + }) + + med_h = float(np.median([it["h"] for it in items])) if items else 10.0 + row_tol = max(6.0, med_h * 0.90) + gap_x_tol = max(8.0, med_h * 1.25) + pad = max(1, int(round(med_h * 0.12))) + + def is_punct_like(t): + raw = t.strip() + if raw == "": + return True + punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) + return punct_ratio >= 0.5 or len(raw) <= 2 + + # 1) rows + items_sorted = sorted(items, key=lambda x: x["yc"]) + rows = [] + for it in items_sorted: + placed = False + for r in rows: + if abs(it["yc"] - r["yc"]) <= row_tol: + r["m"].append(it) + r["yc"] = float(np.mean([k["yc"] for k in r["m"]])) + placed = True + break + if not placed: + rows.append({"yc": it["yc"], "m": [it]}) + + rows.sort(key=lambda r: r["yc"]) + out_boxes = [] + + for r in rows: + mem = sorted(r["m"], key=lambda z: z["xc"]) + normal = [t for t in mem if not is_punct_like(t["txt"])] + punct = [t for t in mem if is_punct_like(t["txt"])] + + if not normal: + normal = mem + punct = [] + + # 2) chunk normal tokens + chunks = [] + cur = [normal[0]] + for t in normal[1:]: + prev = cur[-1]["b"] + b = t["b"] + gap = b[0] - prev[2] + if gap <= gap_x_tol: + cur.append(t) + else: + chunks.append(cur) + cur = [t] + chunks.append(cur) + + # 3) attach punctuation tokens + for p in punct: + pb = p["b"] + pxc, pyc = p["xc"], p["yc"] + + best_k = -1 + best_score = 1e18 + for k, ch in enumerate(chunks): + ub = boxes_union_xyxy([x["b"] for x in ch]) + cx = (ub[0] + ub[2]) / 2.0 + cy = (ub[1] + ub[3]) / 2.0 + + dx = abs(pxc - cx) + dy = abs(pyc - cy) + score = dx + 1.8 * dy + + near = overlap_or_near(pb, ub, gap=int(med_h * 0.9)) + if near: + score -= med_h * 2.0 + + if score < best_score: + best_score = score + best_k = k + + if best_k >= 0: + chunks[best_k].append(p) + else: + chunks.append([p]) + + # 4) chunk boxes + for ch in chunks: + ub = boxes_union_xyxy([x["b"] for x in ch]) + if ub: + x1, y1, x2, y2 = ub + out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) + + # 5) guarantee all tokens included + token_boxes = [it["b"] for it in items] + + def inside(tb, lb): + return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3] + + for tb in token_boxes: + ok = any(inside(tb, lb) for lb in out_boxes) + if not ok: + x1, y1, x2, y2 = tb + out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) + + # 6) merge heavy overlaps + merged = [] + for b in out_boxes: + merged_into = False + for i, m in enumerate(merged): + ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1]) + ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3]) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + a1 = max(1, (b[2]-b[0])*(b[3]-b[1])) + a2 = max(1, (m[2]-m[0])*(m[3]-m[1])) + iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 + if iou > 0.72: + merged[i] = boxes_union_xyxy([b, m]) + merged_into = True + break + if not merged_into: + merged.append(b) + + merged.sort(key=lambda z: (z[1], z[0])) + return merged # ───────────────────────────────────────────── -# READING ORDER +# GROUPING +# ───────────────────────────────────────────── +def auto_gap(image_path, base=18, ref_w=750): + img = cv2.imread(image_path) + if img is None: + return base + return base * (img.shape[1] / ref_w) + +def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): + n = len(ocr) + if n == 0: + return {}, {}, {}, {} + + boxes = [quad_bbox(r[0]) for r in ocr] + centers = [quad_center(r[0]) for r in ocr] + hs = [max(1.0, b[3] - b[1]) for b in boxes] + med_h = float(np.median(hs)) if hs else 12.0 + dist_thresh = max(20.0, med_h * 2.2) + + p = list(range(n)) + + def find(x): + while p[x] != x: + p[x] = p[p[x]] + x = p[x] + return x + + def unite(a, b): + p[find(a)] = find(b) + + for i in range(n): + for j in range(i + 1, n): + if overlap_or_near(boxes[i], boxes[j], gap=gap_px): + unite(i, j) + continue + cx1, cy1 = centers[i] + cx2, cy2 = centers[j] + d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 + if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 3.0: + unite(i, j) + + groups = {} + for i in range(n): + groups.setdefault(find(i), []).append(i) + + sorted_groups = sorted(groups.values(), key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs))) + + bubbles = {} + bubble_boxes = {} + bubble_quads = {} + bubble_indices = {} + + ih, iw = image_shape[:2] + for bid, idxs in enumerate(sorted_groups, start=1): + idxs = sorted(idxs, key=lambda k: boxes[k][1]) + lines = build_lines_from_indices(idxs, ocr) + quads = [ocr[k][0] for k in idxs] + ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) + if ub is None: + continue + + x1, y1, x2, y2 = ub + x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding) + x2 = min(iw, x2 + bbox_padding); y2 = min(ih, y2 + bbox_padding) + + bubbles[bid] = lines + bubble_boxes[bid] = (x1, y1, x2, y2) + bubble_quads[bid] = quads + bubble_indices[bid] = idxs + + return bubbles, bubble_boxes, bubble_quads, bubble_indices + + +# ───────────────────────────────────────────── +# DEBUG +# ───────────────────────────────────────────── +def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): + img = cv2.imread(image_path) + if img is None: + return + + # token quads + for bbox, txt, conf in ocr: + pts = np.array(bbox, dtype=np.int32) + cv2.polylines(img, [pts], True, (180, 180, 180), 1) + + # bubble boxes + yellow line boxes + for bid, bb in bubble_boxes.items(): + x1, y1, x2, y2 = bb + cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) + cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) + + idxs = bubble_indices.get(bid, []) + line_boxes = build_line_boxes_from_indices(idxs, ocr) + for lb in line_boxes: + lx1, ly1, lx2, ly2 = lb + lx1 = max(0, int(lx1)); ly1 = max(0, int(ly1)) + lx2 = min(img.shape[1] - 1, int(lx2)); ly2 = min(img.shape[0] - 1, int(ly2)) + cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3) + + cv2.imwrite(out_path, img) + + +# ───────────────────────────────────────────── +# EXPORT # ───────────────────────────────────────────── def estimate_reading_order(bbox_dict, mode="ltr"): items = [] for bid, (x1, y1, x2, y2) in bbox_dict.items(): cx = (x1 + x2) / 2.0 cy = (y1 + y2) / 2.0 - items.append((bid, x1, y1, x2, y2, cx, cy)) + items.append((bid, cx, cy)) + + items.sort(key=lambda t: t[2]) - items.sort(key=lambda t: t[6]) rows = [] - row_tol = 90 + tol = 90 for it in items: placed = False - for row in rows: - if abs(it[6] - row["cy"]) <= row_tol: - row["items"].append(it) - row["cy"] = np.mean([x[6] for x in row["items"]]) + for r in rows: + if abs(it[2] - r["cy"]) <= tol: + r["items"].append(it) + r["cy"] = float(np.mean([x[2] for x in r["items"]])) placed = True break if not placed: - rows.append({"cy": it[6], "items": [it]}) + rows.append({"cy": it[2], "items": [it]}) rows.sort(key=lambda r: r["cy"]) order = [] for r in rows: - if mode == "rtl": - r["items"].sort(key=lambda t: t[5], reverse=True) - else: - r["items"].sort(key=lambda t: t[5]) - order.extend([it[0] for it in r["items"]]) + r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl")) + order.extend([z[0] for z in r["items"]]) - return {bid: idx + 1 for idx, bid in enumerate(order)} + return {bid: i + 1 for i, bid in enumerate(order)} -# ───────────────────────────────────────────── -# EXPORTERS -# ───────────────────────────────────────────── -def export_bubble_boxes( - bbox_dict, - ocr_quads_dict, - reading_order_map, - filepath="bubbles.json", - bbox_expand_ratio=0.16, - image_shape=None, -): - export = {} - for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): - quads = ocr_quads_dict.get(bubble_id, []) +def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): + out = {} - w_orig = x2 - x1 - h_orig = y2 - y1 - pad_x = int(w_orig * bbox_expand_ratio) - pad_y = int(h_orig * bbox_expand_ratio) + for bid, bb in bbox_dict.items(): + x1, y1, x2, y2 = bb + quads = quads_dict.get(bid, []) + idxs = indices_dict.get(bid, []) - if image_shape is not None: - img_h, img_w = image_shape[:2] - ex1 = max(0, x1 - pad_x) - ey1 = max(0, y1 - pad_y) - ex2 = min(img_w, x2 + pad_x) - ey2 = min(img_h, y2 + pad_y) - else: - ex1, ey1, ex2, ey2 = x1 - pad_x, y1 - pad_y, x2 + pad_x, y2 + pad_y + qboxes = [quad_bbox(q) for q in quads] + text_union = boxes_union_xyxy(qboxes) - export[str(bubble_id)] = { - "x": int(ex1), - "y": int(ey1), - "w": int(ex2 - ex1), - "h": int(ey2 - ey1), - "x_tight": int(x1), - "y_tight": int(y1), - "w_tight": int(w_orig), - "h_tight": int(h_orig), - "reading_order": int(reading_order_map.get(bubble_id, bubble_id)), - "quad_bboxes": [ - { - "x": int(quad_bbox(q)[0]), - "y": int(quad_bbox(q)[1]), - "w": int(quad_bbox(q)[2] - quad_bbox(q)[0]), - "h": int(quad_bbox(q)[3] - quad_bbox(q)[1]), - } - for q in quads - ], - "quads": [[[int(pt[0]), int(pt[1])] for pt in quad] for quad in quads], + line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr) + line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy) + line_union_area = bbox_area_xyxy(line_union_xyxy) + + out[str(bid)] = { + "x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1), + "reading_order": int(reading_map.get(bid, bid)), + "quad_bboxes": [{"x": int(b[0]), "y": int(b[1]), "w": int(b[2]-b[0]), "h": int(b[3]-b[1])} for b in qboxes], + "quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads], + "text_bbox": xyxy_to_xywh(text_union), + + # yellow geometry + "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], + "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, + "line_union_area": int(line_union_area), } with open(filepath, "w", encoding="utf-8") as f: - json.dump(export, f, indent=2, ensure_ascii=False) - -def write_output(output_lines, filepath): - with open(filepath, "w", encoding="utf-8") as f: - f.write("\n".join(output_lines)) + json.dump(out, f, indent=2, ensure_ascii=False) # ───────────────────────────────────────────── -# DEBUG IMAGE -# ───────────────────────────────────────────── -def save_debug_clusters(image_path, ocr_results, bubble_dict, bbox_dict): - image = cv2.imread(image_path) - if image is None: - return - - np.random.seed(42) - num_bubbles = max(bubble_dict.keys(), default=1) - colors = [tuple(int(c) for c in col) for col in np.random.randint(50, 230, size=(num_bubbles + 2, 3))] - - # draw all OCR quads lightly - for bbox, text, _ in ocr_results: - pts = np.array(bbox, dtype=np.int32) - cv2.polylines(image, [pts], isClosed=True, color=(180, 180, 180), thickness=1) - - # draw bubble bboxes - for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): - color = colors[(bubble_id - 1) % len(colors)] - cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2) - cv2.putText(image, f"BOX#{bubble_id}", (int(x1) + 2, int(y1) + 16), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) - - cv2.imwrite("debug_clusters.png", image) - - -# ───────────────────────────────────────────── -# CORE FUNCTION +# MAIN # ───────────────────────────────────────────── def translate_manga_text( image_path, source_lang="en", target_lang="ca", confidence_threshold=0.12, - export_to_file=None, - export_bubbles_to="bubbles.json", - min_text_length=2, + min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, - upscale_factor=2.5, - bbox_padding=3, - debug=False, + export_to_file="output.txt", + export_bubbles_to="bubbles.json", reading_mode="ltr", + debug=True ): - # gap resolve - if gap_px == "auto": - resolved_gap = compute_auto_gap(image_path) - else: - resolved_gap = float(gap_px) - - full_image = cv2.imread(image_path) - if full_image is None: - print(f"❌ Could not load image: {image_path}") + image = cv2.imread(image_path) + if image is None: + print(f"❌ Cannot load image: {image_path}") return - # OCR init - print("\nLoading OCR model...") + resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) + + print("Loading OCR...") ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] reader = easyocr.Reader(ocr_lang_list) - # Translator init - translator = GoogleTranslator(source=source_lang, target=target_lang) + print("Running OCR...") + raw = reader.readtext(image_path, paragraph=False) + print(f"Raw detections: {len(raw)}") - # OCR full image - print(f"\nRunning OCR on: {image_path}") - results = reader.readtext(image_path, paragraph=False) - print(f" Raw detections: {len(results)}") - - # Filter tokens filtered = [] skipped = 0 - for bbox, text, confidence in results: - cleaned = normalize_ocr_text(text) - keep, _ = should_keep_token(cleaned, confidence, confidence_threshold, min_text_length, filter_sound_effects) - if keep: - filtered.append((bbox, cleaned, confidence)) - else: - skipped += 1 + ih, iw = image.shape[:2] - print(f" ✅ {len(filtered)} kept, {skipped} skipped.\n") + for bbox, text, conf in raw: + t = normalize_text(text) + qb = quad_bbox(bbox) + + if conf < confidence_threshold: + skipped += 1 + continue + if len(t) < min_text_length: + skipped += 1 + continue + if is_noise_text(t): + skipped += 1 + continue + if filter_sound_effects and is_sound_effect(t): + skipped += 1 + continue + if is_title_text(t): + skipped += 1 + continue + + if qb[1] < int(ih * TOP_BAND_RATIO): + if conf < 0.70 and len(t) >= 5: + skipped += 1 + continue + + filtered.append((bbox, t, conf)) + + print(f"Kept: {len(filtered)} | Skipped: {skipped}") if not filtered: - print("⚠️ No text detected after filtering.") + print("⚠️ No text after filtering.") return - # Pass 1 grouping - bubble_dict, bbox_dict, ocr_quads, bubble_indices = group_quads_by_overlap( - filtered, - image_shape=full_image.shape, - gap_px=resolved_gap, - bbox_padding=bbox_padding, + bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens( + filtered, image.shape, gap_px=resolved_gap, bbox_padding=3 ) - # Pass 2 orphan absorption - bubble_dict, bbox_dict, ocr_quads, bubble_indices = absorb_orphan_tokens_into_bubbles( - ocr_results=filtered, - bubble_dict=bubble_dict, - bbox_dict=bbox_dict, - ocr_quads=ocr_quads, - bubble_indices=bubble_indices, - image_shape=full_image.shape, - bbox_padding=bbox_padding, - ) - - print(f" ✅ {len(bubble_dict)} bubble(s) detected after absorption.\n") - if debug: - save_debug_clusters(image_path, filtered, bubble_dict, bbox_dict) + save_debug_clusters( + image_path=image_path, + ocr=filtered, + bubble_boxes=bubble_boxes, + bubble_indices=bubble_indices, + out_path="debug_clusters.png" + ) - # merge lines - clean_bubbles = {i: fix_hyphens(lines) for i, lines in bubble_dict.items() if lines} + translator = GoogleTranslator(source=source_lang, target=target_lang) - # OCR quality + reread - print("Checking OCR quality per bubble...") - for i, text in clean_bubbles.items(): - if source_lang == "it": - text = italian_post_ocr_cleanup(text) - clean_bubbles[i] = text - - score = ocr_quality_score(text) - status = "✅" if score >= quality_threshold else "🔁" - print(f" #{i}: score={score:.2f} {status} '{text[:65]}'") - - if score < quality_threshold: - reread = reread_cluster_crop( - full_image, - bbox_dict[i], - reader, - source_lang=source_lang, - upscale_factor=upscale_factor, - ) + clean_lines = {} + for bid, lines in bubbles.items(): + txt = normalize_text(" ".join(lines)) + q = ocr_quality_score(txt) + if q < quality_threshold: + reread = reread_crop(image, bubble_boxes[bid], reader, upscale=2.5, pad=18) if reread: - clean_bubbles[i] = reread + txt = normalize_text(reread) + clean_lines[bid] = apply_glossary(txt) - # Reading order + glossary prepass - reading_order_map = estimate_reading_order(bbox_dict, mode=reading_mode) - for i in list(clean_bubbles.keys()): - clean_bubbles[i] = apply_glossary(clean_bubbles[i], GLOSSARY) + reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) - # Translate - header = "BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS" divider = "─" * 120 - output_lines = [header, divider] + out_lines = ["BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS", divider] - print() + print(divider) print(f"{'BUBBLE':<8} {'ORDER':<6} {'ORIGINAL':<50} {'TRANSLATED':<50} FLAGS") print(divider) - ordered_ids = sorted(clean_bubbles.keys(), key=lambda b: reading_order_map.get(b, b)) translated_count = 0 - - for i in ordered_ids: - src = clean_bubbles[i].strip() + for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): + src = clean_lines[bid].strip() if not src: continue - flags = [] - forced_translation = None - # phrase-map pass - if source_lang == "it" and target_lang == "ca": - exact = PHRASE_MAP_IT_CA.get(src) - if exact: - forced_translation = exact - flags.append("PHRASE_EXACT") - else: - fuzzy, ratio, _ = fuzzy_phrase_match(src, PHRASE_MAP_IT_CA, min_ratio=0.88) - if fuzzy: - forced_translation = fuzzy - flags.append(f"PHRASE_FUZZY:{ratio:.2f}") + try: + tgt = translator.translate(src) or "" + except Exception as e: + tgt = f"[Translation error: {e}]" - if forced_translation is not None: - tgt = forced_translation - else: - try: - tgt = translator.translate(src) - except Exception as e: - tgt = f"[Translation error: {e}]" + tgt = apply_glossary(postprocess_translation_general(tgt)).upper() + src_u = src.upper() - if tgt is None: - tgt = "[No translation returned]" - - tgt = normalize_ocr_text(tgt) - tgt = apply_glossary(tgt, GLOSSARY) - - # suspicious retry - if looks_suspicious_translation(src, tgt): - flags.append("SUSPICIOUS_RETRY") - retry_src = italian_post_ocr_cleanup(src) if source_lang == "it" else src - try: - retry_tgt = translator.translate(retry_src) - if retry_tgt: - retry_tgt = normalize_ocr_text(retry_tgt) - retry_tgt = apply_glossary(retry_tgt, GLOSSARY) - if not looks_suspicious_translation(src, retry_tgt): - tgt = retry_tgt - flags.append("RETRY_OK") - else: - if source_lang == "it" and target_lang == "ca": - fallback, ratio, _ = fuzzy_phrase_match(src, PHRASE_MAP_IT_CA, min_ratio=0.80) - if fallback: - tgt = fallback - flags.append(f"FALLBACK_MAP:{ratio:.2f}") - except Exception: - pass - - tgt = tgt.upper() + out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}") + print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}") translated_count += 1 - ro = reading_order_map.get(i, i) - output_lines.append(f"#{i}|{ro}|{src}|{tgt}|{','.join(flags) if flags else '-'}") - print(f"#{i:<7} {ro:<6} {src:<50} {tgt:<50} {','.join(flags) if flags else '-'}") + out_lines.append(divider) + out_lines.append(f"✅ Done! {translated_count} bubble(s) translated, {skipped} detection(s) skipped.") + + with open(export_to_file, "w", encoding="utf-8") as f: + f.write("\n".join(out_lines)) + + export_bubbles( + export_bubbles_to, + bbox_dict=bubble_boxes, + quads_dict=bubble_quads, + indices_dict=bubble_indices, + ocr=filtered, + reading_map=reading_map, + image_shape=image.shape + ) - output_lines.append(divider) - summary = f"✅ Done! {translated_count} bubble(s) translated, {skipped} detection(s) skipped." - output_lines.append(summary) print(divider) - print(summary) - - if export_to_file: - write_output(output_lines, export_to_file) - - if export_bubbles_to: - export_bubble_boxes( - bbox_dict, - ocr_quads, - reading_order_map=reading_order_map, - filepath=export_bubbles_to, - bbox_expand_ratio=0.16, - image_shape=full_image.shape, - ) + print(f"Saved: {export_to_file}") + print(f"Saved: {export_bubbles_to}") + if debug: + print("Saved: debug_clusters.png (special chars included in yellow boxes)") -# ───────────────────────────────────────────── -# ENTRY POINT -# ───────────────────────────────────────────── if __name__ == "__main__": translate_manga_text( - image_path="001-page.png", - source_lang="it", + image_path="002-page.png", + source_lang="en", target_lang="ca", confidence_threshold=0.12, - min_text_length=2, - export_to_file="output.txt", - export_bubbles_to="bubbles.json", + min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, - upscale_factor=2.5, - bbox_padding=3, - debug=True, + export_to_file="output.txt", + export_bubbles_to="bubbles.json", reading_mode="ltr", + debug=True )