diff --git a/bubbles.json b/bubbles.json index 184c79f..d96941c 100644 --- a/bubbles.json +++ b/bubbles.json @@ -1,9 +1,9 @@ { "1": { - "x": 204, - "y": 137, - "w": 153, - "h": 82, + "x": 201, + "y": 134, + "w": 159, + "h": 88, "quads": [ [ [ @@ -26,10 +26,10 @@ ] }, "2": { - "x": 1167, - "y": 240, - "w": 132, - "h": 134, + "x": 1164, + "y": 237, + "w": 138, + "h": 140, "quads": [ [ [ @@ -106,10 +106,10 @@ ] }, "3": { - "x": 930, - "y": 378, - "w": 136, - "h": 132, + "x": 927, + "y": 375, + "w": 142, + "h": 138, "quads": [ [ [ @@ -186,10 +186,10 @@ ] }, "4": { - "x": 220, - "y": 486, - "w": 150, - "h": 210, + "x": 217, + "y": 483, + "w": 156, + "h": 216, "quads": [ [ [ @@ -320,10 +320,10 @@ ] }, "5": { - "x": 354, - "y": 1132, - "w": 92, - "h": 102, + "x": 351, + "y": 1129, + "w": 98, + "h": 108, "quads": [ [ [ @@ -382,10 +382,10 @@ ] }, "6": { - "x": 740, - "y": 1324, - "w": 38, - "h": 24, + "x": 737, + "y": 1321, + "w": 44, + "h": 30, "quads": [ [ [ diff --git a/fonts/ComicRelief-Bold.ttf b/fonts/ComicRelief-Bold.ttf new file mode 100755 index 0000000..7b86246 Binary files /dev/null and b/fonts/ComicRelief-Bold.ttf differ diff --git a/fonts/ComicRelief-Regular.ttf b/fonts/ComicRelief-Regular.ttf new file mode 100755 index 0000000..d49aabc Binary files /dev/null and b/fonts/ComicRelief-Regular.ttf differ diff --git a/manga-renderer.py b/manga-renderer.py index 99e50a4..0acb369 100644 --- a/manga-renderer.py +++ b/manga-renderer.py @@ -18,29 +18,101 @@ FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc" FONT_COLOR = (0, 0, 0) +# ───────────────────────────────────────────── +# WORD-ONLY WRAP +# +# Breaks ONLY at space boundaries. +# Returns (lines, overflow) where overflow=True +# means a single word is wider than max_w at +# this font size → caller must try smaller. +# ───────────────────────────────────────────── +def wrap_text_words(draw, text, max_w, font): + """ + Word-wraps text to fit within max_w pixels. + Never inserts hyphens or breaks mid-word. + + Returns: + (lines, overflow) + lines : list of strings, each ≤ max_w px wide + overflow : True if any single word exceeds max_w + """ + def measure(s): + bb = draw.textbbox((0, 0), s, font=font) + return bb[2] - bb[0] + + words = text.split() + lines = [] + current = "" + overflow = False + + for word in words: + if measure(word) > max_w: + overflow = True + break + test = (current + " " + word).strip() + if measure(test) <= max_w: + current = test + else: + if current: + lines.append(current) + current = word + + if not overflow and current: + lines.append(current) + + return lines, overflow + + # ───────────────────────────────────────────── # PARSE output.txt # ───────────────────────────────────────────── def parse_translations(filepath): """ Parses output.txt → {bubble_id: translated_text}. - Only bubbles present in the file are returned. - Absent IDs are left completely untouched on the page. + Uses header line as column ruler to find the exact + char position of the TRANSLATED column. + Immune to commas, ellipses, spaces in translated text. """ translations = {} + header_pos = None + with open(filepath, "r", encoding="utf-8") as f: - for line in f: - line = line.rstrip("\n") - if not re.match(r"^\s*#\d+", line): - continue - parts = re.split(r" {2,}", line.strip()) - if len(parts) < 3: - continue - bubble_id = int(re.sub(r"[^0-9]", "", parts[0])) - translated = parts[-1].strip() - if translated.startswith("["): - continue - translations[bubble_id] = translated + lines = f.readlines() + + for raw_line in lines: + line = raw_line.rstrip("\n") + + if re.match(r"^BUBBLE\s+ORIGINAL", line): + m = re.search(r"TRANSLATED", line) + if m: + header_pos = m.start() + print(f" ℹ️ TRANSLATED column at char {header_pos}") + continue + + stripped = line.strip() + if re.match(r"^[─\-=]{3,}$", stripped): + continue + if stripped.startswith("✅") or stripped.startswith("Done"): + continue + if not re.match(r"^\s*#\d+", line): + continue + + m_id = re.match(r"^\s*#(\d+)", line) + if not m_id: + continue + bubble_id = int(m_id.group(1)) + + if header_pos is not None and len(line) > header_pos: + translated = line[header_pos:].strip() + else: + parts = re.split(r" {2,}", stripped) + translated = parts[-1].strip() if len(parts) >= 3 else "" + + if not translated or translated.startswith("["): + print(f" ⚠️ #{bubble_id}: no translation found") + continue + + translations[bubble_id] = translated print(f" ✅ {len(translations)} bubble(s) to translate: " f"{sorted(translations.keys())}") @@ -67,11 +139,6 @@ def load_bubble_boxes(filepath): # SAMPLE BACKGROUND COLOR # ───────────────────────────────────────────── def sample_bubble_background(cv_image, bubble_data): - """ - Samples the dominant background color inside the bbox - by averaging the brightest 10% of pixels. - Returns (B, G, R). - """ x = max(0, bubble_data["x"]) y = max(0, bubble_data["y"]) x2 = min(cv_image.shape[1], x + bubble_data["w"]) @@ -92,21 +159,9 @@ def sample_bubble_background(cv_image, bubble_data): # ───────────────────────────────────────────── # ERASE ORIGINAL TEXT -# Fills the tight OCR bbox with the sampled -# background color. No extra expansion — -# the bbox from bubbles.json is already the -# exact size of the red squares. # ───────────────────────────────────────────── def erase_bubble_text(cv_image, bubble_data, bg_color=(255, 255, 255)): - """ - Fills the bubble bounding box with bg_color. - - Args: - cv_image : BGR numpy array (modified in place) - bubble_data : Dict with 'x','y','w','h' - bg_color : (B,G,R) fill color - """ img_h, img_w = cv_image.shape[:2] x = max(0, bubble_data["x"]) y = max(0, bubble_data["y"]) @@ -116,14 +171,61 @@ def erase_bubble_text(cv_image, bubble_data, # ───────────────────────────────────────────── -# FIT FONT SIZE +# LINE HEIGHT (tight) +# +# Uses actual ascender+descender of the font +# at the given size, with a minimal 1px gap. +# Much tighter than the old flat "+2" approach. # ───────────────────────────────────────────── +def get_line_height(draw, font): + """ + Returns the line height in pixels for the given font. + Measured from actual glyph bounds of "Ay" (covers + ascenders and descenders) plus 1px breathing room. + """ + bb = draw.textbbox((0, 0), "Ay", font=font) + return (bb[3] - bb[1]) + 1 + + +# ───────────────────────────────────────────── +# FIT FONT SIZE (dynamic ceiling, word-wrap) +# +# max_size is derived from the box itself: +# min(MAX_FONT_CAP, inner_h) +# so a tall box can use a large font and a +# small box won't waste iterations on huge sizes. +# +# Rejects a size if: +# • any single word is wider than inner_w, OR +# • total wrapped height exceeds inner_h +# ───────────────────────────────────────────── +MAX_FONT_CAP = 120 # absolute ceiling across all boxes + def fit_font_size(draw, text, max_w, max_h, font_path, - min_size=7, max_size=48): + min_size=7): """ Finds the largest font size where word-wrapped text - fits inside (max_w × max_h). + fits inside max_w × max_h with NO mid-word breaking. + + max_size is computed dynamically as min(MAX_FONT_CAP, max_h) + so the search always starts from a sensible upper bound + relative to the actual box height. + + Args: + draw : ImageDraw instance + text : Full text string + max_w : Available width in pixels + max_h : Available height in pixels + font_path : Path to .ttf (or None for PIL default) + min_size : Minimum font pt (default: 7) + + Returns: + (font, lines) """ + # Dynamic ceiling: no point trying a font taller than the box + max_size = min(MAX_FONT_CAP, max_h) + max_size = max(max_size, min_size) # safety: never below min + best_font = None best_lines = [text] @@ -134,38 +236,48 @@ def fit_font_size(draw, text, max_w, max_h, font_path, except Exception: font = ImageFont.load_default() - words, lines, current = text.split(), [], "" - for word in words: - test = (current + " " + word).strip() - bb = draw.textbbox((0, 0), test, font=font) - if (bb[2] - bb[0]) <= max_w: - current = test - else: - if current: - lines.append(current) - current = word - if current: - lines.append(current) + lines, overflow = wrap_text_words(draw, text, max_w, font) - lh = draw.textbbox((0, 0), "Ay", font=font) - line_h = (lh[3] - lh[1]) + 2 - if line_h * len(lines) <= max_h: + if overflow: + continue # a word is wider than the box → too big + + line_h = get_line_height(draw, font) + total_h = line_h * len(lines) + + if total_h <= max_h: best_font = font best_lines = lines - break + break # largest size that fits — done - return best_font or ImageFont.load_default(), best_lines + # Guaranteed fallback at min_size + if best_font is None: + try: + best_font = (ImageFont.truetype(font_path, min_size) + if font_path else ImageFont.load_default()) + except Exception: + best_font = ImageFont.load_default() + best_lines, _ = wrap_text_words( + draw, text, max_w, best_font) + if not best_lines: + best_lines = [text] + + return best_font, best_lines # ───────────────────────────────────────────── # RENDER TEXT INTO BUBBLE +# +# Text is centered both horizontally and +# vertically inside the padded bbox. +# Line height uses get_line_height() (tight). # ───────────────────────────────────────────── def render_text_in_bubble(pil_image, bubble_data, text, - font_path, padding=8, + font_path, padding=6, font_color=(0, 0, 0)): """ - Renders translated text centered inside the tight bbox. - Font auto-sizes to fill the same w×h the original occupied. + Renders translated text centered inside the bbox. + Font auto-sizes to fill the box as much as possible. + Word-wrap only — no mid-word hyphens. """ x, y = bubble_data["x"], bubble_data["y"] w, h = bubble_data["w"], bubble_data["h"] @@ -174,17 +286,20 @@ def render_text_in_bubble(pil_image, bubble_data, text, inner_w = max(1, w - padding * 2) inner_h = max(1, h - padding * 2) - font, lines = fit_font_size(draw, text, inner_w, inner_h, - font_path) + font, lines = fit_font_size( + draw, text, inner_w, inner_h, font_path + ) - lh_bb = draw.textbbox((0, 0), "Ay", font=font) - line_h = (lh_bb[3] - lh_bb[1]) + 2 + line_h = get_line_height(draw, font) total_h = line_h * len(lines) + + # Center block vertically start_y = y + padding + max(0, (inner_h - total_h) // 2) for line in lines: - lb = draw.textbbox((0, 0), line, font=font) - line_w = lb[2] - lb[0] + bb = draw.textbbox((0, 0), line, font=font) + line_w = bb[2] - bb[0] + # Center each line horizontally start_x = x + padding + max(0, (inner_w - line_w) // 2) draw.text((start_x, start_y), line, font=font, fill=font_color) @@ -216,19 +331,9 @@ def render_translated_page( font_path = FONT_PATH, font_fallback = FONT_FALLBACK, font_color = FONT_COLOR, - text_padding = 8, + text_padding = 6, debug = False, ): - """ - Pipeline: - 1. Parse translations (only present IDs processed) - 2. Load bubble boxes from bubbles.json - 3. Cross-check IDs — absent ones left untouched - 4. Sample background color per bubble - 5. Erase original text (fill tight bbox) - 6. Render translated text sized to fit the bbox - 7. Save output - """ print("=" * 55) print(" MANGA TRANSLATOR — RENDERER") print("=" * 55) @@ -271,7 +376,7 @@ def render_translated_page( print("\n🎨 Sampling backgrounds...") bg_colors = {} for bid in to_process: - bg_bgr = sample_bubble_background( + bg_bgr = sample_bubble_background( cv_image, bubble_boxes[bid]) bg_colors[bid] = bg_bgr bg_rgb = (bg_bgr[2], bg_bgr[1], bg_bgr[0]) @@ -344,9 +449,9 @@ if __name__ == "__main__": output_image = "page_translated.png", translations_file = "output.txt", bubbles_file = "bubbles.json", - font_path = "font.ttf", + font_path = "fonts/ComicRelief-Regular.ttf", font_fallback = "/System/Library/Fonts/Helvetica.ttc", font_color = (0, 0, 0), - text_padding = 8, + text_padding = 6, debug = True, - ) \ No newline at end of file + ) diff --git a/manga-translator.py b/manga-translator.py index f997515..775a154 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -48,57 +48,88 @@ def is_sound_effect(text): # ───────────────────────────────────────────── -# TOKEN FILTER +# TOKEN CLASSIFIER +# +# Three categories: +# "alpha" — contains at least one letter (È, é, A-Z etc.) +# "punct" — 2+ chars, all punctuation (... ?? !! ?! …) +# "noise" — everything else (single symbols, pure digits, +# low-confidence, sound effects) +# +# Both "alpha" and "punct" tokens are KEPT: +# - "alpha" → contributes to translation text AND bbox +# - "punct" → contributes to bbox only (not translation text) +# unless it immediately follows alpha text +# in the same cluster (handled in clustering) # ───────────────────────────────────────────── -def should_keep_token(text, confidence, confidence_threshold, - min_text_length, filter_sound_effects): +def classify_token(text, confidence, confidence_threshold, + min_text_length, filter_sound_effects): """ - Returns (keep: bool, reason: str). + Returns one of: "alpha" | "punct" | "noise" + + "alpha" : has at least one letter → keep for text + bbox + "punct" : 2+ chars, no letters → keep for bbox only + "noise" : drop entirely Rules: - 1. Drop if confidence below threshold - 2. Drop if shorter than min_text_length - 3. Drop pure digit strings - 4. Drop single non-alpha characters - 5. Drop sound effects if filter enabled - 6. Keep everything else + 1. Drop if confidence below threshold → noise + 2. Drop if shorter than min_text_length → noise + 3. Drop pure digit strings → noise + 4. Drop single non-alpha characters → noise + 5. Drop sound effects if filter enabled → noise + 6. 2+ char string with no letters → punct + 7. Has at least one letter → alpha """ cleaned = text.strip() if confidence < confidence_threshold: - return False, f"low confidence ({confidence:.2f})" + return "noise" if len(cleaned) < min_text_length: - return False, "too short" + return "noise" if re.fullmatch(r"\d+", cleaned): - return False, "pure digits" + return "noise" if len(cleaned) == 1 and not cleaned.isalpha(): - return False, "single symbol" + return "noise" if filter_sound_effects and is_sound_effect(cleaned): - return False, "sound effect" + return "noise" - return True, "ok" + # 2+ chars with no letters at all → punctuation token + # Examples: "..." "??" "!!" "?!" "…" ".." + if not any(ch.isalpha() for ch in cleaned): + return "punct" + + return "alpha" + + +def should_keep_token(text, confidence, confidence_threshold, + min_text_length, filter_sound_effects): + """ + Backward-compatible wrapper. + Returns (keep: bool, category: str). + """ + cat = classify_token(text, confidence, confidence_threshold, + min_text_length, filter_sound_effects) + return cat != "noise", cat # ───────────────────────────────────────────── # BOUNDING BOX # -# Rules (match the red square exactly): -# Width = widest single quad's width -# Height = sum of ALL quad heights stacked -# X = centered on the widest quad's CX -# Y = topmost Y1 of all quads +# Width = widest single quad's width +# Height = sum of ALL quad heights stacked +# X = centered on the widest quad's CX +# Y = topmost Y1 of all quads # ───────────────────────────────────────────── def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape, padding_px=10): """ Computes the bubble erase bbox: - 1. Per-quad: measure w, h, cx for every OCR detection + 1. Per-quad: measure w, h, cx 2. Width = width of the widest single quad 3. Height = sum of every quad's height 4. X = widest quad's center ± max_w/2 - (all lines sit symmetrically inside) - 5. Y = top of topmost quad, bottom = Y + total_h + 5. Y = top of topmost quad → Y + total_h Args: ocr_bboxes : List of EasyOCR quad bboxes @@ -113,7 +144,6 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape, if not ocr_bboxes: return 0, 0, 0, 0 - # ── Per-quad metrics ────────────────────────────────────────── quad_metrics = [] for quad in ocr_bboxes: xs = [pt[0] for pt in quad] @@ -121,30 +151,23 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape, qx1, qx2 = min(xs), max(xs) qy1, qy2 = min(ys), max(ys) quad_metrics.append({ - "x1" : qx1, - "x2" : qx2, - "y1" : qy1, - "y2" : qy2, + "x1" : qx1, "x2" : qx2, + "y1" : qy1, "y2" : qy2, "w" : qx2 - qx1, "h" : qy2 - qy1, "cx" : (qx1 + qx2) / 2.0, }) - # ── Width: widest single quad ───────────────────────────────── widest = max(quad_metrics, key=lambda q: q["w"]) max_w = widest["w"] center_x = widest["cx"] + total_h = sum(q["h"] for q in quad_metrics) - # ── Height: sum of all quad heights ────────────────────────── - total_h = sum(q["h"] for q in quad_metrics) - - # ── Box edges ───────────────────────────────────────────────── box_x1 = center_x - max_w / 2.0 box_x2 = center_x + max_w / 2.0 box_y1 = min(q["y1"] for q in quad_metrics) box_y2 = box_y1 + total_h - # ── Padding + clamp ─────────────────────────────────────────── x1 = max(0, box_x1 - padding_px) y1 = max(0, box_y1 - padding_px) x2 = min(img_w, box_x2 + padding_px) @@ -171,17 +194,6 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80): return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1) -# ───────────────────────────────────────────── -# TEXT LINE FILTER -# ───────────────────────────────────────────── -def has_translatable_content(text): - """ - True if text contains at least one letter. - ch.isalpha() handles È, é, ñ, ü etc. - """ - return any(ch.isalpha() for ch in text) - - # ───────────────────────────────────────────── # POST-CLUSTER MERGE (Union-Find) # ───────────────────────────────────────────── @@ -270,11 +282,17 @@ def cluster_into_bubbles(ocr_results, image_shape, Pass 1 — DBSCAN on center points Pass 2 — Bounding-box proximity merge + Token categories per cluster: + "alpha" tokens → translation text + bbox + "punct" tokens → bbox only (e.g. "..." after "HN") + "noise" tokens → already filtered before this function + Bbox: widest-line width (centered) × stacked height. - All quads contribute to bbox regardless of content. Returns: - bubble_dict : cluster_id → list of translatable text lines + bubble_dict : cluster_id → list of text lines + (alpha tokens only, punct appended + to last alpha line if spatially adjacent) bbox_dict : cluster_id → (x1, y1, x2, y2) ocr_quads : cluster_id → list of ALL raw EasyOCR quads """ @@ -303,6 +321,8 @@ def cluster_into_bubbles(ocr_results, image_shape, raw_clusters.setdefault(label, []) raw_quads.setdefault(label, []) bbox, text, _ = ocr_results[idx] + # Store (cy, cx, text, category) + cat = ocr_results[idx][2] # confidence stored as category below raw_clusters[label].append( (centers[idx][1], centers[idx][0], text)) raw_quads[label].append(bbox) @@ -335,15 +355,40 @@ def cluster_into_bubbles(ocr_results, image_shape, items_sorted = sorted(items, key=lambda t: t[0]) - text_lines = [ - text for _, _, text in items_sorted - if has_translatable_content(text) - ] + # ── Build text lines ────────────────────────────────────── + # Alpha tokens become text lines. + # Punct tokens (... ?? etc.) are appended to the + # nearest preceding alpha token on the same Y level. + alpha_lines = [] # (cy, text) for alpha tokens + punct_tokens = [] # (cy, text) for punct tokens + + for cy, cx, text in items_sorted: + if any(ch.isalpha() for ch in text): + alpha_lines.append((cy, text)) + else: + punct_tokens.append((cy, text)) + + # Append each punct token to the closest alpha line by Y + for pcy, ptext in punct_tokens: + if alpha_lines: + # Find alpha line with closest cy + closest_idx = min( + range(len(alpha_lines)), + key=lambda k: abs(alpha_lines[k][0] - pcy) + ) + cy_a, text_a = alpha_lines[closest_idx] + alpha_lines[closest_idx] = (cy_a, text_a + ptext) + # If no alpha lines at all, punct still contributes + # to bbox but not to translation text + + text_lines = [t for _, t in alpha_lines] + + # Fallback: if no alpha at all, keep everything if not text_lines: text_lines = [text for _, _, text in items_sorted] bubble_dict[i] = text_lines - ocr_quads[i] = quads + ocr_quads[i] = quads # ALL quads → full bbox bbox_dict[i] = get_cluster_bbox_from_ocr( quads, image_shape, padding_px=bbox_padding @@ -353,7 +398,8 @@ def cluster_into_bubbles(ocr_results, image_shape, print(f" Cluster #{i}: {len(quads)} quad(s) " f"bbox=({int(b[0])},{int(b[1])})→" f"({int(b[2])},{int(b[3])}) " - f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}") + f"w={int(b[2]-b[0])} h={int(b[3]-b[1])} " + f"text={text_lines}") return bubble_dict, bbox_dict, ocr_quads @@ -519,15 +565,17 @@ def translate_manga_text( for bbox, text, confidence in results: cleaned = text.strip() - keep, reason = should_keep_token( + keep, category = should_keep_token( cleaned, confidence, confidence_threshold, min_text_length, filter_sound_effects ) if keep: filtered.append((bbox, cleaned, confidence)) + if category == "punct": + print(f" ✔ Punct kept: '{cleaned}'") else: - if reason == "sound effect": + if category == "sound effect": print(f" 🔇 SFX skipped: '{cleaned}'") skipped += 1 @@ -656,6 +704,6 @@ if __name__ == "__main__": filter_sound_effects = True, quality_threshold = 0.5, upscale_factor = 2.5, - bbox_padding = 0, + bbox_padding = 3, debug = True, - ) \ No newline at end of file + )