manga-translator/manga-renderer.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
manga-renderer.py

Inputs:  001.jpg  +  bubbles.json  +  output_001.txt
Output:  translated_page_001.png

Strategy:
  1. For every bubble, white-fill all its OCR quads (erases original text cleanly).
  2. Detect the original font size from the OCR bounding boxes.
  3. Dynamically wrap and scale down the translated text if it exceeds the bubble dimensions.
  4. Render the translated text centered inside the bubble bounding box.
  5. Uses uniform line heights to prevent accent collisions.
  6. Adds a white stroke to the text to cover any residual original characters.
"""

import json
import textwrap
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from typing import Dict, List, Tuple, Optional, Set, Any

# ============================================================
# CONFIG  — edit these paths to match your setup
# ============================================================
IMAGE_PATH        = "003.jpg"
BUBBLES_PATH      = "bubbles.json"
TRANSLATIONS_PATH = "output_003.txt"
OUTPUT_PATH       = "translated_page_003.png"

# Font candidates — first one that loads wins
FONT_CANDIDATES = [
    "fonts/ComicNeue-Bold.ttf",
    # Mac fallbacks
    "/System/Library/Fonts/Supplemental/Comic Sans MS Bold.ttf",
    "/System/Library/Fonts/Supplemental/Arial Bold.ttf",
    # Windows fallbacks
    "C:\\Windows\\Fonts\\comicbd.ttf",
    "C:\\Windows\\Fonts\\arialbd.ttf",
    # Linux fallbacks
    "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
]

DEFAULT_FONT_SIZE = 24
MIN_FONT_SIZE     = 12
QUAD_PAD          = 4    # extra pixels added around each quad before white-fill

# ============================================================
# SKIP LIST
# ============================================================
SKIP_BUBBLE_IDS: Set[int] = {
    # Add any bubble IDs you do NOT want rendered here.
}

# ============================================================
# FONT LOADER
# ============================================================
def load_font(path: str, size: int) -> Optional[ImageFont.FreeTypeFont]:
    """Try every face index in a .ttc collection. Validate with getbbox."""
    indices = range(4) if path.lower().endswith(".ttc") else [0]
    for idx in indices:
        try:
            font = ImageFont.truetype(path, size, index=idx)
            font.getbbox("A")   # raises if face metrics are broken
            return font
        except Exception:
            continue
    return None

def resolve_font_path() -> str:
    """Return the path for the first working candidate."""
    for candidate in FONT_CANDIDATES:
        if load_font(candidate, DEFAULT_FONT_SIZE) is not None:
            print(f"   ✅ Font: {candidate}")
            return candidate
    print("   ⚠️  No TrueType font found — using Pillow bitmap fallback")
    return ""

# ============================================================
# PARSERS
# ============================================================
def parse_translations(filepath: str) -> Dict[int, str]:
    """
    Reads output.txt and returns {bubble_id: translated_text}.
    Lines look like:  #2|1|vision-base|ORIGINAL|TRANSLATED|FLAGS
    """
    translations = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line.startswith("#"):
                continue
            parts = line.split("|")
            if len(parts) < 5:
                continue
            try:
                bid        = int(parts[0].lstrip("#"))
                translated = parts[4].strip()
                if translated and translated != "-":
                    translations[bid] = translated
            except ValueError:
                continue
    return translations

def parse_bubbles(filepath: str):
    """
    Returns the full JSON data.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# ============================================================
# ERASE  — white-fill every OCR quad (with small padding)
# ============================================================
def erase_quads(
    image_bgr,
    bubbles_data: Dict[str, dict],
    translations: Dict[int, str],
    skip_ids: Set[int],
    pad: int = QUAD_PAD
):
    """
    White-fills OCR quads ONLY for bubbles that:
      - have a translation in output.txt  AND
      - are NOT in SKIP_BUBBLE_IDS
    """
    ih, iw = image_bgr.shape[:2]
    result = image_bgr.copy()

    erased_count  = 0
    skipped_count = 0

    for bid_str, val in bubbles_data.items():
        bid = int(bid_str)
        quads = val.get("quads", [])

        if bid in skip_ids or bid not in translations:
            skipped_count += 1
            continue

        for quad in quads:
            pts = np.array(quad, dtype=np.int32)
            cv2.fillPoly(result, [pts], (255, 255, 255))

            xs = [p[0] for p in quad]; ys = [p[1] for p in quad]
            x1 = max(0,      min(xs) - pad)
            y1 = max(0,      min(ys) - pad)
            x2 = min(iw - 1, max(xs) + pad)
            y2 = min(ih - 1, max(ys) + pad)
            cv2.rectangle(result, (x1, y1), (x2, y2), (255, 255, 255), -1)

        erased_count += 1

    print(f"   Erased : {erased_count} bubbles")
    print(f"   Ignored: {skipped_count} bubbles (no translation or in skip list)")
    return result

# ============================================================
# DYNAMIC TEXT FITTING
# ============================================================
def get_original_font_size(bubble_data: dict, fallback_size: int = DEFAULT_FONT_SIZE) -> int:
    """Calculates the original font size based on the OCR bounding boxes."""
    line_bboxes = bubble_data.get("line_bboxes", [])
    if not line_bboxes:
        return fallback_size

    heights = [box["h"] for box in line_bboxes]
    median_h = int(np.median(heights))

    estimated_size = int(median_h * 0.85)
    return max(MIN_FONT_SIZE, min(estimated_size, 60))

def fit_text_dynamically(
    text: str,
    font_path: str,
    max_w: int,
    max_h: int,
    target_font_size: int
) -> Tuple[List[str], Any, int, int]:
    """
    Wraps text and scales down font size if it exceeds the bubble dimensions.
    Returns: (wrapped_lines, font_object, line_spacing, final_font_size)
    """
    font_size = target_font_size

    if not font_path:
        font = ImageFont.load_default()
        char_w = 6
        chars_per_line = max(1, int(max_w / char_w))
        wrapped_lines = textwrap.wrap(text, width=chars_per_line)
        return wrapped_lines, font, 4, 10

    while font_size >= MIN_FONT_SIZE:
        font = load_font(font_path, font_size)
        if font is None:
            font = ImageFont.load_default()
            return [text], font, 4, 10

        char_bbox = font.getbbox("A")
        char_w = (char_bbox[2] - char_bbox[0]) or 10
        chars_per_line = max(1, int((max_w * 0.95) / char_w))

        wrapped_lines = textwrap.wrap(text, width=chars_per_line)

        # Use uniform font metrics for height instead of per-line bounding boxes
        line_spacing = max(2, int(font_size * 0.15))
        if hasattr(font, 'getmetrics'):
            ascent, descent = font.getmetrics()
            line_h = ascent + descent
        else:
            line_h = font_size

        total_h = (line_h * len(wrapped_lines)) + (line_spacing * max(0, len(wrapped_lines) - 1))

        max_line_w = 0
        for line in wrapped_lines:
            bbox = font.getbbox(line)
            lw = bbox[2] - bbox[0]
            max_line_w = max(max_line_w, lw)

        if max_line_w <= max_w and total_h <= max_h:
            return wrapped_lines, font, line_spacing, font_size

        font_size -= 2

    font = load_font(font_path, MIN_FONT_SIZE) or ImageFont.load_default()
    char_bbox = font.getbbox("A") if hasattr(font, 'getbbox') else (0,0,6,10)
    char_w = (char_bbox[2] - char_bbox[0]) or 6
    chars_per_line = max(1, int(max_w / char_w))
    wrapped_lines = textwrap.wrap(text, width=chars_per_line)

    return wrapped_lines, font, max(2, int(MIN_FONT_SIZE * 0.15)), MIN_FONT_SIZE

# ============================================================
# RENDER
# ============================================================
def render_text(
    image_bgr,
    bubbles_data: Dict[str, dict],
    translations: Dict[int, str],
    font_path: str,
    skip_ids: Set[int]
):
    """
    Draws the translated text centered in the line_union_bbox of each bubble.
    Adds a white stroke (outline) to cover any residual original characters.
    """
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(image_rgb)
    draw = ImageDraw.Draw(pil_img)

    rendered_count = 0

    for bid_str, val in bubbles_data.items():
        bid = int(bid_str)

        if bid in skip_ids or bid not in translations:
            continue

        text = translations[bid]

        union_box = val.get("line_union_bbox")
        if not union_box:
            union_box = val.get("text_bbox")
            if not union_box:
                continue

        bx, by, bw, bh = union_box["x"], union_box["y"], union_box["w"], union_box["h"]

        pad_x = int(bw * 0.1)
        pad_y = int(bh * 0.1)
        bx -= pad_x // 2
        by -= pad_y // 2
        bw += pad_x
        bh += pad_y

        target_size = get_original_font_size(val)
        wrapped_lines, font, line_spacing, final_size = fit_text_dynamically(text, font_path, bw, bh, target_size)

        # Use uniform typographic line height for rendering
        if hasattr(font, 'getmetrics'):
            ascent, descent = font.getmetrics()
            line_h = ascent + descent
        else:
            line_h = final_size

        total_text_height = (line_h * len(wrapped_lines)) + (line_spacing * max(0, len(wrapped_lines) - 1))

        current_y = by + (bh - total_text_height) // 2
        outline_thickness = max(2, int(final_size * 0.10))

        for i, line in enumerate(wrapped_lines):
            if hasattr(font, 'getbbox'):
                bbox = font.getbbox(line)
                lw = bbox[2] - bbox[0]
            else:
                lw = len(line) * 6

            current_x = bx + (bw - lw) // 2

            draw.text(
                (current_x, current_y),
                line,
                fill=(0, 0, 0),
                font=font,
                stroke_width=outline_thickness,
                stroke_fill=(255, 255, 255)
            )

            # Advance Y by the uniform line height + spacing
            current_y += line_h + line_spacing

        rendered_count += 1

    print(f"   Rendered: {rendered_count} bubbles (with uniform line spacing & outlines)")
    return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)

# ============================================================
# MAIN
# ============================================================
def main():
    print(f"Loading image: {IMAGE_PATH}")
    image_bgr = cv2.imread(IMAGE_PATH)
    if image_bgr is None:
        print(f"❌ Error: Could not load {IMAGE_PATH}")
        return

    print(f"Loading translations: {TRANSLATIONS_PATH}")
    translations = parse_translations(TRANSLATIONS_PATH)

    print(f"Loading bubble data: {BUBBLES_PATH}")
    bubbles_data = parse_bubbles(BUBBLES_PATH)

    print("Resolving font...")
    font_path = resolve_font_path()

    print("\n--- Step 1: Erasing original text ---")
    erased_bgr = erase_quads(
        image_bgr=image_bgr,
        bubbles_data=bubbles_data,
        translations=translations,
        skip_ids=SKIP_BUBBLE_IDS,
        pad=QUAD_PAD
    )

    print("\n--- Step 2: Rendering translated text ---")
    final_bgr = render_text(
        image_bgr=erased_bgr,
        bubbles_data=bubbles_data,
        translations=translations,
        font_path=font_path,
        skip_ids=SKIP_BUBBLE_IDS
    )

    print(f"\nSaving final image to: {OUTPUT_PATH}")
    cv2.imwrite(OUTPUT_PATH, final_bgr)
    print("✅ Done!")

if __name__ == "__main__":
    main()