manga-translator/manga-translator.py

import re
import os
import json
import cv2
import numpy as np
import easyocr
from deep_translator import GoogleTranslator


# ─────────────────────────────────────────────
#  LANGUAGE CODE REFERENCE
# ─────────────────────────────────────────────
SUPPORTED_LANGUAGES = {
    "Vietnamese"           : "vi",
    "Japanese"             : "ja",
    "English"              : "en",
    "Spanish"              : "es",
    "Korean"               : "ko",
    "Chinese (Simplified)" : "ch_sim",
    "Chinese (Traditional)": "ch_tra",
    "French"               : "fr",
    "German"               : "de",
    "Italian"              : "it",
    "Portuguese"           : "pt",
    "Arabic"               : "ar",
    "Russian"              : "ru",
    "Thai"                 : "th",
    "Catalan"              : "ca",
}


# ─────────────────────────────────────────────
#  SOUND EFFECT FILTER
# ─────────────────────────────────────────────
SOUND_EFFECT_PATTERNS = [
    r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
    r"^oh+$",    r"^ugh+$", r"^gr+$", r"^bam+$",
    r"^pow+$",   r"^crash+$", r"^boom+$", r"^bang+$",
    r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$",
    r"^zip+$",   r"^swoosh+$", r"^chirp+$", r"^tweet+$",
]

def is_sound_effect(text):
    cleaned = re.sub(r"[^a-z]", "", text.strip().lower())
    return any(re.fullmatch(p, cleaned, re.IGNORECASE)
               for p in SOUND_EFFECT_PATTERNS)


# ─────────────────────────────────────────────
#  TITLE / LOGO / AUTHOR FILTER
# ─────────────────────────────────────────────
TITLE_PATTERNS = [
    r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
    r"^(spy|family|spy.family)$",
    r"^by\s+.+$",                          # "BY TATSUYA ENDO"
    r"^[a-z]{1,4}\s+[a-z]+\s+[a-z]+$",    # short author-style lines
]

def is_title_text(text):
    cleaned = text.strip().lower()
    return any(re.fullmatch(p, cleaned, re.IGNORECASE)
               for p in TITLE_PATTERNS)


# ─────────────────────────────────────────────
#  GARBAGE TOKEN FILTER
#  Catches OCR misreads that are mostly
#  non-alpha or suspiciously short/mangled
# ─────────────────────────────────────────────
GARBAGE_PATTERNS = [
    r"^[^a-zA-Z]*$",           # no letters at all
    r"^.{1,2}$",               # 1-2 char tokens
    r".*\d+.*",                # contains digits (YO4, HLNGRY etc.)
    r"^[A-Z]{1,4}$",           # isolated caps abbreviations (IILK)
]

def is_garbage(text):
    t = text.strip()
    return any(re.fullmatch(p, t) for p in GARBAGE_PATTERNS)


# ─────────────────────────────────────────────
#  TOKEN CLASSIFIER
# ─────────────────────────────────────────────
def classify_token(text, confidence, confidence_threshold,
                    min_text_length, filter_sound_effects):
    """
    Returns one of: "alpha" | "punct" | "noise"
    """
    cleaned = text.strip()

    if confidence < confidence_threshold:
        return "noise"
    if len(cleaned) < min_text_length:
        return "noise"
    if re.fullmatch(r"\d+", cleaned):
        return "noise"
    if len(cleaned) == 1 and not cleaned.isalpha():
        return "noise"
    if filter_sound_effects and is_sound_effect(cleaned):
        return "noise"
    if is_title_text(cleaned):
        return "noise"
    if is_garbage(cleaned):
        return "noise"
    if not any(ch.isalpha() for ch in cleaned):
        return "punct"
    return "alpha"


def should_keep_token(text, confidence, confidence_threshold,
                       min_text_length, filter_sound_effects):
    cat = classify_token(text, confidence, confidence_threshold,
                          min_text_length, filter_sound_effects)
    return cat != "noise", cat


# ─────────────────────────────────────────────
#  QUAD HELPERS
# ─────────────────────────────────────────────
def quad_bbox(quad):
    xs = [pt[0] for pt in quad]
    ys = [pt[1] for pt in quad]
    return min(xs), min(ys), max(xs), max(ys)


def quads_bbox(quads, image_shape, padding_px=10):
    img_h, img_w = image_shape[:2]
    all_x = [pt[0] for quad in quads for pt in quad]
    all_y = [pt[1] for quad in quads for pt in quad]
    x1 = max(0,     min(all_x) - padding_px)
    y1 = max(0,     min(all_y) - padding_px)
    x2 = min(img_w, max(all_x) + padding_px)
    y2 = min(img_h, max(all_y) + padding_px)
    return x1, y1, x2, y2


def bboxes_overlap_or_touch(a, b, gap_px=0):
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
    gap_y = max(0, max(ay1, by1) - min(ay2, by2))
    return gap_x <= gap_px and gap_y <= gap_px


# ─────────────────────────────────────────────
#  OVERLAP-BASED GROUPING  (Union-Find)
# ─────────────────────────────────────────────
def group_quads_by_overlap(ocr_results, image_shape,
                            gap_px=18, bbox_padding=10):
    n = len(ocr_results)
    if n == 0:
        return {}, {}, {}

    token_bboxes = [quad_bbox(r[0]) for r in ocr_results]
    parent       = list(range(n))

    def find(x):
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return x

    def union(x, y):
        parent[find(x)] = find(y)

    for i in range(n):
        for j in range(i + 1, n):
            if bboxes_overlap_or_touch(
                    token_bboxes[i], token_bboxes[j],
                    gap_px=gap_px):
                union(i, j)

    groups = {}
    for i in range(n):
        root = find(i)
        groups.setdefault(root, [])
        groups[root].append(i)

    def group_sort_key(indices):
        ys = [token_bboxes[i][1] for i in indices]
        xs = [token_bboxes[i][0] for i in indices]
        return (min(ys) // 150, min(xs))

    sorted_groups = sorted(groups.values(), key=group_sort_key)

    bubble_dict = {}
    bbox_dict   = {}
    ocr_quads   = {}

    for gid, indices in enumerate(sorted_groups, start=1):
        indices_sorted = sorted(
            indices, key=lambda i: token_bboxes[i][1])

        quads     = [ocr_results[i][0] for i in indices_sorted]
        raw_texts = [ocr_results[i][1] for i in indices_sorted]

        alpha_lines  = []
        punct_tokens = []

        for i in indices_sorted:
            _, text, _ = ocr_results[i]
            yc = (token_bboxes[i][1] + token_bboxes[i][3]) / 2.0
            if any(ch.isalpha() for ch in text):
                alpha_lines.append((yc, text))
            else:
                punct_tokens.append((yc, text))

        for pcy, ptext in punct_tokens:
            if alpha_lines:
                closest = min(
                    range(len(alpha_lines)),
                    key=lambda k: abs(alpha_lines[k][0] - pcy)
                )
                yc_a, text_a = alpha_lines[closest]
                alpha_lines[closest] = (yc_a, text_a + ptext)

        text_lines = [t for _, t in alpha_lines] or raw_texts

        bubble_dict[gid] = text_lines
        ocr_quads[gid]   = quads
        bbox_dict[gid]   = quads_bbox(quads, image_shape,
                                       padding_px=bbox_padding)

        b = bbox_dict[gid]
        print(f"  Group #{gid}: {len(quads)} quad(s)  "
              f"bbox=({int(b[0])},{int(b[1])})→"
              f"({int(b[2])},{int(b[3])})  "
              f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}  "
              f"text={text_lines}")

    return bubble_dict, bbox_dict, ocr_quads


# ─────────────────────────────────────────────
#  HYPHEN REMOVAL
# ─────────────────────────────────────────────
def fix_hyphens(lines):
    if not lines:
        return ""
    merged = lines[0]
    for line in lines[1:]:
        line   = line.strip()
        merged = (merged[:-1] + line if merged.endswith("-")
                  else merged + " " + line)
    return re.sub(r" {2,}", " ", merged).strip().upper()


# ─────────────────────────────────────────────
#  CROP-BASED OCR RE-READ
# ─────────────────────────────────────────────
def reread_cluster_crop(image, bbox, reader,
                         padding_px=20, upscale_factor=2.5):
    img_h, img_w = image.shape[:2]
    x1, y1, x2, y2 = bbox
    x1 = max(0,     int(x1) - padding_px)
    y1 = max(0,     int(y1) - padding_px)
    x2 = min(img_w, int(x2) + padding_px)
    y2 = min(img_h, int(y2) + padding_px)

    crop = image[y1:y2, x1:x2]
    if crop.size == 0:
        return None

    new_w     = int(crop.shape[1] * upscale_factor)
    new_h     = int(crop.shape[0] * upscale_factor)
    upscaled  = cv2.resize(crop, (new_w, new_h),
                           interpolation=cv2.INTER_CUBIC)
    kernel    = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(upscaled, -1, kernel)

    temp_path = "_temp_crop_ocr.png"
    cv2.imwrite(temp_path, sharpened)
    try:
        crop_results = reader.readtext(temp_path, paragraph=False)
    finally:
        if os.path.exists(temp_path):
            os.remove(temp_path)

    if not crop_results:
        return None

    crop_results.sort(key=lambda r: r[0][0][1])
    lines = [t.strip().upper() for _, t, _ in crop_results
             if t.strip()]
    return fix_hyphens(lines) if lines else None


# ─────────────────────────────────────────────
#  AUTO GAP
# ─────────────────────────────────────────────
def compute_auto_gap(image_path, base_gap=18,
                      reference_width=750):
    image = cv2.imread(image_path)
    if image is None:
        return base_gap
    img_w  = image.shape[1]
    scaled = base_gap * (img_w / reference_width)
    print(f"  ℹ️  Image width: {img_w}px → auto gap: {scaled:.1f}px")
    return scaled


# ─────────────────────────────────────────────
#  OCR QUALITY SCORE
# ─────────────────────────────────────────────
def ocr_quality_score(text):
    if not text or len(text) < 2:
        return 0.0
    alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text)
    garbage     = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
    penalty     = sum(0.2 for p in garbage
                      if re.search(p, text))
    return max(0.0, min(1.0, alpha_ratio - penalty))


# ─────────────────────────────────────────────
#  BUBBLE JSON EXPORT
#  bbox_expand_ratio: grow bbox by this fraction
#  of its own size in each direction to better
#  approximate the full speech bubble boundary.
# ─────────────────────────────────────────────
def export_bubble_boxes(bbox_dict, ocr_quads_dict,
                         filepath="bubbles.json",
                         bbox_expand_ratio=0.35,
                         image_shape=None):
    export = {}
    for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
        quads = ocr_quads_dict.get(bubble_id, [])

        # ── Expand bbox to approximate full bubble ────────────────
        w_orig = x2 - x1
        h_orig = y2 - y1
        pad_x  = int(w_orig * bbox_expand_ratio)
        pad_y  = int(h_orig * bbox_expand_ratio)

        # Clamp to image bounds if image_shape provided
        if image_shape is not None:
            img_h, img_w = image_shape[:2]
            ex1 = max(0,     x1 - pad_x)
            ey1 = max(0,     y1 - pad_y)
            ex2 = min(img_w, x2 + pad_x)
            ey2 = min(img_h, y2 + pad_y)
        else:
            ex1 = x1 - pad_x
            ey1 = y1 - pad_y
            ex2 = x2 + pad_x
            ey2 = y2 + pad_y

        export[str(bubble_id)] = {
            "x"           : int(ex1),
            "y"           : int(ey1),
            "w"           : int(ex2 - ex1),
            "h"           : int(ey2 - ey1),
            # Original tight bbox kept for reference
            "x_tight"     : int(x1),
            "y_tight"     : int(y1),
            "w_tight"     : int(w_orig),
            "h_tight"     : int(h_orig),
            "quad_bboxes" : [
                {
                    "x": int(quad_bbox(q)[0]),
                    "y": int(quad_bbox(q)[1]),
                    "w": int(quad_bbox(q)[2] - quad_bbox(q)[0]),
                    "h": int(quad_bbox(q)[3] - quad_bbox(q)[1]),
                }
                for q in quads
            ],
            "quads": [[[int(pt[0]), int(pt[1])] for pt in quad]
                      for quad in quads],
        }

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(export, f, indent=2, ensure_ascii=False)

    print(f"\n📦 Bubble boxes saved → {filepath}")
    for bid, v in export.items():
        print(f"   #{bid}: expanded=({v['x']},{v['y']}) "
              f"{v['w']}×{v['h']}px  "
              f"tight={v['w_tight']}×{v['h_tight']}px  "
              f"[{len(v['quads'])} quad(s)]")

# ─────────────────────────────────────────────
#  OUTPUT.TXT WRITER
#  Uses a pipe | as unambiguous delimiter
#  Format: #ID|ORIGINAL|TRANSLATED
# ─────────────────────────────────────────────
def write_output(output_lines, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        f.write("\n".join(output_lines))
    print(f"📄 Translations saved → {filepath}")


# ─────────────────────────────────────────────
#  DEBUG IMAGE
# ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr_results,
                         bubble_dict, bbox_dict):
    image = cv2.imread(image_path)
    if image is None:
        return

    np.random.seed(42)
    num_bubbles = max(bubble_dict.keys(), default=1)
    colors = [
        tuple(int(c) for c in col)
        for col in np.random.randint(
            50, 230, size=(num_bubbles + 2, 3))
    ]

    text_to_bubble = {}
    for bubble_id, lines in bubble_dict.items():
        for line in lines:
            text_to_bubble[line] = bubble_id

    for bbox, text, _ in ocr_results:
        bubble_id = text_to_bubble.get(text, 0)
        color     = colors[(bubble_id - 1) % len(colors)]
        pts       = np.array(bbox, dtype=np.int32)
        cv2.polylines(image, [pts], isClosed=True,
                      color=color, thickness=1)

    for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
        color = colors[(bubble_id - 1) % len(colors)]
        cv2.rectangle(image,
                      (int(x1), int(y1)),
                      (int(x2), int(y2)),
                      color, 2)
        cv2.putText(image, f"BOX#{bubble_id}",
                    (int(x1) + 2, int(y1) + 16),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, color, 2)

    cv2.imwrite("debug_clusters.png", image)
    print("  🐛 debug_clusters.png saved")


# ─────────────────────────────────────────────
#  CORE FUNCTION
# ─────────────────────────────────────────────
def translate_manga_text(
    image_path,
    source_lang="en",
    target_lang="ca",
    confidence_threshold=0.10,
    export_to_file=None,
    export_bubbles_to="bubbles.json",
    min_text_length=2,
    gap_px="auto",
    filter_sound_effects=True,
    quality_threshold=0.5,
    upscale_factor=2.5,
    bbox_padding=10,
    debug=False,
):
    # ── 1. Resolve gap ────────────────────────────────────────────
    if gap_px == "auto":
        resolved_gap = compute_auto_gap(image_path)
    else:
        resolved_gap = float(gap_px)

    # ── 2. Load full image ────────────────────────────────────────
    full_image = cv2.imread(image_path)
    if full_image is None:
        print(f"❌ Could not load image: {image_path}")
        return

    # ── 3. Initialize OCR ─────────────────────────────────────────
    print("\nLoading OCR model...")
    ocr_lang_list = ["en", "es"] if source_lang == "ca" \
                    else [source_lang]
    reader = easyocr.Reader(ocr_lang_list)

    # ── 4. Initialize translator ──────────────────────────────────
    translator = GoogleTranslator(source=source_lang,
                                  target=target_lang)

    # ── 5. Run OCR ────────────────────────────────────────────────
    print(f"\nRunning OCR on: {image_path}")
    results = reader.readtext(image_path, paragraph=False)
    print(f"  Raw detections: {len(results)}")

    # ── 6. Filter tokens ──────────────────────────────────────────
    filtered = []
    skipped  = 0

    for bbox, text, confidence in results:
        cleaned = text.strip().upper()
        keep, category = should_keep_token(
            cleaned, confidence,
            confidence_threshold, min_text_length,
            filter_sound_effects
        )
        if keep:
            filtered.append((bbox, cleaned, confidence))
            if category == "punct":
                print(f"  ✔ Punct kept:    '{cleaned}'")
        else:
            tag = ("🔇 SFX"    if is_sound_effect(cleaned) else
                   "🏷  Title"  if is_title_text(cleaned)   else
                   "🗑  Garbage" if is_garbage(cleaned)      else
                   "✂️  Low-conf")
            print(f"  {tag} skipped: '{cleaned}'")
            skipped += 1

    print(f"  ✅ {len(filtered)} kept, {skipped} skipped.\n")

    if not filtered:
        print("⚠️  No text detected after filtering.")
        return

    # ── 7. Group by overlap ───────────────────────────────────────
    print(f"Grouping by overlap "
          f"(gap_px={resolved_gap:.1f}, "
          f"bbox_padding={bbox_padding}px)...")

    bubble_dict, bbox_dict, ocr_quads = group_quads_by_overlap(
        filtered,
        image_shape  = full_image.shape,
        gap_px       = resolved_gap,
        bbox_padding = bbox_padding,
    )
    print(f"  ✅ {len(bubble_dict)} bubble(s) detected.\n")

    # ── 8. Debug ──────────────────────────────────────────────────
    if debug:
        save_debug_clusters(image_path, filtered,
                            bubble_dict, bbox_dict)

    # ── 9. Fix hyphens ────────────────────────────────────────────
    clean_bubbles = {
        i: fix_hyphens(lines)
        for i, lines in bubble_dict.items()
        if lines
    }

    # ── 10. Quality check + crop re-read ──────────────────────────
    print("Checking OCR quality per bubble...")
    for i, text in clean_bubbles.items():
        score  = ocr_quality_score(text)
        status = "✅" if score >= quality_threshold else "🔁"
        print(f"  #{i}: score={score:.2f} {status} "
              f"'{text[:55]}'")

        if score < quality_threshold:
            print(f"    → Re-reading #{i} from crop...")
            reread = reread_cluster_crop(
                full_image, bbox_dict[i], reader,
                upscale_factor=upscale_factor,
            )
            if reread:
                print(f"    → '{reread}'")
                clean_bubbles[i] = reread
            else:
                print(f"    → Nothing found, keeping original.")

    # ── 11. Translate ─────────────────────────────────────────────
    # Output format (pipe-delimited, unambiguous):
    #   #ID|ORIGINAL TEXT|TRANSLATED TEXT
    print()
    header  = "BUBBLE|ORIGINAL|TRANSLATED"
    divider = "─" * 80
    output_lines     = [header, divider]
    translations     = {}
    translated_count = 0

    print(f"{'BUBBLE':<8} {'ORIGINAL':<45} {'TRANSLATED'}")
    print(divider)

    for i in sorted(clean_bubbles.keys()):
        bubble_text = clean_bubbles[i].strip()
        if not bubble_text:
            continue
        try:
            result = translator.translate(bubble_text)
        except Exception as e:
            result = f"[Translation error: {e}]"
        if result is None:
            result = "[No translation returned]"

        result           = result.upper()
        translations[i]  = result
        translated_count += 1

        # Pipe-delimited line — safe regardless of text content
        output_lines.append(f"#{i}|{bubble_text}|{result}")
        print(f"#{i:<7} {bubble_text:<45} {result}")

    output_lines.append(divider)
    summary = (f"✅ Done! {translated_count} bubble(s) "
               f"translated, {skipped} detection(s) skipped.")
    output_lines.append(summary)
    print(divider)
    print(summary)

    # ── 12. Export translations ───────────────────────────────────
    if export_to_file:
        write_output(output_lines, export_to_file)

    # ── 13. Export bubble boxes ───────────────────────────────────
    if export_bubbles_to:
        export_bubble_boxes(
            bbox_dict,
            ocr_quads,
            filepath           = export_bubbles_to,
            bbox_expand_ratio  = 0.1,   # ← tune this
            image_shape        = full_image.shape,
        )


# ─────────────────────────────────────────────
#  ENTRY POINT
# ─────────────────────────────────────────────
if __name__ == "__main__":
    translate_manga_text(
        image_path           = "002-page.jpg",
        source_lang          = "en",
        target_lang          = "ca",
        confidence_threshold = 0.10,
        min_text_length      = 2,
        export_to_file       = "output.txt",
        export_bubbles_to    = "bubbles.json",
        gap_px               = "auto",
        filter_sound_effects = True,
        quality_threshold    = 0.5,
        upscale_factor       = 2.5,
        bbox_padding         = 1,
        debug                = True,
    )