diff --git a/README.md b/README.md index e69de29..61328c9 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,53 @@ +# Manga Translator OCR Pipeline + +A robust manga/comic OCR + translation pipeline with: + +- EasyOCR (default, reliable on macOS M1) +- Optional PaddleOCR (auto-fallback if unavailable) +- Bubble clustering and line-level boxes +- Robust reread pass (multi-preprocessing + slight rotation) +- Translation export + debug overlays + +--- + +## โœจ Features + +- OCR from raw manga pages +- Noise filtering (`BOX` debug artifacts, tiny garbage tokens, symbols) +- Speech bubble grouping +- Reading order estimation (`ltr` / `rtl`) +- Translation output (`output.txt`) +- Structured bubble metadata (`bubbles.json`) +- Visual debug output (`debug_clusters.png`) + +--- + +## ๐Ÿงฐ Requirements + +- macOS (Apple Silicon supported) +- Python **3.11** recommended +- Homebrew (for Python install) + +--- + +## ๐Ÿš€ Setup (Python 3.11 venv) + +```bash +cd /path/to/manga-translator + +# 1) Create venv with 3.11 +/opt/homebrew/bin/python3.11 -m venv venv + +# 2) Activate +source venv/bin/activate + +# 3) Verify interpreter +python -V +# expected: Python 3.11.x + +# 4) Install dependencies +python -m pip install --upgrade pip setuptools wheel +python -m pip install -r requirements.txt + +# Optional Paddle runtime +python -m pip install paddlepaddle || true diff --git a/fonts/ComicNeue-Bold.ttf b/fonts/ComicNeue-Bold.ttf new file mode 100755 index 0000000..91d871e Binary files /dev/null and b/fonts/ComicNeue-Bold.ttf differ diff --git a/fonts/ComicRelief-Bold.ttf b/fonts/ComicRelief-Bold.ttf new file mode 100755 index 0000000..7b86246 Binary files /dev/null and b/fonts/ComicRelief-Bold.ttf differ diff --git a/manga-renderer.py b/manga-renderer.py index ebc2c11..33da988 100644 --- a/manga-renderer.py +++ b/manga-renderer.py @@ -1,509 +1,412 @@ -import os +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +manga-renderer.py + +Inputs: 001.jpg + bubbles.json + output.txt +Output: translated_page.png + +Strategy: + 1. For every bubble, white-fill all its OCR quads (erases original text cleanly) + 2. Render the translated text centered inside the bubble bounding box + 3. Bubbles in SKIP_BUBBLE_IDS are erased but NOT re-rendered (left blank) +""" + import json -import re +import textwrap import cv2 import numpy as np from PIL import Image, ImageDraw, ImageFont +from typing import Dict, List, Tuple, Optional, Set +# ============================================================ +# CONFIG โ€” edit these paths to match your setup +# ============================================================ +IMAGE_PATH = "003.jpg" +BUBBLES_PATH = "bubbles.json" +TRANSLATIONS_PATH = "output.txt" +OUTPUT_PATH = "translated_page_003.png" -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# CONFIG -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -DEFAULT_FONT_CANDIDATES = [ - "fonts/ComicRelief-Regular.ttf", - "fonts/ComicNeue-Regular.ttf", +# Font candidates โ€” first one that loads wins +FONT_CANDIDATES = [ + "fonts/ComicNeue-Bold.ttf", ] -DEFAULT_FONT_COLOR = (0, 0, 0) -DEFAULT_STROKE_COLOR = (255, 255, 255) -MAX_FONT_SIZE = 20 -MIN_FONT_SIZE = 6 +FONT_SIZE = 20 +MIN_FONT_SIZE = 10 +QUAD_PAD = 4 # extra pixels added around each quad before white-fill -# Guarantee full wipe of yellow squares -YELLOW_BOX_PAD_X = 1 -YELLOW_BOX_PAD_Y = 1 -YELLOW_UNION_PAD_X = 4 -YELLOW_UNION_PAD_Y = 4 - -# Optional extra cleanup expansion -ENABLE_EXTRA_CLEAN = True -EXTRA_DILATE_ITERS = 1 -EXTRA_CLOSE_ITERS = 1 - -# Bubble detection (for optional extra mask / border preservation) -FLOOD_TOL = 30 - -# Border restoration: keep very conservative -ENABLE_EDGE_RESTORE = True -EDGE_RESTORE_DILATE = 1 - -# Text layout inside yellow-union -TEXT_INSET = 0.92 +# ============================================================ +# SKIP LIST +# โ”€โ”€ Add any bubble IDs you do NOT want rendered here. +# โ”€โ”€ The quads will still be erased (white-filled) but no +# โ”€โ”€ translated text will be drawn inside them. +# โ”€โ”€ +# โ”€โ”€ Examples of why you'd skip a bubble: +# โ”€โ”€ โ€ข Sound effects (BURP, BAM, POW โ€ฆ) +# โ”€โ”€ โ€ข Untranslatable single characters +# โ”€โ”€ โ€ข Bubbles with bad OCR you want to fix manually later +# โ”€โ”€ โ€ข Narrator boxes you want to leave in the source language +# ============================================================ +SKIP_BUBBLE_IDS: Set[int] = { + # 8, # BURP BURP โ€” sound effect + # 2, # example: bad OCR, fix manually +} -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# PARSERS -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -def parse_translations(translations_file): +# ============================================================ +# FONT LOADER +# ============================================================ +def load_font(path: str, size: int) -> Optional[ImageFont.FreeTypeFont]: + """Try every face index in a .ttc collection. Validate with getbbox.""" + indices = range(4) if path.lower().endswith(".ttc") else [0] + for idx in indices: + try: + font = ImageFont.truetype(path, size, index=idx) + font.getbbox("A") # raises if face metrics are broken + return font + except Exception: + continue + return None + + +def resolve_font() -> Tuple[str, ImageFont.FreeTypeFont]: + """Return (path, font) for the first working candidate.""" + for candidate in FONT_CANDIDATES: + font = load_font(candidate, FONT_SIZE) + if font is not None: + print(f" โœ… Font: {candidate}") + return candidate, font + print(" โš ๏ธ No TrueType font found โ€” using Pillow bitmap fallback") + return "", ImageFont.load_default() + + +# ============================================================ +# PARSE output.txt โ†’ {bid: translated_string} +# ============================================================ +def parse_translations(filepath: str) -> Dict[int, str]: + """ + Reads output.txt and returns {bubble_id: translated_text}. + Lines look like: #2|1|vision-base|ORIGINAL|TRANSLATED|FLAGS + """ translations = {} - originals = {} - flags_map = {} - - with open(translations_file, "r", encoding="utf-8") as f: + with open(filepath, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line.startswith("#"): continue - parts = line.split("|") + if len(parts) < 5: + continue try: - bubble_id = int(parts[0].lstrip("#")) - except Exception: + bid = int(parts[0].lstrip("#")) + translated = parts[4].strip() + if translated and translated != "-": + translations[bid] = translated + except ValueError: continue - - if len(parts) >= 5: - original = parts[2].strip() - translated = parts[3].strip() - flags = parts[4].strip() - elif len(parts) >= 4: - original = parts[2].strip() - translated = parts[3].strip() - flags = "-" - elif len(parts) >= 3: - original = "" - translated = parts[2].strip() - flags = "-" - else: - continue - - if translated.startswith("["): - continue - - translations[bubble_id] = translated - originals[bubble_id] = original - flags_map[bubble_id] = flags - - return translations, originals, flags_map + return translations -def parse_bubbles(bubbles_file): - with open(bubbles_file, "r", encoding="utf-8") as f: - raw = json.load(f) - return {int(k): v for k, v in raw.items()} - - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# HELPERS -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -def clamp(v, lo, hi): - return max(lo, min(hi, v)) - - -def xywh_to_xyxy(box): - if not box: - return None - x = int(box.get("x", 0)) - y = int(box.get("y", 0)) - w = int(box.get("w", 0)) - h = int(box.get("h", 0)) - return (x, y, x + w, y + h) - - -def union_xyxy(boxes): - boxes = [b for b in boxes if b is not None] - if not boxes: - return None - x1 = min(b[0] for b in boxes) - y1 = min(b[1] for b in boxes) - x2 = max(b[2] for b in boxes) - y2 = max(b[3] for b in boxes) - if x2 <= x1 or y2 <= y1: - return None - return (x1, y1, x2, y2) - - -def bbox_from_mask(mask): - ys, xs = np.where(mask > 0) - if len(xs) == 0: - return None - return (int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1) - - -def normalize_text(s): - t = s.upper().strip() - t = re.sub(r"[^\w]+", "", t) - return t - - -def is_sfx_like(text): - t = normalize_text(text) - return bool(len(t) <= 8 and re.fullmatch(r"(SHA+|BIP+|BEEP+|HN+|AH+|OH+)", t)) - - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# FONT -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -def load_font_from_candidates(candidates, size): - for path in candidates: - if path and os.path.exists(path): - try: - return ImageFont.truetype(path, size), path - except Exception: - continue - return ImageFont.load_default(), "PIL_DEFAULT" - - -def measure_text(draw, text, font): - bb = draw.textbbox((0, 0), text, font=font) - return bb[2] - bb[0], bb[3] - bb[1] - - -def wrap_text(draw, text, font, max_width): - words = text.split() - lines = [] - cur = "" - - for w in words: - test = (cur + " " + w).strip() - tw, _ = measure_text(draw, test, font) - if tw <= max_width or not cur: - cur = test - else: - lines.append(cur) - cur = w - if cur: - lines.append(cur) - - if not lines: - return [""], 0, 0 - - widths = [] - heights = [] - for ln in lines: - lw, lh = measure_text(draw, ln, font) - widths.append(lw) - heights.append(lh) - - gap = max(2, heights[0] // 5) - total_h = sum(heights) + gap * (len(lines) - 1) - return lines, total_h, max(widths) - - -def fit_font(draw, text, font_candidates, safe_w, safe_h): - for size in range(MAX_FONT_SIZE, MIN_FONT_SIZE - 1, -1): - font, _ = load_font_from_candidates(font_candidates, size) - lines, total_h, max_w = wrap_text(draw, text, font, safe_w) - if total_h <= safe_h and max_w <= safe_w: - return font, lines, total_h - - font, _ = load_font_from_candidates(font_candidates, MIN_FONT_SIZE) - lines, total_h, _ = wrap_text(draw, text, font, safe_w) - return font, lines, total_h - - -def draw_text_with_stroke(draw, pos, text, font, fill, stroke_fill): - x, y = pos - _, h = measure_text(draw, text, font) - sw = 2 if h <= 11 else 1 - - for dx in range(-sw, sw + 1): - for dy in range(-sw, sw + 1): - if dx == 0 and dy == 0: - continue - draw.text((x + dx, y + dy), text, font=font, fill=stroke_fill) - - draw.text((x, y), text, font=font, fill=fill) - - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# MASK BUILDERS -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -def build_yellow_mask(bubble_data, img_h, img_w): +# ============================================================ +# PARSE bubbles.json โ†’ bubble_boxes, quads_per_bubble +# ============================================================ +def parse_bubbles(filepath: str): """ - HARD GUARANTEE: - Returned mask always covers all yellow squares (line_bboxes). + Returns: + bubble_boxes : {bid: (x1, y1, x2, y2)} + quads_per_bubble : {bid: [ [[x,y],[x,y],[x,y],[x,y]], ... ]} """ - mask = np.zeros((img_h, img_w), dtype=np.uint8) + with open(filepath, "r", encoding="utf-8") as f: + data = json.load(f) - # Preferred: exact line boxes - line_boxes = bubble_data.get("line_bboxes", []) - for lb in line_boxes: - b = xywh_to_xyxy(lb) - if not b: + bubble_boxes = {} + quads_per_bubble = {} + + for key, val in data.items(): + bid = int(key) + + x1 = val["x"]; y1 = val["y"] + x2 = x1 + val["w"]; y2 = y1 + val["h"] + bubble_boxes[bid] = (x1, y1, x2, y2) + + quads_per_bubble[bid] = val.get("quads", []) + + return bubble_boxes, quads_per_bubble + + +# ============================================================ +# ERASE โ€” white-fill every OCR quad (with small padding) +# ============================================================ +def erase_quads( + image_bgr, + quads_per_bubble: Dict[int, List], + translations: Dict[int, str], # โ† NEW: only erase what we'll render + skip_ids: Set[int], + pad: int = QUAD_PAD +): + """ + White-fills OCR quads ONLY for bubbles that: + - have a translation in output.txt AND + - are NOT in SKIP_BUBBLE_IDS + Everything else is left completely untouched. + """ + ih, iw = image_bgr.shape[:2] + result = image_bgr.copy() + + erased_count = 0 + skipped_count = 0 + + for bid, quads in quads_per_bubble.items(): + + # ignore if explicitly skipped + if bid in skip_ids: + skipped_count += 1 continue - x1, y1, x2, y2 = b - x1 -= YELLOW_BOX_PAD_X - y1 -= YELLOW_BOX_PAD_Y - x2 += YELLOW_BOX_PAD_X - y2 += YELLOW_BOX_PAD_Y - x1 = clamp(x1, 0, img_w - 1) - y1 = clamp(y1, 0, img_h - 1) - x2 = clamp(x2, 1, img_w) - y2 = clamp(y2, 1, img_h) - if x2 > x1 and y2 > y1: - cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) - # If no line boxes available, use line_union fallback - if np.count_nonzero(mask) == 0: - ub = xywh_to_xyxy(bubble_data.get("line_union_bbox")) - if ub: - x1, y1, x2, y2 = ub - x1 -= YELLOW_UNION_PAD_X - y1 -= YELLOW_UNION_PAD_Y - x2 += YELLOW_UNION_PAD_X - y2 += YELLOW_UNION_PAD_Y - x1 = clamp(x1, 0, img_w - 1) - y1 = clamp(y1, 0, img_h - 1) - x2 = clamp(x2, 1, img_w) - y2 = clamp(y2, 1, img_h) - if x2 > x1 and y2 > y1: - cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) + # ignore if no translation exists (deleted from output.txt) + if bid not in translations: + skipped_count += 1 + continue - # Last fallback: text_bbox - if np.count_nonzero(mask) == 0: - tb = xywh_to_xyxy(bubble_data.get("text_bbox")) - if tb: - x1, y1, x2, y2 = tb - x1 -= YELLOW_UNION_PAD_X - y1 -= YELLOW_UNION_PAD_Y - x2 += YELLOW_UNION_PAD_X - y2 += YELLOW_UNION_PAD_Y - x1 = clamp(x1, 0, img_w - 1) - y1 = clamp(y1, 0, img_h - 1) - x2 = clamp(x2, 1, img_w) - y2 = clamp(y2, 1, img_h) - if x2 > x1 and y2 > y1: - cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) + for quad in quads: + pts = np.array(quad, dtype=np.int32) + cv2.fillPoly(result, [pts], (255, 255, 255)) - return mask + xs = [p[0] for p in quad]; ys = [p[1] for p in quad] + x1 = max(0, min(xs) - pad) + y1 = max(0, min(ys) - pad) + x2 = min(iw - 1, max(xs) + pad) + y2 = min(ih - 1, max(ys) + pad) + cv2.rectangle(result, (x1, y1), (x2, y2), (255, 255, 255), -1) + + erased_count += 1 + + print(f" Erased : {erased_count} bubbles") + print(f" Ignored: {skipped_count} bubbles (no translation or in skip list)") + return result -def bubble_interior_mask(img_bgr, bubble_data): +# ============================================================ +# FONT SIZING + TEXT WRAP +# ============================================================ +def fit_text( + text: str, + box_w: int, + box_h: int, + font_path: str, + max_size: int = FONT_SIZE, + min_size: int = MIN_FONT_SIZE +) -> Tuple[int, ImageFont.FreeTypeFont, List[str]]: """ - Optional helper to expand clean region safely; never used to shrink yellow coverage. + Returns (fitted_size, font, wrapped_lines) โ€” largest size where + the text block fits inside box_w ร— box_h. """ - h, w = img_bgr.shape[:2] + for size in range(max_size, min_size - 1, -1): + font = load_font(font_path, size) if font_path else None + if font is None: + return min_size, ImageFont.load_default(), [text] - panel = xywh_to_xyxy(bubble_data.get("panel_bbox")) - if panel is None: - panel = (0, 0, w, h) - px1, py1, px2, py2 = panel + chars_per_line = max(1, int(box_w / (size * 0.62))) + wrapped = textwrap.fill(text, width=chars_per_line) + lines = wrapped.split("\n") + total_h = (size + 8) * len(lines) - seed = bubble_data.get("seed_point", {}) - sx = int(seed.get("x", bubble_data.get("x", 0) + bubble_data.get("w", 1) // 2)) - sy = int(seed.get("y", bubble_data.get("y", 0) + bubble_data.get("h", 1) // 2)) - sx = clamp(sx, 1, w - 2) - sy = clamp(sy, 1, h - 2) + if total_h <= box_h - 8: + return size, font, lines - gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY) + # Nothing fit โ€” use minimum size + font = load_font(font_path, min_size) if font_path else None + if font is None: + font = ImageFont.load_default() + chars_per_line = max(1, int(box_w / (min_size * 0.62))) + lines = textwrap.fill(text, width=chars_per_line).split("\n") + return min_size, font, lines - panel_bin = np.zeros_like(binary) - panel_bin[py1:py2, px1:px2] = binary[py1:py2, px1:px2] - # if seed on dark pixel, search nearby white - if gray[sy, sx] < 150: - found = False - search_r = max(2, min(bubble_data.get("w", 20), bubble_data.get("h", 20)) // 3) - for r in range(1, search_r + 1): - for dy in range(-r, r + 1): - for dx in range(-r, r + 1): - nx, ny = sx + dx, sy + dy - if px1 <= nx < px2 and py1 <= ny < py2 and gray[ny, nx] >= 200: - sx, sy = nx, ny - found = True - break - if found: - break - if found: - break - - if not found: - m = np.zeros((h, w), dtype=np.uint8) - bx = bubble_data.get("x", 0) - by = bubble_data.get("y", 0) - bw = bubble_data.get("w", 20) - bh = bubble_data.get("h", 20) - cv2.ellipse(m, (bx + bw // 2, by + bh // 2), (max(4, bw // 2), max(4, bh // 2)), 0, 0, 360, 255, -1) - return m - - ff_mask = np.zeros((h + 2, w + 2), dtype=np.uint8) - flood = panel_bin.copy() - cv2.floodFill( - flood, ff_mask, (sx, sy), 255, - loDiff=FLOOD_TOL, upDiff=FLOOD_TOL, - flags=cv2.FLOODFILL_FIXED_RANGE +# ============================================================ +# COLOR HELPERS +# ============================================================ +def sample_bg_color( + image_bgr, + x1: int, y1: int, + x2: int, y2: int +) -> Tuple[int, int, int]: + """Sample four corners of a bubble to estimate background color (R, G, B).""" + ih, iw = image_bgr.shape[:2] + samples = [] + for sx, sy in [(x1+4, y1+4), (x2-4, y1+4), (x1+4, y2-4), (x2-4, y2-4)]: + sx = max(0, min(iw-1, sx)); sy = max(0, min(ih-1, sy)) + b, g, r = image_bgr[sy, sx] + samples.append((int(r), int(g), int(b))) + return ( + int(np.median([s[0] for s in samples])), + int(np.median([s[1] for s in samples])), + int(np.median([s[2] for s in samples])), ) - m = (ff_mask[1:-1, 1:-1] * 255).astype(np.uint8) - m = cv2.morphologyEx(m, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=1) - return m + +def pick_fg_color(bg: Tuple[int, int, int]) -> Tuple[int, int, int]: + lum = 0.299 * bg[0] + 0.587 * bg[1] + 0.114 * bg[2] + return (0, 0, 0) if lum > 128 else (255, 255, 255) -def build_clean_mask(img_bgr, bubble_data): - """ - FINAL RULE: - clean_mask MUST cover yellow_mask completely. - """ - h, w = img_bgr.shape[:2] - yellow = build_yellow_mask(bubble_data, h, w) - - # start with guaranteed yellow - clean = yellow.copy() - - if ENABLE_EXTRA_CLEAN: - bubble_m = bubble_interior_mask(img_bgr, bubble_data) - extra = cv2.dilate(yellow, np.ones((3, 3), np.uint8), iterations=EXTRA_DILATE_ITERS) - extra = cv2.morphologyEx(extra, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=EXTRA_CLOSE_ITERS) - extra = cv2.bitwise_and(extra, bubble_m) - - # IMPORTANT: union with yellow (never subtract yellow) - clean = cv2.bitwise_or(yellow, extra) - - # final guarantee (defensive) - clean = cv2.bitwise_or(clean, yellow) - - return clean, yellow +def safe_textbbox( + draw, pos, text, font +) -> Tuple[int, int, int, int]: + try: + return draw.textbbox(pos, text, font=font) + except Exception: + size = getattr(font, "size", 12) + return ( + pos[0], pos[1], + pos[0] + int(len(text) * size * 0.6), + pos[1] + int(size * 1.2) + ) -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# DRAW BUBBLE -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -def draw_bubble( - pil_img, - img_bgr_ref, - bubble_data, - original_text, - translated_text, - font_candidates, - font_color, - stroke_color -): - if original_text and translated_text: - if normalize_text(original_text) == normalize_text(translated_text) and is_sfx_like(original_text): - return "skip_sfx" - - rgb = np.array(pil_img) - h, w = rgb.shape[:2] - - clean_mask, yellow_mask = build_clean_mask(img_bgr_ref, bubble_data) - if np.count_nonzero(clean_mask) == 0: - return "skip_no_area" - - # 1) FORCE white fill on clean mask (includes full yellow by guarantee) - rgb[clean_mask == 255] = [255, 255, 255] - - # 2) Optional edge restore, but NEVER overwrite yellow coverage - if ENABLE_EDGE_RESTORE: - bubble_m = bubble_interior_mask(img_bgr_ref, bubble_data) - edge = cv2.morphologyEx(bubble_m, cv2.MORPH_GRADIENT, np.ones((3, 3), np.uint8)) - edge = cv2.dilate(edge, np.ones((3, 3), np.uint8), iterations=EDGE_RESTORE_DILATE) - - # Don't restore where yellow exists (hard guarantee) - edge[yellow_mask == 255] = 0 - - orig_rgb = cv2.cvtColor(img_bgr_ref, cv2.COLOR_BGR2RGB) - rgb[edge == 255] = orig_rgb[edge == 255] - - pil_img.paste(Image.fromarray(rgb)) - - if not translated_text: - return "clean_only" - - # text region based on yellow area (exact requirement) - text_bbox = bbox_from_mask(yellow_mask) - if text_bbox is None: - text_bbox = bbox_from_mask(clean_mask) - if text_bbox is None: - return "skip_no_area" - - x1, y1, x2, y2 = text_bbox - - draw = ImageDraw.Draw(pil_img) - text_cx = int((x1 + x2) / 2) - text_cy = int((y1 + y2) / 2) - safe_w = max(16, int((x2 - x1) * TEXT_INSET)) - safe_h = max(16, int((y2 - y1) * TEXT_INSET)) - - font, lines, total_h = fit_font(draw, translated_text, font_candidates, safe_w, safe_h) - - y_cursor = int(round(text_cy - total_h / 2.0)) - for line in lines: - lw, lh = measure_text(draw, line, font) - x = text_cx - lw // 2 - draw_text_with_stroke(draw, (x, y_cursor), line, font, fill=font_color, stroke_fill=stroke_color) - y_cursor += lh + max(lh // 5, 2) - - return "rendered" - - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# MAIN -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# ============================================================ +# RENDER +# ============================================================ def render_translations( - input_image, - output_image, - translations_file, - bubbles_file, - font_candidates=DEFAULT_FONT_CANDIDATES, - font_color=DEFAULT_FONT_COLOR, - stroke_color=DEFAULT_STROKE_COLOR + image_bgr, + bubble_boxes: Dict[int, Tuple], + translations: Dict[int, str], + skip_ids: Set[int], + font_path: str, + font_size: int = FONT_SIZE, + bold_outline: bool = True, + auto_color: bool = True, + output_path: str = OUTPUT_PATH ): - img_bgr = cv2.imread(input_image) - if img_bgr is None: - raise FileNotFoundError(f"Cannot load image: {input_image}") + image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB) + pil_img = Image.fromarray(image_rgb) + draw = ImageDraw.Draw(pil_img) - img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) + rendered = 0 + skipped = 0 + missing = 0 - translations, originals, flags_map = parse_translations(translations_file) - bubbles = parse_bubbles(bubbles_file) + for bid, (x1, y1, x2, y2) in sorted(bubble_boxes.items()): - rendered, skipped = 0, 0 - - def sort_key(item): - bid, _ = item - b = bubbles.get(bid, {}) - return int(b.get("reading_order", bid)) - - for bubble_id, translated_text in sorted(translations.items(), key=sort_key): - if bubble_id not in bubbles: + # โ”€โ”€ skip list check โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if bid in skip_ids: + print(f" โญ๏ธ Bubble #{bid:<3} โ€” skipped (in SKIP_BUBBLE_IDS)") skipped += 1 continue - bubble_data = bubbles[bubble_id] - original_text = originals.get(bubble_id, "") + text = translations.get(bid, "").strip() + if not text: + print(f" โš ๏ธ Bubble #{bid:<3} โ€” no translation found, left blank") + missing += 1 + continue - status = draw_bubble( - pil_img=img_pil, - img_bgr_ref=img_bgr, - bubble_data=bubble_data, - original_text=original_text, - translated_text=translated_text, - font_candidates=font_candidates, - font_color=font_color, - stroke_color=stroke_color + box_w = x2 - x1 + box_h = y2 - y1 + if box_w < 10 or box_h < 10: + continue + + # โ”€โ”€ fit font + wrap โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + size, font, lines = fit_text( + text, box_w, box_h, font_path, max_size=font_size ) - if status.startswith("skip"): - skipped += 1 + # โ”€โ”€ colors โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if auto_color: + bg = sample_bg_color(image_bgr, x1, y1, x2, y2) + fg = pick_fg_color(bg) + ol = (255, 255, 255) if fg == (0, 0, 0) else (0, 0, 0) else: - rendered += 1 + fg, ol = (0, 0, 0), (255, 255, 255) - out_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) - cv2.imwrite(output_image, out_bgr) + # โ”€โ”€ vertical center โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + line_h = size + 8 + total_h = line_h * len(lines) + y_cur = y1 + max(4, (box_h - total_h) // 2) - print(f"โœ… Done โ€” {rendered} rendered, {skipped} skipped.") - print(f"๐Ÿ“„ Output โ†’ {output_image}") - print("Guarantee: full yellow-square area is always white-cleaned before drawing text.") + for line in lines: + bb = safe_textbbox(draw, (0, 0), line, font) + line_w = bb[2] - bb[0] + x_cur = x1 + max(2, (box_w - line_w) // 2) + + if bold_outline: + for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]: + try: + draw.text((x_cur + dx, y_cur + dy), line, font=font, fill=ol) + except Exception: + pass + + try: + draw.text((x_cur, y_cur), line, font=font, fill=fg) + except Exception as e: + print(f" โŒ Draw error bubble #{bid}: {e}") + + y_cur += line_h + + print(f" โœ… Bubble #{bid:<3} โ€” rendered ({len(lines)} lines, size {size}px)") + rendered += 1 + + pil_img.save(output_path) + + print() + print(f"{'โ”€'*50}") + print(f" Rendered : {rendered}") + print(f" Skipped : {skipped} (SKIP_BUBBLE_IDS)") + print(f" No text : {missing} (not in output.txt)") + print(f"{'โ”€'*50}") + print(f"โœ… Saved โ†’ {output_path}") + + return pil_img + + +# ============================================================ +# MAIN +# ============================================================ +def main(): + print(f"๐Ÿ“– Loading image : {IMAGE_PATH}") + image = cv2.imread(IMAGE_PATH) + if image is None: + print(f"โŒ Cannot load: {IMAGE_PATH}"); return + + print(f"๐Ÿ“ฆ Loading bubbles : {BUBBLES_PATH}") + bubble_boxes, quads_per_bubble = parse_bubbles(BUBBLES_PATH) + print(f" {len(bubble_boxes)} bubbles | " + f"{sum(len(v) for v in quads_per_bubble.values())} quads total") + + print(f"๐ŸŒ Loading translations : {TRANSLATIONS_PATH}") + translations = parse_translations(TRANSLATIONS_PATH) + print(f" {len(translations)} translations found") + + if SKIP_BUBBLE_IDS: + print(f"โญ๏ธ Skip list : bubbles {sorted(SKIP_BUBBLE_IDS)}") + else: + print(f"โญ๏ธ Skip list : (empty โ€” all bubbles will be rendered)") + + print("๐Ÿ”ค Resolving font...") + font_path, _ = resolve_font() + + print(f"๐Ÿงน Erasing original text (quad fill + pad={QUAD_PAD}px)...") + clean_image = erase_quads( + image, + quads_per_bubble, + translations = translations, # โ† pass translations here + skip_ids = SKIP_BUBBLE_IDS, + pad = QUAD_PAD + ) + + print("โœ๏ธ Rendering translated text...") + render_translations( + image_bgr = clean_image, + bubble_boxes = bubble_boxes, + translations = translations, + skip_ids = SKIP_BUBBLE_IDS, + font_path = font_path, + font_size = FONT_SIZE, + bold_outline = True, + auto_color = True, + output_path = OUTPUT_PATH + ) if __name__ == "__main__": - render_translations( - input_image="001-page.png", - output_image="page_translated.png", - translations_file="output.txt", - bubbles_file="bubbles.json", - font_candidates=DEFAULT_FONT_CANDIDATES, - font_color=DEFAULT_FONT_COLOR, - stroke_color=DEFAULT_STROKE_COLOR - ) + main() diff --git a/manga-translator.py b/manga-translator.py index 7a95073..8aee6dc 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -6,13 +6,17 @@ import re import json import cv2 import numpy as np +import warnings +from typing import List, Tuple, Dict, Any, Optional from deep_translator import GoogleTranslator -# OCR engines -import easyocr -from paddleocr import PaddleOCR +# macOS Native Vision imports +import Vision +import Quartz +from Foundation import NSData +warnings.filterwarnings("ignore", category=UserWarning) # ============================================================ # CONFIG @@ -26,7 +30,7 @@ GLOSSARY = { } SOUND_EFFECT_PATTERNS = [ - r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$", + r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" ] @@ -47,13 +51,13 @@ TOP_BAND_RATIO = 0.08 # ============================================================ -# TEXT HELPERS +# HELPERS # ============================================================ def normalize_text(text: str) -> str: t = (text or "").strip().upper() - t = t.replace("โ€œ", "\"").replace("โ€", "\"") - t = t.replace("โ€™", "'").replace("โ€˜", "'") - t = t.replace("โ€ฆ", "...") + t = t.replace("\u201c", "\"").replace("\u201d", "\"") + t = t.replace("\u2018", "'").replace("\u2019", "'") + t = t.replace("\u2026", "...") t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = re.sub(r"([ยกยฟ])\s+", r"\1", t) @@ -88,24 +92,35 @@ def is_title_text(text: str) -> bool: return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) +def looks_like_box_tag(t: str) -> bool: + s = re.sub(r"[^A-Z0-9#]", "", (t or "").upper()) + if re.fullmatch(r"[BEF]?[O0D]X#?\d{0,3}", s): + return True + if re.fullmatch(r"B[O0D]X\d{0,3}", s): + return True + return False + + def is_noise_text(text: str) -> bool: t = (text or "").strip() + + # Explicitly allow standalone punctuation like ? or ! + if re.fullmatch(r"[\?\!]+", t): + return False + if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): return True - - if len(t) <= 2 and not re.search(r"[A-Z0-9]", t): + if looks_like_box_tag(t): + return True + if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!]", t): return True symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) if len(t) <= 6 and symbol_ratio > 0.60: return True - return False -# ============================================================ -# GEOMETRY HELPERS -# ============================================================ def quad_bbox(quad): xs = [p[0] for p in quad] ys = [p[1] for p in quad] @@ -150,9 +165,6 @@ def overlap_or_near(a, b, gap=0): return gap_x <= gap and gap_y <= gap -# ============================================================ -# QUALITY -# ============================================================ def ocr_candidate_score(text: str) -> float: if not text: return 0.0 @@ -179,204 +191,98 @@ def ocr_candidate_score(text: str) -> float: # ============================================================ -# OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID) +# OCR ENGINES (Apple Native Vision) # ============================================================ -class HybridOCR: - def __init__(self, source_lang="en", use_gpu=False): - self.source_lang = source_lang +class MacVisionDetector: + def __init__(self, source_lang="en"): + lang_map = {"en": "en-US", "es": "es-ES", "ca": "ca-ES", "fr": "fr-FR", "ja": "ja-JP"} + apple_lang = lang_map.get(source_lang, "en-US") + self.langs = [apple_lang] + print(f"โšก Using Apple Vision OCR (Language: {self.langs})") - # Paddle language choice (single lang for Paddle) - # For manga EN/ES pages, latin model is robust. - if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"): - paddle_lang = "latin" - elif source_lang in ("ja",): - paddle_lang = "japan" - elif source_lang in ("ko",): - paddle_lang = "korean" - elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"): - paddle_lang = "ch" + def read(self, image_path_or_array): + if isinstance(image_path_or_array, str): + img = cv2.imread(image_path_or_array) else: - paddle_lang = "latin" + img = image_path_or_array - # EasyOCR language list - if source_lang == "ca": - easy_langs = ["es", "en"] - elif source_lang == "en": - easy_langs = ["en", "es"] - elif source_lang == "es": - easy_langs = ["es", "en"] - else: - easy_langs = [source_lang] + if img is None or img.size == 0: + return [] - self.paddle = PaddleOCR( - use_angle_cls=True, - lang=paddle_lang, - use_gpu=use_gpu, - show_log=False - ) - self.easy = easyocr.Reader(easy_langs, gpu=use_gpu) + ih, iw = img.shape[:2] - @staticmethod - def _paddle_to_std(result): - """ - Convert Paddle result to Easy-like: - [ (quad, text, conf), ... ] - """ - out = [] - # paddle.ocr(...) returns list per image - # each item line: [ [ [x,y],...4pts ], (text, conf) ] - if not result: - return out - # result can be [None] or nested list - blocks = result if isinstance(result, list) else [result] - for blk in blocks: - if blk is None: - continue - if len(blk) == 0: - continue - # some versions wrap once more - if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2: - lines = blk - elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2: - lines = blk - else: - # maybe nested once more - if len(blk) == 1 and isinstance(blk[0], list): - lines = blk[0] - else: - lines = [] + success, buffer = cv2.imencode('.png', img) + if not success: + return [] - for ln in lines: - try: - pts, rec = ln - txt, conf = rec[0], float(rec[1]) - quad = [[float(p[0]), float(p[1])] for p in pts] - out.append((quad, txt, conf)) - except Exception: - continue - return out + ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) + handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) + results = [] - def read_full_image(self, image_path): - """ - Primary: Paddle - Fallback merge: EasyOCR - Returns merged standardized detections. - """ - # Paddle - pr = self.paddle.ocr(image_path, cls=True) - paddle_det = self._paddle_to_std(pr) + def completion_handler(request, error): + if error: + print(f"Vision API Error: {error}") + return - # Easy - easy_det = self.easy.readtext(image_path, paragraph=False) + for observation in request.results(): + candidate = observation.topCandidates_(1)[0] + text = candidate.string() + confidence = candidate.confidence() - # Merge by IOU/text proximity - merged = list(paddle_det) - for eb in easy_det: - eq, et, ec = eb - ebox = quad_bbox(eq) - keep = True - for pb in paddle_det: - pq, pt, pc = pb - pbox = quad_bbox(pq) + bbox = observation.boundingBox() + x = bbox.origin.x * iw + y_bottom_left = bbox.origin.y * ih + w = bbox.size.width * iw + h = bbox.size.height * ih - ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1]) - ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3]) - inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) - a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1])) - a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1])) - iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 + y = ih - y_bottom_left - h - if iou > 0.55: - # if overlapped and paddle exists, keep paddle unless easy much higher conf - if float(ec) > float(pc) + 0.20: - # replace paddle with easy-like entry - try: - merged.remove(pb) - except Exception: - pass - merged.append((eq, et, float(ec))) - keep = False - break + quad = [ + [int(x), int(y)], + [int(x + w), int(y)], + [int(x + w), int(y + h)], + [int(x), int(y + h)] + ] - if keep: - merged.append((eq, et, float(ec))) + results.append((quad, text, confidence)) - return merged + request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) + request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) + request.setUsesLanguageCorrection_(True) + request.setRecognitionLanguages_(self.langs) - def read_array_with_both(self, arr_gray_or_bgr): - """ - OCR from array (used in robust reread pass). - Returns merged detections in standardized format. - """ - tmp = "_tmp_ocr_hybrid.png" - cv2.imwrite(tmp, arr_gray_or_bgr) - try: - pr = self.paddle.ocr(tmp, cls=True) - paddle_det = self._paddle_to_std(pr) - easy_det = self.easy.readtext(tmp, paragraph=False) + handler.performRequests_error_([request], None) - merged = list(paddle_det) - - for eb in easy_det: - eq, et, ec = eb - ebox = quad_bbox(eq) - keep = True - for pb in paddle_det: - pq, pt, pc = pb - pbox = quad_bbox(pq) - - ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1]) - ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3]) - inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) - a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1])) - a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1])) - iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 - - if iou > 0.55: - if float(ec) > float(pc) + 0.20: - try: - merged.remove(pb) - except Exception: - pass - merged.append((eq, et, float(ec))) - keep = False - break - - if keep: - merged.append((eq, et, float(ec))) - - return merged - finally: - if os.path.exists(tmp): - os.remove(tmp) + return results # ============================================================ -# PREPROCESS + ROBUST REREAD +# PREPROCESS # ============================================================ def preprocess_variant(crop_bgr, mode): gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) if mode == "raw": return gray - if mode == "clahe": return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) - if mode == "adaptive": den = cv2.GaussianBlur(gray, (3, 3), 0) - return cv2.adaptiveThreshold( - den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 35, 11 - ) - + return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) if mode == "otsu": den = cv2.GaussianBlur(gray, (3, 3), 0) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return th - if mode == "invert": return 255 - gray + if mode == "bilateral": + den = cv2.bilateralFilter(gray, 7, 60, 60) + _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + return th + if mode == "morph_open": + _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + k = np.ones((2, 2), np.uint8) + return cv2.morphologyEx(th, cv2.MORPH_OPEN, k) return gray @@ -389,22 +295,18 @@ def rotate_image_keep_bounds(img, angle_deg): new_w = int((h * sin) + (w * cos)) new_h = int((h * cos) + (w * sin)) - M[0, 2] += (new_w / 2) - c[0] M[1, 2] += (new_h / 2) - c[1] return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) -def rebuild_text_from_ocr_result(res): +def rebuild_text_from_vision_result(res): if not res: return "" norm = [] - for item in res: - if len(item) != 3: - continue - bbox, txt, conf = item + for bbox, txt, conf in res: if not txt or not txt.strip(): continue b = quad_bbox(bbox) @@ -419,7 +321,7 @@ def rebuild_text_from_ocr_result(res): med_h = float(np.median([x[5] for x in norm])) row_tol = max(6.0, med_h * 0.75) - norm.sort(key=lambda z: z[4]) # y + norm.sort(key=lambda z: z[4]) rows = [] for it in norm: placed = False @@ -435,7 +337,7 @@ def rebuild_text_from_ocr_result(res): rows.sort(key=lambda r: r["yc"]) lines = [] for r in rows: - mem = sorted(r["m"], key=lambda z: z[3]) # x + mem = sorted(r["m"], key=lambda z: z[3]) line = normalize_text(" ".join(x[1] for x in mem)) if line: lines.append(line) @@ -443,57 +345,51 @@ def rebuild_text_from_ocr_result(res): return normalize_text(" ".join(lines)) -def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24): - ih, iw = image.shape[:2] - x1, y1, x2, y2 = bbox - x1 = max(0, int(x1 - pad)) - y1 = max(0, int(y1 - pad)) - x2 = min(iw, int(x2 + pad)) - y2 = min(ih, int(y2 + pad)) - crop = image[y1:y2, x1:x2] +def reread_bubble_with_vision( + image_bgr, + bbox_xyxy, + vision_detector: MacVisionDetector, + upscale=3.0, + pad=24 +): + ih, iw = image_bgr.shape[:2] + x1, y1, x2, y2 = bbox_xyxy + x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) + x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad)) + + crop = image_bgr[y1:y2, x1:x2] if crop.size == 0: - return None, 0.0 + return None, 0.0, "none" - up = cv2.resize( - crop, - (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), - interpolation=cv2.INTER_CUBIC - ) - - modes = ["raw", "clahe", "adaptive", "otsu", "invert"] + modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"] angles = [0.0, 1.5, -1.5] - best_text, best_score = "", 0.0 + best_v_txt, best_v_sc = "", 0.0 + up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), interpolation=cv2.INTER_CUBIC) for mode in modes: - proc = preprocess_variant(up, mode) - - if len(proc.shape) == 2: - proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) - else: - proc3 = proc - + proc = preprocess_variant(up0, mode) + proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc for a in angles: rot = rotate_image_keep_bounds(proc3, a) - res = hybrid_ocr.read_array_with_both(rot) - txt = rebuild_text_from_ocr_result(res) + res = vision_detector.read(rot) + txt = rebuild_text_from_vision_result(res) sc = ocr_candidate_score(txt) + if sc > best_v_sc: + best_v_txt, best_v_sc = txt, sc - if sc > best_score: - best_text, best_score = txt, sc + if best_v_txt: + return best_v_txt, best_v_sc, "vision-reread" - if not best_text: - return None, 0.0 - return best_text, best_score + return None, 0.0, "none" # ============================================================ -# LINE REBUILD + YELLOW BOXES +# LINES + BUBBLES # ============================================================ def build_lines_from_indices(indices, ocr): if not indices: return [] - items = [] for i in indices: b = quad_bbox(ocr[i][0]) @@ -526,7 +422,6 @@ def build_lines_from_indices(indices, ocr): txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) if txt and not is_noise_text(txt): lines.append(txt) - return lines @@ -540,16 +435,10 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): txt = normalize_text(ocr[i][1]) if is_noise_text(txt): continue - xc = (b[0] + b[2]) / 2.0 yc = (b[1] + b[3]) / 2.0 - w = max(1.0, b[2] - b[0]) h = max(1.0, b[3] - b[1]) - - items.append({ - "i": i, "b": b, "txt": txt, - "xc": xc, "yc": yc, "w": w, "h": h - }) + items.append({"i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "h": h}) if not items: return [] @@ -559,16 +448,8 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): gap_x_tol = max(8.0, med_h * 1.25) pad = max(3, int(round(med_h * 0.22))) - def is_punct_like(t): - raw = (t or "").strip() - if raw == "": - return True - punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) - return punct_ratio >= 0.5 or len(raw) <= 2 - - items_sorted = sorted(items, key=lambda x: x["yc"]) rows = [] - for it in items_sorted: + for it in sorted(items, key=lambda x: x["yc"]): placed = False for r in rows: if abs(it["yc"] - r["yc"]) <= row_tol: @@ -584,16 +465,12 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): for r in rows: mem = sorted(r["m"], key=lambda z: z["xc"]) - normal = [t for t in mem if not is_punct_like(t["txt"])] - punct = [t for t in mem if is_punct_like(t["txt"])] - - if not normal: - normal = mem - punct = [] + if not mem: + continue chunks = [] - cur = [normal[0]] - for t in normal[1:]: + cur = [mem[0]] + for t in mem[1:]: prev = cur[-1]["b"] b = t["b"] gap = b[0] - prev[2] @@ -604,106 +481,26 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None): cur = [t] chunks.append(cur) - for p in punct: - pb = p["b"] - pxc, pyc = p["xc"], p["yc"] - best_k = -1 - best_score = 1e18 - - for k, ch in enumerate(chunks): - ub = boxes_union_xyxy([x["b"] for x in ch]) - cx = (ub[0] + ub[2]) / 2.0 - cy = (ub[1] + ub[3]) / 2.0 - dx = abs(pxc - cx) - dy = abs(pyc - cy) - score = dx + 1.8 * dy - - near = overlap_or_near(pb, ub, gap=int(med_h * 1.25)) - if near: - score -= med_h * 2.0 - - if score < best_score: - best_score = score - best_k = k - - if best_k >= 0: - chunks[best_k].append(p) - else: - chunks.append([p]) - for ch in chunks: ub = boxes_union_xyxy([x["b"] for x in ch]) if ub: x1, y1, x2, y2 = ub - pad_x = pad - pad_top = int(round(pad * 1.35)) - pad_bot = int(round(pad * 0.95)) - out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) - - token_boxes = [it["b"] for it in items] - - def inside(tb, lb): - return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3] - - for tb in token_boxes: - if not any(inside(tb, lb) for lb in out_boxes): - x1, y1, x2, y2 = tb - pad_x = pad - pad_top = int(round(pad * 1.35)) - pad_bot = int(round(pad * 0.95)) - out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) - - merged = [] - for b in out_boxes: - merged_into = False - for i, m in enumerate(merged): - ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1]) - ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3]) - inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) - a1 = max(1, (b[2] - b[0]) * (b[3] - b[1])) - a2 = max(1, (m[2] - m[0]) * (m[3] - m[1])) - iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 - if iou > 0.72: - merged[i] = boxes_union_xyxy([b, m]) - merged_into = True - break - if not merged_into: - merged.append(b) - - safe = [] - for (x1, y1, x2, y2) in merged: - w = x2 - x1 - h = y2 - y1 - if w < 28: - d = (28 - w) // 2 + 2 - x1 -= d; x2 += d - if h < 18: - d = (18 - h) // 2 + 2 - y1 -= d; y2 += d - safe.append((x1, y1, x2, y2)) - merged = safe + out_boxes.append((x1 - pad, y1 - int(round(pad*1.35)), x2 + pad, y2 + int(round(pad*0.95)))) if image_shape is not None: ih, iw = image_shape[:2] clamped = [] - for b in merged: - x1 = max(0, int(b[0])) - y1 = max(0, int(b[1])) - x2 = min(iw - 1, int(b[2])) - y2 = min(ih - 1, int(b[3])) + for b in out_boxes: + x1 = max(0, int(b[0])); y1 = max(0, int(b[1])) + x2 = min(iw - 1, int(b[2])); y2 = min(ih - 1, int(b[3])) if x2 > x1 and y2 > y1: clamped.append((x1, y1, x2, y2)) - merged = clamped - else: - merged = [(int(b[0]), int(b[1]), int(b[2]), int(b[3])) for b in merged] + out_boxes = clamped - merged.sort(key=lambda z: (z[1], z[0])) - return merged + out_boxes.sort(key=lambda z: (z[1], z[0])) + return out_boxes -# ============================================================ -# GROUPING -# ============================================================ def auto_gap(image_path, base=18, ref_w=750): img = cv2.imread(image_path) if img is None: @@ -750,21 +547,14 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): sorted_groups = sorted( groups.values(), - key=lambda idxs: ( - min(boxes[i][1] for i in idxs), - min(boxes[i][0] for i in idxs) - ) + key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs)) ) - bubbles = {} - bubble_boxes = {} - bubble_quads = {} - bubble_indices = {} - + bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {} ih, iw = image_shape[:2] + for bid, idxs in enumerate(sorted_groups, start=1): idxs = sorted(idxs, key=lambda k: boxes[k][1]) - lines = build_lines_from_indices(idxs, ocr) quads = [ocr[k][0] for k in idxs] ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) @@ -772,10 +562,8 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): continue x1, y1, x2, y2 = ub - x1 = max(0, x1 - bbox_padding) - y1 = max(0, y1 - bbox_padding) - x2 = min(iw - 1, x2 + bbox_padding) - y2 = min(ih - 1, y2 + bbox_padding) + x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding) + x2 = min(iw - 1, x2 + bbox_padding); y2 = min(ih - 1, y2 + bbox_padding) bubbles[bid] = lines bubble_boxes[bid] = (x1, y1, x2, y2) @@ -786,37 +574,63 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3): # ============================================================ -# DEBUG +# DEBUG / EXPORT # ============================================================ -def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): +def save_debug_clusters( + image_path, + ocr, + bubble_boxes, + bubble_indices, + clean_lines=None, + out_path="debug_clusters.png" +): img = cv2.imread(image_path) if img is None: return + # โ”€โ”€ FIX 1: white-fill each OCR quad before drawing its outline โ”€โ”€ for bbox, txt, conf in ocr: pts = np.array(bbox, dtype=np.int32) - cv2.polylines(img, [pts], True, (180, 180, 180), 1) + cv2.fillPoly(img, [pts], (255, 255, 255)) # โ† white background + cv2.polylines(img, [pts], True, (180, 180, 180), 1) # โ† grey outline for bid, bb in bubble_boxes.items(): x1, y1, x2, y2 = bb - cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) - cv2.putText( - img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2 - ) - idxs = bubble_indices.get(bid, []) - line_boxes = build_line_boxes_from_indices(idxs, ocr, image_shape=img.shape) - for lb in line_boxes: - lx1, ly1, lx2, ly2 = lb - cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3) + # Draw green bubble bounding box + ID label + cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) + cv2.putText(img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) + + # โ”€โ”€ FIX 2: yellow line-box drawing loop removed entirely โ”€โ”€โ”€โ”€ + + # Draw translated text overlay below each bubble box + if clean_lines and bid in clean_lines: + text = clean_lines[bid] + words = text.split() + lines = [] + current_line = "" + + for word in words: + if len(current_line) + len(word) < 25: + current_line += word + " " + else: + lines.append(current_line.strip()) + current_line = word + " " + if current_line: + lines.append(current_line.strip()) + + y_text = y2 + 18 + for line in lines: + cv2.putText(img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3) + cv2.putText(img, line, (x1, y_text), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) + y_text += 18 cv2.imwrite(out_path, img) -# ============================================================ -# EXPORT -# ============================================================ def estimate_reading_order(bbox_dict, mode="ltr"): items = [] for bid, (x1, y1, x2, y2) in bbox_dict.items(): @@ -826,8 +640,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"): items.sort(key=lambda t: t[2]) - rows = [] - tol = 90 + rows, tol = [], 90 for it in items: placed = False for r in rows: @@ -850,7 +663,6 @@ def estimate_reading_order(bbox_dict, mode="ltr"): def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): out = {} - for bid, bb in bbox_dict.items(): x1, y1, x2, y2 = bb quads = quads_dict.get(bid, []) @@ -870,9 +682,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m {"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])} for b in qboxes ], - "quads": [ - [[int(p[0]), int(p[1])] for p in q] for q in quads - ], + "quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads], "text_bbox": xyxy_to_xywh(text_union), "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, @@ -884,10 +694,10 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m # ============================================================ -# MAIN PIPELINE +# PIPELINE # ============================================================ def translate_manga_text( - image_path, + image_path="001-page.png", source_lang="en", target_lang="ca", confidence_threshold=0.12, @@ -898,8 +708,7 @@ def translate_manga_text( export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", - debug=True, - use_gpu=False + debug=True ): image = cv2.imread(image_path) if image is None: @@ -908,12 +717,12 @@ def translate_manga_text( resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) - print("Loading Hybrid OCR (Paddle + EasyOCR)...") - hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu) + print("Loading OCR engines...") + detector = MacVisionDetector(source_lang=source_lang) - print("Running OCR...") - raw = hybrid.read_full_image(image_path) - print(f"Raw detections (merged): {len(raw)}") + print("Running detection OCR (Apple Vision)...") + raw = detector.read(image_path) + print(f"Raw detections: {len(raw)}") filtered = [] skipped = 0 @@ -924,25 +733,18 @@ def translate_manga_text( qb = quad_bbox(bbox) if conf < confidence_threshold: - skipped += 1 - continue + skipped += 1; continue if len(t) < min_text_length: - skipped += 1 - continue + skipped += 1; continue if is_noise_text(t): - skipped += 1 - continue + skipped += 1; continue if filter_sound_effects and is_sound_effect(t): - skipped += 1 - continue + skipped += 1; continue if is_title_text(t): - skipped += 1 - continue - + skipped += 1; continue if qb[1] < int(ih * TOP_BAND_RATIO): if conf < 0.70 and len(t) >= 5: - skipped += 1 - continue + skipped += 1; continue filtered.append((bbox, t, conf)) @@ -955,75 +757,80 @@ def translate_manga_text( filtered, image.shape, gap_px=resolved_gap, bbox_padding=3 ) + translator = GoogleTranslator(source=source_lang, target=target_lang) + + clean_lines: Dict[int, str] = {} + sources_used: Dict[int, str] = {} + + for bid, lines in bubbles.items(): + base_txt = normalize_text(" ".join(lines)) + base_sc = ocr_candidate_score(base_txt) + + txt = base_txt + src_used = "vision-base" + + if base_sc < quality_threshold: + rr_txt, rr_sc, rr_src = reread_bubble_with_vision( + image_bgr=image, + bbox_xyxy=bubble_boxes[bid], + vision_detector=detector, + upscale=3.0, + pad=24 + ) + if rr_txt and rr_sc > base_sc + 0.04: + txt = rr_txt + src_used = rr_src + + txt = txt.replace(" BOMPORTA", " IMPORTA") + txt = txt.replace(" TESTO ", " ESTO ") + txt = txt.replace(" MIVERDAD", " MI VERDAD") + + clean_lines[bid] = apply_glossary(normalize_text(txt)) + sources_used[bid] = src_used + + reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) + if debug: save_debug_clusters( image_path=image_path, ocr=filtered, bubble_boxes=bubble_boxes, bubble_indices=bubble_indices, + clean_lines=clean_lines, out_path="debug_clusters.png" ) - translator = GoogleTranslator(source=source_lang, target=target_lang) - - clean_lines = {} - for bid, lines in bubbles.items(): - base_txt = normalize_text(" ".join(lines)) - base_sc = ocr_candidate_score(base_txt) - - if base_sc < quality_threshold: - rr_txt, rr_sc = reread_crop_robust( - image, - bubble_boxes[bid], - hybrid, - upscale=3.0, - pad=24 - ) - if rr_txt and rr_sc > base_sc + 0.06: - txt = rr_txt - else: - txt = base_txt - else: - txt = base_txt - - txt = txt.replace(" BOMPORTA", " IMPORTA") - txt = txt.replace(" TESTO ", " ESTO ") - txt = txt.replace(" MIVERDAD", " MI VERDAD") - - clean_lines[bid] = apply_glossary(normalize_text(txt)) - - reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) - divider = "โ”€" * 120 - out_lines = ["BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS", divider] + out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider] print(divider) - print(f"{'BUBBLE':<8} {'ORDER':<6} {'ORIGINAL':<50} {'TRANSLATED':<50} FLAGS") + print(f"{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} {'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS") print(divider) translated_count = 0 for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): - src = clean_lines[bid].strip() - if not src: + src_txt = clean_lines[bid].strip() + if not src_txt: continue flags = [] try: - tgt = translator.translate(src) or "" + tgt = translator.translate(src_txt) or "" except Exception as e: tgt = f"[Translation error: {e}]" flags.append("TRANSLATION_ERROR") tgt = apply_glossary(postprocess_translation_general(tgt)).upper() - src_u = src.upper() + src_u = src_txt.upper() + src_engine = sources_used.get(bid, "unknown") out_lines.append( - f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" + f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" ) print( - f"#{bid:<7} {reading_map.get(bid,bid):<6} " - f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}" + f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} " + f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}" ) translated_count += 1 @@ -1050,22 +857,18 @@ def translate_manga_text( print("Saved: debug_clusters.png") -# ============================================================ -# ENTRYPOINT -# ============================================================ if __name__ == "__main__": translate_manga_text( - image_path="001-page.png", - source_lang="it", + image_path="003.jpg", + source_lang="es", target_lang="ca", confidence_threshold=0.12, - min_text_length=1, + min_text_length=2, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", - debug=True, - use_gpu=False + debug=True ) diff --git a/requirements b/requirements new file mode 100644 index 0000000..492c48d --- /dev/null +++ b/requirements @@ -0,0 +1,79 @@ +aistudio-sdk==0.3.8 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anyio==4.13.0 +bce-python-sdk==0.9.70 +beautifulsoup4==4.14.3 +certifi==2026.2.25 +chardet==7.4.3 +charset-normalizer==3.4.7 +click==8.3.2 +colorlog==6.10.1 +crc32c==2.8 +deep-translator==1.11.4 +easyocr==1.7.2 +filelock==3.28.0 +fsspec==2026.3.0 +future==1.0.0 +h11==0.16.0 +hf-xet==1.4.3 +httpcore==1.0.9 +httpx==0.28.1 +huggingface_hub==1.10.2 +idna==3.11 +ImageIO==2.37.3 +imagesize==2.0.0 +Jinja2==3.1.6 +lazy-loader==0.5 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mdurl==0.1.2 +modelscope==1.35.4 +mpmath==1.3.0 +networkx==3.6.1 +ninja==1.13.0 +numpy==1.26.4 +opencv-contrib-python==4.10.0.84 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opt-einsum==3.3.0 +packaging==26.1 +paddleocr==3.4.1 +paddlepaddle==3.3.1 +paddlex==3.4.3 +pandas==3.0.2 +pillow==12.2.0 +prettytable==3.17.0 +protobuf==7.34.1 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyclipper==1.4.0 +pycryptodome==3.23.0 +pydantic==2.13.1 +pydantic_core==2.46.1 +Pygments==2.20.0 +pypdfium2==5.7.0 +python-bidi==0.6.7 +python-dateutil==2.9.0.post0 +PyYAML==6.0.2 +requests==2.33.1 +rich==15.0.0 +ruamel.yaml==0.19.1 +safetensors==0.7.0 +scikit-image==0.26.0 +scipy==1.17.1 +shapely==2.1.2 +shellingham==1.5.4 +six==1.17.0 +soupsieve==2.8.3 +sympy==1.14.0 +tifffile==2026.3.3 +torch==2.11.0 +torchvision==0.26.0 +tqdm==4.67.3 +typer==0.24.1 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +ujson==5.12.0 +urllib3==2.6.3 +wcwidth==0.6.0 diff --git a/requirements.txt b/requirements.txt index eefc6d0..62d7a7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,12 @@ -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# manga-translator + manga-renderer -# Python >= 3.9 recommended -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -# Computer vision + image processing -opencv-python>=4.8.0 -numpy>=1.24.0 -Pillow>=10.0.0 - -# OCR engine (manga-translator) -manga-ocr>=0.1.8 - -# Translation (manga-translator) -deep-translator>=1.11.0 - -# HTTP / file handling used internally by manga-ocr -requests>=2.31.0 +numpy<2.0 +opencv-python>=4.8 +easyocr>=1.7.1 +deep-translator>=1.11.4 +manga-ocr>=0.1.14 +torch +torchvision +Pillow +transformers +fugashi +unidic-lite