First beta

This commit is contained in:
Guillem Hernandez Sola
2026-04-15 21:12:41 +02:00
parent 5ef8c39f69
commit dd1cf54f86
7 changed files with 736 additions and 905 deletions

View File

@@ -0,0 +1,53 @@
# Manga Translator OCR Pipeline
A robust manga/comic OCR + translation pipeline with:
- EasyOCR (default, reliable on macOS M1)
- Optional PaddleOCR (auto-fallback if unavailable)
- Bubble clustering and line-level boxes
- Robust reread pass (multi-preprocessing + slight rotation)
- Translation export + debug overlays
---
## ✨ Features
- OCR from raw manga pages
- Noise filtering (`BOX` debug artifacts, tiny garbage tokens, symbols)
- Speech bubble grouping
- Reading order estimation (`ltr` / `rtl`)
- Translation output (`output.txt`)
- Structured bubble metadata (`bubbles.json`)
- Visual debug output (`debug_clusters.png`)
---
## 🧰 Requirements
- macOS (Apple Silicon supported)
- Python **3.11** recommended
- Homebrew (for Python install)
---
## 🚀 Setup (Python 3.11 venv)
```bash
cd /path/to/manga-translator
# 1) Create venv with 3.11
/opt/homebrew/bin/python3.11 -m venv venv
# 2) Activate
source venv/bin/activate
# 3) Verify interpreter
python -V
# expected: Python 3.11.x
# 4) Install dependencies
python -m pip install --upgrade pip setuptools wheel
python -m pip install -r requirements.txt
# Optional Paddle runtime
python -m pip install paddlepaddle || true

BIN
fonts/ComicNeue-Bold.ttf Executable file

Binary file not shown.

BIN
fonts/ComicRelief-Bold.ttf Executable file

Binary file not shown.

View File

@@ -1,509 +1,412 @@
import os #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
manga-renderer.py
Inputs: 001.jpg + bubbles.json + output.txt
Output: translated_page.png
Strategy:
1. For every bubble, white-fill all its OCR quads (erases original text cleanly)
2. Render the translated text centered inside the bubble bounding box
3. Bubbles in SKIP_BUBBLE_IDS are erased but NOT re-rendered (left blank)
"""
import json import json
import re import textwrap
import cv2 import cv2
import numpy as np import numpy as np
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from typing import Dict, List, Tuple, Optional, Set
# ============================================================
# CONFIG — edit these paths to match your setup
# ============================================================
IMAGE_PATH = "003.jpg"
BUBBLES_PATH = "bubbles.json"
TRANSLATIONS_PATH = "output.txt"
OUTPUT_PATH = "translated_page_003.png"
# ───────────────────────────────────────────── # Font candidates — first one that loads wins
# CONFIG FONT_CANDIDATES = [
# ───────────────────────────────────────────── "fonts/ComicNeue-Bold.ttf",
DEFAULT_FONT_CANDIDATES = [
"fonts/ComicRelief-Regular.ttf",
"fonts/ComicNeue-Regular.ttf",
] ]
DEFAULT_FONT_COLOR = (0, 0, 0)
DEFAULT_STROKE_COLOR = (255, 255, 255)
MAX_FONT_SIZE = 20 FONT_SIZE = 20
MIN_FONT_SIZE = 6 MIN_FONT_SIZE = 10
QUAD_PAD = 4 # extra pixels added around each quad before white-fill
# Guarantee full wipe of yellow squares # ============================================================
YELLOW_BOX_PAD_X = 1 # SKIP LIST
YELLOW_BOX_PAD_Y = 1 # ── Add any bubble IDs you do NOT want rendered here.
YELLOW_UNION_PAD_X = 4 # ── The quads will still be erased (white-filled) but no
YELLOW_UNION_PAD_Y = 4 # ── translated text will be drawn inside them.
# ──
# Optional extra cleanup expansion # ── Examples of why you'd skip a bubble:
ENABLE_EXTRA_CLEAN = True # ── • Sound effects (BURP, BAM, POW …)
EXTRA_DILATE_ITERS = 1 # ── • Untranslatable single characters
EXTRA_CLOSE_ITERS = 1 # ── • Bubbles with bad OCR you want to fix manually later
# ── • Narrator boxes you want to leave in the source language
# Bubble detection (for optional extra mask / border preservation) # ============================================================
FLOOD_TOL = 30 SKIP_BUBBLE_IDS: Set[int] = {
# 8, # BURP BURP — sound effect
# Border restoration: keep very conservative # 2, # example: bad OCR, fix manually
ENABLE_EDGE_RESTORE = True }
EDGE_RESTORE_DILATE = 1
# Text layout inside yellow-union
TEXT_INSET = 0.92
# ───────────────────────────────────────────── # ============================================================
# PARSERS # FONT LOADER
# ───────────────────────────────────────────── # ============================================================
def parse_translations(translations_file): def load_font(path: str, size: int) -> Optional[ImageFont.FreeTypeFont]:
"""Try every face index in a .ttc collection. Validate with getbbox."""
indices = range(4) if path.lower().endswith(".ttc") else [0]
for idx in indices:
try:
font = ImageFont.truetype(path, size, index=idx)
font.getbbox("A") # raises if face metrics are broken
return font
except Exception:
continue
return None
def resolve_font() -> Tuple[str, ImageFont.FreeTypeFont]:
"""Return (path, font) for the first working candidate."""
for candidate in FONT_CANDIDATES:
font = load_font(candidate, FONT_SIZE)
if font is not None:
print(f" ✅ Font: {candidate}")
return candidate, font
print(" ⚠️ No TrueType font found — using Pillow bitmap fallback")
return "", ImageFont.load_default()
# ============================================================
# PARSE output.txt → {bid: translated_string}
# ============================================================
def parse_translations(filepath: str) -> Dict[int, str]:
"""
Reads output.txt and returns {bubble_id: translated_text}.
Lines look like: #2|1|vision-base|ORIGINAL|TRANSLATED|FLAGS
"""
translations = {} translations = {}
originals = {} with open(filepath, "r", encoding="utf-8") as f:
flags_map = {}
with open(translations_file, "r", encoding="utf-8") as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
if not line.startswith("#"): if not line.startswith("#"):
continue continue
parts = line.split("|") parts = line.split("|")
if len(parts) < 5:
continue
try: try:
bubble_id = int(parts[0].lstrip("#")) bid = int(parts[0].lstrip("#"))
except Exception: translated = parts[4].strip()
if translated and translated != "-":
translations[bid] = translated
except ValueError:
continue continue
return translations
if len(parts) >= 5:
original = parts[2].strip()
translated = parts[3].strip()
flags = parts[4].strip()
elif len(parts) >= 4:
original = parts[2].strip()
translated = parts[3].strip()
flags = "-"
elif len(parts) >= 3:
original = ""
translated = parts[2].strip()
flags = "-"
else:
continue
if translated.startswith("["):
continue
translations[bubble_id] = translated
originals[bubble_id] = original
flags_map[bubble_id] = flags
return translations, originals, flags_map
def parse_bubbles(bubbles_file): # ============================================================
with open(bubbles_file, "r", encoding="utf-8") as f: # PARSE bubbles.json → bubble_boxes, quads_per_bubble
raw = json.load(f) # ============================================================
return {int(k): v for k, v in raw.items()} def parse_bubbles(filepath: str):
# ─────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────
def clamp(v, lo, hi):
return max(lo, min(hi, v))
def xywh_to_xyxy(box):
if not box:
return None
x = int(box.get("x", 0))
y = int(box.get("y", 0))
w = int(box.get("w", 0))
h = int(box.get("h", 0))
return (x, y, x + w, y + h)
def union_xyxy(boxes):
boxes = [b for b in boxes if b is not None]
if not boxes:
return None
x1 = min(b[0] for b in boxes)
y1 = min(b[1] for b in boxes)
x2 = max(b[2] for b in boxes)
y2 = max(b[3] for b in boxes)
if x2 <= x1 or y2 <= y1:
return None
return (x1, y1, x2, y2)
def bbox_from_mask(mask):
ys, xs = np.where(mask > 0)
if len(xs) == 0:
return None
return (int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1)
def normalize_text(s):
t = s.upper().strip()
t = re.sub(r"[^\w]+", "", t)
return t
def is_sfx_like(text):
t = normalize_text(text)
return bool(len(t) <= 8 and re.fullmatch(r"(SHA+|BIP+|BEEP+|HN+|AH+|OH+)", t))
# ─────────────────────────────────────────────
# FONT
# ─────────────────────────────────────────────
def load_font_from_candidates(candidates, size):
for path in candidates:
if path and os.path.exists(path):
try:
return ImageFont.truetype(path, size), path
except Exception:
continue
return ImageFont.load_default(), "PIL_DEFAULT"
def measure_text(draw, text, font):
bb = draw.textbbox((0, 0), text, font=font)
return bb[2] - bb[0], bb[3] - bb[1]
def wrap_text(draw, text, font, max_width):
words = text.split()
lines = []
cur = ""
for w in words:
test = (cur + " " + w).strip()
tw, _ = measure_text(draw, test, font)
if tw <= max_width or not cur:
cur = test
else:
lines.append(cur)
cur = w
if cur:
lines.append(cur)
if not lines:
return [""], 0, 0
widths = []
heights = []
for ln in lines:
lw, lh = measure_text(draw, ln, font)
widths.append(lw)
heights.append(lh)
gap = max(2, heights[0] // 5)
total_h = sum(heights) + gap * (len(lines) - 1)
return lines, total_h, max(widths)
def fit_font(draw, text, font_candidates, safe_w, safe_h):
for size in range(MAX_FONT_SIZE, MIN_FONT_SIZE - 1, -1):
font, _ = load_font_from_candidates(font_candidates, size)
lines, total_h, max_w = wrap_text(draw, text, font, safe_w)
if total_h <= safe_h and max_w <= safe_w:
return font, lines, total_h
font, _ = load_font_from_candidates(font_candidates, MIN_FONT_SIZE)
lines, total_h, _ = wrap_text(draw, text, font, safe_w)
return font, lines, total_h
def draw_text_with_stroke(draw, pos, text, font, fill, stroke_fill):
x, y = pos
_, h = measure_text(draw, text, font)
sw = 2 if h <= 11 else 1
for dx in range(-sw, sw + 1):
for dy in range(-sw, sw + 1):
if dx == 0 and dy == 0:
continue
draw.text((x + dx, y + dy), text, font=font, fill=stroke_fill)
draw.text((x, y), text, font=font, fill=fill)
# ─────────────────────────────────────────────
# MASK BUILDERS
# ─────────────────────────────────────────────
def build_yellow_mask(bubble_data, img_h, img_w):
""" """
HARD GUARANTEE: Returns:
Returned mask always covers all yellow squares (line_bboxes). bubble_boxes : {bid: (x1, y1, x2, y2)}
quads_per_bubble : {bid: [ [[x,y],[x,y],[x,y],[x,y]], ... ]}
""" """
mask = np.zeros((img_h, img_w), dtype=np.uint8) with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# Preferred: exact line boxes bubble_boxes = {}
line_boxes = bubble_data.get("line_bboxes", []) quads_per_bubble = {}
for lb in line_boxes:
b = xywh_to_xyxy(lb) for key, val in data.items():
if not b: bid = int(key)
x1 = val["x"]; y1 = val["y"]
x2 = x1 + val["w"]; y2 = y1 + val["h"]
bubble_boxes[bid] = (x1, y1, x2, y2)
quads_per_bubble[bid] = val.get("quads", [])
return bubble_boxes, quads_per_bubble
# ============================================================
# ERASE — white-fill every OCR quad (with small padding)
# ============================================================
def erase_quads(
image_bgr,
quads_per_bubble: Dict[int, List],
translations: Dict[int, str], # ← NEW: only erase what we'll render
skip_ids: Set[int],
pad: int = QUAD_PAD
):
"""
White-fills OCR quads ONLY for bubbles that:
- have a translation in output.txt AND
- are NOT in SKIP_BUBBLE_IDS
Everything else is left completely untouched.
"""
ih, iw = image_bgr.shape[:2]
result = image_bgr.copy()
erased_count = 0
skipped_count = 0
for bid, quads in quads_per_bubble.items():
# ignore if explicitly skipped
if bid in skip_ids:
skipped_count += 1
continue continue
x1, y1, x2, y2 = b
x1 -= YELLOW_BOX_PAD_X
y1 -= YELLOW_BOX_PAD_Y
x2 += YELLOW_BOX_PAD_X
y2 += YELLOW_BOX_PAD_Y
x1 = clamp(x1, 0, img_w - 1)
y1 = clamp(y1, 0, img_h - 1)
x2 = clamp(x2, 1, img_w)
y2 = clamp(y2, 1, img_h)
if x2 > x1 and y2 > y1:
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
# If no line boxes available, use line_union fallback # ignore if no translation exists (deleted from output.txt)
if np.count_nonzero(mask) == 0: if bid not in translations:
ub = xywh_to_xyxy(bubble_data.get("line_union_bbox")) skipped_count += 1
if ub: continue
x1, y1, x2, y2 = ub
x1 -= YELLOW_UNION_PAD_X
y1 -= YELLOW_UNION_PAD_Y
x2 += YELLOW_UNION_PAD_X
y2 += YELLOW_UNION_PAD_Y
x1 = clamp(x1, 0, img_w - 1)
y1 = clamp(y1, 0, img_h - 1)
x2 = clamp(x2, 1, img_w)
y2 = clamp(y2, 1, img_h)
if x2 > x1 and y2 > y1:
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
# Last fallback: text_bbox for quad in quads:
if np.count_nonzero(mask) == 0: pts = np.array(quad, dtype=np.int32)
tb = xywh_to_xyxy(bubble_data.get("text_bbox")) cv2.fillPoly(result, [pts], (255, 255, 255))
if tb:
x1, y1, x2, y2 = tb
x1 -= YELLOW_UNION_PAD_X
y1 -= YELLOW_UNION_PAD_Y
x2 += YELLOW_UNION_PAD_X
y2 += YELLOW_UNION_PAD_Y
x1 = clamp(x1, 0, img_w - 1)
y1 = clamp(y1, 0, img_h - 1)
x2 = clamp(x2, 1, img_w)
y2 = clamp(y2, 1, img_h)
if x2 > x1 and y2 > y1:
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
return mask xs = [p[0] for p in quad]; ys = [p[1] for p in quad]
x1 = max(0, min(xs) - pad)
y1 = max(0, min(ys) - pad)
x2 = min(iw - 1, max(xs) + pad)
y2 = min(ih - 1, max(ys) + pad)
cv2.rectangle(result, (x1, y1), (x2, y2), (255, 255, 255), -1)
erased_count += 1
print(f" Erased : {erased_count} bubbles")
print(f" Ignored: {skipped_count} bubbles (no translation or in skip list)")
return result
def bubble_interior_mask(img_bgr, bubble_data): # ============================================================
# FONT SIZING + TEXT WRAP
# ============================================================
def fit_text(
text: str,
box_w: int,
box_h: int,
font_path: str,
max_size: int = FONT_SIZE,
min_size: int = MIN_FONT_SIZE
) -> Tuple[int, ImageFont.FreeTypeFont, List[str]]:
""" """
Optional helper to expand clean region safely; never used to shrink yellow coverage. Returns (fitted_size, font, wrapped_lines) — largest size where
the text block fits inside box_w × box_h.
""" """
h, w = img_bgr.shape[:2] for size in range(max_size, min_size - 1, -1):
font = load_font(font_path, size) if font_path else None
if font is None:
return min_size, ImageFont.load_default(), [text]
panel = xywh_to_xyxy(bubble_data.get("panel_bbox")) chars_per_line = max(1, int(box_w / (size * 0.62)))
if panel is None: wrapped = textwrap.fill(text, width=chars_per_line)
panel = (0, 0, w, h) lines = wrapped.split("\n")
px1, py1, px2, py2 = panel total_h = (size + 8) * len(lines)
seed = bubble_data.get("seed_point", {}) if total_h <= box_h - 8:
sx = int(seed.get("x", bubble_data.get("x", 0) + bubble_data.get("w", 1) // 2)) return size, font, lines
sy = int(seed.get("y", bubble_data.get("y", 0) + bubble_data.get("h", 1) // 2))
sx = clamp(sx, 1, w - 2)
sy = clamp(sy, 1, h - 2)
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # Nothing fit — use minimum size
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY) font = load_font(font_path, min_size) if font_path else None
if font is None:
font = ImageFont.load_default()
chars_per_line = max(1, int(box_w / (min_size * 0.62)))
lines = textwrap.fill(text, width=chars_per_line).split("\n")
return min_size, font, lines
panel_bin = np.zeros_like(binary)
panel_bin[py1:py2, px1:px2] = binary[py1:py2, px1:px2]
# if seed on dark pixel, search nearby white # ============================================================
if gray[sy, sx] < 150: # COLOR HELPERS
found = False # ============================================================
search_r = max(2, min(bubble_data.get("w", 20), bubble_data.get("h", 20)) // 3) def sample_bg_color(
for r in range(1, search_r + 1): image_bgr,
for dy in range(-r, r + 1): x1: int, y1: int,
for dx in range(-r, r + 1): x2: int, y2: int
nx, ny = sx + dx, sy + dy ) -> Tuple[int, int, int]:
if px1 <= nx < px2 and py1 <= ny < py2 and gray[ny, nx] >= 200: """Sample four corners of a bubble to estimate background color (R, G, B)."""
sx, sy = nx, ny ih, iw = image_bgr.shape[:2]
found = True samples = []
break for sx, sy in [(x1+4, y1+4), (x2-4, y1+4), (x1+4, y2-4), (x2-4, y2-4)]:
if found: sx = max(0, min(iw-1, sx)); sy = max(0, min(ih-1, sy))
break b, g, r = image_bgr[sy, sx]
if found: samples.append((int(r), int(g), int(b)))
break return (
int(np.median([s[0] for s in samples])),
if not found: int(np.median([s[1] for s in samples])),
m = np.zeros((h, w), dtype=np.uint8) int(np.median([s[2] for s in samples])),
bx = bubble_data.get("x", 0)
by = bubble_data.get("y", 0)
bw = bubble_data.get("w", 20)
bh = bubble_data.get("h", 20)
cv2.ellipse(m, (bx + bw // 2, by + bh // 2), (max(4, bw // 2), max(4, bh // 2)), 0, 0, 360, 255, -1)
return m
ff_mask = np.zeros((h + 2, w + 2), dtype=np.uint8)
flood = panel_bin.copy()
cv2.floodFill(
flood, ff_mask, (sx, sy), 255,
loDiff=FLOOD_TOL, upDiff=FLOOD_TOL,
flags=cv2.FLOODFILL_FIXED_RANGE
) )
m = (ff_mask[1:-1, 1:-1] * 255).astype(np.uint8)
m = cv2.morphologyEx(m, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=1) def pick_fg_color(bg: Tuple[int, int, int]) -> Tuple[int, int, int]:
return m lum = 0.299 * bg[0] + 0.587 * bg[1] + 0.114 * bg[2]
return (0, 0, 0) if lum > 128 else (255, 255, 255)
def build_clean_mask(img_bgr, bubble_data): def safe_textbbox(
""" draw, pos, text, font
FINAL RULE: ) -> Tuple[int, int, int, int]:
clean_mask MUST cover yellow_mask completely. try:
""" return draw.textbbox(pos, text, font=font)
h, w = img_bgr.shape[:2] except Exception:
yellow = build_yellow_mask(bubble_data, h, w) size = getattr(font, "size", 12)
return (
# start with guaranteed yellow pos[0], pos[1],
clean = yellow.copy() pos[0] + int(len(text) * size * 0.6),
pos[1] + int(size * 1.2)
if ENABLE_EXTRA_CLEAN: )
bubble_m = bubble_interior_mask(img_bgr, bubble_data)
extra = cv2.dilate(yellow, np.ones((3, 3), np.uint8), iterations=EXTRA_DILATE_ITERS)
extra = cv2.morphologyEx(extra, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=EXTRA_CLOSE_ITERS)
extra = cv2.bitwise_and(extra, bubble_m)
# IMPORTANT: union with yellow (never subtract yellow)
clean = cv2.bitwise_or(yellow, extra)
# final guarantee (defensive)
clean = cv2.bitwise_or(clean, yellow)
return clean, yellow
# ───────────────────────────────────────────── # ============================================================
# DRAW BUBBLE # RENDER
# ───────────────────────────────────────────── # ============================================================
def draw_bubble(
pil_img,
img_bgr_ref,
bubble_data,
original_text,
translated_text,
font_candidates,
font_color,
stroke_color
):
if original_text and translated_text:
if normalize_text(original_text) == normalize_text(translated_text) and is_sfx_like(original_text):
return "skip_sfx"
rgb = np.array(pil_img)
h, w = rgb.shape[:2]
clean_mask, yellow_mask = build_clean_mask(img_bgr_ref, bubble_data)
if np.count_nonzero(clean_mask) == 0:
return "skip_no_area"
# 1) FORCE white fill on clean mask (includes full yellow by guarantee)
rgb[clean_mask == 255] = [255, 255, 255]
# 2) Optional edge restore, but NEVER overwrite yellow coverage
if ENABLE_EDGE_RESTORE:
bubble_m = bubble_interior_mask(img_bgr_ref, bubble_data)
edge = cv2.morphologyEx(bubble_m, cv2.MORPH_GRADIENT, np.ones((3, 3), np.uint8))
edge = cv2.dilate(edge, np.ones((3, 3), np.uint8), iterations=EDGE_RESTORE_DILATE)
# Don't restore where yellow exists (hard guarantee)
edge[yellow_mask == 255] = 0
orig_rgb = cv2.cvtColor(img_bgr_ref, cv2.COLOR_BGR2RGB)
rgb[edge == 255] = orig_rgb[edge == 255]
pil_img.paste(Image.fromarray(rgb))
if not translated_text:
return "clean_only"
# text region based on yellow area (exact requirement)
text_bbox = bbox_from_mask(yellow_mask)
if text_bbox is None:
text_bbox = bbox_from_mask(clean_mask)
if text_bbox is None:
return "skip_no_area"
x1, y1, x2, y2 = text_bbox
draw = ImageDraw.Draw(pil_img)
text_cx = int((x1 + x2) / 2)
text_cy = int((y1 + y2) / 2)
safe_w = max(16, int((x2 - x1) * TEXT_INSET))
safe_h = max(16, int((y2 - y1) * TEXT_INSET))
font, lines, total_h = fit_font(draw, translated_text, font_candidates, safe_w, safe_h)
y_cursor = int(round(text_cy - total_h / 2.0))
for line in lines:
lw, lh = measure_text(draw, line, font)
x = text_cx - lw // 2
draw_text_with_stroke(draw, (x, y_cursor), line, font, fill=font_color, stroke_fill=stroke_color)
y_cursor += lh + max(lh // 5, 2)
return "rendered"
# ─────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────
def render_translations( def render_translations(
input_image, image_bgr,
output_image, bubble_boxes: Dict[int, Tuple],
translations_file, translations: Dict[int, str],
bubbles_file, skip_ids: Set[int],
font_candidates=DEFAULT_FONT_CANDIDATES, font_path: str,
font_color=DEFAULT_FONT_COLOR, font_size: int = FONT_SIZE,
stroke_color=DEFAULT_STROKE_COLOR bold_outline: bool = True,
auto_color: bool = True,
output_path: str = OUTPUT_PATH
): ):
img_bgr = cv2.imread(input_image) image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
if img_bgr is None: pil_img = Image.fromarray(image_rgb)
raise FileNotFoundError(f"Cannot load image: {input_image}") draw = ImageDraw.Draw(pil_img)
img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) rendered = 0
skipped = 0
missing = 0
translations, originals, flags_map = parse_translations(translations_file) for bid, (x1, y1, x2, y2) in sorted(bubble_boxes.items()):
bubbles = parse_bubbles(bubbles_file)
rendered, skipped = 0, 0 # ── skip list check ────────────────────────────────────────
if bid in skip_ids:
def sort_key(item): print(f" ⏭️ Bubble #{bid:<3} — skipped (in SKIP_BUBBLE_IDS)")
bid, _ = item
b = bubbles.get(bid, {})
return int(b.get("reading_order", bid))
for bubble_id, translated_text in sorted(translations.items(), key=sort_key):
if bubble_id not in bubbles:
skipped += 1 skipped += 1
continue continue
bubble_data = bubbles[bubble_id] text = translations.get(bid, "").strip()
original_text = originals.get(bubble_id, "") if not text:
print(f" ⚠️ Bubble #{bid:<3} — no translation found, left blank")
missing += 1
continue
status = draw_bubble( box_w = x2 - x1
pil_img=img_pil, box_h = y2 - y1
img_bgr_ref=img_bgr, if box_w < 10 or box_h < 10:
bubble_data=bubble_data, continue
original_text=original_text,
translated_text=translated_text, # ── fit font + wrap ────────────────────────────────────────
font_candidates=font_candidates, size, font, lines = fit_text(
font_color=font_color, text, box_w, box_h, font_path, max_size=font_size
stroke_color=stroke_color
) )
if status.startswith("skip"): # ── colors ─────────────────────────────────────────────────
skipped += 1 if auto_color:
bg = sample_bg_color(image_bgr, x1, y1, x2, y2)
fg = pick_fg_color(bg)
ol = (255, 255, 255) if fg == (0, 0, 0) else (0, 0, 0)
else: else:
fg, ol = (0, 0, 0), (255, 255, 255)
# ── vertical center ────────────────────────────────────────
line_h = size + 8
total_h = line_h * len(lines)
y_cur = y1 + max(4, (box_h - total_h) // 2)
for line in lines:
bb = safe_textbbox(draw, (0, 0), line, font)
line_w = bb[2] - bb[0]
x_cur = x1 + max(2, (box_w - line_w) // 2)
if bold_outline:
for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
try:
draw.text((x_cur + dx, y_cur + dy), line, font=font, fill=ol)
except Exception:
pass
try:
draw.text((x_cur, y_cur), line, font=font, fill=fg)
except Exception as e:
print(f" ❌ Draw error bubble #{bid}: {e}")
y_cur += line_h
print(f" ✅ Bubble #{bid:<3} — rendered ({len(lines)} lines, size {size}px)")
rendered += 1 rendered += 1
out_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) pil_img.save(output_path)
cv2.imwrite(output_image, out_bgr)
print(f"✅ Done — {rendered} rendered, {skipped} skipped.") print()
print(f"📄 Output → {output_image}") print(f"{''*50}")
print("Guarantee: full yellow-square area is always white-cleaned before drawing text.") print(f" Rendered : {rendered}")
print(f" Skipped : {skipped} (SKIP_BUBBLE_IDS)")
print(f" No text : {missing} (not in output.txt)")
print(f"{''*50}")
print(f"✅ Saved → {output_path}")
return pil_img
# ============================================================
# MAIN
# ============================================================
def main():
print(f"📖 Loading image : {IMAGE_PATH}")
image = cv2.imread(IMAGE_PATH)
if image is None:
print(f"❌ Cannot load: {IMAGE_PATH}"); return
print(f"📦 Loading bubbles : {BUBBLES_PATH}")
bubble_boxes, quads_per_bubble = parse_bubbles(BUBBLES_PATH)
print(f" {len(bubble_boxes)} bubbles | "
f"{sum(len(v) for v in quads_per_bubble.values())} quads total")
print(f"🌐 Loading translations : {TRANSLATIONS_PATH}")
translations = parse_translations(TRANSLATIONS_PATH)
print(f" {len(translations)} translations found")
if SKIP_BUBBLE_IDS:
print(f"⏭️ Skip list : bubbles {sorted(SKIP_BUBBLE_IDS)}")
else:
print(f"⏭️ Skip list : (empty — all bubbles will be rendered)")
print("🔤 Resolving font...")
font_path, _ = resolve_font()
print(f"🧹 Erasing original text (quad fill + pad={QUAD_PAD}px)...")
clean_image = erase_quads(
image,
quads_per_bubble,
translations = translations, # ← pass translations here
skip_ids = SKIP_BUBBLE_IDS,
pad = QUAD_PAD
)
print("✍️ Rendering translated text...")
render_translations(
image_bgr = clean_image,
bubble_boxes = bubble_boxes,
translations = translations,
skip_ids = SKIP_BUBBLE_IDS,
font_path = font_path,
font_size = FONT_SIZE,
bold_outline = True,
auto_color = True,
output_path = OUTPUT_PATH
)
if __name__ == "__main__": if __name__ == "__main__":
render_translations( main()
input_image="001-page.png",
output_image="page_translated.png",
translations_file="output.txt",
bubbles_file="bubbles.json",
font_candidates=DEFAULT_FONT_CANDIDATES,
font_color=DEFAULT_FONT_COLOR,
stroke_color=DEFAULT_STROKE_COLOR
)

View File

@@ -6,13 +6,17 @@ import re
import json import json
import cv2 import cv2
import numpy as np import numpy as np
import warnings
from typing import List, Tuple, Dict, Any, Optional
from deep_translator import GoogleTranslator from deep_translator import GoogleTranslator
# OCR engines # macOS Native Vision imports
import easyocr import Vision
from paddleocr import PaddleOCR import Quartz
from Foundation import NSData
warnings.filterwarnings("ignore", category=UserWarning)
# ============================================================ # ============================================================
# CONFIG # CONFIG
@@ -26,7 +30,7 @@ GLOSSARY = {
} }
SOUND_EFFECT_PATTERNS = [ SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$", r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
] ]
@@ -47,13 +51,13 @@ TOP_BAND_RATIO = 0.08
# ============================================================ # ============================================================
# TEXT HELPERS # HELPERS
# ============================================================ # ============================================================
def normalize_text(text: str) -> str: def normalize_text(text: str) -> str:
t = (text or "").strip().upper() t = (text or "").strip().upper()
t = t.replace("", "\"").replace("", "\"") t = t.replace("\u201c", "\"").replace("\u201d", "\"")
t = t.replace("", "'").replace("", "'") t = t.replace("\u2018", "'").replace("\u2019", "'")
t = t.replace("", "...") t = t.replace("\u2026", "...")
t = re.sub(r"\s+", " ", t) t = re.sub(r"\s+", " ", t)
t = re.sub(r"\s+([,.;:!?])", r"\1", t) t = re.sub(r"\s+([,.;:!?])", r"\1", t)
t = re.sub(r"([¡¿])\s+", r"\1", t) t = re.sub(r"([¡¿])\s+", r"\1", t)
@@ -88,24 +92,35 @@ def is_title_text(text: str) -> bool:
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
def looks_like_box_tag(t: str) -> bool:
s = re.sub(r"[^A-Z0-9#]", "", (t or "").upper())
if re.fullmatch(r"[BEF]?[O0D]X#?\d{0,3}", s):
return True
if re.fullmatch(r"B[O0D]X\d{0,3}", s):
return True
return False
def is_noise_text(text: str) -> bool: def is_noise_text(text: str) -> bool:
t = (text or "").strip() t = (text or "").strip()
# Explicitly allow standalone punctuation like ? or !
if re.fullmatch(r"[\?\!]+", t):
return False
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
return True return True
if looks_like_box_tag(t):
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t): return True
if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!]", t):
return True return True
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
if len(t) <= 6 and symbol_ratio > 0.60: if len(t) <= 6 and symbol_ratio > 0.60:
return True return True
return False return False
# ============================================================
# GEOMETRY HELPERS
# ============================================================
def quad_bbox(quad): def quad_bbox(quad):
xs = [p[0] for p in quad] xs = [p[0] for p in quad]
ys = [p[1] for p in quad] ys = [p[1] for p in quad]
@@ -150,9 +165,6 @@ def overlap_or_near(a, b, gap=0):
return gap_x <= gap and gap_y <= gap return gap_x <= gap and gap_y <= gap
# ============================================================
# QUALITY
# ============================================================
def ocr_candidate_score(text: str) -> float: def ocr_candidate_score(text: str) -> float:
if not text: if not text:
return 0.0 return 0.0
@@ -179,204 +191,98 @@ def ocr_candidate_score(text: str) -> float:
# ============================================================ # ============================================================
# OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID) # OCR ENGINES (Apple Native Vision)
# ============================================================ # ============================================================
class HybridOCR: class MacVisionDetector:
def __init__(self, source_lang="en", use_gpu=False): def __init__(self, source_lang="en"):
self.source_lang = source_lang lang_map = {"en": "en-US", "es": "es-ES", "ca": "ca-ES", "fr": "fr-FR", "ja": "ja-JP"}
apple_lang = lang_map.get(source_lang, "en-US")
self.langs = [apple_lang]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs})")
# Paddle language choice (single lang for Paddle) def read(self, image_path_or_array):
# For manga EN/ES pages, latin model is robust. if isinstance(image_path_or_array, str):
if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"): img = cv2.imread(image_path_or_array)
paddle_lang = "latin"
elif source_lang in ("ja",):
paddle_lang = "japan"
elif source_lang in ("ko",):
paddle_lang = "korean"
elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"):
paddle_lang = "ch"
else: else:
paddle_lang = "latin" img = image_path_or_array
# EasyOCR language list if img is None or img.size == 0:
if source_lang == "ca": return []
easy_langs = ["es", "en"]
elif source_lang == "en":
easy_langs = ["en", "es"]
elif source_lang == "es":
easy_langs = ["es", "en"]
else:
easy_langs = [source_lang]
self.paddle = PaddleOCR( ih, iw = img.shape[:2]
use_angle_cls=True,
lang=paddle_lang,
use_gpu=use_gpu,
show_log=False
)
self.easy = easyocr.Reader(easy_langs, gpu=use_gpu)
@staticmethod success, buffer = cv2.imencode('.png', img)
def _paddle_to_std(result): if not success:
""" return []
Convert Paddle result to Easy-like:
[ (quad, text, conf), ... ]
"""
out = []
# paddle.ocr(...) returns list per image
# each item line: [ [ [x,y],...4pts ], (text, conf) ]
if not result:
return out
# result can be [None] or nested list
blocks = result if isinstance(result, list) else [result]
for blk in blocks:
if blk is None:
continue
if len(blk) == 0:
continue
# some versions wrap once more
if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2:
lines = blk
elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2:
lines = blk
else:
# maybe nested once more
if len(blk) == 1 and isinstance(blk[0], list):
lines = blk[0]
else:
lines = []
for ln in lines: ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
try: handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
pts, rec = ln results = []
txt, conf = rec[0], float(rec[1])
quad = [[float(p[0]), float(p[1])] for p in pts]
out.append((quad, txt, conf))
except Exception:
continue
return out
def read_full_image(self, image_path): def completion_handler(request, error):
""" if error:
Primary: Paddle print(f"Vision API Error: {error}")
Fallback merge: EasyOCR return
Returns merged standardized detections.
"""
# Paddle
pr = self.paddle.ocr(image_path, cls=True)
paddle_det = self._paddle_to_std(pr)
# Easy for observation in request.results():
easy_det = self.easy.readtext(image_path, paragraph=False) candidate = observation.topCandidates_(1)[0]
text = candidate.string()
confidence = candidate.confidence()
# Merge by IOU/text proximity bbox = observation.boundingBox()
merged = list(paddle_det) x = bbox.origin.x * iw
for eb in easy_det: y_bottom_left = bbox.origin.y * ih
eq, et, ec = eb w = bbox.size.width * iw
ebox = quad_bbox(eq) h = bbox.size.height * ih
keep = True
for pb in paddle_det:
pq, pt, pc = pb
pbox = quad_bbox(pq)
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1]) y = ih - y_bottom_left - h
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.55: quad = [
# if overlapped and paddle exists, keep paddle unless easy much higher conf [int(x), int(y)],
if float(ec) > float(pc) + 0.20: [int(x + w), int(y)],
# replace paddle with easy-like entry [int(x + w), int(y + h)],
try: [int(x), int(y + h)]
merged.remove(pb) ]
except Exception:
pass
merged.append((eq, et, float(ec)))
keep = False
break
if keep: results.append((quad, text, confidence))
merged.append((eq, et, float(ec)))
return merged request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
request.setUsesLanguageCorrection_(True)
request.setRecognitionLanguages_(self.langs)
def read_array_with_both(self, arr_gray_or_bgr): handler.performRequests_error_([request], None)
"""
OCR from array (used in robust reread pass).
Returns merged detections in standardized format.
"""
tmp = "_tmp_ocr_hybrid.png"
cv2.imwrite(tmp, arr_gray_or_bgr)
try:
pr = self.paddle.ocr(tmp, cls=True)
paddle_det = self._paddle_to_std(pr)
easy_det = self.easy.readtext(tmp, paragraph=False)
merged = list(paddle_det) return results
for eb in easy_det:
eq, et, ec = eb
ebox = quad_bbox(eq)
keep = True
for pb in paddle_det:
pq, pt, pc = pb
pbox = quad_bbox(pq)
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.55:
if float(ec) > float(pc) + 0.20:
try:
merged.remove(pb)
except Exception:
pass
merged.append((eq, et, float(ec)))
keep = False
break
if keep:
merged.append((eq, et, float(ec)))
return merged
finally:
if os.path.exists(tmp):
os.remove(tmp)
# ============================================================ # ============================================================
# PREPROCESS + ROBUST REREAD # PREPROCESS
# ============================================================ # ============================================================
def preprocess_variant(crop_bgr, mode): def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw": if mode == "raw":
return gray return gray
if mode == "clahe": if mode == "clahe":
return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray)
if mode == "adaptive": if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3, 3), 0) den = cv2.GaussianBlur(gray, (3, 3), 0)
return cv2.adaptiveThreshold( return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11)
den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11
)
if mode == "otsu": if mode == "otsu":
den = cv2.GaussianBlur(gray, (3, 3), 0) den = cv2.GaussianBlur(gray, (3, 3), 0)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) _, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th return th
if mode == "invert": if mode == "invert":
return 255 - gray return 255 - gray
if mode == "bilateral":
den = cv2.bilateralFilter(gray, 7, 60, 60)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "morph_open":
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
k = np.ones((2, 2), np.uint8)
return cv2.morphologyEx(th, cv2.MORPH_OPEN, k)
return gray return gray
@@ -389,22 +295,18 @@ def rotate_image_keep_bounds(img, angle_deg):
new_w = int((h * sin) + (w * cos)) new_w = int((h * sin) + (w * cos))
new_h = int((h * cos) + (w * sin)) new_h = int((h * cos) + (w * sin))
M[0, 2] += (new_w / 2) - c[0] M[0, 2] += (new_w / 2) - c[0]
M[1, 2] += (new_h / 2) - c[1] M[1, 2] += (new_h / 2) - c[1]
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
def rebuild_text_from_ocr_result(res): def rebuild_text_from_vision_result(res):
if not res: if not res:
return "" return ""
norm = [] norm = []
for item in res: for bbox, txt, conf in res:
if len(item) != 3:
continue
bbox, txt, conf = item
if not txt or not txt.strip(): if not txt or not txt.strip():
continue continue
b = quad_bbox(bbox) b = quad_bbox(bbox)
@@ -419,7 +321,7 @@ def rebuild_text_from_ocr_result(res):
med_h = float(np.median([x[5] for x in norm])) med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75) row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4]) # y norm.sort(key=lambda z: z[4])
rows = [] rows = []
for it in norm: for it in norm:
placed = False placed = False
@@ -435,7 +337,7 @@ def rebuild_text_from_ocr_result(res):
rows.sort(key=lambda r: r["yc"]) rows.sort(key=lambda r: r["yc"])
lines = [] lines = []
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z[3]) # x mem = sorted(r["m"], key=lambda z: z[3])
line = normalize_text(" ".join(x[1] for x in mem)) line = normalize_text(" ".join(x[1] for x in mem))
if line: if line:
lines.append(line) lines.append(line)
@@ -443,57 +345,51 @@ def rebuild_text_from_ocr_result(res):
return normalize_text(" ".join(lines)) return normalize_text(" ".join(lines))
def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24): def reread_bubble_with_vision(
ih, iw = image.shape[:2] image_bgr,
x1, y1, x2, y2 = bbox bbox_xyxy,
x1 = max(0, int(x1 - pad)) vision_detector: MacVisionDetector,
y1 = max(0, int(y1 - pad)) upscale=3.0,
x2 = min(iw, int(x2 + pad)) pad=24
y2 = min(ih, int(y2 + pad)) ):
crop = image[y1:y2, x1:x2] ih, iw = image_bgr.shape[:2]
x1, y1, x2, y2 = bbox_xyxy
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad))
crop = image_bgr[y1:y2, x1:x2]
if crop.size == 0: if crop.size == 0:
return None, 0.0 return None, 0.0, "none"
up = cv2.resize( modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
crop,
(int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)),
interpolation=cv2.INTER_CUBIC
)
modes = ["raw", "clahe", "adaptive", "otsu", "invert"]
angles = [0.0, 1.5, -1.5] angles = [0.0, 1.5, -1.5]
best_text, best_score = "", 0.0 best_v_txt, best_v_sc = "", 0.0
up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), interpolation=cv2.INTER_CUBIC)
for mode in modes: for mode in modes:
proc = preprocess_variant(up, mode) proc = preprocess_variant(up0, mode)
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
if len(proc.shape) == 2:
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR)
else:
proc3 = proc
for a in angles: for a in angles:
rot = rotate_image_keep_bounds(proc3, a) rot = rotate_image_keep_bounds(proc3, a)
res = hybrid_ocr.read_array_with_both(rot) res = vision_detector.read(rot)
txt = rebuild_text_from_ocr_result(res) txt = rebuild_text_from_vision_result(res)
sc = ocr_candidate_score(txt) sc = ocr_candidate_score(txt)
if sc > best_v_sc:
best_v_txt, best_v_sc = txt, sc
if sc > best_score: if best_v_txt:
best_text, best_score = txt, sc return best_v_txt, best_v_sc, "vision-reread"
if not best_text: return None, 0.0, "none"
return None, 0.0
return best_text, best_score
# ============================================================ # ============================================================
# LINE REBUILD + YELLOW BOXES # LINES + BUBBLES
# ============================================================ # ============================================================
def build_lines_from_indices(indices, ocr): def build_lines_from_indices(indices, ocr):
if not indices: if not indices:
return [] return []
items = [] items = []
for i in indices: for i in indices:
b = quad_bbox(ocr[i][0]) b = quad_bbox(ocr[i][0])
@@ -526,7 +422,6 @@ def build_lines_from_indices(indices, ocr):
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
if txt and not is_noise_text(txt): if txt and not is_noise_text(txt):
lines.append(txt) lines.append(txt)
return lines return lines
@@ -540,16 +435,10 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
txt = normalize_text(ocr[i][1]) txt = normalize_text(ocr[i][1])
if is_noise_text(txt): if is_noise_text(txt):
continue continue
xc = (b[0] + b[2]) / 2.0 xc = (b[0] + b[2]) / 2.0
yc = (b[1] + b[3]) / 2.0 yc = (b[1] + b[3]) / 2.0
w = max(1.0, b[2] - b[0])
h = max(1.0, b[3] - b[1]) h = max(1.0, b[3] - b[1])
items.append({"i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "h": h})
items.append({
"i": i, "b": b, "txt": txt,
"xc": xc, "yc": yc, "w": w, "h": h
})
if not items: if not items:
return [] return []
@@ -559,16 +448,8 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
gap_x_tol = max(8.0, med_h * 1.25) gap_x_tol = max(8.0, med_h * 1.25)
pad = max(3, int(round(med_h * 0.22))) pad = max(3, int(round(med_h * 0.22)))
def is_punct_like(t):
raw = (t or "").strip()
if raw == "":
return True
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
return punct_ratio >= 0.5 or len(raw) <= 2
items_sorted = sorted(items, key=lambda x: x["yc"])
rows = [] rows = []
for it in items_sorted: for it in sorted(items, key=lambda x: x["yc"]):
placed = False placed = False
for r in rows: for r in rows:
if abs(it["yc"] - r["yc"]) <= row_tol: if abs(it["yc"] - r["yc"]) <= row_tol:
@@ -584,16 +465,12 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z["xc"]) mem = sorted(r["m"], key=lambda z: z["xc"])
normal = [t for t in mem if not is_punct_like(t["txt"])] if not mem:
punct = [t for t in mem if is_punct_like(t["txt"])] continue
if not normal:
normal = mem
punct = []
chunks = [] chunks = []
cur = [normal[0]] cur = [mem[0]]
for t in normal[1:]: for t in mem[1:]:
prev = cur[-1]["b"] prev = cur[-1]["b"]
b = t["b"] b = t["b"]
gap = b[0] - prev[2] gap = b[0] - prev[2]
@@ -604,106 +481,26 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
cur = [t] cur = [t]
chunks.append(cur) chunks.append(cur)
for p in punct:
pb = p["b"]
pxc, pyc = p["xc"], p["yc"]
best_k = -1
best_score = 1e18
for k, ch in enumerate(chunks):
ub = boxes_union_xyxy([x["b"] for x in ch])
cx = (ub[0] + ub[2]) / 2.0
cy = (ub[1] + ub[3]) / 2.0
dx = abs(pxc - cx)
dy = abs(pyc - cy)
score = dx + 1.8 * dy
near = overlap_or_near(pb, ub, gap=int(med_h * 1.25))
if near:
score -= med_h * 2.0
if score < best_score:
best_score = score
best_k = k
if best_k >= 0:
chunks[best_k].append(p)
else:
chunks.append([p])
for ch in chunks: for ch in chunks:
ub = boxes_union_xyxy([x["b"] for x in ch]) ub = boxes_union_xyxy([x["b"] for x in ch])
if ub: if ub:
x1, y1, x2, y2 = ub x1, y1, x2, y2 = ub
pad_x = pad out_boxes.append((x1 - pad, y1 - int(round(pad*1.35)), x2 + pad, y2 + int(round(pad*0.95))))
pad_top = int(round(pad * 1.35))
pad_bot = int(round(pad * 0.95))
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
token_boxes = [it["b"] for it in items]
def inside(tb, lb):
return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3]
for tb in token_boxes:
if not any(inside(tb, lb) for lb in out_boxes):
x1, y1, x2, y2 = tb
pad_x = pad
pad_top = int(round(pad * 1.35))
pad_bot = int(round(pad * 0.95))
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
merged = []
for b in out_boxes:
merged_into = False
for i, m in enumerate(merged):
ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1])
ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (b[2] - b[0]) * (b[3] - b[1]))
a2 = max(1, (m[2] - m[0]) * (m[3] - m[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.72:
merged[i] = boxes_union_xyxy([b, m])
merged_into = True
break
if not merged_into:
merged.append(b)
safe = []
for (x1, y1, x2, y2) in merged:
w = x2 - x1
h = y2 - y1
if w < 28:
d = (28 - w) // 2 + 2
x1 -= d; x2 += d
if h < 18:
d = (18 - h) // 2 + 2
y1 -= d; y2 += d
safe.append((x1, y1, x2, y2))
merged = safe
if image_shape is not None: if image_shape is not None:
ih, iw = image_shape[:2] ih, iw = image_shape[:2]
clamped = [] clamped = []
for b in merged: for b in out_boxes:
x1 = max(0, int(b[0])) x1 = max(0, int(b[0])); y1 = max(0, int(b[1]))
y1 = max(0, int(b[1])) x2 = min(iw - 1, int(b[2])); y2 = min(ih - 1, int(b[3]))
x2 = min(iw - 1, int(b[2]))
y2 = min(ih - 1, int(b[3]))
if x2 > x1 and y2 > y1: if x2 > x1 and y2 > y1:
clamped.append((x1, y1, x2, y2)) clamped.append((x1, y1, x2, y2))
merged = clamped out_boxes = clamped
else:
merged = [(int(b[0]), int(b[1]), int(b[2]), int(b[3])) for b in merged]
merged.sort(key=lambda z: (z[1], z[0])) out_boxes.sort(key=lambda z: (z[1], z[0]))
return merged return out_boxes
# ============================================================
# GROUPING
# ============================================================
def auto_gap(image_path, base=18, ref_w=750): def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path) img = cv2.imread(image_path)
if img is None: if img is None:
@@ -750,21 +547,14 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
sorted_groups = sorted( sorted_groups = sorted(
groups.values(), groups.values(),
key=lambda idxs: ( key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs))
min(boxes[i][1] for i in idxs),
min(boxes[i][0] for i in idxs)
)
) )
bubbles = {} bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
bubble_boxes = {}
bubble_quads = {}
bubble_indices = {}
ih, iw = image_shape[:2] ih, iw = image_shape[:2]
for bid, idxs in enumerate(sorted_groups, start=1): for bid, idxs in enumerate(sorted_groups, start=1):
idxs = sorted(idxs, key=lambda k: boxes[k][1]) idxs = sorted(idxs, key=lambda k: boxes[k][1])
lines = build_lines_from_indices(idxs, ocr) lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs] quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
@@ -772,10 +562,8 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
continue continue
x1, y1, x2, y2 = ub x1, y1, x2, y2 = ub
x1 = max(0, x1 - bbox_padding) x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding)
y1 = max(0, y1 - bbox_padding) x2 = min(iw - 1, x2 + bbox_padding); y2 = min(ih - 1, y2 + bbox_padding)
x2 = min(iw - 1, x2 + bbox_padding)
y2 = min(ih - 1, y2 + bbox_padding)
bubbles[bid] = lines bubbles[bid] = lines
bubble_boxes[bid] = (x1, y1, x2, y2) bubble_boxes[bid] = (x1, y1, x2, y2)
@@ -786,37 +574,63 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
# ============================================================ # ============================================================
# DEBUG # DEBUG / EXPORT
# ============================================================ # ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): def save_debug_clusters(
image_path,
ocr,
bubble_boxes,
bubble_indices,
clean_lines=None,
out_path="debug_clusters.png"
):
img = cv2.imread(image_path) img = cv2.imread(image_path)
if img is None: if img is None:
return return
# ── FIX 1: white-fill each OCR quad before drawing its outline ──
for bbox, txt, conf in ocr: for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32) pts = np.array(bbox, dtype=np.int32)
cv2.polylines(img, [pts], True, (180, 180, 180), 1) cv2.fillPoly(img, [pts], (255, 255, 255)) # ← white background
cv2.polylines(img, [pts], True, (180, 180, 180), 1) # ← grey outline
for bid, bb in bubble_boxes.items(): for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
cv2.putText(
img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2
)
idxs = bubble_indices.get(bid, []) # Draw green bubble bounding box + ID label
line_boxes = build_line_boxes_from_indices(idxs, ocr, image_shape=img.shape) cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
for lb in line_boxes: cv2.putText(img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)),
lx1, ly1, lx2, ly2 = lb cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2)
cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3)
# ── FIX 2: yellow line-box drawing loop removed entirely ────
# Draw translated text overlay below each bubble box
if clean_lines and bid in clean_lines:
text = clean_lines[bid]
words = text.split()
lines = []
current_line = ""
for word in words:
if len(current_line) + len(word) < 25:
current_line += word + " "
else:
lines.append(current_line.strip())
current_line = word + " "
if current_line:
lines.append(current_line.strip())
y_text = y2 + 18
for line in lines:
cv2.putText(img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3)
cv2.putText(img, line, (x1, y_text),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
y_text += 18
cv2.imwrite(out_path, img) cv2.imwrite(out_path, img)
# ============================================================
# EXPORT
# ============================================================
def estimate_reading_order(bbox_dict, mode="ltr"): def estimate_reading_order(bbox_dict, mode="ltr"):
items = [] items = []
for bid, (x1, y1, x2, y2) in bbox_dict.items(): for bid, (x1, y1, x2, y2) in bbox_dict.items():
@@ -826,8 +640,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
items.sort(key=lambda t: t[2]) items.sort(key=lambda t: t[2])
rows = [] rows, tol = [], 90
tol = 90
for it in items: for it in items:
placed = False placed = False
for r in rows: for r in rows:
@@ -850,7 +663,6 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
out = {} out = {}
for bid, bb in bbox_dict.items(): for bid, bb in bbox_dict.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
quads = quads_dict.get(bid, []) quads = quads_dict.get(bid, [])
@@ -870,9 +682,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
{"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])} {"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])}
for b in qboxes for b in qboxes
], ],
"quads": [ "quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads],
[[int(p[0]), int(p[1])] for p in q] for q in quads
],
"text_bbox": xyxy_to_xywh(text_union), "text_bbox": xyxy_to_xywh(text_union),
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
@@ -884,10 +694,10 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
# ============================================================ # ============================================================
# MAIN PIPELINE # PIPELINE
# ============================================================ # ============================================================
def translate_manga_text( def translate_manga_text(
image_path, image_path="001-page.png",
source_lang="en", source_lang="en",
target_lang="ca", target_lang="ca",
confidence_threshold=0.12, confidence_threshold=0.12,
@@ -898,8 +708,7 @@ def translate_manga_text(
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="ltr", reading_mode="ltr",
debug=True, debug=True
use_gpu=False
): ):
image = cv2.imread(image_path) image = cv2.imread(image_path)
if image is None: if image is None:
@@ -908,12 +717,12 @@ def translate_manga_text(
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
print("Loading Hybrid OCR (Paddle + EasyOCR)...") print("Loading OCR engines...")
hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu) detector = MacVisionDetector(source_lang=source_lang)
print("Running OCR...") print("Running detection OCR (Apple Vision)...")
raw = hybrid.read_full_image(image_path) raw = detector.read(image_path)
print(f"Raw detections (merged): {len(raw)}") print(f"Raw detections: {len(raw)}")
filtered = [] filtered = []
skipped = 0 skipped = 0
@@ -924,25 +733,18 @@ def translate_manga_text(
qb = quad_bbox(bbox) qb = quad_bbox(bbox)
if conf < confidence_threshold: if conf < confidence_threshold:
skipped += 1 skipped += 1; continue
continue
if len(t) < min_text_length: if len(t) < min_text_length:
skipped += 1 skipped += 1; continue
continue
if is_noise_text(t): if is_noise_text(t):
skipped += 1 skipped += 1; continue
continue
if filter_sound_effects and is_sound_effect(t): if filter_sound_effects and is_sound_effect(t):
skipped += 1 skipped += 1; continue
continue
if is_title_text(t): if is_title_text(t):
skipped += 1 skipped += 1; continue
continue
if qb[1] < int(ih * TOP_BAND_RATIO): if qb[1] < int(ih * TOP_BAND_RATIO):
if conf < 0.70 and len(t) >= 5: if conf < 0.70 and len(t) >= 5:
skipped += 1 skipped += 1; continue
continue
filtered.append((bbox, t, conf)) filtered.append((bbox, t, conf))
@@ -955,75 +757,80 @@ def translate_manga_text(
filtered, image.shape, gap_px=resolved_gap, bbox_padding=3 filtered, image.shape, gap_px=resolved_gap, bbox_padding=3
) )
if debug:
save_debug_clusters(
image_path=image_path,
ocr=filtered,
bubble_boxes=bubble_boxes,
bubble_indices=bubble_indices,
out_path="debug_clusters.png"
)
translator = GoogleTranslator(source=source_lang, target=target_lang) translator = GoogleTranslator(source=source_lang, target=target_lang)
clean_lines = {} clean_lines: Dict[int, str] = {}
sources_used: Dict[int, str] = {}
for bid, lines in bubbles.items(): for bid, lines in bubbles.items():
base_txt = normalize_text(" ".join(lines)) base_txt = normalize_text(" ".join(lines))
base_sc = ocr_candidate_score(base_txt) base_sc = ocr_candidate_score(base_txt)
txt = base_txt
src_used = "vision-base"
if base_sc < quality_threshold: if base_sc < quality_threshold:
rr_txt, rr_sc = reread_crop_robust( rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
image, image_bgr=image,
bubble_boxes[bid], bbox_xyxy=bubble_boxes[bid],
hybrid, vision_detector=detector,
upscale=3.0, upscale=3.0,
pad=24 pad=24
) )
if rr_txt and rr_sc > base_sc + 0.06: if rr_txt and rr_sc > base_sc + 0.04:
txt = rr_txt txt = rr_txt
else: src_used = rr_src
txt = base_txt
else:
txt = base_txt
txt = txt.replace(" BOMPORTA", " IMPORTA") txt = txt.replace(" BOMPORTA", " IMPORTA")
txt = txt.replace(" TESTO ", " ESTO ") txt = txt.replace(" TESTO ", " ESTO ")
txt = txt.replace(" MIVERDAD", " MI VERDAD") txt = txt.replace(" MIVERDAD", " MI VERDAD")
clean_lines[bid] = apply_glossary(normalize_text(txt)) clean_lines[bid] = apply_glossary(normalize_text(txt))
sources_used[bid] = src_used
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
if debug:
save_debug_clusters(
image_path=image_path,
ocr=filtered,
bubble_boxes=bubble_boxes,
bubble_indices=bubble_indices,
clean_lines=clean_lines,
out_path="debug_clusters.png"
)
divider = "" * 120 divider = "" * 120
out_lines = ["BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS", divider] out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
print(divider) print(divider)
print(f"{'BUBBLE':<8} {'ORDER':<6} {'ORIGINAL':<50} {'TRANSLATED':<50} FLAGS") print(f"{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} {'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS")
print(divider) print(divider)
translated_count = 0 translated_count = 0
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)): for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src = clean_lines[bid].strip() src_txt = clean_lines[bid].strip()
if not src: if not src_txt:
continue continue
flags = [] flags = []
try: try:
tgt = translator.translate(src) or "" tgt = translator.translate(src_txt) or ""
except Exception as e: except Exception as e:
tgt = f"[Translation error: {e}]" tgt = f"[Translation error: {e}]"
flags.append("TRANSLATION_ERROR") flags.append("TRANSLATION_ERROR")
tgt = apply_glossary(postprocess_translation_general(tgt)).upper() tgt = apply_glossary(postprocess_translation_general(tgt)).upper()
src_u = src.upper() src_u = src_txt.upper()
src_engine = sources_used.get(bid, "unknown")
out_lines.append( out_lines.append(
f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}" f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}"
) )
print( print(
f"#{bid:<7} {reading_map.get(bid,bid):<6} " f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}" f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}"
) )
translated_count += 1 translated_count += 1
@@ -1050,22 +857,18 @@ def translate_manga_text(
print("Saved: debug_clusters.png") print("Saved: debug_clusters.png")
# ============================================================
# ENTRYPOINT
# ============================================================
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path="001-page.png", image_path="003.jpg",
source_lang="it", source_lang="es",
target_lang="ca", target_lang="ca",
confidence_threshold=0.12, confidence_threshold=0.12,
min_text_length=1, min_text_length=2,
gap_px="auto", gap_px="auto",
filter_sound_effects=True, filter_sound_effects=True,
quality_threshold=0.62, quality_threshold=0.62,
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="ltr", reading_mode="ltr",
debug=True, debug=True
use_gpu=False
) )

79
requirements Normal file
View File

@@ -0,0 +1,79 @@
aistudio-sdk==0.3.8
annotated-doc==0.0.4
annotated-types==0.7.0
anyio==4.13.0
bce-python-sdk==0.9.70
beautifulsoup4==4.14.3
certifi==2026.2.25
chardet==7.4.3
charset-normalizer==3.4.7
click==8.3.2
colorlog==6.10.1
crc32c==2.8
deep-translator==1.11.4
easyocr==1.7.2
filelock==3.28.0
fsspec==2026.3.0
future==1.0.0
h11==0.16.0
hf-xet==1.4.3
httpcore==1.0.9
httpx==0.28.1
huggingface_hub==1.10.2
idna==3.11
ImageIO==2.37.3
imagesize==2.0.0
Jinja2==3.1.6
lazy-loader==0.5
markdown-it-py==4.0.0
MarkupSafe==3.0.3
mdurl==0.1.2
modelscope==1.35.4
mpmath==1.3.0
networkx==3.6.1
ninja==1.13.0
numpy==1.26.4
opencv-contrib-python==4.10.0.84
opencv-python==4.11.0.86
opencv-python-headless==4.11.0.86
opt-einsum==3.3.0
packaging==26.1
paddleocr==3.4.1
paddlepaddle==3.3.1
paddlex==3.4.3
pandas==3.0.2
pillow==12.2.0
prettytable==3.17.0
protobuf==7.34.1
psutil==7.2.2
py-cpuinfo==9.0.0
pyclipper==1.4.0
pycryptodome==3.23.0
pydantic==2.13.1
pydantic_core==2.46.1
Pygments==2.20.0
pypdfium2==5.7.0
python-bidi==0.6.7
python-dateutil==2.9.0.post0
PyYAML==6.0.2
requests==2.33.1
rich==15.0.0
ruamel.yaml==0.19.1
safetensors==0.7.0
scikit-image==0.26.0
scipy==1.17.1
shapely==2.1.2
shellingham==1.5.4
six==1.17.0
soupsieve==2.8.3
sympy==1.14.0
tifffile==2026.3.3
torch==2.11.0
torchvision==0.26.0
tqdm==4.67.3
typer==0.24.1
typing-inspection==0.4.2
typing_extensions==4.15.0
ujson==5.12.0
urllib3==2.6.3
wcwidth==0.6.0

View File

@@ -1,19 +1,12 @@
# ───────────────────────────────────────────── numpy<2.0
# manga-translator + manga-renderer opencv-python>=4.8
# Python >= 3.9 recommended easyocr>=1.7.1
# ───────────────────────────────────────────── deep-translator>=1.11.4
manga-ocr>=0.1.14
# Computer vision + image processing torch
opencv-python>=4.8.0 torchvision
numpy>=1.24.0 Pillow
Pillow>=10.0.0 transformers
fugashi
# OCR engine (manga-translator) unidic-lite
manga-ocr>=0.1.8
# Translation (manga-translator)
deep-translator>=1.11.0
# HTTP / file handling used internally by manga-ocr
requests>=2.31.0