First beta
This commit is contained in:
53
README.md
53
README.md
@@ -0,0 +1,53 @@
|
|||||||
|
# Manga Translator OCR Pipeline
|
||||||
|
|
||||||
|
A robust manga/comic OCR + translation pipeline with:
|
||||||
|
|
||||||
|
- EasyOCR (default, reliable on macOS M1)
|
||||||
|
- Optional PaddleOCR (auto-fallback if unavailable)
|
||||||
|
- Bubble clustering and line-level boxes
|
||||||
|
- Robust reread pass (multi-preprocessing + slight rotation)
|
||||||
|
- Translation export + debug overlays
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✨ Features
|
||||||
|
|
||||||
|
- OCR from raw manga pages
|
||||||
|
- Noise filtering (`BOX` debug artifacts, tiny garbage tokens, symbols)
|
||||||
|
- Speech bubble grouping
|
||||||
|
- Reading order estimation (`ltr` / `rtl`)
|
||||||
|
- Translation output (`output.txt`)
|
||||||
|
- Structured bubble metadata (`bubbles.json`)
|
||||||
|
- Visual debug output (`debug_clusters.png`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🧰 Requirements
|
||||||
|
|
||||||
|
- macOS (Apple Silicon supported)
|
||||||
|
- Python **3.11** recommended
|
||||||
|
- Homebrew (for Python install)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Setup (Python 3.11 venv)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /path/to/manga-translator
|
||||||
|
|
||||||
|
# 1) Create venv with 3.11
|
||||||
|
/opt/homebrew/bin/python3.11 -m venv venv
|
||||||
|
|
||||||
|
# 2) Activate
|
||||||
|
source venv/bin/activate
|
||||||
|
|
||||||
|
# 3) Verify interpreter
|
||||||
|
python -V
|
||||||
|
# expected: Python 3.11.x
|
||||||
|
|
||||||
|
# 4) Install dependencies
|
||||||
|
python -m pip install --upgrade pip setuptools wheel
|
||||||
|
python -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Optional Paddle runtime
|
||||||
|
python -m pip install paddlepaddle || true
|
||||||
|
|||||||
BIN
fonts/ComicNeue-Bold.ttf
Executable file
BIN
fonts/ComicNeue-Bold.ttf
Executable file
Binary file not shown.
BIN
fonts/ComicRelief-Bold.ttf
Executable file
BIN
fonts/ComicRelief-Bold.ttf
Executable file
Binary file not shown.
@@ -1,509 +1,412 @@
|
|||||||
import os
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
manga-renderer.py
|
||||||
|
|
||||||
|
Inputs: 001.jpg + bubbles.json + output.txt
|
||||||
|
Output: translated_page.png
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. For every bubble, white-fill all its OCR quads (erases original text cleanly)
|
||||||
|
2. Render the translated text centered inside the bubble bounding box
|
||||||
|
3. Bubbles in SKIP_BUBBLE_IDS are erased but NOT re-rendered (left blank)
|
||||||
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
import textwrap
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from typing import Dict, List, Tuple, Optional, Set
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CONFIG — edit these paths to match your setup
|
||||||
|
# ============================================================
|
||||||
|
IMAGE_PATH = "003.jpg"
|
||||||
|
BUBBLES_PATH = "bubbles.json"
|
||||||
|
TRANSLATIONS_PATH = "output.txt"
|
||||||
|
OUTPUT_PATH = "translated_page_003.png"
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# Font candidates — first one that loads wins
|
||||||
# CONFIG
|
FONT_CANDIDATES = [
|
||||||
# ─────────────────────────────────────────────
|
"fonts/ComicNeue-Bold.ttf",
|
||||||
DEFAULT_FONT_CANDIDATES = [
|
|
||||||
"fonts/ComicRelief-Regular.ttf",
|
|
||||||
"fonts/ComicNeue-Regular.ttf",
|
|
||||||
]
|
]
|
||||||
DEFAULT_FONT_COLOR = (0, 0, 0)
|
|
||||||
DEFAULT_STROKE_COLOR = (255, 255, 255)
|
|
||||||
|
|
||||||
MAX_FONT_SIZE = 20
|
FONT_SIZE = 20
|
||||||
MIN_FONT_SIZE = 6
|
MIN_FONT_SIZE = 10
|
||||||
|
QUAD_PAD = 4 # extra pixels added around each quad before white-fill
|
||||||
|
|
||||||
# Guarantee full wipe of yellow squares
|
# ============================================================
|
||||||
YELLOW_BOX_PAD_X = 1
|
# SKIP LIST
|
||||||
YELLOW_BOX_PAD_Y = 1
|
# ── Add any bubble IDs you do NOT want rendered here.
|
||||||
YELLOW_UNION_PAD_X = 4
|
# ── The quads will still be erased (white-filled) but no
|
||||||
YELLOW_UNION_PAD_Y = 4
|
# ── translated text will be drawn inside them.
|
||||||
|
# ──
|
||||||
# Optional extra cleanup expansion
|
# ── Examples of why you'd skip a bubble:
|
||||||
ENABLE_EXTRA_CLEAN = True
|
# ── • Sound effects (BURP, BAM, POW …)
|
||||||
EXTRA_DILATE_ITERS = 1
|
# ── • Untranslatable single characters
|
||||||
EXTRA_CLOSE_ITERS = 1
|
# ── • Bubbles with bad OCR you want to fix manually later
|
||||||
|
# ── • Narrator boxes you want to leave in the source language
|
||||||
# Bubble detection (for optional extra mask / border preservation)
|
# ============================================================
|
||||||
FLOOD_TOL = 30
|
SKIP_BUBBLE_IDS: Set[int] = {
|
||||||
|
# 8, # BURP BURP — sound effect
|
||||||
# Border restoration: keep very conservative
|
# 2, # example: bad OCR, fix manually
|
||||||
ENABLE_EDGE_RESTORE = True
|
}
|
||||||
EDGE_RESTORE_DILATE = 1
|
|
||||||
|
|
||||||
# Text layout inside yellow-union
|
|
||||||
TEXT_INSET = 0.92
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ============================================================
|
||||||
# PARSERS
|
# FONT LOADER
|
||||||
# ─────────────────────────────────────────────
|
# ============================================================
|
||||||
def parse_translations(translations_file):
|
def load_font(path: str, size: int) -> Optional[ImageFont.FreeTypeFont]:
|
||||||
|
"""Try every face index in a .ttc collection. Validate with getbbox."""
|
||||||
|
indices = range(4) if path.lower().endswith(".ttc") else [0]
|
||||||
|
for idx in indices:
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype(path, size, index=idx)
|
||||||
|
font.getbbox("A") # raises if face metrics are broken
|
||||||
|
return font
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_font() -> Tuple[str, ImageFont.FreeTypeFont]:
|
||||||
|
"""Return (path, font) for the first working candidate."""
|
||||||
|
for candidate in FONT_CANDIDATES:
|
||||||
|
font = load_font(candidate, FONT_SIZE)
|
||||||
|
if font is not None:
|
||||||
|
print(f" ✅ Font: {candidate}")
|
||||||
|
return candidate, font
|
||||||
|
print(" ⚠️ No TrueType font found — using Pillow bitmap fallback")
|
||||||
|
return "", ImageFont.load_default()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# PARSE output.txt → {bid: translated_string}
|
||||||
|
# ============================================================
|
||||||
|
def parse_translations(filepath: str) -> Dict[int, str]:
|
||||||
|
"""
|
||||||
|
Reads output.txt and returns {bubble_id: translated_text}.
|
||||||
|
Lines look like: #2|1|vision-base|ORIGINAL|TRANSLATED|FLAGS
|
||||||
|
"""
|
||||||
translations = {}
|
translations = {}
|
||||||
originals = {}
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
flags_map = {}
|
|
||||||
|
|
||||||
with open(translations_file, "r", encoding="utf-8") as f:
|
|
||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line.startswith("#"):
|
if not line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parts = line.split("|")
|
parts = line.split("|")
|
||||||
|
if len(parts) < 5:
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
bubble_id = int(parts[0].lstrip("#"))
|
bid = int(parts[0].lstrip("#"))
|
||||||
except Exception:
|
translated = parts[4].strip()
|
||||||
|
if translated and translated != "-":
|
||||||
|
translations[bid] = translated
|
||||||
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
return translations
|
||||||
if len(parts) >= 5:
|
|
||||||
original = parts[2].strip()
|
|
||||||
translated = parts[3].strip()
|
|
||||||
flags = parts[4].strip()
|
|
||||||
elif len(parts) >= 4:
|
|
||||||
original = parts[2].strip()
|
|
||||||
translated = parts[3].strip()
|
|
||||||
flags = "-"
|
|
||||||
elif len(parts) >= 3:
|
|
||||||
original = ""
|
|
||||||
translated = parts[2].strip()
|
|
||||||
flags = "-"
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if translated.startswith("["):
|
|
||||||
continue
|
|
||||||
|
|
||||||
translations[bubble_id] = translated
|
|
||||||
originals[bubble_id] = original
|
|
||||||
flags_map[bubble_id] = flags
|
|
||||||
|
|
||||||
return translations, originals, flags_map
|
|
||||||
|
|
||||||
|
|
||||||
def parse_bubbles(bubbles_file):
|
# ============================================================
|
||||||
with open(bubbles_file, "r", encoding="utf-8") as f:
|
# PARSE bubbles.json → bubble_boxes, quads_per_bubble
|
||||||
raw = json.load(f)
|
# ============================================================
|
||||||
return {int(k): v for k, v in raw.items()}
|
def parse_bubbles(filepath: str):
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
# HELPERS
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
def clamp(v, lo, hi):
|
|
||||||
return max(lo, min(hi, v))
|
|
||||||
|
|
||||||
|
|
||||||
def xywh_to_xyxy(box):
|
|
||||||
if not box:
|
|
||||||
return None
|
|
||||||
x = int(box.get("x", 0))
|
|
||||||
y = int(box.get("y", 0))
|
|
||||||
w = int(box.get("w", 0))
|
|
||||||
h = int(box.get("h", 0))
|
|
||||||
return (x, y, x + w, y + h)
|
|
||||||
|
|
||||||
|
|
||||||
def union_xyxy(boxes):
|
|
||||||
boxes = [b for b in boxes if b is not None]
|
|
||||||
if not boxes:
|
|
||||||
return None
|
|
||||||
x1 = min(b[0] for b in boxes)
|
|
||||||
y1 = min(b[1] for b in boxes)
|
|
||||||
x2 = max(b[2] for b in boxes)
|
|
||||||
y2 = max(b[3] for b in boxes)
|
|
||||||
if x2 <= x1 or y2 <= y1:
|
|
||||||
return None
|
|
||||||
return (x1, y1, x2, y2)
|
|
||||||
|
|
||||||
|
|
||||||
def bbox_from_mask(mask):
|
|
||||||
ys, xs = np.where(mask > 0)
|
|
||||||
if len(xs) == 0:
|
|
||||||
return None
|
|
||||||
return (int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_text(s):
|
|
||||||
t = s.upper().strip()
|
|
||||||
t = re.sub(r"[^\w]+", "", t)
|
|
||||||
return t
|
|
||||||
|
|
||||||
|
|
||||||
def is_sfx_like(text):
|
|
||||||
t = normalize_text(text)
|
|
||||||
return bool(len(t) <= 8 and re.fullmatch(r"(SHA+|BIP+|BEEP+|HN+|AH+|OH+)", t))
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
# FONT
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
def load_font_from_candidates(candidates, size):
|
|
||||||
for path in candidates:
|
|
||||||
if path and os.path.exists(path):
|
|
||||||
try:
|
|
||||||
return ImageFont.truetype(path, size), path
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
return ImageFont.load_default(), "PIL_DEFAULT"
|
|
||||||
|
|
||||||
|
|
||||||
def measure_text(draw, text, font):
|
|
||||||
bb = draw.textbbox((0, 0), text, font=font)
|
|
||||||
return bb[2] - bb[0], bb[3] - bb[1]
|
|
||||||
|
|
||||||
|
|
||||||
def wrap_text(draw, text, font, max_width):
|
|
||||||
words = text.split()
|
|
||||||
lines = []
|
|
||||||
cur = ""
|
|
||||||
|
|
||||||
for w in words:
|
|
||||||
test = (cur + " " + w).strip()
|
|
||||||
tw, _ = measure_text(draw, test, font)
|
|
||||||
if tw <= max_width or not cur:
|
|
||||||
cur = test
|
|
||||||
else:
|
|
||||||
lines.append(cur)
|
|
||||||
cur = w
|
|
||||||
if cur:
|
|
||||||
lines.append(cur)
|
|
||||||
|
|
||||||
if not lines:
|
|
||||||
return [""], 0, 0
|
|
||||||
|
|
||||||
widths = []
|
|
||||||
heights = []
|
|
||||||
for ln in lines:
|
|
||||||
lw, lh = measure_text(draw, ln, font)
|
|
||||||
widths.append(lw)
|
|
||||||
heights.append(lh)
|
|
||||||
|
|
||||||
gap = max(2, heights[0] // 5)
|
|
||||||
total_h = sum(heights) + gap * (len(lines) - 1)
|
|
||||||
return lines, total_h, max(widths)
|
|
||||||
|
|
||||||
|
|
||||||
def fit_font(draw, text, font_candidates, safe_w, safe_h):
|
|
||||||
for size in range(MAX_FONT_SIZE, MIN_FONT_SIZE - 1, -1):
|
|
||||||
font, _ = load_font_from_candidates(font_candidates, size)
|
|
||||||
lines, total_h, max_w = wrap_text(draw, text, font, safe_w)
|
|
||||||
if total_h <= safe_h and max_w <= safe_w:
|
|
||||||
return font, lines, total_h
|
|
||||||
|
|
||||||
font, _ = load_font_from_candidates(font_candidates, MIN_FONT_SIZE)
|
|
||||||
lines, total_h, _ = wrap_text(draw, text, font, safe_w)
|
|
||||||
return font, lines, total_h
|
|
||||||
|
|
||||||
|
|
||||||
def draw_text_with_stroke(draw, pos, text, font, fill, stroke_fill):
|
|
||||||
x, y = pos
|
|
||||||
_, h = measure_text(draw, text, font)
|
|
||||||
sw = 2 if h <= 11 else 1
|
|
||||||
|
|
||||||
for dx in range(-sw, sw + 1):
|
|
||||||
for dy in range(-sw, sw + 1):
|
|
||||||
if dx == 0 and dy == 0:
|
|
||||||
continue
|
|
||||||
draw.text((x + dx, y + dy), text, font=font, fill=stroke_fill)
|
|
||||||
|
|
||||||
draw.text((x, y), text, font=font, fill=fill)
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
# MASK BUILDERS
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
def build_yellow_mask(bubble_data, img_h, img_w):
|
|
||||||
"""
|
"""
|
||||||
HARD GUARANTEE:
|
Returns:
|
||||||
Returned mask always covers all yellow squares (line_bboxes).
|
bubble_boxes : {bid: (x1, y1, x2, y2)}
|
||||||
|
quads_per_bubble : {bid: [ [[x,y],[x,y],[x,y],[x,y]], ... ]}
|
||||||
"""
|
"""
|
||||||
mask = np.zeros((img_h, img_w), dtype=np.uint8)
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
# Preferred: exact line boxes
|
bubble_boxes = {}
|
||||||
line_boxes = bubble_data.get("line_bboxes", [])
|
quads_per_bubble = {}
|
||||||
for lb in line_boxes:
|
|
||||||
b = xywh_to_xyxy(lb)
|
for key, val in data.items():
|
||||||
if not b:
|
bid = int(key)
|
||||||
|
|
||||||
|
x1 = val["x"]; y1 = val["y"]
|
||||||
|
x2 = x1 + val["w"]; y2 = y1 + val["h"]
|
||||||
|
bubble_boxes[bid] = (x1, y1, x2, y2)
|
||||||
|
|
||||||
|
quads_per_bubble[bid] = val.get("quads", [])
|
||||||
|
|
||||||
|
return bubble_boxes, quads_per_bubble
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# ERASE — white-fill every OCR quad (with small padding)
|
||||||
|
# ============================================================
|
||||||
|
def erase_quads(
|
||||||
|
image_bgr,
|
||||||
|
quads_per_bubble: Dict[int, List],
|
||||||
|
translations: Dict[int, str], # ← NEW: only erase what we'll render
|
||||||
|
skip_ids: Set[int],
|
||||||
|
pad: int = QUAD_PAD
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
White-fills OCR quads ONLY for bubbles that:
|
||||||
|
- have a translation in output.txt AND
|
||||||
|
- are NOT in SKIP_BUBBLE_IDS
|
||||||
|
Everything else is left completely untouched.
|
||||||
|
"""
|
||||||
|
ih, iw = image_bgr.shape[:2]
|
||||||
|
result = image_bgr.copy()
|
||||||
|
|
||||||
|
erased_count = 0
|
||||||
|
skipped_count = 0
|
||||||
|
|
||||||
|
for bid, quads in quads_per_bubble.items():
|
||||||
|
|
||||||
|
# ignore if explicitly skipped
|
||||||
|
if bid in skip_ids:
|
||||||
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
x1, y1, x2, y2 = b
|
|
||||||
x1 -= YELLOW_BOX_PAD_X
|
|
||||||
y1 -= YELLOW_BOX_PAD_Y
|
|
||||||
x2 += YELLOW_BOX_PAD_X
|
|
||||||
y2 += YELLOW_BOX_PAD_Y
|
|
||||||
x1 = clamp(x1, 0, img_w - 1)
|
|
||||||
y1 = clamp(y1, 0, img_h - 1)
|
|
||||||
x2 = clamp(x2, 1, img_w)
|
|
||||||
y2 = clamp(y2, 1, img_h)
|
|
||||||
if x2 > x1 and y2 > y1:
|
|
||||||
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
|
|
||||||
|
|
||||||
# If no line boxes available, use line_union fallback
|
# ignore if no translation exists (deleted from output.txt)
|
||||||
if np.count_nonzero(mask) == 0:
|
if bid not in translations:
|
||||||
ub = xywh_to_xyxy(bubble_data.get("line_union_bbox"))
|
skipped_count += 1
|
||||||
if ub:
|
continue
|
||||||
x1, y1, x2, y2 = ub
|
|
||||||
x1 -= YELLOW_UNION_PAD_X
|
|
||||||
y1 -= YELLOW_UNION_PAD_Y
|
|
||||||
x2 += YELLOW_UNION_PAD_X
|
|
||||||
y2 += YELLOW_UNION_PAD_Y
|
|
||||||
x1 = clamp(x1, 0, img_w - 1)
|
|
||||||
y1 = clamp(y1, 0, img_h - 1)
|
|
||||||
x2 = clamp(x2, 1, img_w)
|
|
||||||
y2 = clamp(y2, 1, img_h)
|
|
||||||
if x2 > x1 and y2 > y1:
|
|
||||||
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
|
|
||||||
|
|
||||||
# Last fallback: text_bbox
|
for quad in quads:
|
||||||
if np.count_nonzero(mask) == 0:
|
pts = np.array(quad, dtype=np.int32)
|
||||||
tb = xywh_to_xyxy(bubble_data.get("text_bbox"))
|
cv2.fillPoly(result, [pts], (255, 255, 255))
|
||||||
if tb:
|
|
||||||
x1, y1, x2, y2 = tb
|
|
||||||
x1 -= YELLOW_UNION_PAD_X
|
|
||||||
y1 -= YELLOW_UNION_PAD_Y
|
|
||||||
x2 += YELLOW_UNION_PAD_X
|
|
||||||
y2 += YELLOW_UNION_PAD_Y
|
|
||||||
x1 = clamp(x1, 0, img_w - 1)
|
|
||||||
y1 = clamp(y1, 0, img_h - 1)
|
|
||||||
x2 = clamp(x2, 1, img_w)
|
|
||||||
y2 = clamp(y2, 1, img_h)
|
|
||||||
if x2 > x1 and y2 > y1:
|
|
||||||
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
|
|
||||||
|
|
||||||
return mask
|
xs = [p[0] for p in quad]; ys = [p[1] for p in quad]
|
||||||
|
x1 = max(0, min(xs) - pad)
|
||||||
|
y1 = max(0, min(ys) - pad)
|
||||||
|
x2 = min(iw - 1, max(xs) + pad)
|
||||||
|
y2 = min(ih - 1, max(ys) + pad)
|
||||||
|
cv2.rectangle(result, (x1, y1), (x2, y2), (255, 255, 255), -1)
|
||||||
|
|
||||||
|
erased_count += 1
|
||||||
|
|
||||||
|
print(f" Erased : {erased_count} bubbles")
|
||||||
|
print(f" Ignored: {skipped_count} bubbles (no translation or in skip list)")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def bubble_interior_mask(img_bgr, bubble_data):
|
# ============================================================
|
||||||
|
# FONT SIZING + TEXT WRAP
|
||||||
|
# ============================================================
|
||||||
|
def fit_text(
|
||||||
|
text: str,
|
||||||
|
box_w: int,
|
||||||
|
box_h: int,
|
||||||
|
font_path: str,
|
||||||
|
max_size: int = FONT_SIZE,
|
||||||
|
min_size: int = MIN_FONT_SIZE
|
||||||
|
) -> Tuple[int, ImageFont.FreeTypeFont, List[str]]:
|
||||||
"""
|
"""
|
||||||
Optional helper to expand clean region safely; never used to shrink yellow coverage.
|
Returns (fitted_size, font, wrapped_lines) — largest size where
|
||||||
|
the text block fits inside box_w × box_h.
|
||||||
"""
|
"""
|
||||||
h, w = img_bgr.shape[:2]
|
for size in range(max_size, min_size - 1, -1):
|
||||||
|
font = load_font(font_path, size) if font_path else None
|
||||||
|
if font is None:
|
||||||
|
return min_size, ImageFont.load_default(), [text]
|
||||||
|
|
||||||
panel = xywh_to_xyxy(bubble_data.get("panel_bbox"))
|
chars_per_line = max(1, int(box_w / (size * 0.62)))
|
||||||
if panel is None:
|
wrapped = textwrap.fill(text, width=chars_per_line)
|
||||||
panel = (0, 0, w, h)
|
lines = wrapped.split("\n")
|
||||||
px1, py1, px2, py2 = panel
|
total_h = (size + 8) * len(lines)
|
||||||
|
|
||||||
seed = bubble_data.get("seed_point", {})
|
if total_h <= box_h - 8:
|
||||||
sx = int(seed.get("x", bubble_data.get("x", 0) + bubble_data.get("w", 1) // 2))
|
return size, font, lines
|
||||||
sy = int(seed.get("y", bubble_data.get("y", 0) + bubble_data.get("h", 1) // 2))
|
|
||||||
sx = clamp(sx, 1, w - 2)
|
|
||||||
sy = clamp(sy, 1, h - 2)
|
|
||||||
|
|
||||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
# Nothing fit — use minimum size
|
||||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
font = load_font(font_path, min_size) if font_path else None
|
||||||
|
if font is None:
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
chars_per_line = max(1, int(box_w / (min_size * 0.62)))
|
||||||
|
lines = textwrap.fill(text, width=chars_per_line).split("\n")
|
||||||
|
return min_size, font, lines
|
||||||
|
|
||||||
panel_bin = np.zeros_like(binary)
|
|
||||||
panel_bin[py1:py2, px1:px2] = binary[py1:py2, px1:px2]
|
|
||||||
|
|
||||||
# if seed on dark pixel, search nearby white
|
# ============================================================
|
||||||
if gray[sy, sx] < 150:
|
# COLOR HELPERS
|
||||||
found = False
|
# ============================================================
|
||||||
search_r = max(2, min(bubble_data.get("w", 20), bubble_data.get("h", 20)) // 3)
|
def sample_bg_color(
|
||||||
for r in range(1, search_r + 1):
|
image_bgr,
|
||||||
for dy in range(-r, r + 1):
|
x1: int, y1: int,
|
||||||
for dx in range(-r, r + 1):
|
x2: int, y2: int
|
||||||
nx, ny = sx + dx, sy + dy
|
) -> Tuple[int, int, int]:
|
||||||
if px1 <= nx < px2 and py1 <= ny < py2 and gray[ny, nx] >= 200:
|
"""Sample four corners of a bubble to estimate background color (R, G, B)."""
|
||||||
sx, sy = nx, ny
|
ih, iw = image_bgr.shape[:2]
|
||||||
found = True
|
samples = []
|
||||||
break
|
for sx, sy in [(x1+4, y1+4), (x2-4, y1+4), (x1+4, y2-4), (x2-4, y2-4)]:
|
||||||
if found:
|
sx = max(0, min(iw-1, sx)); sy = max(0, min(ih-1, sy))
|
||||||
break
|
b, g, r = image_bgr[sy, sx]
|
||||||
if found:
|
samples.append((int(r), int(g), int(b)))
|
||||||
break
|
return (
|
||||||
|
int(np.median([s[0] for s in samples])),
|
||||||
if not found:
|
int(np.median([s[1] for s in samples])),
|
||||||
m = np.zeros((h, w), dtype=np.uint8)
|
int(np.median([s[2] for s in samples])),
|
||||||
bx = bubble_data.get("x", 0)
|
|
||||||
by = bubble_data.get("y", 0)
|
|
||||||
bw = bubble_data.get("w", 20)
|
|
||||||
bh = bubble_data.get("h", 20)
|
|
||||||
cv2.ellipse(m, (bx + bw // 2, by + bh // 2), (max(4, bw // 2), max(4, bh // 2)), 0, 0, 360, 255, -1)
|
|
||||||
return m
|
|
||||||
|
|
||||||
ff_mask = np.zeros((h + 2, w + 2), dtype=np.uint8)
|
|
||||||
flood = panel_bin.copy()
|
|
||||||
cv2.floodFill(
|
|
||||||
flood, ff_mask, (sx, sy), 255,
|
|
||||||
loDiff=FLOOD_TOL, upDiff=FLOOD_TOL,
|
|
||||||
flags=cv2.FLOODFILL_FIXED_RANGE
|
|
||||||
)
|
)
|
||||||
|
|
||||||
m = (ff_mask[1:-1, 1:-1] * 255).astype(np.uint8)
|
|
||||||
m = cv2.morphologyEx(m, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=1)
|
def pick_fg_color(bg: Tuple[int, int, int]) -> Tuple[int, int, int]:
|
||||||
return m
|
lum = 0.299 * bg[0] + 0.587 * bg[1] + 0.114 * bg[2]
|
||||||
|
return (0, 0, 0) if lum > 128 else (255, 255, 255)
|
||||||
|
|
||||||
|
|
||||||
def build_clean_mask(img_bgr, bubble_data):
|
def safe_textbbox(
|
||||||
"""
|
draw, pos, text, font
|
||||||
FINAL RULE:
|
) -> Tuple[int, int, int, int]:
|
||||||
clean_mask MUST cover yellow_mask completely.
|
try:
|
||||||
"""
|
return draw.textbbox(pos, text, font=font)
|
||||||
h, w = img_bgr.shape[:2]
|
except Exception:
|
||||||
yellow = build_yellow_mask(bubble_data, h, w)
|
size = getattr(font, "size", 12)
|
||||||
|
return (
|
||||||
# start with guaranteed yellow
|
pos[0], pos[1],
|
||||||
clean = yellow.copy()
|
pos[0] + int(len(text) * size * 0.6),
|
||||||
|
pos[1] + int(size * 1.2)
|
||||||
if ENABLE_EXTRA_CLEAN:
|
)
|
||||||
bubble_m = bubble_interior_mask(img_bgr, bubble_data)
|
|
||||||
extra = cv2.dilate(yellow, np.ones((3, 3), np.uint8), iterations=EXTRA_DILATE_ITERS)
|
|
||||||
extra = cv2.morphologyEx(extra, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8), iterations=EXTRA_CLOSE_ITERS)
|
|
||||||
extra = cv2.bitwise_and(extra, bubble_m)
|
|
||||||
|
|
||||||
# IMPORTANT: union with yellow (never subtract yellow)
|
|
||||||
clean = cv2.bitwise_or(yellow, extra)
|
|
||||||
|
|
||||||
# final guarantee (defensive)
|
|
||||||
clean = cv2.bitwise_or(clean, yellow)
|
|
||||||
|
|
||||||
return clean, yellow
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ============================================================
|
||||||
# DRAW BUBBLE
|
# RENDER
|
||||||
# ─────────────────────────────────────────────
|
# ============================================================
|
||||||
def draw_bubble(
|
|
||||||
pil_img,
|
|
||||||
img_bgr_ref,
|
|
||||||
bubble_data,
|
|
||||||
original_text,
|
|
||||||
translated_text,
|
|
||||||
font_candidates,
|
|
||||||
font_color,
|
|
||||||
stroke_color
|
|
||||||
):
|
|
||||||
if original_text and translated_text:
|
|
||||||
if normalize_text(original_text) == normalize_text(translated_text) and is_sfx_like(original_text):
|
|
||||||
return "skip_sfx"
|
|
||||||
|
|
||||||
rgb = np.array(pil_img)
|
|
||||||
h, w = rgb.shape[:2]
|
|
||||||
|
|
||||||
clean_mask, yellow_mask = build_clean_mask(img_bgr_ref, bubble_data)
|
|
||||||
if np.count_nonzero(clean_mask) == 0:
|
|
||||||
return "skip_no_area"
|
|
||||||
|
|
||||||
# 1) FORCE white fill on clean mask (includes full yellow by guarantee)
|
|
||||||
rgb[clean_mask == 255] = [255, 255, 255]
|
|
||||||
|
|
||||||
# 2) Optional edge restore, but NEVER overwrite yellow coverage
|
|
||||||
if ENABLE_EDGE_RESTORE:
|
|
||||||
bubble_m = bubble_interior_mask(img_bgr_ref, bubble_data)
|
|
||||||
edge = cv2.morphologyEx(bubble_m, cv2.MORPH_GRADIENT, np.ones((3, 3), np.uint8))
|
|
||||||
edge = cv2.dilate(edge, np.ones((3, 3), np.uint8), iterations=EDGE_RESTORE_DILATE)
|
|
||||||
|
|
||||||
# Don't restore where yellow exists (hard guarantee)
|
|
||||||
edge[yellow_mask == 255] = 0
|
|
||||||
|
|
||||||
orig_rgb = cv2.cvtColor(img_bgr_ref, cv2.COLOR_BGR2RGB)
|
|
||||||
rgb[edge == 255] = orig_rgb[edge == 255]
|
|
||||||
|
|
||||||
pil_img.paste(Image.fromarray(rgb))
|
|
||||||
|
|
||||||
if not translated_text:
|
|
||||||
return "clean_only"
|
|
||||||
|
|
||||||
# text region based on yellow area (exact requirement)
|
|
||||||
text_bbox = bbox_from_mask(yellow_mask)
|
|
||||||
if text_bbox is None:
|
|
||||||
text_bbox = bbox_from_mask(clean_mask)
|
|
||||||
if text_bbox is None:
|
|
||||||
return "skip_no_area"
|
|
||||||
|
|
||||||
x1, y1, x2, y2 = text_bbox
|
|
||||||
|
|
||||||
draw = ImageDraw.Draw(pil_img)
|
|
||||||
text_cx = int((x1 + x2) / 2)
|
|
||||||
text_cy = int((y1 + y2) / 2)
|
|
||||||
safe_w = max(16, int((x2 - x1) * TEXT_INSET))
|
|
||||||
safe_h = max(16, int((y2 - y1) * TEXT_INSET))
|
|
||||||
|
|
||||||
font, lines, total_h = fit_font(draw, translated_text, font_candidates, safe_w, safe_h)
|
|
||||||
|
|
||||||
y_cursor = int(round(text_cy - total_h / 2.0))
|
|
||||||
for line in lines:
|
|
||||||
lw, lh = measure_text(draw, line, font)
|
|
||||||
x = text_cx - lw // 2
|
|
||||||
draw_text_with_stroke(draw, (x, y_cursor), line, font, fill=font_color, stroke_fill=stroke_color)
|
|
||||||
y_cursor += lh + max(lh // 5, 2)
|
|
||||||
|
|
||||||
return "rendered"
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
# MAIN
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
def render_translations(
|
def render_translations(
|
||||||
input_image,
|
image_bgr,
|
||||||
output_image,
|
bubble_boxes: Dict[int, Tuple],
|
||||||
translations_file,
|
translations: Dict[int, str],
|
||||||
bubbles_file,
|
skip_ids: Set[int],
|
||||||
font_candidates=DEFAULT_FONT_CANDIDATES,
|
font_path: str,
|
||||||
font_color=DEFAULT_FONT_COLOR,
|
font_size: int = FONT_SIZE,
|
||||||
stroke_color=DEFAULT_STROKE_COLOR
|
bold_outline: bool = True,
|
||||||
|
auto_color: bool = True,
|
||||||
|
output_path: str = OUTPUT_PATH
|
||||||
):
|
):
|
||||||
img_bgr = cv2.imread(input_image)
|
image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
|
||||||
if img_bgr is None:
|
pil_img = Image.fromarray(image_rgb)
|
||||||
raise FileNotFoundError(f"Cannot load image: {input_image}")
|
draw = ImageDraw.Draw(pil_img)
|
||||||
|
|
||||||
img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
|
rendered = 0
|
||||||
|
skipped = 0
|
||||||
|
missing = 0
|
||||||
|
|
||||||
translations, originals, flags_map = parse_translations(translations_file)
|
for bid, (x1, y1, x2, y2) in sorted(bubble_boxes.items()):
|
||||||
bubbles = parse_bubbles(bubbles_file)
|
|
||||||
|
|
||||||
rendered, skipped = 0, 0
|
# ── skip list check ────────────────────────────────────────
|
||||||
|
if bid in skip_ids:
|
||||||
def sort_key(item):
|
print(f" ⏭️ Bubble #{bid:<3} — skipped (in SKIP_BUBBLE_IDS)")
|
||||||
bid, _ = item
|
|
||||||
b = bubbles.get(bid, {})
|
|
||||||
return int(b.get("reading_order", bid))
|
|
||||||
|
|
||||||
for bubble_id, translated_text in sorted(translations.items(), key=sort_key):
|
|
||||||
if bubble_id not in bubbles:
|
|
||||||
skipped += 1
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
bubble_data = bubbles[bubble_id]
|
text = translations.get(bid, "").strip()
|
||||||
original_text = originals.get(bubble_id, "")
|
if not text:
|
||||||
|
print(f" ⚠️ Bubble #{bid:<3} — no translation found, left blank")
|
||||||
|
missing += 1
|
||||||
|
continue
|
||||||
|
|
||||||
status = draw_bubble(
|
box_w = x2 - x1
|
||||||
pil_img=img_pil,
|
box_h = y2 - y1
|
||||||
img_bgr_ref=img_bgr,
|
if box_w < 10 or box_h < 10:
|
||||||
bubble_data=bubble_data,
|
continue
|
||||||
original_text=original_text,
|
|
||||||
translated_text=translated_text,
|
# ── fit font + wrap ────────────────────────────────────────
|
||||||
font_candidates=font_candidates,
|
size, font, lines = fit_text(
|
||||||
font_color=font_color,
|
text, box_w, box_h, font_path, max_size=font_size
|
||||||
stroke_color=stroke_color
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if status.startswith("skip"):
|
# ── colors ─────────────────────────────────────────────────
|
||||||
skipped += 1
|
if auto_color:
|
||||||
|
bg = sample_bg_color(image_bgr, x1, y1, x2, y2)
|
||||||
|
fg = pick_fg_color(bg)
|
||||||
|
ol = (255, 255, 255) if fg == (0, 0, 0) else (0, 0, 0)
|
||||||
else:
|
else:
|
||||||
|
fg, ol = (0, 0, 0), (255, 255, 255)
|
||||||
|
|
||||||
|
# ── vertical center ────────────────────────────────────────
|
||||||
|
line_h = size + 8
|
||||||
|
total_h = line_h * len(lines)
|
||||||
|
y_cur = y1 + max(4, (box_h - total_h) // 2)
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
bb = safe_textbbox(draw, (0, 0), line, font)
|
||||||
|
line_w = bb[2] - bb[0]
|
||||||
|
x_cur = x1 + max(2, (box_w - line_w) // 2)
|
||||||
|
|
||||||
|
if bold_outline:
|
||||||
|
for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
|
||||||
|
try:
|
||||||
|
draw.text((x_cur + dx, y_cur + dy), line, font=font, fill=ol)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
draw.text((x_cur, y_cur), line, font=font, fill=fg)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Draw error bubble #{bid}: {e}")
|
||||||
|
|
||||||
|
y_cur += line_h
|
||||||
|
|
||||||
|
print(f" ✅ Bubble #{bid:<3} — rendered ({len(lines)} lines, size {size}px)")
|
||||||
rendered += 1
|
rendered += 1
|
||||||
|
|
||||||
out_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
|
pil_img.save(output_path)
|
||||||
cv2.imwrite(output_image, out_bgr)
|
|
||||||
|
|
||||||
print(f"✅ Done — {rendered} rendered, {skipped} skipped.")
|
print()
|
||||||
print(f"📄 Output → {output_image}")
|
print(f"{'─'*50}")
|
||||||
print("Guarantee: full yellow-square area is always white-cleaned before drawing text.")
|
print(f" Rendered : {rendered}")
|
||||||
|
print(f" Skipped : {skipped} (SKIP_BUBBLE_IDS)")
|
||||||
|
print(f" No text : {missing} (not in output.txt)")
|
||||||
|
print(f"{'─'*50}")
|
||||||
|
print(f"✅ Saved → {output_path}")
|
||||||
|
|
||||||
|
return pil_img
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# MAIN
|
||||||
|
# ============================================================
|
||||||
|
def main():
|
||||||
|
print(f"📖 Loading image : {IMAGE_PATH}")
|
||||||
|
image = cv2.imread(IMAGE_PATH)
|
||||||
|
if image is None:
|
||||||
|
print(f"❌ Cannot load: {IMAGE_PATH}"); return
|
||||||
|
|
||||||
|
print(f"📦 Loading bubbles : {BUBBLES_PATH}")
|
||||||
|
bubble_boxes, quads_per_bubble = parse_bubbles(BUBBLES_PATH)
|
||||||
|
print(f" {len(bubble_boxes)} bubbles | "
|
||||||
|
f"{sum(len(v) for v in quads_per_bubble.values())} quads total")
|
||||||
|
|
||||||
|
print(f"🌐 Loading translations : {TRANSLATIONS_PATH}")
|
||||||
|
translations = parse_translations(TRANSLATIONS_PATH)
|
||||||
|
print(f" {len(translations)} translations found")
|
||||||
|
|
||||||
|
if SKIP_BUBBLE_IDS:
|
||||||
|
print(f"⏭️ Skip list : bubbles {sorted(SKIP_BUBBLE_IDS)}")
|
||||||
|
else:
|
||||||
|
print(f"⏭️ Skip list : (empty — all bubbles will be rendered)")
|
||||||
|
|
||||||
|
print("🔤 Resolving font...")
|
||||||
|
font_path, _ = resolve_font()
|
||||||
|
|
||||||
|
print(f"🧹 Erasing original text (quad fill + pad={QUAD_PAD}px)...")
|
||||||
|
clean_image = erase_quads(
|
||||||
|
image,
|
||||||
|
quads_per_bubble,
|
||||||
|
translations = translations, # ← pass translations here
|
||||||
|
skip_ids = SKIP_BUBBLE_IDS,
|
||||||
|
pad = QUAD_PAD
|
||||||
|
)
|
||||||
|
|
||||||
|
print("✍️ Rendering translated text...")
|
||||||
|
render_translations(
|
||||||
|
image_bgr = clean_image,
|
||||||
|
bubble_boxes = bubble_boxes,
|
||||||
|
translations = translations,
|
||||||
|
skip_ids = SKIP_BUBBLE_IDS,
|
||||||
|
font_path = font_path,
|
||||||
|
font_size = FONT_SIZE,
|
||||||
|
bold_outline = True,
|
||||||
|
auto_color = True,
|
||||||
|
output_path = OUTPUT_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
render_translations(
|
main()
|
||||||
input_image="001-page.png",
|
|
||||||
output_image="page_translated.png",
|
|
||||||
translations_file="output.txt",
|
|
||||||
bubbles_file="bubbles.json",
|
|
||||||
font_candidates=DEFAULT_FONT_CANDIDATES,
|
|
||||||
font_color=DEFAULT_FONT_COLOR,
|
|
||||||
stroke_color=DEFAULT_STROKE_COLOR
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -6,13 +6,17 @@ import re
|
|||||||
import json
|
import json
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import warnings
|
||||||
|
from typing import List, Tuple, Dict, Any, Optional
|
||||||
|
|
||||||
from deep_translator import GoogleTranslator
|
from deep_translator import GoogleTranslator
|
||||||
|
|
||||||
# OCR engines
|
# macOS Native Vision imports
|
||||||
import easyocr
|
import Vision
|
||||||
from paddleocr import PaddleOCR
|
import Quartz
|
||||||
|
from Foundation import NSData
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore", category=UserWarning)
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# CONFIG
|
# CONFIG
|
||||||
@@ -26,7 +30,7 @@ GLOSSARY = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SOUND_EFFECT_PATTERNS = [
|
SOUND_EFFECT_PATTERNS = [
|
||||||
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$",
|
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
|
||||||
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
|
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
|
||||||
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
|
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
|
||||||
]
|
]
|
||||||
@@ -47,13 +51,13 @@ TOP_BAND_RATIO = 0.08
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# TEXT HELPERS
|
# HELPERS
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def normalize_text(text: str) -> str:
|
def normalize_text(text: str) -> str:
|
||||||
t = (text or "").strip().upper()
|
t = (text or "").strip().upper()
|
||||||
t = t.replace("“", "\"").replace("”", "\"")
|
t = t.replace("\u201c", "\"").replace("\u201d", "\"")
|
||||||
t = t.replace("’", "'").replace("‘", "'")
|
t = t.replace("\u2018", "'").replace("\u2019", "'")
|
||||||
t = t.replace("…", "...")
|
t = t.replace("\u2026", "...")
|
||||||
t = re.sub(r"\s+", " ", t)
|
t = re.sub(r"\s+", " ", t)
|
||||||
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
|
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
|
||||||
t = re.sub(r"([¡¿])\s+", r"\1", t)
|
t = re.sub(r"([¡¿])\s+", r"\1", t)
|
||||||
@@ -88,24 +92,35 @@ def is_title_text(text: str) -> bool:
|
|||||||
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
|
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_box_tag(t: str) -> bool:
|
||||||
|
s = re.sub(r"[^A-Z0-9#]", "", (t or "").upper())
|
||||||
|
if re.fullmatch(r"[BEF]?[O0D]X#?\d{0,3}", s):
|
||||||
|
return True
|
||||||
|
if re.fullmatch(r"B[O0D]X\d{0,3}", s):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_noise_text(text: str) -> bool:
|
def is_noise_text(text: str) -> bool:
|
||||||
t = (text or "").strip()
|
t = (text or "").strip()
|
||||||
|
|
||||||
|
# Explicitly allow standalone punctuation like ? or !
|
||||||
|
if re.fullmatch(r"[\?\!]+", t):
|
||||||
|
return False
|
||||||
|
|
||||||
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
|
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
|
||||||
return True
|
return True
|
||||||
|
if looks_like_box_tag(t):
|
||||||
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
|
return True
|
||||||
|
if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!]", t):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
|
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
|
||||||
if len(t) <= 6 and symbol_ratio > 0.60:
|
if len(t) <= 6 and symbol_ratio > 0.60:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# GEOMETRY HELPERS
|
|
||||||
# ============================================================
|
|
||||||
def quad_bbox(quad):
|
def quad_bbox(quad):
|
||||||
xs = [p[0] for p in quad]
|
xs = [p[0] for p in quad]
|
||||||
ys = [p[1] for p in quad]
|
ys = [p[1] for p in quad]
|
||||||
@@ -150,9 +165,6 @@ def overlap_or_near(a, b, gap=0):
|
|||||||
return gap_x <= gap and gap_y <= gap
|
return gap_x <= gap and gap_y <= gap
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# QUALITY
|
|
||||||
# ============================================================
|
|
||||||
def ocr_candidate_score(text: str) -> float:
|
def ocr_candidate_score(text: str) -> float:
|
||||||
if not text:
|
if not text:
|
||||||
return 0.0
|
return 0.0
|
||||||
@@ -179,204 +191,98 @@ def ocr_candidate_score(text: str) -> float:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID)
|
# OCR ENGINES (Apple Native Vision)
|
||||||
# ============================================================
|
# ============================================================
|
||||||
class HybridOCR:
|
class MacVisionDetector:
|
||||||
def __init__(self, source_lang="en", use_gpu=False):
|
def __init__(self, source_lang="en"):
|
||||||
self.source_lang = source_lang
|
lang_map = {"en": "en-US", "es": "es-ES", "ca": "ca-ES", "fr": "fr-FR", "ja": "ja-JP"}
|
||||||
|
apple_lang = lang_map.get(source_lang, "en-US")
|
||||||
|
self.langs = [apple_lang]
|
||||||
|
print(f"⚡ Using Apple Vision OCR (Language: {self.langs})")
|
||||||
|
|
||||||
# Paddle language choice (single lang for Paddle)
|
def read(self, image_path_or_array):
|
||||||
# For manga EN/ES pages, latin model is robust.
|
if isinstance(image_path_or_array, str):
|
||||||
if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"):
|
img = cv2.imread(image_path_or_array)
|
||||||
paddle_lang = "latin"
|
|
||||||
elif source_lang in ("ja",):
|
|
||||||
paddle_lang = "japan"
|
|
||||||
elif source_lang in ("ko",):
|
|
||||||
paddle_lang = "korean"
|
|
||||||
elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"):
|
|
||||||
paddle_lang = "ch"
|
|
||||||
else:
|
else:
|
||||||
paddle_lang = "latin"
|
img = image_path_or_array
|
||||||
|
|
||||||
# EasyOCR language list
|
if img is None or img.size == 0:
|
||||||
if source_lang == "ca":
|
return []
|
||||||
easy_langs = ["es", "en"]
|
|
||||||
elif source_lang == "en":
|
|
||||||
easy_langs = ["en", "es"]
|
|
||||||
elif source_lang == "es":
|
|
||||||
easy_langs = ["es", "en"]
|
|
||||||
else:
|
|
||||||
easy_langs = [source_lang]
|
|
||||||
|
|
||||||
self.paddle = PaddleOCR(
|
ih, iw = img.shape[:2]
|
||||||
use_angle_cls=True,
|
|
||||||
lang=paddle_lang,
|
|
||||||
use_gpu=use_gpu,
|
|
||||||
show_log=False
|
|
||||||
)
|
|
||||||
self.easy = easyocr.Reader(easy_langs, gpu=use_gpu)
|
|
||||||
|
|
||||||
@staticmethod
|
success, buffer = cv2.imencode('.png', img)
|
||||||
def _paddle_to_std(result):
|
if not success:
|
||||||
"""
|
return []
|
||||||
Convert Paddle result to Easy-like:
|
|
||||||
[ (quad, text, conf), ... ]
|
|
||||||
"""
|
|
||||||
out = []
|
|
||||||
# paddle.ocr(...) returns list per image
|
|
||||||
# each item line: [ [ [x,y],...4pts ], (text, conf) ]
|
|
||||||
if not result:
|
|
||||||
return out
|
|
||||||
# result can be [None] or nested list
|
|
||||||
blocks = result if isinstance(result, list) else [result]
|
|
||||||
for blk in blocks:
|
|
||||||
if blk is None:
|
|
||||||
continue
|
|
||||||
if len(blk) == 0:
|
|
||||||
continue
|
|
||||||
# some versions wrap once more
|
|
||||||
if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2:
|
|
||||||
lines = blk
|
|
||||||
elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2:
|
|
||||||
lines = blk
|
|
||||||
else:
|
|
||||||
# maybe nested once more
|
|
||||||
if len(blk) == 1 and isinstance(blk[0], list):
|
|
||||||
lines = blk[0]
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
|
|
||||||
for ln in lines:
|
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||||||
try:
|
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||||||
pts, rec = ln
|
results = []
|
||||||
txt, conf = rec[0], float(rec[1])
|
|
||||||
quad = [[float(p[0]), float(p[1])] for p in pts]
|
|
||||||
out.append((quad, txt, conf))
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
return out
|
|
||||||
|
|
||||||
def read_full_image(self, image_path):
|
def completion_handler(request, error):
|
||||||
"""
|
if error:
|
||||||
Primary: Paddle
|
print(f"Vision API Error: {error}")
|
||||||
Fallback merge: EasyOCR
|
return
|
||||||
Returns merged standardized detections.
|
|
||||||
"""
|
|
||||||
# Paddle
|
|
||||||
pr = self.paddle.ocr(image_path, cls=True)
|
|
||||||
paddle_det = self._paddle_to_std(pr)
|
|
||||||
|
|
||||||
# Easy
|
for observation in request.results():
|
||||||
easy_det = self.easy.readtext(image_path, paragraph=False)
|
candidate = observation.topCandidates_(1)[0]
|
||||||
|
text = candidate.string()
|
||||||
|
confidence = candidate.confidence()
|
||||||
|
|
||||||
# Merge by IOU/text proximity
|
bbox = observation.boundingBox()
|
||||||
merged = list(paddle_det)
|
x = bbox.origin.x * iw
|
||||||
for eb in easy_det:
|
y_bottom_left = bbox.origin.y * ih
|
||||||
eq, et, ec = eb
|
w = bbox.size.width * iw
|
||||||
ebox = quad_bbox(eq)
|
h = bbox.size.height * ih
|
||||||
keep = True
|
|
||||||
for pb in paddle_det:
|
|
||||||
pq, pt, pc = pb
|
|
||||||
pbox = quad_bbox(pq)
|
|
||||||
|
|
||||||
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
|
y = ih - y_bottom_left - h
|
||||||
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
|
|
||||||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
||||||
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
|
|
||||||
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
|
|
||||||
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
|
|
||||||
|
|
||||||
if iou > 0.55:
|
quad = [
|
||||||
# if overlapped and paddle exists, keep paddle unless easy much higher conf
|
[int(x), int(y)],
|
||||||
if float(ec) > float(pc) + 0.20:
|
[int(x + w), int(y)],
|
||||||
# replace paddle with easy-like entry
|
[int(x + w), int(y + h)],
|
||||||
try:
|
[int(x), int(y + h)]
|
||||||
merged.remove(pb)
|
]
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
merged.append((eq, et, float(ec)))
|
|
||||||
keep = False
|
|
||||||
break
|
|
||||||
|
|
||||||
if keep:
|
results.append((quad, text, confidence))
|
||||||
merged.append((eq, et, float(ec)))
|
|
||||||
|
|
||||||
return merged
|
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||||||
|
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||||||
|
request.setUsesLanguageCorrection_(True)
|
||||||
|
request.setRecognitionLanguages_(self.langs)
|
||||||
|
|
||||||
def read_array_with_both(self, arr_gray_or_bgr):
|
handler.performRequests_error_([request], None)
|
||||||
"""
|
|
||||||
OCR from array (used in robust reread pass).
|
|
||||||
Returns merged detections in standardized format.
|
|
||||||
"""
|
|
||||||
tmp = "_tmp_ocr_hybrid.png"
|
|
||||||
cv2.imwrite(tmp, arr_gray_or_bgr)
|
|
||||||
try:
|
|
||||||
pr = self.paddle.ocr(tmp, cls=True)
|
|
||||||
paddle_det = self._paddle_to_std(pr)
|
|
||||||
easy_det = self.easy.readtext(tmp, paragraph=False)
|
|
||||||
|
|
||||||
merged = list(paddle_det)
|
return results
|
||||||
|
|
||||||
for eb in easy_det:
|
|
||||||
eq, et, ec = eb
|
|
||||||
ebox = quad_bbox(eq)
|
|
||||||
keep = True
|
|
||||||
for pb in paddle_det:
|
|
||||||
pq, pt, pc = pb
|
|
||||||
pbox = quad_bbox(pq)
|
|
||||||
|
|
||||||
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
|
|
||||||
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
|
|
||||||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
||||||
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
|
|
||||||
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
|
|
||||||
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
|
|
||||||
|
|
||||||
if iou > 0.55:
|
|
||||||
if float(ec) > float(pc) + 0.20:
|
|
||||||
try:
|
|
||||||
merged.remove(pb)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
merged.append((eq, et, float(ec)))
|
|
||||||
keep = False
|
|
||||||
break
|
|
||||||
|
|
||||||
if keep:
|
|
||||||
merged.append((eq, et, float(ec)))
|
|
||||||
|
|
||||||
return merged
|
|
||||||
finally:
|
|
||||||
if os.path.exists(tmp):
|
|
||||||
os.remove(tmp)
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# PREPROCESS + ROBUST REREAD
|
# PREPROCESS
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def preprocess_variant(crop_bgr, mode):
|
def preprocess_variant(crop_bgr, mode):
|
||||||
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
if mode == "raw":
|
if mode == "raw":
|
||||||
return gray
|
return gray
|
||||||
|
|
||||||
if mode == "clahe":
|
if mode == "clahe":
|
||||||
return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray)
|
return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray)
|
||||||
|
|
||||||
if mode == "adaptive":
|
if mode == "adaptive":
|
||||||
den = cv2.GaussianBlur(gray, (3, 3), 0)
|
den = cv2.GaussianBlur(gray, (3, 3), 0)
|
||||||
return cv2.adaptiveThreshold(
|
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11)
|
||||||
den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
||||||
cv2.THRESH_BINARY, 35, 11
|
|
||||||
)
|
|
||||||
|
|
||||||
if mode == "otsu":
|
if mode == "otsu":
|
||||||
den = cv2.GaussianBlur(gray, (3, 3), 0)
|
den = cv2.GaussianBlur(gray, (3, 3), 0)
|
||||||
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
return th
|
return th
|
||||||
|
|
||||||
if mode == "invert":
|
if mode == "invert":
|
||||||
return 255 - gray
|
return 255 - gray
|
||||||
|
if mode == "bilateral":
|
||||||
|
den = cv2.bilateralFilter(gray, 7, 60, 60)
|
||||||
|
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
|
return th
|
||||||
|
if mode == "morph_open":
|
||||||
|
_, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
|
k = np.ones((2, 2), np.uint8)
|
||||||
|
return cv2.morphologyEx(th, cv2.MORPH_OPEN, k)
|
||||||
|
|
||||||
return gray
|
return gray
|
||||||
|
|
||||||
@@ -389,22 +295,18 @@ def rotate_image_keep_bounds(img, angle_deg):
|
|||||||
|
|
||||||
new_w = int((h * sin) + (w * cos))
|
new_w = int((h * sin) + (w * cos))
|
||||||
new_h = int((h * cos) + (w * sin))
|
new_h = int((h * cos) + (w * sin))
|
||||||
|
|
||||||
M[0, 2] += (new_w / 2) - c[0]
|
M[0, 2] += (new_w / 2) - c[0]
|
||||||
M[1, 2] += (new_h / 2) - c[1]
|
M[1, 2] += (new_h / 2) - c[1]
|
||||||
|
|
||||||
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
|
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
|
||||||
|
|
||||||
|
|
||||||
def rebuild_text_from_ocr_result(res):
|
def rebuild_text_from_vision_result(res):
|
||||||
if not res:
|
if not res:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
norm = []
|
norm = []
|
||||||
for item in res:
|
for bbox, txt, conf in res:
|
||||||
if len(item) != 3:
|
|
||||||
continue
|
|
||||||
bbox, txt, conf = item
|
|
||||||
if not txt or not txt.strip():
|
if not txt or not txt.strip():
|
||||||
continue
|
continue
|
||||||
b = quad_bbox(bbox)
|
b = quad_bbox(bbox)
|
||||||
@@ -419,7 +321,7 @@ def rebuild_text_from_ocr_result(res):
|
|||||||
med_h = float(np.median([x[5] for x in norm]))
|
med_h = float(np.median([x[5] for x in norm]))
|
||||||
row_tol = max(6.0, med_h * 0.75)
|
row_tol = max(6.0, med_h * 0.75)
|
||||||
|
|
||||||
norm.sort(key=lambda z: z[4]) # y
|
norm.sort(key=lambda z: z[4])
|
||||||
rows = []
|
rows = []
|
||||||
for it in norm:
|
for it in norm:
|
||||||
placed = False
|
placed = False
|
||||||
@@ -435,7 +337,7 @@ def rebuild_text_from_ocr_result(res):
|
|||||||
rows.sort(key=lambda r: r["yc"])
|
rows.sort(key=lambda r: r["yc"])
|
||||||
lines = []
|
lines = []
|
||||||
for r in rows:
|
for r in rows:
|
||||||
mem = sorted(r["m"], key=lambda z: z[3]) # x
|
mem = sorted(r["m"], key=lambda z: z[3])
|
||||||
line = normalize_text(" ".join(x[1] for x in mem))
|
line = normalize_text(" ".join(x[1] for x in mem))
|
||||||
if line:
|
if line:
|
||||||
lines.append(line)
|
lines.append(line)
|
||||||
@@ -443,57 +345,51 @@ def rebuild_text_from_ocr_result(res):
|
|||||||
return normalize_text(" ".join(lines))
|
return normalize_text(" ".join(lines))
|
||||||
|
|
||||||
|
|
||||||
def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24):
|
def reread_bubble_with_vision(
|
||||||
ih, iw = image.shape[:2]
|
image_bgr,
|
||||||
x1, y1, x2, y2 = bbox
|
bbox_xyxy,
|
||||||
x1 = max(0, int(x1 - pad))
|
vision_detector: MacVisionDetector,
|
||||||
y1 = max(0, int(y1 - pad))
|
upscale=3.0,
|
||||||
x2 = min(iw, int(x2 + pad))
|
pad=24
|
||||||
y2 = min(ih, int(y2 + pad))
|
):
|
||||||
crop = image[y1:y2, x1:x2]
|
ih, iw = image_bgr.shape[:2]
|
||||||
|
x1, y1, x2, y2 = bbox_xyxy
|
||||||
|
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
|
||||||
|
x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad))
|
||||||
|
|
||||||
|
crop = image_bgr[y1:y2, x1:x2]
|
||||||
if crop.size == 0:
|
if crop.size == 0:
|
||||||
return None, 0.0
|
return None, 0.0, "none"
|
||||||
|
|
||||||
up = cv2.resize(
|
modes = ["raw", "clahe", "adaptive", "otsu", "invert", "bilateral", "morph_open"]
|
||||||
crop,
|
|
||||||
(int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)),
|
|
||||||
interpolation=cv2.INTER_CUBIC
|
|
||||||
)
|
|
||||||
|
|
||||||
modes = ["raw", "clahe", "adaptive", "otsu", "invert"]
|
|
||||||
angles = [0.0, 1.5, -1.5]
|
angles = [0.0, 1.5, -1.5]
|
||||||
|
|
||||||
best_text, best_score = "", 0.0
|
best_v_txt, best_v_sc = "", 0.0
|
||||||
|
up0 = cv2.resize(crop, (int(crop.shape[1]*upscale), int(crop.shape[0]*upscale)), interpolation=cv2.INTER_CUBIC)
|
||||||
|
|
||||||
for mode in modes:
|
for mode in modes:
|
||||||
proc = preprocess_variant(up, mode)
|
proc = preprocess_variant(up0, mode)
|
||||||
|
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
|
||||||
if len(proc.shape) == 2:
|
|
||||||
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR)
|
|
||||||
else:
|
|
||||||
proc3 = proc
|
|
||||||
|
|
||||||
for a in angles:
|
for a in angles:
|
||||||
rot = rotate_image_keep_bounds(proc3, a)
|
rot = rotate_image_keep_bounds(proc3, a)
|
||||||
res = hybrid_ocr.read_array_with_both(rot)
|
res = vision_detector.read(rot)
|
||||||
txt = rebuild_text_from_ocr_result(res)
|
txt = rebuild_text_from_vision_result(res)
|
||||||
sc = ocr_candidate_score(txt)
|
sc = ocr_candidate_score(txt)
|
||||||
|
if sc > best_v_sc:
|
||||||
|
best_v_txt, best_v_sc = txt, sc
|
||||||
|
|
||||||
if sc > best_score:
|
if best_v_txt:
|
||||||
best_text, best_score = txt, sc
|
return best_v_txt, best_v_sc, "vision-reread"
|
||||||
|
|
||||||
if not best_text:
|
return None, 0.0, "none"
|
||||||
return None, 0.0
|
|
||||||
return best_text, best_score
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# LINE REBUILD + YELLOW BOXES
|
# LINES + BUBBLES
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def build_lines_from_indices(indices, ocr):
|
def build_lines_from_indices(indices, ocr):
|
||||||
if not indices:
|
if not indices:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
items = []
|
items = []
|
||||||
for i in indices:
|
for i in indices:
|
||||||
b = quad_bbox(ocr[i][0])
|
b = quad_bbox(ocr[i][0])
|
||||||
@@ -526,7 +422,6 @@ def build_lines_from_indices(indices, ocr):
|
|||||||
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
|
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
|
||||||
if txt and not is_noise_text(txt):
|
if txt and not is_noise_text(txt):
|
||||||
lines.append(txt)
|
lines.append(txt)
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
@@ -540,16 +435,10 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
txt = normalize_text(ocr[i][1])
|
txt = normalize_text(ocr[i][1])
|
||||||
if is_noise_text(txt):
|
if is_noise_text(txt):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
xc = (b[0] + b[2]) / 2.0
|
xc = (b[0] + b[2]) / 2.0
|
||||||
yc = (b[1] + b[3]) / 2.0
|
yc = (b[1] + b[3]) / 2.0
|
||||||
w = max(1.0, b[2] - b[0])
|
|
||||||
h = max(1.0, b[3] - b[1])
|
h = max(1.0, b[3] - b[1])
|
||||||
|
items.append({"i": i, "b": b, "txt": txt, "xc": xc, "yc": yc, "h": h})
|
||||||
items.append({
|
|
||||||
"i": i, "b": b, "txt": txt,
|
|
||||||
"xc": xc, "yc": yc, "w": w, "h": h
|
|
||||||
})
|
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
return []
|
return []
|
||||||
@@ -559,16 +448,8 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
gap_x_tol = max(8.0, med_h * 1.25)
|
gap_x_tol = max(8.0, med_h * 1.25)
|
||||||
pad = max(3, int(round(med_h * 0.22)))
|
pad = max(3, int(round(med_h * 0.22)))
|
||||||
|
|
||||||
def is_punct_like(t):
|
|
||||||
raw = (t or "").strip()
|
|
||||||
if raw == "":
|
|
||||||
return True
|
|
||||||
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
|
|
||||||
return punct_ratio >= 0.5 or len(raw) <= 2
|
|
||||||
|
|
||||||
items_sorted = sorted(items, key=lambda x: x["yc"])
|
|
||||||
rows = []
|
rows = []
|
||||||
for it in items_sorted:
|
for it in sorted(items, key=lambda x: x["yc"]):
|
||||||
placed = False
|
placed = False
|
||||||
for r in rows:
|
for r in rows:
|
||||||
if abs(it["yc"] - r["yc"]) <= row_tol:
|
if abs(it["yc"] - r["yc"]) <= row_tol:
|
||||||
@@ -584,16 +465,12 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
|
|
||||||
for r in rows:
|
for r in rows:
|
||||||
mem = sorted(r["m"], key=lambda z: z["xc"])
|
mem = sorted(r["m"], key=lambda z: z["xc"])
|
||||||
normal = [t for t in mem if not is_punct_like(t["txt"])]
|
if not mem:
|
||||||
punct = [t for t in mem if is_punct_like(t["txt"])]
|
continue
|
||||||
|
|
||||||
if not normal:
|
|
||||||
normal = mem
|
|
||||||
punct = []
|
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
cur = [normal[0]]
|
cur = [mem[0]]
|
||||||
for t in normal[1:]:
|
for t in mem[1:]:
|
||||||
prev = cur[-1]["b"]
|
prev = cur[-1]["b"]
|
||||||
b = t["b"]
|
b = t["b"]
|
||||||
gap = b[0] - prev[2]
|
gap = b[0] - prev[2]
|
||||||
@@ -604,106 +481,26 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
cur = [t]
|
cur = [t]
|
||||||
chunks.append(cur)
|
chunks.append(cur)
|
||||||
|
|
||||||
for p in punct:
|
|
||||||
pb = p["b"]
|
|
||||||
pxc, pyc = p["xc"], p["yc"]
|
|
||||||
best_k = -1
|
|
||||||
best_score = 1e18
|
|
||||||
|
|
||||||
for k, ch in enumerate(chunks):
|
|
||||||
ub = boxes_union_xyxy([x["b"] for x in ch])
|
|
||||||
cx = (ub[0] + ub[2]) / 2.0
|
|
||||||
cy = (ub[1] + ub[3]) / 2.0
|
|
||||||
dx = abs(pxc - cx)
|
|
||||||
dy = abs(pyc - cy)
|
|
||||||
score = dx + 1.8 * dy
|
|
||||||
|
|
||||||
near = overlap_or_near(pb, ub, gap=int(med_h * 1.25))
|
|
||||||
if near:
|
|
||||||
score -= med_h * 2.0
|
|
||||||
|
|
||||||
if score < best_score:
|
|
||||||
best_score = score
|
|
||||||
best_k = k
|
|
||||||
|
|
||||||
if best_k >= 0:
|
|
||||||
chunks[best_k].append(p)
|
|
||||||
else:
|
|
||||||
chunks.append([p])
|
|
||||||
|
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
ub = boxes_union_xyxy([x["b"] for x in ch])
|
ub = boxes_union_xyxy([x["b"] for x in ch])
|
||||||
if ub:
|
if ub:
|
||||||
x1, y1, x2, y2 = ub
|
x1, y1, x2, y2 = ub
|
||||||
pad_x = pad
|
out_boxes.append((x1 - pad, y1 - int(round(pad*1.35)), x2 + pad, y2 + int(round(pad*0.95))))
|
||||||
pad_top = int(round(pad * 1.35))
|
|
||||||
pad_bot = int(round(pad * 0.95))
|
|
||||||
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
|
||||||
|
|
||||||
token_boxes = [it["b"] for it in items]
|
|
||||||
|
|
||||||
def inside(tb, lb):
|
|
||||||
return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3]
|
|
||||||
|
|
||||||
for tb in token_boxes:
|
|
||||||
if not any(inside(tb, lb) for lb in out_boxes):
|
|
||||||
x1, y1, x2, y2 = tb
|
|
||||||
pad_x = pad
|
|
||||||
pad_top = int(round(pad * 1.35))
|
|
||||||
pad_bot = int(round(pad * 0.95))
|
|
||||||
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
|
||||||
|
|
||||||
merged = []
|
|
||||||
for b in out_boxes:
|
|
||||||
merged_into = False
|
|
||||||
for i, m in enumerate(merged):
|
|
||||||
ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1])
|
|
||||||
ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3])
|
|
||||||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
||||||
a1 = max(1, (b[2] - b[0]) * (b[3] - b[1]))
|
|
||||||
a2 = max(1, (m[2] - m[0]) * (m[3] - m[1]))
|
|
||||||
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
|
|
||||||
if iou > 0.72:
|
|
||||||
merged[i] = boxes_union_xyxy([b, m])
|
|
||||||
merged_into = True
|
|
||||||
break
|
|
||||||
if not merged_into:
|
|
||||||
merged.append(b)
|
|
||||||
|
|
||||||
safe = []
|
|
||||||
for (x1, y1, x2, y2) in merged:
|
|
||||||
w = x2 - x1
|
|
||||||
h = y2 - y1
|
|
||||||
if w < 28:
|
|
||||||
d = (28 - w) // 2 + 2
|
|
||||||
x1 -= d; x2 += d
|
|
||||||
if h < 18:
|
|
||||||
d = (18 - h) // 2 + 2
|
|
||||||
y1 -= d; y2 += d
|
|
||||||
safe.append((x1, y1, x2, y2))
|
|
||||||
merged = safe
|
|
||||||
|
|
||||||
if image_shape is not None:
|
if image_shape is not None:
|
||||||
ih, iw = image_shape[:2]
|
ih, iw = image_shape[:2]
|
||||||
clamped = []
|
clamped = []
|
||||||
for b in merged:
|
for b in out_boxes:
|
||||||
x1 = max(0, int(b[0]))
|
x1 = max(0, int(b[0])); y1 = max(0, int(b[1]))
|
||||||
y1 = max(0, int(b[1]))
|
x2 = min(iw - 1, int(b[2])); y2 = min(ih - 1, int(b[3]))
|
||||||
x2 = min(iw - 1, int(b[2]))
|
|
||||||
y2 = min(ih - 1, int(b[3]))
|
|
||||||
if x2 > x1 and y2 > y1:
|
if x2 > x1 and y2 > y1:
|
||||||
clamped.append((x1, y1, x2, y2))
|
clamped.append((x1, y1, x2, y2))
|
||||||
merged = clamped
|
out_boxes = clamped
|
||||||
else:
|
|
||||||
merged = [(int(b[0]), int(b[1]), int(b[2]), int(b[3])) for b in merged]
|
|
||||||
|
|
||||||
merged.sort(key=lambda z: (z[1], z[0]))
|
out_boxes.sort(key=lambda z: (z[1], z[0]))
|
||||||
return merged
|
return out_boxes
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# GROUPING
|
|
||||||
# ============================================================
|
|
||||||
def auto_gap(image_path, base=18, ref_w=750):
|
def auto_gap(image_path, base=18, ref_w=750):
|
||||||
img = cv2.imread(image_path)
|
img = cv2.imread(image_path)
|
||||||
if img is None:
|
if img is None:
|
||||||
@@ -750,21 +547,14 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
|
|
||||||
sorted_groups = sorted(
|
sorted_groups = sorted(
|
||||||
groups.values(),
|
groups.values(),
|
||||||
key=lambda idxs: (
|
key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs))
|
||||||
min(boxes[i][1] for i in idxs),
|
|
||||||
min(boxes[i][0] for i in idxs)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
bubbles = {}
|
bubbles, bubble_boxes, bubble_quads, bubble_indices = {}, {}, {}, {}
|
||||||
bubble_boxes = {}
|
|
||||||
bubble_quads = {}
|
|
||||||
bubble_indices = {}
|
|
||||||
|
|
||||||
ih, iw = image_shape[:2]
|
ih, iw = image_shape[:2]
|
||||||
|
|
||||||
for bid, idxs in enumerate(sorted_groups, start=1):
|
for bid, idxs in enumerate(sorted_groups, start=1):
|
||||||
idxs = sorted(idxs, key=lambda k: boxes[k][1])
|
idxs = sorted(idxs, key=lambda k: boxes[k][1])
|
||||||
|
|
||||||
lines = build_lines_from_indices(idxs, ocr)
|
lines = build_lines_from_indices(idxs, ocr)
|
||||||
quads = [ocr[k][0] for k in idxs]
|
quads = [ocr[k][0] for k in idxs]
|
||||||
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
|
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
|
||||||
@@ -772,10 +562,8 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
x1, y1, x2, y2 = ub
|
x1, y1, x2, y2 = ub
|
||||||
x1 = max(0, x1 - bbox_padding)
|
x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding)
|
||||||
y1 = max(0, y1 - bbox_padding)
|
x2 = min(iw - 1, x2 + bbox_padding); y2 = min(ih - 1, y2 + bbox_padding)
|
||||||
x2 = min(iw - 1, x2 + bbox_padding)
|
|
||||||
y2 = min(ih - 1, y2 + bbox_padding)
|
|
||||||
|
|
||||||
bubbles[bid] = lines
|
bubbles[bid] = lines
|
||||||
bubble_boxes[bid] = (x1, y1, x2, y2)
|
bubble_boxes[bid] = (x1, y1, x2, y2)
|
||||||
@@ -786,37 +574,63 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# DEBUG
|
# DEBUG / EXPORT
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
|
def save_debug_clusters(
|
||||||
|
image_path,
|
||||||
|
ocr,
|
||||||
|
bubble_boxes,
|
||||||
|
bubble_indices,
|
||||||
|
clean_lines=None,
|
||||||
|
out_path="debug_clusters.png"
|
||||||
|
):
|
||||||
img = cv2.imread(image_path)
|
img = cv2.imread(image_path)
|
||||||
if img is None:
|
if img is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# ── FIX 1: white-fill each OCR quad before drawing its outline ──
|
||||||
for bbox, txt, conf in ocr:
|
for bbox, txt, conf in ocr:
|
||||||
pts = np.array(bbox, dtype=np.int32)
|
pts = np.array(bbox, dtype=np.int32)
|
||||||
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
|
cv2.fillPoly(img, [pts], (255, 255, 255)) # ← white background
|
||||||
|
cv2.polylines(img, [pts], True, (180, 180, 180), 1) # ← grey outline
|
||||||
|
|
||||||
for bid, bb in bubble_boxes.items():
|
for bid, bb in bubble_boxes.items():
|
||||||
x1, y1, x2, y2 = bb
|
x1, y1, x2, y2 = bb
|
||||||
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
|
|
||||||
cv2.putText(
|
|
||||||
img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)),
|
|
||||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2
|
|
||||||
)
|
|
||||||
|
|
||||||
idxs = bubble_indices.get(bid, [])
|
# Draw green bubble bounding box + ID label
|
||||||
line_boxes = build_line_boxes_from_indices(idxs, ocr, image_shape=img.shape)
|
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
|
||||||
for lb in line_boxes:
|
cv2.putText(img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)),
|
||||||
lx1, ly1, lx2, ly2 = lb
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2)
|
||||||
cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3)
|
|
||||||
|
# ── FIX 2: yellow line-box drawing loop removed entirely ────
|
||||||
|
|
||||||
|
# Draw translated text overlay below each bubble box
|
||||||
|
if clean_lines and bid in clean_lines:
|
||||||
|
text = clean_lines[bid]
|
||||||
|
words = text.split()
|
||||||
|
lines = []
|
||||||
|
current_line = ""
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
if len(current_line) + len(word) < 25:
|
||||||
|
current_line += word + " "
|
||||||
|
else:
|
||||||
|
lines.append(current_line.strip())
|
||||||
|
current_line = word + " "
|
||||||
|
if current_line:
|
||||||
|
lines.append(current_line.strip())
|
||||||
|
|
||||||
|
y_text = y2 + 18
|
||||||
|
for line in lines:
|
||||||
|
cv2.putText(img, line, (x1, y_text),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3)
|
||||||
|
cv2.putText(img, line, (x1, y_text),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
|
||||||
|
y_text += 18
|
||||||
|
|
||||||
cv2.imwrite(out_path, img)
|
cv2.imwrite(out_path, img)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# EXPORT
|
|
||||||
# ============================================================
|
|
||||||
def estimate_reading_order(bbox_dict, mode="ltr"):
|
def estimate_reading_order(bbox_dict, mode="ltr"):
|
||||||
items = []
|
items = []
|
||||||
for bid, (x1, y1, x2, y2) in bbox_dict.items():
|
for bid, (x1, y1, x2, y2) in bbox_dict.items():
|
||||||
@@ -826,8 +640,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
|
|||||||
|
|
||||||
items.sort(key=lambda t: t[2])
|
items.sort(key=lambda t: t[2])
|
||||||
|
|
||||||
rows = []
|
rows, tol = [], 90
|
||||||
tol = 90
|
|
||||||
for it in items:
|
for it in items:
|
||||||
placed = False
|
placed = False
|
||||||
for r in rows:
|
for r in rows:
|
||||||
@@ -850,7 +663,6 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
|
|||||||
|
|
||||||
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
|
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
|
||||||
out = {}
|
out = {}
|
||||||
|
|
||||||
for bid, bb in bbox_dict.items():
|
for bid, bb in bbox_dict.items():
|
||||||
x1, y1, x2, y2 = bb
|
x1, y1, x2, y2 = bb
|
||||||
quads = quads_dict.get(bid, [])
|
quads = quads_dict.get(bid, [])
|
||||||
@@ -870,9 +682,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
|||||||
{"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])}
|
{"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])}
|
||||||
for b in qboxes
|
for b in qboxes
|
||||||
],
|
],
|
||||||
"quads": [
|
"quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads],
|
||||||
[[int(p[0]), int(p[1])] for p in q] for q in quads
|
|
||||||
],
|
|
||||||
"text_bbox": xyxy_to_xywh(text_union),
|
"text_bbox": xyxy_to_xywh(text_union),
|
||||||
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
|
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
|
||||||
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
|
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
|
||||||
@@ -884,10 +694,10 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# MAIN PIPELINE
|
# PIPELINE
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def translate_manga_text(
|
def translate_manga_text(
|
||||||
image_path,
|
image_path="001-page.png",
|
||||||
source_lang="en",
|
source_lang="en",
|
||||||
target_lang="ca",
|
target_lang="ca",
|
||||||
confidence_threshold=0.12,
|
confidence_threshold=0.12,
|
||||||
@@ -898,8 +708,7 @@ def translate_manga_text(
|
|||||||
export_to_file="output.txt",
|
export_to_file="output.txt",
|
||||||
export_bubbles_to="bubbles.json",
|
export_bubbles_to="bubbles.json",
|
||||||
reading_mode="ltr",
|
reading_mode="ltr",
|
||||||
debug=True,
|
debug=True
|
||||||
use_gpu=False
|
|
||||||
):
|
):
|
||||||
image = cv2.imread(image_path)
|
image = cv2.imread(image_path)
|
||||||
if image is None:
|
if image is None:
|
||||||
@@ -908,12 +717,12 @@ def translate_manga_text(
|
|||||||
|
|
||||||
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
||||||
|
|
||||||
print("Loading Hybrid OCR (Paddle + EasyOCR)...")
|
print("Loading OCR engines...")
|
||||||
hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu)
|
detector = MacVisionDetector(source_lang=source_lang)
|
||||||
|
|
||||||
print("Running OCR...")
|
print("Running detection OCR (Apple Vision)...")
|
||||||
raw = hybrid.read_full_image(image_path)
|
raw = detector.read(image_path)
|
||||||
print(f"Raw detections (merged): {len(raw)}")
|
print(f"Raw detections: {len(raw)}")
|
||||||
|
|
||||||
filtered = []
|
filtered = []
|
||||||
skipped = 0
|
skipped = 0
|
||||||
@@ -924,25 +733,18 @@ def translate_manga_text(
|
|||||||
qb = quad_bbox(bbox)
|
qb = quad_bbox(bbox)
|
||||||
|
|
||||||
if conf < confidence_threshold:
|
if conf < confidence_threshold:
|
||||||
skipped += 1
|
skipped += 1; continue
|
||||||
continue
|
|
||||||
if len(t) < min_text_length:
|
if len(t) < min_text_length:
|
||||||
skipped += 1
|
skipped += 1; continue
|
||||||
continue
|
|
||||||
if is_noise_text(t):
|
if is_noise_text(t):
|
||||||
skipped += 1
|
skipped += 1; continue
|
||||||
continue
|
|
||||||
if filter_sound_effects and is_sound_effect(t):
|
if filter_sound_effects and is_sound_effect(t):
|
||||||
skipped += 1
|
skipped += 1; continue
|
||||||
continue
|
|
||||||
if is_title_text(t):
|
if is_title_text(t):
|
||||||
skipped += 1
|
skipped += 1; continue
|
||||||
continue
|
|
||||||
|
|
||||||
if qb[1] < int(ih * TOP_BAND_RATIO):
|
if qb[1] < int(ih * TOP_BAND_RATIO):
|
||||||
if conf < 0.70 and len(t) >= 5:
|
if conf < 0.70 and len(t) >= 5:
|
||||||
skipped += 1
|
skipped += 1; continue
|
||||||
continue
|
|
||||||
|
|
||||||
filtered.append((bbox, t, conf))
|
filtered.append((bbox, t, conf))
|
||||||
|
|
||||||
@@ -955,75 +757,80 @@ def translate_manga_text(
|
|||||||
filtered, image.shape, gap_px=resolved_gap, bbox_padding=3
|
filtered, image.shape, gap_px=resolved_gap, bbox_padding=3
|
||||||
)
|
)
|
||||||
|
|
||||||
if debug:
|
|
||||||
save_debug_clusters(
|
|
||||||
image_path=image_path,
|
|
||||||
ocr=filtered,
|
|
||||||
bubble_boxes=bubble_boxes,
|
|
||||||
bubble_indices=bubble_indices,
|
|
||||||
out_path="debug_clusters.png"
|
|
||||||
)
|
|
||||||
|
|
||||||
translator = GoogleTranslator(source=source_lang, target=target_lang)
|
translator = GoogleTranslator(source=source_lang, target=target_lang)
|
||||||
|
|
||||||
clean_lines = {}
|
clean_lines: Dict[int, str] = {}
|
||||||
|
sources_used: Dict[int, str] = {}
|
||||||
|
|
||||||
for bid, lines in bubbles.items():
|
for bid, lines in bubbles.items():
|
||||||
base_txt = normalize_text(" ".join(lines))
|
base_txt = normalize_text(" ".join(lines))
|
||||||
base_sc = ocr_candidate_score(base_txt)
|
base_sc = ocr_candidate_score(base_txt)
|
||||||
|
|
||||||
|
txt = base_txt
|
||||||
|
src_used = "vision-base"
|
||||||
|
|
||||||
if base_sc < quality_threshold:
|
if base_sc < quality_threshold:
|
||||||
rr_txt, rr_sc = reread_crop_robust(
|
rr_txt, rr_sc, rr_src = reread_bubble_with_vision(
|
||||||
image,
|
image_bgr=image,
|
||||||
bubble_boxes[bid],
|
bbox_xyxy=bubble_boxes[bid],
|
||||||
hybrid,
|
vision_detector=detector,
|
||||||
upscale=3.0,
|
upscale=3.0,
|
||||||
pad=24
|
pad=24
|
||||||
)
|
)
|
||||||
if rr_txt and rr_sc > base_sc + 0.06:
|
if rr_txt and rr_sc > base_sc + 0.04:
|
||||||
txt = rr_txt
|
txt = rr_txt
|
||||||
else:
|
src_used = rr_src
|
||||||
txt = base_txt
|
|
||||||
else:
|
|
||||||
txt = base_txt
|
|
||||||
|
|
||||||
txt = txt.replace(" BOMPORTA", " IMPORTA")
|
txt = txt.replace(" BOMPORTA", " IMPORTA")
|
||||||
txt = txt.replace(" TESTO ", " ESTO ")
|
txt = txt.replace(" TESTO ", " ESTO ")
|
||||||
txt = txt.replace(" MIVERDAD", " MI VERDAD")
|
txt = txt.replace(" MIVERDAD", " MI VERDAD")
|
||||||
|
|
||||||
clean_lines[bid] = apply_glossary(normalize_text(txt))
|
clean_lines[bid] = apply_glossary(normalize_text(txt))
|
||||||
|
sources_used[bid] = src_used
|
||||||
|
|
||||||
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
|
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
save_debug_clusters(
|
||||||
|
image_path=image_path,
|
||||||
|
ocr=filtered,
|
||||||
|
bubble_boxes=bubble_boxes,
|
||||||
|
bubble_indices=bubble_indices,
|
||||||
|
clean_lines=clean_lines,
|
||||||
|
out_path="debug_clusters.png"
|
||||||
|
)
|
||||||
|
|
||||||
divider = "─" * 120
|
divider = "─" * 120
|
||||||
out_lines = ["BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS", divider]
|
out_lines = ["BUBBLE|ORDER|OCR_SOURCE|ORIGINAL|TRANSLATED|FLAGS", divider]
|
||||||
|
|
||||||
print(divider)
|
print(divider)
|
||||||
print(f"{'BUBBLE':<8} {'ORDER':<6} {'ORIGINAL':<50} {'TRANSLATED':<50} FLAGS")
|
print(f"{'BUBBLE':<8} {'ORDER':<6} {'SOURCE':<12} {'ORIGINAL':<40} {'TRANSLATED':<40} FLAGS")
|
||||||
print(divider)
|
print(divider)
|
||||||
|
|
||||||
translated_count = 0
|
translated_count = 0
|
||||||
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
|
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
|
||||||
src = clean_lines[bid].strip()
|
src_txt = clean_lines[bid].strip()
|
||||||
if not src:
|
if not src_txt:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
flags = []
|
flags = []
|
||||||
try:
|
try:
|
||||||
tgt = translator.translate(src) or ""
|
tgt = translator.translate(src_txt) or ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
tgt = f"[Translation error: {e}]"
|
tgt = f"[Translation error: {e}]"
|
||||||
flags.append("TRANSLATION_ERROR")
|
flags.append("TRANSLATION_ERROR")
|
||||||
|
|
||||||
tgt = apply_glossary(postprocess_translation_general(tgt)).upper()
|
tgt = apply_glossary(postprocess_translation_general(tgt)).upper()
|
||||||
src_u = src.upper()
|
src_u = src_txt.upper()
|
||||||
|
src_engine = sources_used.get(bid, "unknown")
|
||||||
|
|
||||||
out_lines.append(
|
out_lines.append(
|
||||||
f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}"
|
f"#{bid}|{reading_map.get(bid,bid)}|{src_engine}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}"
|
||||||
)
|
)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"#{bid:<7} {reading_map.get(bid,bid):<6} "
|
f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_engine:<12} "
|
||||||
f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}"
|
f"{src_u[:40]:<40} {tgt[:40]:<40} {','.join(flags) if flags else '-'}"
|
||||||
)
|
)
|
||||||
translated_count += 1
|
translated_count += 1
|
||||||
|
|
||||||
@@ -1050,22 +857,18 @@ def translate_manga_text(
|
|||||||
print("Saved: debug_clusters.png")
|
print("Saved: debug_clusters.png")
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# ENTRYPOINT
|
|
||||||
# ============================================================
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
translate_manga_text(
|
translate_manga_text(
|
||||||
image_path="001-page.png",
|
image_path="003.jpg",
|
||||||
source_lang="it",
|
source_lang="es",
|
||||||
target_lang="ca",
|
target_lang="ca",
|
||||||
confidence_threshold=0.12,
|
confidence_threshold=0.12,
|
||||||
min_text_length=1,
|
min_text_length=2,
|
||||||
gap_px="auto",
|
gap_px="auto",
|
||||||
filter_sound_effects=True,
|
filter_sound_effects=True,
|
||||||
quality_threshold=0.62,
|
quality_threshold=0.62,
|
||||||
export_to_file="output.txt",
|
export_to_file="output.txt",
|
||||||
export_bubbles_to="bubbles.json",
|
export_bubbles_to="bubbles.json",
|
||||||
reading_mode="ltr",
|
reading_mode="ltr",
|
||||||
debug=True,
|
debug=True
|
||||||
use_gpu=False
|
|
||||||
)
|
)
|
||||||
|
|||||||
79
requirements
Normal file
79
requirements
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
aistudio-sdk==0.3.8
|
||||||
|
annotated-doc==0.0.4
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.13.0
|
||||||
|
bce-python-sdk==0.9.70
|
||||||
|
beautifulsoup4==4.14.3
|
||||||
|
certifi==2026.2.25
|
||||||
|
chardet==7.4.3
|
||||||
|
charset-normalizer==3.4.7
|
||||||
|
click==8.3.2
|
||||||
|
colorlog==6.10.1
|
||||||
|
crc32c==2.8
|
||||||
|
deep-translator==1.11.4
|
||||||
|
easyocr==1.7.2
|
||||||
|
filelock==3.28.0
|
||||||
|
fsspec==2026.3.0
|
||||||
|
future==1.0.0
|
||||||
|
h11==0.16.0
|
||||||
|
hf-xet==1.4.3
|
||||||
|
httpcore==1.0.9
|
||||||
|
httpx==0.28.1
|
||||||
|
huggingface_hub==1.10.2
|
||||||
|
idna==3.11
|
||||||
|
ImageIO==2.37.3
|
||||||
|
imagesize==2.0.0
|
||||||
|
Jinja2==3.1.6
|
||||||
|
lazy-loader==0.5
|
||||||
|
markdown-it-py==4.0.0
|
||||||
|
MarkupSafe==3.0.3
|
||||||
|
mdurl==0.1.2
|
||||||
|
modelscope==1.35.4
|
||||||
|
mpmath==1.3.0
|
||||||
|
networkx==3.6.1
|
||||||
|
ninja==1.13.0
|
||||||
|
numpy==1.26.4
|
||||||
|
opencv-contrib-python==4.10.0.84
|
||||||
|
opencv-python==4.11.0.86
|
||||||
|
opencv-python-headless==4.11.0.86
|
||||||
|
opt-einsum==3.3.0
|
||||||
|
packaging==26.1
|
||||||
|
paddleocr==3.4.1
|
||||||
|
paddlepaddle==3.3.1
|
||||||
|
paddlex==3.4.3
|
||||||
|
pandas==3.0.2
|
||||||
|
pillow==12.2.0
|
||||||
|
prettytable==3.17.0
|
||||||
|
protobuf==7.34.1
|
||||||
|
psutil==7.2.2
|
||||||
|
py-cpuinfo==9.0.0
|
||||||
|
pyclipper==1.4.0
|
||||||
|
pycryptodome==3.23.0
|
||||||
|
pydantic==2.13.1
|
||||||
|
pydantic_core==2.46.1
|
||||||
|
Pygments==2.20.0
|
||||||
|
pypdfium2==5.7.0
|
||||||
|
python-bidi==0.6.7
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
PyYAML==6.0.2
|
||||||
|
requests==2.33.1
|
||||||
|
rich==15.0.0
|
||||||
|
ruamel.yaml==0.19.1
|
||||||
|
safetensors==0.7.0
|
||||||
|
scikit-image==0.26.0
|
||||||
|
scipy==1.17.1
|
||||||
|
shapely==2.1.2
|
||||||
|
shellingham==1.5.4
|
||||||
|
six==1.17.0
|
||||||
|
soupsieve==2.8.3
|
||||||
|
sympy==1.14.0
|
||||||
|
tifffile==2026.3.3
|
||||||
|
torch==2.11.0
|
||||||
|
torchvision==0.26.0
|
||||||
|
tqdm==4.67.3
|
||||||
|
typer==0.24.1
|
||||||
|
typing-inspection==0.4.2
|
||||||
|
typing_extensions==4.15.0
|
||||||
|
ujson==5.12.0
|
||||||
|
urllib3==2.6.3
|
||||||
|
wcwidth==0.6.0
|
||||||
@@ -1,19 +1,12 @@
|
|||||||
# ─────────────────────────────────────────────
|
numpy<2.0
|
||||||
# manga-translator + manga-renderer
|
opencv-python>=4.8
|
||||||
# Python >= 3.9 recommended
|
easyocr>=1.7.1
|
||||||
# ─────────────────────────────────────────────
|
deep-translator>=1.11.4
|
||||||
|
manga-ocr>=0.1.14
|
||||||
# Computer vision + image processing
|
torch
|
||||||
opencv-python>=4.8.0
|
torchvision
|
||||||
numpy>=1.24.0
|
Pillow
|
||||||
Pillow>=10.0.0
|
transformers
|
||||||
|
fugashi
|
||||||
# OCR engine (manga-translator)
|
unidic-lite
|
||||||
manga-ocr>=0.1.8
|
|
||||||
|
|
||||||
# Translation (manga-translator)
|
|
||||||
deep-translator>=1.11.0
|
|
||||||
|
|
||||||
# HTTP / file handling used internally by manga-ocr
|
|
||||||
requests>=2.31.0
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user