Added some fixes

This commit is contained in:
Guillem Hernandez Sola
2026-04-14 20:08:51 +02:00
parent 0069da706b
commit f95b7d32d4
5 changed files with 359 additions and 171 deletions

BIN
fonts/ComicNeue-Regular.ttf Executable file

Binary file not shown.

Binary file not shown.

View File

@@ -10,10 +10,8 @@ from PIL import Image, ImageDraw, ImageFont
# CONFIG # CONFIG
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
DEFAULT_FONT_CANDIDATES = [ DEFAULT_FONT_CANDIDATES = [
"fonts/AnimeAce2_reg.ttf",
"fonts/WildWordsRoman.ttf",
"fonts/ComicRelief-Regular.ttf", "fonts/ComicRelief-Regular.ttf",
"fonts/NotoSans-Regular.ttf", "fonts/ComicNeue-Regular.ttf",
] ]
DEFAULT_FONT_COLOR = (0, 0, 0) DEFAULT_FONT_COLOR = (0, 0, 0)
DEFAULT_STROKE_COLOR = (255, 255, 255) DEFAULT_STROKE_COLOR = (255, 255, 255)
@@ -501,7 +499,7 @@ def render_translations(
if __name__ == "__main__": if __name__ == "__main__":
render_translations( render_translations(
input_image="002-page.png", input_image="001-page.png",
output_image="page_translated.png", output_image="page_translated.png",
translations_file="output.txt", translations_file="output.txt",
bubbles_file="bubbles.json", bubbles_file="bubbles.json",

View File

@@ -1,3 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re import re
import os import os
import json import json
@@ -8,7 +11,7 @@ from deep_translator import GoogleTranslator
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# CONFIG # CONFIG
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
GLOSSARY = { GLOSSARY = {
"ANYA": "ANYA", "ANYA": "ANYA",
@@ -32,17 +35,17 @@ TITLE_PATTERNS = [
NOISE_PATTERNS = [ NOISE_PATTERNS = [
r"^[^a-zA-Z0-9\?!.]+$", r"^[^a-zA-Z0-9\?!.]+$",
r"^BOX[0-9A-Z]*$", r"^BOX[0-9A-Z#\s]*$",
] ]
TOP_BAND_RATIO = 0.08 TOP_BAND_RATIO = 0.08
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# TEXT HELPERS # TEXT HELPERS
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def normalize_text(text): def normalize_text(text: str) -> str:
t = text.strip().upper() t = (text or "").strip().upper()
t = t.replace("", "\"").replace("", "\"") t = t.replace("", "\"").replace("", "\"")
t = t.replace("", "'").replace("", "'") t = t.replace("", "'").replace("", "'")
t = t.replace("", "...") t = t.replace("", "...")
@@ -54,13 +57,13 @@ def normalize_text(text):
t = re.sub(r",\?", "?", t) t = re.sub(r",\?", "?", t)
return t.strip() return t.strip()
def apply_glossary(text): def apply_glossary(text: str) -> str:
out = text out = text or ""
for k in sorted(GLOSSARY.keys(), key=len, reverse=True): for k in sorted(GLOSSARY.keys(), key=len, reverse=True):
out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE) out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE)
return out return out
def postprocess_translation_general(text): def postprocess_translation_general(text: str) -> str:
t = normalize_text(text) t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip() t = re.sub(r"\s{2,}", " ", t).strip()
t = re.sub(r"([!?]){3,}", r"\1\1", t) t = re.sub(r"([!?]){3,}", r"\1\1", t)
@@ -69,23 +72,23 @@ def postprocess_translation_general(text):
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# FILTERS # FILTERS
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def is_sound_effect(text): def is_sound_effect(text: str) -> bool:
cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
def is_title_text(text): def is_title_text(text: str) -> bool:
t = text.strip().lower() t = (text or "").strip().lower()
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS) return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
def is_noise_text(text): def is_noise_text(text: str) -> bool:
t = text.strip() t = (text or "").strip()
return any(re.fullmatch(p, t) for p in NOISE_PATTERNS) return any(re.fullmatch(p, t) for p in NOISE_PATTERNS)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# GEOMETRY # GEOMETRY
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def quad_bbox(quad): def quad_bbox(quad):
xs = [p[0] for p in quad] xs = [p[0] for p in quad]
@@ -127,9 +130,9 @@ def overlap_or_near(a, b, gap=0):
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# QUALITY # QUALITY / SCORING
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def ocr_quality_score(text): def ocr_quality_score(text: str) -> float:
if not text or len(text) < 2: if not text or len(text) < 2:
return 0.0 return 0.0
alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text)) alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text))
@@ -141,21 +144,75 @@ def ocr_quality_score(text):
bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0 bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0
return max(0.0, min(1.0, alpha_ratio - penalty + bonus)) return max(0.0, min(1.0, alpha_ratio - penalty + bonus))
def ocr_candidate_score(text: str) -> float:
if not text:
return 0.0
t = text.strip()
n = len(t)
if n == 0:
return 0.0
alpha = sum(c.isalpha() for c in t) / n
spaces = sum(c.isspace() for c in t) / n
punct_ok = sum(c in ".,!?'-:;()[]\"" for c in t) / n
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"]", t)) / n
penalty = 0.0
if re.search(r"\b[A-Z]\b", t):
penalty += 0.05
if re.search(r"[0-9]{2,}", t):
penalty += 0.08
if re.search(r"(..)\1\1", t):
penalty += 0.08
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
return max(0.0, min(1.0, score))
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# OCR RE-READ # OCR MULTI-PASS
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def preprocess_variant(crop_bgr, mode): def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw": if mode == "raw":
return gray return gray
if mode == "clahe": if mode == "clahe":
return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray) return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray)
if mode == "adaptive": if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3, 3), 0) den = cv2.GaussianBlur(gray, (3, 3), 0)
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11) return cv2.adaptiveThreshold(
den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11
)
if mode == "otsu":
den = cv2.GaussianBlur(gray, (3, 3), 0)
_, th = cv2.threshold(den, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th
if mode == "invert":
return 255 - gray
return gray return gray
def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2]
c = (w / 2, h / 2)
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
cos = abs(M[0, 0])
sin = abs(M[0, 1])
new_w = int((h * sin) + (w * cos))
new_h = int((h * cos) + (w * sin))
M[0, 2] += (new_w / 2) - c[0]
M[1, 2] += (new_h / 2) - c[1]
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
def run_ocr_on_array(reader, arr): def run_ocr_on_array(reader, arr):
tmp = "_tmp_ocr.png" tmp = "_tmp_ocr.png"
cv2.imwrite(tmp, arr) cv2.imwrite(tmp, arr)
@@ -165,35 +222,104 @@ def run_ocr_on_array(reader, arr):
if os.path.exists(tmp): if os.path.exists(tmp):
os.remove(tmp) os.remove(tmp)
def reread_crop(image, bbox, reader, upscale=2.5, pad=18): def rebuild_text_from_ocr_result(res):
if not res:
return ""
norm = []
for item in res:
if len(item) != 3:
continue
bbox, txt, conf = item
if not txt or not txt.strip():
continue
b = quad_bbox(bbox)
xc = (b[0] + b[2]) / 2.0
yc = (b[1] + b[3]) / 2.0
h = max(1.0, b[3] - b[1])
norm.append((b, txt, conf, xc, yc, h))
if not norm:
return ""
med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4]) # y
rows = []
for it in norm:
placed = False
for r in rows:
if abs(it[4] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k[4] for k in r["m"]]))
placed = True
break
if not placed:
rows.append({"yc": it[4], "m": [it]})
rows.sort(key=lambda r: r["yc"])
lines = []
for r in rows:
mem = sorted(r["m"], key=lambda z: z[3]) # x
line = normalize_text(" ".join(x[1] for x in mem))
if line:
lines.append(line)
return normalize_text(" ".join(lines))
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22):
ih, iw = image.shape[:2] ih, iw = image.shape[:2]
x1, y1, x2, y2 = bbox x1, y1, x2, y2 = bbox
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) x1 = max(0, int(x1 - pad))
x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad)) y1 = max(0, int(y1 - pad))
x2 = min(iw, int(x2 + pad))
y2 = min(ih, int(y2 + pad))
crop = image[y1:y2, x1:x2] crop = image[y1:y2, x1:x2]
if crop.size == 0: if crop.size == 0:
return None return None, 0.0
up = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC) up = cv2.resize(
crop,
(int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)),
interpolation=cv2.INTER_CUBIC
)
best = None modes = ["raw", "clahe", "adaptive", "otsu", "invert"]
for mode in ("raw", "clahe", "adaptive"): angles = [0.0, 1.5, -1.5]
best_text, best_score = "", 0.0
for mode in modes:
proc = preprocess_variant(up, mode) proc = preprocess_variant(up, mode)
res = run_ocr_on_array(reader, proc)
if not res:
continue
res.sort(key=lambda r: (r[0][0][1], r[0][0][0]))
lines = [normalize_text(t) for _, t, _ in res if t.strip()]
merged = re.sub(r"\s{2,}", " ", " ".join(lines)).strip()
s = ocr_quality_score(merged)
if best is None or s > best[0]:
best = (s, merged)
return best[1] if best else None if len(proc.shape) == 2:
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR)
else:
proc3 = proc
for a in angles:
rot = rotate_image_keep_bounds(proc3, a)
if len(rot.shape) == 3:
rot_in = cv2.cvtColor(rot, cv2.COLOR_BGR2GRAY)
else:
rot_in = rot
res = run_ocr_on_array(reader, rot_in)
txt = rebuild_text_from_ocr_result(res)
sc = ocr_candidate_score(txt)
if sc > best_score:
best_text, best_score = txt, sc
if not best_text:
return None, 0.0
return best_text, best_score
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# LINES + YELLOW BOXES # LINES + YELLOW BOXES
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def build_lines_from_indices(indices, ocr): def build_lines_from_indices(indices, ocr):
if not indices: if not indices:
@@ -233,14 +359,13 @@ def build_lines_from_indices(indices, ocr):
return lines return lines
def build_line_boxes_from_indices(indices, ocr): def build_line_boxes_from_indices(indices, ocr):
""" """
Robust yellow-box generation with punctuation attachment: Robust yellow-box generation with punctuation attachment:
- row grouping - row grouping
- chunking by x gap - chunking by x gap
- attach tiny punctuation/special tokens to nearest chunk - attach tiny punctuation/special tokens to nearest chunk
- coverage guarantee - token coverage guarantee
""" """
if not indices: if not indices:
return [] return []
@@ -264,13 +389,13 @@ def build_line_boxes_from_indices(indices, ocr):
pad = max(1, int(round(med_h * 0.12))) pad = max(1, int(round(med_h * 0.12)))
def is_punct_like(t): def is_punct_like(t):
raw = t.strip() raw = (t or "").strip()
if raw == "": if raw == "":
return True return True
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
return punct_ratio >= 0.5 or len(raw) <= 2 return punct_ratio >= 0.5 or len(raw) <= 2
# 1) rows # 1) row grouping
items_sorted = sorted(items, key=lambda x: x["yc"]) items_sorted = sorted(items, key=lambda x: x["yc"])
rows = [] rows = []
for it in items_sorted: for it in items_sorted:
@@ -296,7 +421,7 @@ def build_line_boxes_from_indices(indices, ocr):
normal = mem normal = mem
punct = [] punct = []
# 2) chunk normal tokens # 2) chunk normal by x-gap
chunks = [] chunks = []
cur = [normal[0]] cur = [normal[0]]
for t in normal[1:]: for t in normal[1:]:
@@ -310,18 +435,17 @@ def build_line_boxes_from_indices(indices, ocr):
cur = [t] cur = [t]
chunks.append(cur) chunks.append(cur)
# 3) attach punctuation tokens # 3) attach punct tokens to nearest chunk
for p in punct: for p in punct:
pb = p["b"] pb = p["b"]
pxc, pyc = p["xc"], p["yc"] pxc, pyc = p["xc"], p["yc"]
best_k = -1 best_k = -1
best_score = 1e18 best_score = 1e18
for k, ch in enumerate(chunks): for k, ch in enumerate(chunks):
ub = boxes_union_xyxy([x["b"] for x in ch]) ub = boxes_union_xyxy([x["b"] for x in ch])
cx = (ub[0] + ub[2]) / 2.0 cx = (ub[0] + ub[2]) / 2.0
cy = (ub[1] + ub[3]) / 2.0 cy = (ub[1] + ub[3]) / 2.0
dx = abs(pxc - cx) dx = abs(pxc - cx)
dy = abs(pyc - cy) dy = abs(pyc - cy)
score = dx + 1.8 * dy score = dx + 1.8 * dy
@@ -339,22 +463,21 @@ def build_line_boxes_from_indices(indices, ocr):
else: else:
chunks.append([p]) chunks.append([p])
# 4) chunk boxes # 4) emit chunk boxes
for ch in chunks: for ch in chunks:
ub = boxes_union_xyxy([x["b"] for x in ch]) ub = boxes_union_xyxy([x["b"] for x in ch])
if ub: if ub:
x1, y1, x2, y2 = ub x1, y1, x2, y2 = ub
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad))
# 5) guarantee all tokens included # 5) guarantee every token is inside some yellow box
token_boxes = [it["b"] for it in items] token_boxes = [it["b"] for it in items]
def inside(tb, lb): def inside(tb, lb):
return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3] return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3]
for tb in token_boxes: for tb in token_boxes:
ok = any(inside(tb, lb) for lb in out_boxes) if not any(inside(tb, lb) for lb in out_boxes):
if not ok:
x1, y1, x2, y2 = tb x1, y1, x2, y2 = tb
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad)) out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad))
@@ -366,8 +489,8 @@ def build_line_boxes_from_indices(indices, ocr):
ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1]) ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1])
ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3]) ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (b[2]-b[0])*(b[3]-b[1])) a1 = max(1, (b[2] - b[0]) * (b[3] - b[1]))
a2 = max(1, (m[2]-m[0])*(m[3]-m[1])) a2 = max(1, (m[2] - m[0]) * (m[3] - m[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0 iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.72: if iou > 0.72:
merged[i] = boxes_union_xyxy([b, m]) merged[i] = boxes_union_xyxy([b, m])
@@ -381,7 +504,7 @@ def build_line_boxes_from_indices(indices, ocr):
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# GROUPING # GROUPING
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def auto_gap(image_path, base=18, ref_w=750): def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path) img = cv2.imread(image_path)
@@ -426,7 +549,13 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
for i in range(n): for i in range(n):
groups.setdefault(find(i), []).append(i) groups.setdefault(find(i), []).append(i)
sorted_groups = sorted(groups.values(), key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs))) sorted_groups = sorted(
groups.values(),
key=lambda idxs: (
min(boxes[i][1] for i in idxs),
min(boxes[i][0] for i in idxs)
)
)
bubbles = {} bubbles = {}
bubble_boxes = {} bubble_boxes = {}
@@ -436,6 +565,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
ih, iw = image_shape[:2] ih, iw = image_shape[:2]
for bid, idxs in enumerate(sorted_groups, start=1): for bid, idxs in enumerate(sorted_groups, start=1):
idxs = sorted(idxs, key=lambda k: boxes[k][1]) idxs = sorted(idxs, key=lambda k: boxes[k][1])
lines = build_lines_from_indices(idxs, ocr) lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs] quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads]) ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
@@ -443,8 +573,10 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
continue continue
x1, y1, x2, y2 = ub x1, y1, x2, y2 = ub
x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding) x1 = max(0, x1 - bbox_padding)
x2 = min(iw, x2 + bbox_padding); y2 = min(ih, y2 + bbox_padding) y1 = max(0, y1 - bbox_padding)
x2 = min(iw, x2 + bbox_padding)
y2 = min(ih, y2 + bbox_padding)
bubbles[bid] = lines bubbles[bid] = lines
bubble_boxes[bid] = (x1, y1, x2, y2) bubble_boxes[bid] = (x1, y1, x2, y2)
@@ -455,23 +587,24 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# DEBUG # DEBUG
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
img = cv2.imread(image_path) img = cv2.imread(image_path)
if img is None: if img is None:
return return
# token quads # OCR token quads
for bbox, txt, conf in ocr: for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32) pts = np.array(bbox, dtype=np.int32)
cv2.polylines(img, [pts], True, (180, 180, 180), 1) cv2.polylines(img, [pts], True, (180, 180, 180), 1)
# bubble boxes + yellow line boxes # Bubble + line boxes
for bid, bb in bubble_boxes.items(): for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2) cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2)
idxs = bubble_indices.get(bid, []) idxs = bubble_indices.get(bid, [])
line_boxes = build_line_boxes_from_indices(idxs, ocr) line_boxes = build_line_boxes_from_indices(idxs, ocr)
@@ -485,7 +618,7 @@ def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path=
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# EXPORT # EXPORT
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def estimate_reading_order(bbox_dict, mode="ltr"): def estimate_reading_order(bbox_dict, mode="ltr"):
items = [] items = []
@@ -494,7 +627,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
cy = (y1 + y2) / 2.0 cy = (y1 + y2) / 2.0
items.append((bid, cx, cy)) items.append((bid, cx, cy))
items.sort(key=lambda t: t[2]) items.sort(key=lambda t: t[2]) # top to bottom
rows = [] rows = []
tol = 90 tol = 90
@@ -517,7 +650,6 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
return {bid: i + 1 for i, bid in enumerate(order)} return {bid: i + 1 for i, bid in enumerate(order)}
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
out = {} out = {}
@@ -536,11 +668,15 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
out[str(bid)] = { out[str(bid)] = {
"x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1), "x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1),
"reading_order": int(reading_map.get(bid, bid)), "reading_order": int(reading_map.get(bid, bid)),
"quad_bboxes": [{"x": int(b[0]), "y": int(b[1]), "w": int(b[2]-b[0]), "h": int(b[3]-b[1])} for b in qboxes], "quad_bboxes": [
"quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads], {"x": int(b[0]), "y": int(b[1]), "w": int(b[2] - b[0]), "h": int(b[3] - b[1])}
for b in qboxes
],
"quads": [
[[int(p[0]), int(p[1])] for p in q] for q in quads
],
"text_bbox": xyxy_to_xywh(text_union), "text_bbox": xyxy_to_xywh(text_union),
# yellow geometry
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy], "line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None, "line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
"line_union_area": int(line_union_area), "line_union_area": int(line_union_area),
@@ -551,7 +687,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# MAIN # MAIN
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def translate_manga_text( def translate_manga_text(
image_path, image_path,
@@ -606,6 +742,7 @@ def translate_manga_text(
skipped += 1 skipped += 1
continue continue
# reduce false positives in very top strip
if qb[1] < int(ih * TOP_BAND_RATIO): if qb[1] < int(ih * TOP_BAND_RATIO):
if conf < 0.70 and len(t) >= 5: if conf < 0.70 and len(t) >= 5:
skipped += 1 skipped += 1
@@ -633,14 +770,28 @@ def translate_manga_text(
translator = GoogleTranslator(source=source_lang, target=target_lang) translator = GoogleTranslator(source=source_lang, target=target_lang)
# robust bubble text cleanup
clean_lines = {} clean_lines = {}
for bid, lines in bubbles.items(): for bid, lines in bubbles.items():
txt = normalize_text(" ".join(lines)) base_txt = normalize_text(" ".join(lines))
q = ocr_quality_score(txt) base_sc = ocr_candidate_score(base_txt)
if q < quality_threshold:
reread = reread_crop(image, bubble_boxes[bid], reader, upscale=2.5, pad=18) # only robust reread on low quality
if reread: if base_sc < quality_threshold:
txt = normalize_text(reread) rr_txt, rr_sc = reread_crop_robust(
image,
bubble_boxes[bid],
reader,
upscale=3.0,
pad=22
)
if rr_txt and rr_sc > base_sc + 0.06:
txt = rr_txt
else:
txt = base_txt
else:
txt = base_txt
clean_lines[bid] = apply_glossary(txt) clean_lines[bid] = apply_glossary(txt)
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode) reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
@@ -657,18 +808,24 @@ def translate_manga_text(
src = clean_lines[bid].strip() src = clean_lines[bid].strip()
if not src: if not src:
continue continue
flags = []
flags = []
try: try:
tgt = translator.translate(src) or "" tgt = translator.translate(src) or ""
except Exception as e: except Exception as e:
tgt = f"[Translation error: {e}]" tgt = f"[Translation error: {e}]"
flags.append("TRANSLATION_ERROR")
tgt = apply_glossary(postprocess_translation_general(tgt)).upper() tgt = apply_glossary(postprocess_translation_general(tgt)).upper()
src_u = src.upper() src_u = src.upper()
out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}") out_lines.append(
print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}") f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}"
)
print(
f"#{bid:<7} {reading_map.get(bid,bid):<6} "
f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}"
)
translated_count += 1 translated_count += 1
out_lines.append(divider) out_lines.append(divider)
@@ -691,13 +848,13 @@ def translate_manga_text(
print(f"Saved: {export_to_file}") print(f"Saved: {export_to_file}")
print(f"Saved: {export_bubbles_to}") print(f"Saved: {export_bubbles_to}")
if debug: if debug:
print("Saved: debug_clusters.png (special chars included in yellow boxes)") print("Saved: debug_clusters.png")
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path="002-page.png", image_path="001-page.png",
source_lang="en", source_lang="it",
target_lang="ca", target_lang="ca",
confidence_threshold=0.12, confidence_threshold=0.12,
min_text_length=1, min_text_length=1,

View File

@@ -2,60 +2,76 @@
""" """
pipeline.py pipeline.py
─────────────────────────────────────────────────────────────── ───────────────────────────────────────────────────────────────
Translation-only pipeline for Dandadan_059_2022_Digital Translation + render pipeline
Flow per page: Flow per page:
1. Run translate_manga_text() output.txt + bubbles.json 1) translate_manga_text() -> output.txt + bubbles.json (+ debug_clusters.png if DEBUG)
2. Copy original image to workdir for reference 2) render_translations() -> page_translated.png
3) Pack CBZ with originals + rendered pages + text outputs
Folder structure produced: Folder structure:
Dandadan_059_2022_Digital_1r0n/ <CHAPTER_DIR>/
├── 000.png
├── 001.png
└── translated/ └── translated/
├── 00/ ├── 000/
│ ├── output.txt ← translations to review │ ├── output.txt
│ ├── bubbles.json ← bubble boxes │ ├── bubbles.json
── debug_clusters.png ← cluster debug (if DEBUG=True) ── page_translated.png
├── 01/ │ └── debug_clusters.png (optional)
├── 001/
│ └── ... │ └── ...
└── ... └── ...
Dandadan_059_translated.cbz ← original pages + translations CBZ:
zipped for reference - pages/<original pages>
- rendered/<page_stem>_translated.png
- translations/<page_stem>_output.txt
""" """
import os import os
import sys import sys
import shutil
import zipfile import zipfile
import importlib.util import importlib.util
from pathlib import Path from pathlib import Path
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# CONFIG — edit these as needed # CONFIG
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
CHAPTER_DIR = "/Users/guillemhernandezsola/Downloads/Spy_x_Family_076_2023_Digital_1r0n" CHAPTER_DIR = "/Users/guillemhernandezsola/Downloads/Dandadan_059_2022_Digital_1r0n"
OUTPUT_CBZ = "/Users/guillemhernandezsola/Downloads/Spy_x_Family_076_2023_Digital_1r0n_translated.cbz" OUTPUT_CBZ = "/Users/guillemhernandezsola/Downloads/Dandadan_059_2022_Digital_1r0n_translated.cbz"
SOURCE_LANG = "en"
TARGET_LANG = "ca"
# manga-translator.py settings SOURCE_LANG = "en"
TARGET_LANG = "ca"
# translator (NEW signature-compatible)
CONFIDENCE_THRESHOLD = 0.10 CONFIDENCE_THRESHOLD = 0.10
MIN_TEXT_LENGTH = 2 MIN_TEXT_LENGTH = 1
CLUSTER_EPS = "auto" GAP_PX = "auto" # was cluster/proximity in old version
PROXIMITY_PX = 80
FILTER_SFX = True FILTER_SFX = True
QUALITY_THRESHOLD = 0.5 QUALITY_THRESHOLD = 0.50
UPSCALE_FACTOR = 2.5 READING_MODE = "ltr"
BBOX_PADDING = 5
DEBUG = True DEBUG = True
# renderer
RENDER_ENABLED = True
RENDER_OUTPUT_NAME = "page_translated.png"
# optional custom font list for renderer
FONT_CANDIDATES = [
"fonts/ComicNeue-Regular.ttf",
"fonts/ComicRelief-Regular.ttf"
]
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# DYNAMIC MODULE LOADER # DYNAMIC MODULE LOADER
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def load_module(name, filepath): def load_module(name, filepath):
spec = importlib.util.spec_from_file_location(name, filepath) spec = importlib.util.spec_from_file_location(name, filepath)
if spec is None or spec.loader is None:
raise FileNotFoundError(f"Cannot load spec for {filepath}")
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module) spec.loader.exec_module(module)
return module return module
@@ -65,10 +81,10 @@ def load_module(name, filepath):
# HELPERS # HELPERS
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def sorted_pages(chapter_dir): def sorted_pages(chapter_dir):
exts = {".jpg", ".jpeg", ".png", ".webp"} exts = {".jpg", ".jpeg", ".png", ".webp"}
pages = [ pages = [
p for p in Path(chapter_dir).iterdir() p for p in Path(chapter_dir).iterdir()
if p.suffix.lower() in exts if p.is_file() and p.suffix.lower() in exts
] ]
return sorted(pages, key=lambda p: p.stem) return sorted(pages, key=lambda p: p.stem)
@@ -80,82 +96,97 @@ def make_page_workdir(chapter_dir, page_stem):
def pack_cbz(chapter_dir, translated_dir, output_cbz): def pack_cbz(chapter_dir, translated_dir, output_cbz):
""" exts = {".jpg", ".jpeg", ".png", ".webp"}
Packs into CBZ:
- All original pages (from chapter_dir root) pages = sorted(
- All output.txt (one per page subfolder)
Sorted by page stem for correct reading order.
"""
exts = {".jpg", ".jpeg", ".png", ".webp"}
pages = sorted(
[p for p in Path(chapter_dir).iterdir() [p for p in Path(chapter_dir).iterdir()
if p.suffix.lower() in exts], if p.is_file() and p.suffix.lower() in exts],
key=lambda p: p.stem key=lambda p: p.stem
) )
txts = sorted(
txts = sorted(
translated_dir.rglob("output.txt"), translated_dir.rglob("output.txt"),
key=lambda p: p.parent.name key=lambda p: p.parent.name
) )
rendered = sorted(
translated_dir.rglob(RENDER_OUTPUT_NAME),
key=lambda p: p.parent.name
)
if not pages: if not pages:
print("⚠️ No original pages found — CBZ not created.") print("⚠️ No original pages found — CBZ not created.")
return return
with zipfile.ZipFile(output_cbz, "w", with zipfile.ZipFile(output_cbz, "w", compression=zipfile.ZIP_STORED) as zf:
compression=zipfile.ZIP_STORED) as zf: # original pages
# Original pages
for img in pages: for img in pages:
arcname = f"pages/{img.name}" arcname = f"pages/{img.name}"
zf.write(img, arcname) zf.write(img, arcname)
print(f" 🖼 {arcname}") print(f" 🖼 {arcname}")
# Translation text files # rendered pages
for rp in rendered:
arcname = f"rendered/{rp.parent.name}_translated.png"
zf.write(rp, arcname)
print(f" 🎨 {arcname}")
# text outputs
for txt in txts: for txt in txts:
arcname = f"translations/{txt.parent.name}_output.txt" arcname = f"translations/{txt.parent.name}_output.txt"
zf.write(txt, arcname) zf.write(txt, arcname)
print(f" 📄 {arcname}") print(f" 📄 {arcname}")
print(f"\n✅ CBZ saved → {output_cbz} " print(
f"({len(pages)} page(s), {len(txts)} translation(s))") f"\n✅ CBZ saved → {output_cbz} "
f"({len(pages)} original, {len(rendered)} rendered, {len(txts)} text)"
)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# PER-PAGE PIPELINE # PER-PAGE PIPELINE
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def process_page(page_path, workdir, translator_module): def process_page(page_path, workdir, translator_module, renderer_module):
""" """
Runs translator for a single page. Runs translator + renderer for one page.
All output files land in workdir. All generated files are written inside workdir.
Returns True on success, False on failure.
""" """
print(f"\n{''*60}") print(f"\n{'' * 70}")
print(f" PAGE: {page_path.name}") print(f"PAGE: {page_path.name}")
print(f"{''*60}") print(f"{'' * 70}")
orig_dir = os.getcwd() orig_dir = os.getcwd()
try: try:
# chdir into workdir so debug_clusters.png,
# temp files etc. all land there
os.chdir(workdir) os.chdir(workdir)
# 1) translate
translator_module.translate_manga_text( translator_module.translate_manga_text(
image_path = str(page_path.resolve()), image_path= str(page_path.resolve()),
source_lang = SOURCE_LANG, source_lang=SOURCE_LANG,
target_lang = TARGET_LANG, target_lang=TARGET_LANG,
confidence_threshold = CONFIDENCE_THRESHOLD, confidence_threshold=CONFIDENCE_THRESHOLD,
export_to_file = "output.txt", min_text_length=MIN_TEXT_LENGTH,
export_bubbles_to = "bubbles.json", gap_px=GAP_PX,
min_text_length = MIN_TEXT_LENGTH, filter_sound_effects=FILTER_SFX,
cluster_eps = CLUSTER_EPS, quality_threshold=QUALITY_THRESHOLD,
proximity_px = PROXIMITY_PX, export_to_file="output.txt",
filter_sound_effects = FILTER_SFX, export_bubbles_to="bubbles.json",
quality_threshold = QUALITY_THRESHOLD, reading_mode=READING_MODE,
upscale_factor = UPSCALE_FACTOR, debug=DEBUG
bbox_padding = BBOX_PADDING,
debug = DEBUG,
) )
print(" ✅ translator done")
# 2) render
if RENDER_ENABLED:
renderer_module.render_translations(
input_image=str(page_path.resolve()),
output_image=RENDER_OUTPUT_NAME,
translations_file="output.txt",
bubbles_file="bubbles.json",
font_candidates=FONT_CANDIDATES
)
print(" ✅ renderer done")
print(f" ✅ Translated → {workdir}")
return True return True
except Exception as e: except Exception as e:
@@ -170,16 +201,20 @@ def process_page(page_path, workdir, translator_module):
# MAIN # MAIN
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def main(): def main():
# ── Load translator module ──────────────────────────────────── print("Loading modules...")
print("Loading manga-translator.py...")
try: try:
translator = load_module( translator = load_module("manga_translator", "manga-translator.py")
"manga_translator", "manga-translator.py") except Exception as e:
except FileNotFoundError as e: print(f"❌ Could not load manga-translator.py: {e}")
print(f"❌ Could not load module: {e}") sys.exit(1)
try:
renderer = load_module("manga_renderer", "manga-renderer.py")
except Exception as e:
print(f"❌ Could not load manga-renderer.py: {e}")
sys.exit(1) sys.exit(1)
# ── Discover pages ────────────────────────────────────────────
pages = sorted_pages(CHAPTER_DIR) pages = sorted_pages(CHAPTER_DIR)
if not pages: if not pages:
print(f"❌ No images found in: {CHAPTER_DIR}") print(f"❌ No images found in: {CHAPTER_DIR}")
@@ -187,33 +222,31 @@ def main():
print(f"\n📖 Chapter : {CHAPTER_DIR}") print(f"\n📖 Chapter : {CHAPTER_DIR}")
print(f" Pages : {len(pages)}") print(f" Pages : {len(pages)}")
print(f" Source : {SOURCE_LANG} Target: {TARGET_LANG}\n") print(f" Source : {SOURCE_LANG} Target: {TARGET_LANG}")
print(f" Render : {'ON' if RENDER_ENABLED else 'OFF'}\n")
# ── Process each page ─────────────────────────────────────────
translated_dir = Path(CHAPTER_DIR) / "translated" translated_dir = Path(CHAPTER_DIR) / "translated"
succeeded = [] succeeded = []
failed = [] failed = []
for i, page_path in enumerate(pages, start=1): for i, page_path in enumerate(pages, start=1):
print(f"\n[{i}/{len(pages)}] {page_path.name}") print(f"[{i}/{len(pages)}] {page_path.name}")
workdir = make_page_workdir(CHAPTER_DIR, page_path.stem) workdir = make_page_workdir(CHAPTER_DIR, page_path.stem)
ok = process_page(page_path, workdir, translator) ok = process_page(page_path, workdir, translator, renderer)
if ok: if ok:
succeeded.append(page_path.name) succeeded.append(page_path.name)
else: else:
failed.append(page_path.name) failed.append(page_path.name)
# ── Summary ─────────────────────────────────────────────────── print(f"\n{'' * 70}")
print(f"\n{''*60}") print("PIPELINE COMPLETE")
print(f" PIPELINE COMPLETE") print(f"{len(succeeded)} page(s) succeeded")
print(f"{len(succeeded)} page(s) succeeded")
if failed: if failed:
print(f" {len(failed)} page(s) failed:") print(f"{len(failed)} page(s) failed:")
for f in failed: for f in failed:
print(f" {f}") print(f"{f}")
print(f"{''*60}\n") print(f"{'' * 70}\n")
# ── Pack CBZ ──────────────────────────────────────────────────
print("Packing CBZ...") print("Packing CBZ...")
pack_cbz(CHAPTER_DIR, translated_dir, OUTPUT_CBZ) pack_cbz(CHAPTER_DIR, translated_dir, OUTPUT_CBZ)