Files
manga-translator/manga-translator.py
Guillem Hernandez Sola 0069da706b stable version
2026-04-14 19:25:22 +02:00

712 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import os
import json
import cv2
import numpy as np
import easyocr
from deep_translator import GoogleTranslator
# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
GLOSSARY = {
"ANYA": "ANYA",
"STARLIGHT ANYA": "STARLIGHT ANYA",
"MR. HENDERSON": "MR. HENDERSON",
"HENDERSON": "HENDERSON",
"STELLA STAR": "STELLA STAR",
}
SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^oh+$",
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
]
TITLE_PATTERNS = [
r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
r"^(spy|family|spy.family)$",
r"^by\s+.+$",
]
NOISE_PATTERNS = [
r"^[^a-zA-Z0-9\?!.]+$",
r"^BOX[0-9A-Z]*$",
]
TOP_BAND_RATIO = 0.08
# ─────────────────────────────────────────────
# TEXT HELPERS
# ─────────────────────────────────────────────
def normalize_text(text):
t = text.strip().upper()
t = t.replace("", "\"").replace("", "\"")
t = t.replace("", "'").replace("", "'")
t = t.replace("", "...")
t = re.sub(r"\s+", " ", t)
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
t = re.sub(r"\(\s+", "(", t)
t = re.sub(r"\s+\)", ")", t)
t = re.sub(r"\.{4,}", "...", t)
t = re.sub(r",\?", "?", t)
return t.strip()
def apply_glossary(text):
out = text
for k in sorted(GLOSSARY.keys(), key=len, reverse=True):
out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE)
return out
def postprocess_translation_general(text):
t = normalize_text(text)
t = re.sub(r"\s{2,}", " ", t).strip()
t = re.sub(r"([!?]){3,}", r"\1\1", t)
t = re.sub(r"\.{4,}", "...", t)
return t
# ─────────────────────────────────────────────
# FILTERS
# ─────────────────────────────────────────────
def is_sound_effect(text):
cleaned = re.sub(r"[^a-z]", "", text.strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
def is_title_text(text):
t = text.strip().lower()
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
def is_noise_text(text):
t = text.strip()
return any(re.fullmatch(p, t) for p in NOISE_PATTERNS)
# ─────────────────────────────────────────────
# GEOMETRY
# ─────────────────────────────────────────────
def quad_bbox(quad):
xs = [p[0] for p in quad]
ys = [p[1] for p in quad]
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
def quad_center(quad):
x1, y1, x2, y2 = quad_bbox(quad)
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
def boxes_union_xyxy(boxes):
boxes = [b for b in boxes if b is not None]
if not boxes:
return None
return (
int(min(b[0] for b in boxes)),
int(min(b[1] for b in boxes)),
int(max(b[2] for b in boxes)),
int(max(b[3] for b in boxes)),
)
def bbox_area_xyxy(b):
if b is None:
return 0
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
def xyxy_to_xywh(b):
if b is None:
return None
x1, y1, x2, y2 = b
return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
def overlap_or_near(a, b, gap=0):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
gap_y = max(0, max(ay1, by1) - min(ay2, by2))
return gap_x <= gap and gap_y <= gap
# ─────────────────────────────────────────────
# QUALITY
# ─────────────────────────────────────────────
def ocr_quality_score(text):
if not text or len(text) < 2:
return 0.0
alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text))
penalty = 0.0
if re.search(r"[^\w\s\'\!\?\.,\-]{2,}", text):
penalty += 0.2
if re.search(r",,", text):
penalty += 0.2
bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0
return max(0.0, min(1.0, alpha_ratio - penalty + bonus))
# ─────────────────────────────────────────────
# OCR RE-READ
# ─────────────────────────────────────────────
def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
if mode == "raw":
return gray
if mode == "clahe":
return cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(gray)
if mode == "adaptive":
den = cv2.GaussianBlur(gray, (3, 3), 0)
return cv2.adaptiveThreshold(den, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11)
return gray
def run_ocr_on_array(reader, arr):
tmp = "_tmp_ocr.png"
cv2.imwrite(tmp, arr)
try:
return reader.readtext(tmp, paragraph=False)
finally:
if os.path.exists(tmp):
os.remove(tmp)
def reread_crop(image, bbox, reader, upscale=2.5, pad=18):
ih, iw = image.shape[:2]
x1, y1, x2, y2 = bbox
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
x2 = min(iw, int(x2 + pad)); y2 = min(ih, int(y2 + pad))
crop = image[y1:y2, x1:x2]
if crop.size == 0:
return None
up = cv2.resize(crop, (int(crop.shape[1] * upscale), int(crop.shape[0] * upscale)), interpolation=cv2.INTER_CUBIC)
best = None
for mode in ("raw", "clahe", "adaptive"):
proc = preprocess_variant(up, mode)
res = run_ocr_on_array(reader, proc)
if not res:
continue
res.sort(key=lambda r: (r[0][0][1], r[0][0][0]))
lines = [normalize_text(t) for _, t, _ in res if t.strip()]
merged = re.sub(r"\s{2,}", " ", " ".join(lines)).strip()
s = ocr_quality_score(merged)
if best is None or s > best[0]:
best = (s, merged)
return best[1] if best else None
# ─────────────────────────────────────────────
# LINES + YELLOW BOXES
# ─────────────────────────────────────────────
def build_lines_from_indices(indices, ocr):
if not indices:
return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
xc = (b[0] + b[2]) / 2.0
yc = (b[1] + b[3]) / 2.0
h = max(1.0, b[3] - b[1])
items.append((i, b, xc, yc, h))
med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3])
rows = []
for it in items:
i, b, xc, yc, h = it
placed = False
for r in rows:
if abs(yc - r["yc"]) <= row_tol:
r["m"].append((i, b, xc, yc))
r["yc"] = float(np.mean([k[3] for k in r["m"]]))
placed = True
break
if not placed:
rows.append({"yc": yc, "m": [(i, b, xc, yc)]})
rows.sort(key=lambda r: r["yc"])
lines = []
for r in rows:
mem = sorted(r["m"], key=lambda z: z[2])
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
lines.append(txt)
return lines
def build_line_boxes_from_indices(indices, ocr):
"""
Robust yellow-box generation with punctuation attachment:
- row grouping
- chunking by x gap
- attach tiny punctuation/special tokens to nearest chunk
- coverage guarantee
"""
if not indices:
return []
items = []
for i in indices:
b = quad_bbox(ocr[i][0])
txt = normalize_text(ocr[i][1])
xc = (b[0] + b[2]) / 2.0
yc = (b[1] + b[3]) / 2.0
w = max(1.0, b[2] - b[0])
h = max(1.0, b[3] - b[1])
items.append({
"i": i, "b": b, "txt": txt,
"xc": xc, "yc": yc, "w": w, "h": h
})
med_h = float(np.median([it["h"] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.90)
gap_x_tol = max(8.0, med_h * 1.25)
pad = max(1, int(round(med_h * 0.12)))
def is_punct_like(t):
raw = t.strip()
if raw == "":
return True
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
return punct_ratio >= 0.5 or len(raw) <= 2
# 1) rows
items_sorted = sorted(items, key=lambda x: x["yc"])
rows = []
for it in items_sorted:
placed = False
for r in rows:
if abs(it["yc"] - r["yc"]) <= row_tol:
r["m"].append(it)
r["yc"] = float(np.mean([k["yc"] for k in r["m"]]))
placed = True
break
if not placed:
rows.append({"yc": it["yc"], "m": [it]})
rows.sort(key=lambda r: r["yc"])
out_boxes = []
for r in rows:
mem = sorted(r["m"], key=lambda z: z["xc"])
normal = [t for t in mem if not is_punct_like(t["txt"])]
punct = [t for t in mem if is_punct_like(t["txt"])]
if not normal:
normal = mem
punct = []
# 2) chunk normal tokens
chunks = []
cur = [normal[0]]
for t in normal[1:]:
prev = cur[-1]["b"]
b = t["b"]
gap = b[0] - prev[2]
if gap <= gap_x_tol:
cur.append(t)
else:
chunks.append(cur)
cur = [t]
chunks.append(cur)
# 3) attach punctuation tokens
for p in punct:
pb = p["b"]
pxc, pyc = p["xc"], p["yc"]
best_k = -1
best_score = 1e18
for k, ch in enumerate(chunks):
ub = boxes_union_xyxy([x["b"] for x in ch])
cx = (ub[0] + ub[2]) / 2.0
cy = (ub[1] + ub[3]) / 2.0
dx = abs(pxc - cx)
dy = abs(pyc - cy)
score = dx + 1.8 * dy
near = overlap_or_near(pb, ub, gap=int(med_h * 0.9))
if near:
score -= med_h * 2.0
if score < best_score:
best_score = score
best_k = k
if best_k >= 0:
chunks[best_k].append(p)
else:
chunks.append([p])
# 4) chunk boxes
for ch in chunks:
ub = boxes_union_xyxy([x["b"] for x in ch])
if ub:
x1, y1, x2, y2 = ub
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad))
# 5) guarantee all tokens included
token_boxes = [it["b"] for it in items]
def inside(tb, lb):
return tb[0] >= lb[0] and tb[1] >= lb[1] and tb[2] <= lb[2] and tb[3] <= lb[3]
for tb in token_boxes:
ok = any(inside(tb, lb) for lb in out_boxes)
if not ok:
x1, y1, x2, y2 = tb
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad))
# 6) merge heavy overlaps
merged = []
for b in out_boxes:
merged_into = False
for i, m in enumerate(merged):
ix1 = max(b[0], m[0]); iy1 = max(b[1], m[1])
ix2 = min(b[2], m[2]); iy2 = min(b[3], m[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (b[2]-b[0])*(b[3]-b[1]))
a2 = max(1, (m[2]-m[0])*(m[3]-m[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.72:
merged[i] = boxes_union_xyxy([b, m])
merged_into = True
break
if not merged_into:
merged.append(b)
merged.sort(key=lambda z: (z[1], z[0]))
return merged
# ─────────────────────────────────────────────
# GROUPING
# ─────────────────────────────────────────────
def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path)
if img is None:
return base
return base * (img.shape[1] / ref_w)
def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
n = len(ocr)
if n == 0:
return {}, {}, {}, {}
boxes = [quad_bbox(r[0]) for r in ocr]
centers = [quad_center(r[0]) for r in ocr]
hs = [max(1.0, b[3] - b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0
dist_thresh = max(20.0, med_h * 2.2)
p = list(range(n))
def find(x):
while p[x] != x:
p[x] = p[p[x]]
x = p[x]
return x
def unite(a, b):
p[find(a)] = find(b)
for i in range(n):
for j in range(i + 1, n):
if overlap_or_near(boxes[i], boxes[j], gap=gap_px):
unite(i, j)
continue
cx1, cy1 = centers[i]
cx2, cy2 = centers[j]
d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 3.0:
unite(i, j)
groups = {}
for i in range(n):
groups.setdefault(find(i), []).append(i)
sorted_groups = sorted(groups.values(), key=lambda idxs: (min(boxes[i][1] for i in idxs), min(boxes[i][0] for i in idxs)))
bubbles = {}
bubble_boxes = {}
bubble_quads = {}
bubble_indices = {}
ih, iw = image_shape[:2]
for bid, idxs in enumerate(sorted_groups, start=1):
idxs = sorted(idxs, key=lambda k: boxes[k][1])
lines = build_lines_from_indices(idxs, ocr)
quads = [ocr[k][0] for k in idxs]
ub = boxes_union_xyxy([quad_bbox(q) for q in quads])
if ub is None:
continue
x1, y1, x2, y2 = ub
x1 = max(0, x1 - bbox_padding); y1 = max(0, y1 - bbox_padding)
x2 = min(iw, x2 + bbox_padding); y2 = min(ih, y2 + bbox_padding)
bubbles[bid] = lines
bubble_boxes[bid] = (x1, y1, x2, y2)
bubble_quads[bid] = quads
bubble_indices[bid] = idxs
return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ─────────────────────────────────────────────
# DEBUG
# ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
img = cv2.imread(image_path)
if img is None:
return
# token quads
for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32)
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
# bubble boxes + yellow line boxes
for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2)
idxs = bubble_indices.get(bid, [])
line_boxes = build_line_boxes_from_indices(idxs, ocr)
for lb in line_boxes:
lx1, ly1, lx2, ly2 = lb
lx1 = max(0, int(lx1)); ly1 = max(0, int(ly1))
lx2 = min(img.shape[1] - 1, int(lx2)); ly2 = min(img.shape[0] - 1, int(ly2))
cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3)
cv2.imwrite(out_path, img)
# ─────────────────────────────────────────────
# EXPORT
# ─────────────────────────────────────────────
def estimate_reading_order(bbox_dict, mode="ltr"):
items = []
for bid, (x1, y1, x2, y2) in bbox_dict.items():
cx = (x1 + x2) / 2.0
cy = (y1 + y2) / 2.0
items.append((bid, cx, cy))
items.sort(key=lambda t: t[2])
rows = []
tol = 90
for it in items:
placed = False
for r in rows:
if abs(it[2] - r["cy"]) <= tol:
r["items"].append(it)
r["cy"] = float(np.mean([x[2] for x in r["items"]]))
placed = True
break
if not placed:
rows.append({"cy": it[2], "items": [it]})
rows.sort(key=lambda r: r["cy"])
order = []
for r in rows:
r["items"].sort(key=lambda x: x[1], reverse=(mode == "rtl"))
order.extend([z[0] for z in r["items"]])
return {bid: i + 1 for i, bid in enumerate(order)}
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
out = {}
for bid, bb in bbox_dict.items():
x1, y1, x2, y2 = bb
quads = quads_dict.get(bid, [])
idxs = indices_dict.get(bid, [])
qboxes = [quad_bbox(q) for q in quads]
text_union = boxes_union_xyxy(qboxes)
line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr)
line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy)
line_union_area = bbox_area_xyxy(line_union_xyxy)
out[str(bid)] = {
"x": int(x1), "y": int(y1), "w": int(x2 - x1), "h": int(y2 - y1),
"reading_order": int(reading_map.get(bid, bid)),
"quad_bboxes": [{"x": int(b[0]), "y": int(b[1]), "w": int(b[2]-b[0]), "h": int(b[3]-b[1])} for b in qboxes],
"quads": [[[int(p[0]), int(p[1])] for p in q] for q in quads],
"text_bbox": xyxy_to_xywh(text_union),
# yellow geometry
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
"line_union_area": int(line_union_area),
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(out, f, indent=2, ensure_ascii=False)
# ─────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────
def translate_manga_text(
image_path,
source_lang="en",
target_lang="ca",
confidence_threshold=0.12,
min_text_length=1,
gap_px="auto",
filter_sound_effects=True,
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True
):
image = cv2.imread(image_path)
if image is None:
print(f"❌ Cannot load image: {image_path}")
return
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
print("Loading OCR...")
ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang]
reader = easyocr.Reader(ocr_lang_list)
print("Running OCR...")
raw = reader.readtext(image_path, paragraph=False)
print(f"Raw detections: {len(raw)}")
filtered = []
skipped = 0
ih, iw = image.shape[:2]
for bbox, text, conf in raw:
t = normalize_text(text)
qb = quad_bbox(bbox)
if conf < confidence_threshold:
skipped += 1
continue
if len(t) < min_text_length:
skipped += 1
continue
if is_noise_text(t):
skipped += 1
continue
if filter_sound_effects and is_sound_effect(t):
skipped += 1
continue
if is_title_text(t):
skipped += 1
continue
if qb[1] < int(ih * TOP_BAND_RATIO):
if conf < 0.70 and len(t) >= 5:
skipped += 1
continue
filtered.append((bbox, t, conf))
print(f"Kept: {len(filtered)} | Skipped: {skipped}")
if not filtered:
print("⚠️ No text after filtering.")
return
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens(
filtered, image.shape, gap_px=resolved_gap, bbox_padding=3
)
if debug:
save_debug_clusters(
image_path=image_path,
ocr=filtered,
bubble_boxes=bubble_boxes,
bubble_indices=bubble_indices,
out_path="debug_clusters.png"
)
translator = GoogleTranslator(source=source_lang, target=target_lang)
clean_lines = {}
for bid, lines in bubbles.items():
txt = normalize_text(" ".join(lines))
q = ocr_quality_score(txt)
if q < quality_threshold:
reread = reread_crop(image, bubble_boxes[bid], reader, upscale=2.5, pad=18)
if reread:
txt = normalize_text(reread)
clean_lines[bid] = apply_glossary(txt)
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
divider = "" * 120
out_lines = ["BUBBLE|ORDER|ORIGINAL|TRANSLATED|FLAGS", divider]
print(divider)
print(f"{'BUBBLE':<8} {'ORDER':<6} {'ORIGINAL':<50} {'TRANSLATED':<50} FLAGS")
print(divider)
translated_count = 0
for bid in sorted(clean_lines.keys(), key=lambda x: reading_map.get(x, x)):
src = clean_lines[bid].strip()
if not src:
continue
flags = []
try:
tgt = translator.translate(src) or ""
except Exception as e:
tgt = f"[Translation error: {e}]"
tgt = apply_glossary(postprocess_translation_general(tgt)).upper()
src_u = src.upper()
out_lines.append(f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}")
print(f"#{bid:<7} {reading_map.get(bid,bid):<6} {src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}")
translated_count += 1
out_lines.append(divider)
out_lines.append(f"✅ Done! {translated_count} bubble(s) translated, {skipped} detection(s) skipped.")
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(out_lines))
export_bubbles(
export_bubbles_to,
bbox_dict=bubble_boxes,
quads_dict=bubble_quads,
indices_dict=bubble_indices,
ocr=filtered,
reading_map=reading_map,
image_shape=image.shape
)
print(divider)
print(f"Saved: {export_to_file}")
print(f"Saved: {export_bubbles_to}")
if debug:
print("Saved: debug_clusters.png (special chars included in yellow boxes)")
if __name__ == "__main__":
translate_manga_text(
image_path="002-page.png",
source_lang="en",
target_lang="ca",
confidence_threshold=0.12,
min_text_length=1,
gap_px="auto",
filter_sound_effects=True,
quality_threshold=0.62,
export_to_file="output.txt",
export_bubbles_to="bubbles.json",
reading_mode="ltr",
debug=True
)