Files
manga-translator/manga-translator.py
Guillem Hernandez Sola ead32cef24 Ellipses
2026-04-12 18:47:30 +02:00

629 lines
24 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import os
import json
import cv2
import numpy as np
import easyocr
from deep_translator import GoogleTranslator
# ─────────────────────────────────────────────
# LANGUAGE CODE REFERENCE
# ─────────────────────────────────────────────
SUPPORTED_LANGUAGES = {
"Vietnamese" : "vi",
"Japanese" : "ja",
"English" : "en",
"Spanish" : "es",
"Korean" : "ko",
"Chinese (Simplified)" : "ch_sim",
"Chinese (Traditional)": "ch_tra",
"French" : "fr",
"German" : "de",
"Italian" : "it",
"Portuguese" : "pt",
"Arabic" : "ar",
"Russian" : "ru",
"Thai" : "th",
"Catalan" : "ca",
}
# ─────────────────────────────────────────────
# SOUND EFFECT FILTER
# ─────────────────────────────────────────────
SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
r"^oh+$", r"^ugh+$", r"^gr+$", r"^bam+$",
r"^pow+$", r"^crash+$", r"^boom+$", r"^bang+$",
r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$",
r"^zip+$", r"^swoosh+$", r"^chirp+$", r"^tweet+$",
]
def is_sound_effect(text):
cleaned = re.sub(r"[^a-z]", "", text.strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE)
for p in SOUND_EFFECT_PATTERNS)
# ─────────────────────────────────────────────
# TITLE / LOGO / AUTHOR FILTER
# ─────────────────────────────────────────────
TITLE_PATTERNS = [
r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
r"^(spy|family|spy.family)$",
r"^by\s+.+$", # "BY TATSUYA ENDO"
r"^[a-z]{1,4}\s+[a-z]+\s+[a-z]+$", # short author-style lines
]
def is_title_text(text):
cleaned = text.strip().lower()
return any(re.fullmatch(p, cleaned, re.IGNORECASE)
for p in TITLE_PATTERNS)
# ─────────────────────────────────────────────
# GARBAGE TOKEN FILTER
# Catches OCR misreads that are mostly
# non-alpha or suspiciously short/mangled
# ─────────────────────────────────────────────
GARBAGE_PATTERNS = [
r"^[^a-zA-Z]*$", # no letters at all
r"^.{1,2}$", # 1-2 char tokens
r".*\d+.*", # contains digits (YO4, HLNGRY etc.)
r"^[A-Z]{1,4}$", # isolated caps abbreviations (IILK)
]
def is_garbage(text):
t = text.strip()
return any(re.fullmatch(p, t) for p in GARBAGE_PATTERNS)
# ─────────────────────────────────────────────
# TOKEN CLASSIFIER
# ─────────────────────────────────────────────
def classify_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
"""
Returns one of: "alpha" | "punct" | "noise"
"""
cleaned = text.strip()
if confidence < confidence_threshold:
return "noise"
if len(cleaned) < min_text_length:
return "noise"
if re.fullmatch(r"\d+", cleaned):
return "noise"
if len(cleaned) == 1 and not cleaned.isalpha():
return "noise"
if filter_sound_effects and is_sound_effect(cleaned):
return "noise"
if is_title_text(cleaned):
return "noise"
if is_garbage(cleaned):
return "noise"
if not any(ch.isalpha() for ch in cleaned):
return "punct"
return "alpha"
def should_keep_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
cat = classify_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects)
return cat != "noise", cat
# ─────────────────────────────────────────────
# QUAD HELPERS
# ─────────────────────────────────────────────
def quad_bbox(quad):
xs = [pt[0] for pt in quad]
ys = [pt[1] for pt in quad]
return min(xs), min(ys), max(xs), max(ys)
def quads_bbox(quads, image_shape, padding_px=10):
img_h, img_w = image_shape[:2]
all_x = [pt[0] for quad in quads for pt in quad]
all_y = [pt[1] for quad in quads for pt in quad]
x1 = max(0, min(all_x) - padding_px)
y1 = max(0, min(all_y) - padding_px)
x2 = min(img_w, max(all_x) + padding_px)
y2 = min(img_h, max(all_y) + padding_px)
return x1, y1, x2, y2
def bboxes_overlap_or_touch(a, b, gap_px=0):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
gap_x = max(0, max(ax1, bx1) - min(ax2, bx2))
gap_y = max(0, max(ay1, by1) - min(ay2, by2))
return gap_x <= gap_px and gap_y <= gap_px
# ─────────────────────────────────────────────
# OVERLAP-BASED GROUPING (Union-Find)
# ─────────────────────────────────────────────
def group_quads_by_overlap(ocr_results, image_shape,
gap_px=18, bbox_padding=10):
n = len(ocr_results)
if n == 0:
return {}, {}, {}
token_bboxes = [quad_bbox(r[0]) for r in ocr_results]
parent = list(range(n))
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x, y):
parent[find(x)] = find(y)
for i in range(n):
for j in range(i + 1, n):
if bboxes_overlap_or_touch(
token_bboxes[i], token_bboxes[j],
gap_px=gap_px):
union(i, j)
groups = {}
for i in range(n):
root = find(i)
groups.setdefault(root, [])
groups[root].append(i)
def group_sort_key(indices):
ys = [token_bboxes[i][1] for i in indices]
xs = [token_bboxes[i][0] for i in indices]
return (min(ys) // 150, min(xs))
sorted_groups = sorted(groups.values(), key=group_sort_key)
bubble_dict = {}
bbox_dict = {}
ocr_quads = {}
for gid, indices in enumerate(sorted_groups, start=1):
indices_sorted = sorted(
indices, key=lambda i: token_bboxes[i][1])
quads = [ocr_results[i][0] for i in indices_sorted]
raw_texts = [ocr_results[i][1] for i in indices_sorted]
alpha_lines = []
punct_tokens = []
for i in indices_sorted:
_, text, _ = ocr_results[i]
yc = (token_bboxes[i][1] + token_bboxes[i][3]) / 2.0
if any(ch.isalpha() for ch in text):
alpha_lines.append((yc, text))
else:
punct_tokens.append((yc, text))
for pcy, ptext in punct_tokens:
if alpha_lines:
closest = min(
range(len(alpha_lines)),
key=lambda k: abs(alpha_lines[k][0] - pcy)
)
yc_a, text_a = alpha_lines[closest]
alpha_lines[closest] = (yc_a, text_a + ptext)
text_lines = [t for _, t in alpha_lines] or raw_texts
bubble_dict[gid] = text_lines
ocr_quads[gid] = quads
bbox_dict[gid] = quads_bbox(quads, image_shape,
padding_px=bbox_padding)
b = bbox_dict[gid]
print(f" Group #{gid}: {len(quads)} quad(s) "
f"bbox=({int(b[0])},{int(b[1])})→"
f"({int(b[2])},{int(b[3])}) "
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])} "
f"text={text_lines}")
return bubble_dict, bbox_dict, ocr_quads
# ─────────────────────────────────────────────
# HYPHEN REMOVAL
# ─────────────────────────────────────────────
def fix_hyphens(lines):
if not lines:
return ""
merged = lines[0]
for line in lines[1:]:
line = line.strip()
merged = (merged[:-1] + line if merged.endswith("-")
else merged + " " + line)
return re.sub(r" {2,}", " ", merged).strip().upper()
# ─────────────────────────────────────────────
# CROP-BASED OCR RE-READ
# ─────────────────────────────────────────────
def reread_cluster_crop(image, bbox, reader,
padding_px=20, upscale_factor=2.5):
img_h, img_w = image.shape[:2]
x1, y1, x2, y2 = bbox
x1 = max(0, int(x1) - padding_px)
y1 = max(0, int(y1) - padding_px)
x2 = min(img_w, int(x2) + padding_px)
y2 = min(img_h, int(y2) + padding_px)
crop = image[y1:y2, x1:x2]
if crop.size == 0:
return None
new_w = int(crop.shape[1] * upscale_factor)
new_h = int(crop.shape[0] * upscale_factor)
upscaled = cv2.resize(crop, (new_w, new_h),
interpolation=cv2.INTER_CUBIC)
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
sharpened = cv2.filter2D(upscaled, -1, kernel)
temp_path = "_temp_crop_ocr.png"
cv2.imwrite(temp_path, sharpened)
try:
crop_results = reader.readtext(temp_path, paragraph=False)
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
if not crop_results:
return None
crop_results.sort(key=lambda r: r[0][0][1])
lines = [t.strip().upper() for _, t, _ in crop_results
if t.strip()]
return fix_hyphens(lines) if lines else None
# ─────────────────────────────────────────────
# AUTO GAP
# ─────────────────────────────────────────────
def compute_auto_gap(image_path, base_gap=18,
reference_width=750):
image = cv2.imread(image_path)
if image is None:
return base_gap
img_w = image.shape[1]
scaled = base_gap * (img_w / reference_width)
print(f" Image width: {img_w}px → auto gap: {scaled:.1f}px")
return scaled
# ─────────────────────────────────────────────
# OCR QUALITY SCORE
# ─────────────────────────────────────────────
def ocr_quality_score(text):
if not text or len(text) < 2:
return 0.0
alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text)
garbage = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
penalty = sum(0.2 for p in garbage
if re.search(p, text))
return max(0.0, min(1.0, alpha_ratio - penalty))
# ─────────────────────────────────────────────
# BUBBLE JSON EXPORT
# bbox_expand_ratio: grow bbox by this fraction
# of its own size in each direction to better
# approximate the full speech bubble boundary.
# ─────────────────────────────────────────────
def export_bubble_boxes(bbox_dict, ocr_quads_dict,
filepath="bubbles.json",
bbox_expand_ratio=0.35,
image_shape=None):
export = {}
for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
quads = ocr_quads_dict.get(bubble_id, [])
# ── Expand bbox to approximate full bubble ────────────────
w_orig = x2 - x1
h_orig = y2 - y1
pad_x = int(w_orig * bbox_expand_ratio)
pad_y = int(h_orig * bbox_expand_ratio)
# Clamp to image bounds if image_shape provided
if image_shape is not None:
img_h, img_w = image_shape[:2]
ex1 = max(0, x1 - pad_x)
ey1 = max(0, y1 - pad_y)
ex2 = min(img_w, x2 + pad_x)
ey2 = min(img_h, y2 + pad_y)
else:
ex1 = x1 - pad_x
ey1 = y1 - pad_y
ex2 = x2 + pad_x
ey2 = y2 + pad_y
export[str(bubble_id)] = {
"x" : int(ex1),
"y" : int(ey1),
"w" : int(ex2 - ex1),
"h" : int(ey2 - ey1),
# Original tight bbox kept for reference
"x_tight" : int(x1),
"y_tight" : int(y1),
"w_tight" : int(w_orig),
"h_tight" : int(h_orig),
"quad_bboxes" : [
{
"x": int(quad_bbox(q)[0]),
"y": int(quad_bbox(q)[1]),
"w": int(quad_bbox(q)[2] - quad_bbox(q)[0]),
"h": int(quad_bbox(q)[3] - quad_bbox(q)[1]),
}
for q in quads
],
"quads": [[[int(pt[0]), int(pt[1])] for pt in quad]
for quad in quads],
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(export, f, indent=2, ensure_ascii=False)
print(f"\n📦 Bubble boxes saved → {filepath}")
for bid, v in export.items():
print(f" #{bid}: expanded=({v['x']},{v['y']}) "
f"{v['w']}×{v['h']}px "
f"tight={v['w_tight']}×{v['h_tight']}px "
f"[{len(v['quads'])} quad(s)]")
# ─────────────────────────────────────────────
# OUTPUT.TXT WRITER
# Uses a pipe | as unambiguous delimiter
# Format: #ID|ORIGINAL|TRANSLATED
# ─────────────────────────────────────────────
def write_output(output_lines, filepath):
with open(filepath, "w", encoding="utf-8") as f:
f.write("\n".join(output_lines))
print(f"📄 Translations saved → {filepath}")
# ─────────────────────────────────────────────
# DEBUG IMAGE
# ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr_results,
bubble_dict, bbox_dict):
image = cv2.imread(image_path)
if image is None:
return
np.random.seed(42)
num_bubbles = max(bubble_dict.keys(), default=1)
colors = [
tuple(int(c) for c in col)
for col in np.random.randint(
50, 230, size=(num_bubbles + 2, 3))
]
text_to_bubble = {}
for bubble_id, lines in bubble_dict.items():
for line in lines:
text_to_bubble[line] = bubble_id
for bbox, text, _ in ocr_results:
bubble_id = text_to_bubble.get(text, 0)
color = colors[(bubble_id - 1) % len(colors)]
pts = np.array(bbox, dtype=np.int32)
cv2.polylines(image, [pts], isClosed=True,
color=color, thickness=1)
for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
color = colors[(bubble_id - 1) % len(colors)]
cv2.rectangle(image,
(int(x1), int(y1)),
(int(x2), int(y2)),
color, 2)
cv2.putText(image, f"BOX#{bubble_id}",
(int(x1) + 2, int(y1) + 16),
cv2.FONT_HERSHEY_SIMPLEX,
0.5, color, 2)
cv2.imwrite("debug_clusters.png", image)
print(" 🐛 debug_clusters.png saved")
# ─────────────────────────────────────────────
# CORE FUNCTION
# ─────────────────────────────────────────────
def translate_manga_text(
image_path,
source_lang="en",
target_lang="ca",
confidence_threshold=0.10,
export_to_file=None,
export_bubbles_to="bubbles.json",
min_text_length=2,
gap_px="auto",
filter_sound_effects=True,
quality_threshold=0.5,
upscale_factor=2.5,
bbox_padding=10,
debug=False,
):
# ── 1. Resolve gap ────────────────────────────────────────────
if gap_px == "auto":
resolved_gap = compute_auto_gap(image_path)
else:
resolved_gap = float(gap_px)
# ── 2. Load full image ────────────────────────────────────────
full_image = cv2.imread(image_path)
if full_image is None:
print(f"❌ Could not load image: {image_path}")
return
# ── 3. Initialize OCR ─────────────────────────────────────────
print("\nLoading OCR model...")
ocr_lang_list = ["en", "es"] if source_lang == "ca" \
else [source_lang]
reader = easyocr.Reader(ocr_lang_list)
# ── 4. Initialize translator ──────────────────────────────────
translator = GoogleTranslator(source=source_lang,
target=target_lang)
# ── 5. Run OCR ────────────────────────────────────────────────
print(f"\nRunning OCR on: {image_path}")
results = reader.readtext(image_path, paragraph=False)
print(f" Raw detections: {len(results)}")
# ── 6. Filter tokens ──────────────────────────────────────────
filtered = []
skipped = 0
for bbox, text, confidence in results:
cleaned = text.strip().upper()
keep, category = should_keep_token(
cleaned, confidence,
confidence_threshold, min_text_length,
filter_sound_effects
)
if keep:
filtered.append((bbox, cleaned, confidence))
if category == "punct":
print(f" ✔ Punct kept: '{cleaned}'")
else:
tag = ("🔇 SFX" if is_sound_effect(cleaned) else
"🏷 Title" if is_title_text(cleaned) else
"🗑 Garbage" if is_garbage(cleaned) else
"✂️ Low-conf")
print(f" {tag} skipped: '{cleaned}'")
skipped += 1
print(f"{len(filtered)} kept, {skipped} skipped.\n")
if not filtered:
print("⚠️ No text detected after filtering.")
return
# ── 7. Group by overlap ───────────────────────────────────────
print(f"Grouping by overlap "
f"(gap_px={resolved_gap:.1f}, "
f"bbox_padding={bbox_padding}px)...")
bubble_dict, bbox_dict, ocr_quads = group_quads_by_overlap(
filtered,
image_shape = full_image.shape,
gap_px = resolved_gap,
bbox_padding = bbox_padding,
)
print(f"{len(bubble_dict)} bubble(s) detected.\n")
# ── 8. Debug ──────────────────────────────────────────────────
if debug:
save_debug_clusters(image_path, filtered,
bubble_dict, bbox_dict)
# ── 9. Fix hyphens ────────────────────────────────────────────
clean_bubbles = {
i: fix_hyphens(lines)
for i, lines in bubble_dict.items()
if lines
}
# ── 10. Quality check + crop re-read ──────────────────────────
print("Checking OCR quality per bubble...")
for i, text in clean_bubbles.items():
score = ocr_quality_score(text)
status = "" if score >= quality_threshold else "🔁"
print(f" #{i}: score={score:.2f} {status} "
f"'{text[:55]}'")
if score < quality_threshold:
print(f" → Re-reading #{i} from crop...")
reread = reread_cluster_crop(
full_image, bbox_dict[i], reader,
upscale_factor=upscale_factor,
)
if reread:
print(f"'{reread}'")
clean_bubbles[i] = reread
else:
print(f" → Nothing found, keeping original.")
# ── 11. Translate ─────────────────────────────────────────────
# Output format (pipe-delimited, unambiguous):
# #ID|ORIGINAL TEXT|TRANSLATED TEXT
print()
header = "BUBBLE|ORIGINAL|TRANSLATED"
divider = "" * 80
output_lines = [header, divider]
translations = {}
translated_count = 0
print(f"{'BUBBLE':<8} {'ORIGINAL':<45} {'TRANSLATED'}")
print(divider)
for i in sorted(clean_bubbles.keys()):
bubble_text = clean_bubbles[i].strip()
if not bubble_text:
continue
try:
result = translator.translate(bubble_text)
except Exception as e:
result = f"[Translation error: {e}]"
if result is None:
result = "[No translation returned]"
result = result.upper()
translations[i] = result
translated_count += 1
# Pipe-delimited line — safe regardless of text content
output_lines.append(f"#{i}|{bubble_text}|{result}")
print(f"#{i:<7} {bubble_text:<45} {result}")
output_lines.append(divider)
summary = (f"✅ Done! {translated_count} bubble(s) "
f"translated, {skipped} detection(s) skipped.")
output_lines.append(summary)
print(divider)
print(summary)
# ── 12. Export translations ───────────────────────────────────
if export_to_file:
write_output(output_lines, export_to_file)
# ── 13. Export bubble boxes ───────────────────────────────────
if export_bubbles_to:
export_bubble_boxes(
bbox_dict,
ocr_quads,
filepath = export_bubbles_to,
bbox_expand_ratio = 0.1, # ← tune this
image_shape = full_image.shape,
)
# ─────────────────────────────────────────────
# ENTRY POINT
# ─────────────────────────────────────────────
if __name__ == "__main__":
translate_manga_text(
image_path = "002-page.jpg",
source_lang = "en",
target_lang = "ca",
confidence_threshold = 0.10,
min_text_length = 2,
export_to_file = "output.txt",
export_bubbles_to = "bubbles.json",
gap_px = "auto",
filter_sound_effects = True,
quality_threshold = 0.5,
upscale_factor = 2.5,
bbox_padding = 1,
debug = True,
)