Files
manga-translator/manga-translator.py
Guillem Hernandez Sola f92ea8410b Bona primera versió
2026-04-10 18:05:17 +02:00

559 lines
21 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import os
import cv2
import numpy as np
import easyocr
from deep_translator import GoogleTranslator
from sklearn.cluster import DBSCAN
# ─────────────────────────────────────────────
# LANGUAGE CODE REFERENCE
# ─────────────────────────────────────────────
SUPPORTED_LANGUAGES = {
"Vietnamese" : "vi",
"Japanese" : "ja",
"English" : "en",
"Spanish" : "es",
"Korean" : "ko",
"Chinese (Simplified)" : "ch_sim",
"Chinese (Traditional)": "ch_tra",
"French" : "fr",
"German" : "de",
"Italian" : "it",
"Portuguese" : "pt",
"Arabic" : "ar",
"Russian" : "ru",
"Thai" : "th",
"Catalan" : "ca",
}
# ─────────────────────────────────────────────
# SOUND EFFECT FILTER
# ─────────────────────────────────────────────
SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$",
r"^sha+$",
r"^ha+$",
r"^ah+$",
r"^oh+$",
r"^ugh+$",
r"^gr+$",
r"^bam+$",
r"^pow+$",
r"^crash+$",
r"^boom+$",
r"^bang+$",
r"^crack+$",
r"^whoosh+$",
r"^thud+$",
r"^snap+$",
r"^zip+$",
r"^swoosh+$",
]
def is_sound_effect(text):
cleaned = re.sub(r"[^a-z]", "", text.strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
# ─────────────────────────────────────────────
# BOUNDING BOX HELPERS
# ─────────────────────────────────────────────
def get_cluster_bbox(items):
"""
Returns (x1, y1, x2, y2) tight bounding box around
all (cy, cx, text) center points in a cluster.
Uses a fixed half-size approximation per text block.
"""
half = 30
x1 = min(cx for _, cx, _ in items) - half
y1 = min(cy for cy, _, _ in items) - half
x2 = max(cx for _, cx, _ in items) + half
y2 = max(cy for cy, _, _ in items) + half
return x1, y1, x2, y2
def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
"""
Returns True if two (x1,y1,x2,y2) boxes are within
proximity_px pixels of each other (or overlapping).
"""
ax1, ay1, ax2, ay2 = bbox_a
bx1, by1, bx2, by2 = bbox_b
ax1 -= proximity_px; ay1 -= proximity_px
ax2 += proximity_px; ay2 += proximity_px
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
# ─────────────────────────────────────────────
# POST-CLUSTER MERGE (Union-Find)
# ─────────────────────────────────────────────
def merge_nearby_clusters(raw_clusters, proximity_px=80):
"""
Merges clusters whose bounding boxes are within
proximity_px pixels of each other.
Fixes split bubbles without changing eps globally.
"""
labels = list(raw_clusters.keys())
bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl]) for lbl in labels}
parent = {lbl: lbl for lbl in labels}
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x, y):
parent[find(x)] = find(y)
for i in range(len(labels)):
for j in range(i + 1, len(labels)):
a, b = labels[i], labels[j]
if boxes_are_close(bboxes[a], bboxes[b], proximity_px):
union(a, b)
merged = {}
for lbl in labels:
root = find(lbl)
merged.setdefault(root, [])
merged[root].extend(raw_clusters[lbl])
return merged
# ─────────────────────────────────────────────
# CROP-BASED OCR RE-READ
# For each cluster bounding box, crop the
# original image with padding and re-run OCR
# at higher quality. This fixes garbled text
# in small or low-contrast bubbles.
# ─────────────────────────────────────────────
def reread_cluster_crop(
image,
bbox,
reader,
source_lang,
padding_px=20,
upscale_factor=2.5,
):
"""
Crops a cluster region from the full image, upscales it,
and re-runs OCR for higher accuracy on small text.
Args:
image : Full-page image as numpy array (BGR)
bbox : (x1, y1, x2, y2) cluster bounding box
reader : Initialized EasyOCR Reader
source_lang : Language code string
padding_px : Pixels of padding around the crop (default: 20)
upscale_factor: How much to enlarge the crop before OCR (default: 2.5)
Returns:
Single cleaned string with all OCR lines merged top-to-bottom,
or None if OCR found nothing.
"""
img_h, img_w = image.shape[:2]
x1, y1, x2, y2 = bbox
# Add padding, clamp to image bounds
x1 = max(0, int(x1) - padding_px)
y1 = max(0, int(y1) - padding_px)
x2 = min(img_w, int(x2) + padding_px)
y2 = min(img_h, int(y2) + padding_px)
crop = image[y1:y2, x1:x2]
if crop.size == 0:
return None
# Upscale for better OCR on small text
new_w = int(crop.shape[1] * upscale_factor)
new_h = int(crop.shape[0] * upscale_factor)
upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
# Light sharpening to improve OCR on manga fonts
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
sharpened = cv2.filter2D(upscaled, -1, kernel)
# Save temp crop and OCR it
temp_path = "_temp_crop_ocr.png"
cv2.imwrite(temp_path, sharpened)
try:
crop_results = reader.readtext(temp_path, paragraph=False)
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
if not crop_results:
return None
# Sort detections top-to-bottom and join lines
crop_results.sort(key=lambda r: r[0][0][1]) # sort by top-left Y
lines = [text.strip() for _, text, conf in crop_results if text.strip()]
return fix_hyphens(lines) if lines else None
# ─────────────────────────────────────────────
# DBSCAN BUBBLE CLUSTERING
# ─────────────────────────────────────────────
def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
"""
Two-pass clustering:
Pass 1 — DBSCAN on center points
Pass 2 — Bounding-box proximity merge
Returns:
bubble_dict : cluster_id → list of (cy, cx, text)
bbox_dict : cluster_id → (x1, y1, x2, y2)
"""
if not ocr_results:
return {}, {}
centers = []
for bbox, text, confidence in ocr_results:
xs = [pt[0] for pt in bbox]
ys = [pt[1] for pt in bbox]
centers.append([sum(xs) / 4, sum(ys) / 4])
centers_array = np.array(centers, dtype=np.float32)
db = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean")
labels = db.fit_predict(centers_array)
raw_clusters = {}
noise_counter = int(max(labels, default=0)) + 1
for idx, label in enumerate(labels):
if label == -1:
label = noise_counter
noise_counter += 1
raw_clusters.setdefault(label, [])
bbox, text, _ = ocr_results[idx]
raw_clusters[label].append((centers[idx][1], centers[idx][0], text))
print(f" DBSCAN pass: {len(raw_clusters)} cluster(s)")
merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px)
print(f" After merge: {len(merged_clusters)} cluster(s)")
# Sort in reading order
row_band_px = 150
def cluster_sort_key(items):
return (min(cy for cy, cx, _ in items) // row_band_px,
min(cx for cy, cx, _ in items))
sorted_clusters = sorted(merged_clusters.values(), key=cluster_sort_key)
bubble_dict = {}
bbox_dict = {}
for i, items in enumerate(sorted_clusters, start=1):
items_sorted = sorted(items, key=lambda t: t[0])
bubble_dict[i] = [text for _, _, text in items_sorted]
bbox_dict[i] = get_cluster_bbox(items)
return bubble_dict, bbox_dict
# ─────────────────────────────────────────────
# HYPHEN REMOVAL
# ─────────────────────────────────────────────
def fix_hyphens(lines):
if not lines:
return ""
merged = lines[0]
for line in lines[1:]:
line = line.strip()
merged = merged[:-1] + line if merged.endswith("-") else merged + " " + line
return re.sub(r" {2,}", " ", merged).strip()
# ─────────────────────────────────────────────
# AUTO EPS
# ─────────────────────────────────────────────
def compute_auto_eps(image_path, base_eps=80, reference_width=750):
image = cv2.imread(image_path)
if image is None:
return base_eps
img_w = image.shape[1]
scaled = base_eps * (img_w / reference_width)
print(f" Image width: {img_w}px → auto eps: {scaled:.1f}px")
return scaled
# ─────────────────────────────────────────────
# OCR QUALITY SCORE
# Heuristic to detect garbled OCR output.
# Low score = likely garbage, trigger re-read.
# ─────────────────────────────────────────────
def ocr_quality_score(text):
"""
Returns a quality score 0.01.0 for an OCR result.
Penalises:
- High ratio of non-alphabetic characters
- Very short text (< 4 chars)
- Suspicious character combos (,,- etc.)
A score below 0.5 triggers a crop re-read.
"""
if not text or len(text) < 2:
return 0.0
alpha_chars = sum(1 for c in text if c.isalpha())
total_chars = len(text)
alpha_ratio = alpha_chars / total_chars
# Penalise suspicious patterns
garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
penalty = sum(0.2 for p in garbage_patterns if re.search(p, text))
score = alpha_ratio - penalty
return max(0.0, min(1.0, score))
# ─────────────────────────────────────────────
# DEBUG CLUSTER IMAGE
# ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr_results, bubble_dict):
image = cv2.imread(image_path)
if image is None:
return
np.random.seed(42)
num_bubbles = max(bubble_dict.keys(), default=1)
colors = [
tuple(int(c) for c in col)
for col in np.random.randint(50, 230, size=(num_bubbles + 2, 3))
]
text_to_bubble = {}
for bubble_id, lines in bubble_dict.items():
for line in lines:
text_to_bubble[line] = bubble_id
for bbox, text, _ in ocr_results:
bubble_id = text_to_bubble.get(text, 0)
color = colors[(bubble_id - 1) % len(colors)]
pts = np.array(bbox, dtype=np.int32)
cv2.polylines(image, [pts], isClosed=True, color=color, thickness=2)
x = int(pts[0][0])
y = max(int(pts[0][1]) - 5, 12)
cv2.putText(image, f"#{bubble_id}", (x, y),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
cv2.imwrite("debug_clusters.png", image)
print(" 🐛 Cluster debug saved → debug_clusters.png")
# ─────────────────────────────────────────────
# CORE FUNCTION
# ─────────────────────────────────────────────
def translate_manga_text(
image_path,
source_lang="it",
target_lang="ca",
confidence_threshold=0.15,
export_to_file=None,
min_text_length=2,
cluster_eps="auto",
proximity_px=80,
filter_sound_effects=True,
quality_threshold=0.5, # below this → trigger crop re-read
upscale_factor=2.5, # crop upscale multiplier for re-read
debug=False,
):
"""
Full pipeline:
OCR → filter → DBSCAN cluster → proximity merge
→ quality check → crop re-read if needed
→ fix hyphens → translate
Args:
image_path : Path to your image file
source_lang : Source language code (default: 'it')
target_lang : Target language code (default: 'ca')
confidence_threshold : Min OCR confidence (default: 0.15)
export_to_file : Save output to .txt (default: None)
min_text_length : Min characters per detection(default: 2)
cluster_eps : DBSCAN eps or 'auto' (default: 'auto')
proximity_px : Post-merge proximity px (default: 80)
filter_sound_effects : Skip onomatopoeia/SFX (default: True)
quality_threshold : Min quality score 01 before re-read (default: 0.5)
upscale_factor : Crop upscale for re-read (default: 2.5)
debug : Save debug_clusters.png (default: False)
"""
# ── 1. Resolve eps ────────────────────────────────────────────────────────
if cluster_eps == "auto":
print("Computing auto eps...")
eps = compute_auto_eps(image_path)
else:
eps = float(cluster_eps)
# ── 2. Load full image (needed for crop re-reads) ─────────────────────────
full_image = cv2.imread(image_path)
if full_image is None:
print(f"❌ Could not load image: {image_path}")
return
# ── 3. Initialize OCR ─────────────────────────────────────────────────────
print("\nLoading OCR model...")
ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang]
reader = easyocr.Reader(ocr_lang_list)
# ── 4. Initialize translator ──────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
# ── 5. Run OCR on full image ──────────────────────────────────────────────
print(f"\nRunning OCR on: {image_path}")
results = reader.readtext(image_path, paragraph=False)
print(f" Raw detections: {len(results)}")
# ── 6. Filter detections ──────────────────────────────────────────────────
filtered = []
skipped = 0
for bbox, text, confidence in results:
cleaned = text.strip()
if confidence < confidence_threshold:
skipped += 1
continue
if len(cleaned) < min_text_length:
skipped += 1
continue
if re.fullmatch(r"[\d\W]+", cleaned):
skipped += 1
continue
if filter_sound_effects and is_sound_effect(cleaned):
print(f" 🔇 SFX skipped: '{cleaned}'")
skipped += 1
continue
filtered.append((bbox, cleaned, confidence))
print(f"{len(filtered)} detection(s) kept, {skipped} skipped.\n")
if not filtered:
print("⚠️ No text detected after filtering.")
return
# ── 7. Cluster + merge ────────────────────────────────────────────────────
print(f"Clustering detections (eps={eps:.1f}px, proximity={proximity_px}px)...")
bubble_dict, bbox_dict = cluster_into_bubbles(
filtered, eps=eps, proximity_px=proximity_px
)
print(f"{len(bubble_dict)} bubble(s) after merge.\n")
# ── 8. Debug image ────────────────────────────────────────────────────────
if debug:
save_debug_clusters(image_path, filtered, bubble_dict)
# ── 9. Fix hyphens → first-pass text ─────────────────────────────────────
clean_bubbles = {
i: fix_hyphens(lines)
for i, lines in bubble_dict.items()
if lines
}
# ── 10. Quality check → crop re-read for low-quality bubbles ─────────────
print("Checking OCR quality per bubble...")
for i, text in clean_bubbles.items():
score = ocr_quality_score(text)
status = "" if score >= quality_threshold else "🔁"
print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'")
if score < quality_threshold:
print(f" → Re-reading bubble #{i} from crop...")
reread = reread_cluster_crop(
full_image,
bbox_dict[i],
reader,
source_lang,
upscale_factor=upscale_factor,
)
if reread:
print(f" → Re-read result: '{reread}'")
clean_bubbles[i] = reread
else:
print(f" → Re-read returned nothing, keeping original.")
# ── 11. Translate & print ─────────────────────────────────────────────────
print()
header = f"{'BUBBLE':<8} {'ORIGINAL (Italian)':<50} {'TRANSLATED (Catalan)'}"
divider = "" * 105
output_lines = [header, divider]
print(header)
print(divider)
translated_count = 0
for i in sorted(clean_bubbles.keys()):
bubble_text = clean_bubbles[i].strip()
if not bubble_text:
continue
try:
translated = translator.translate(bubble_text)
except Exception as e:
translated = f"[Translation error: {e}]"
if translated is None:
translated = "[No translation returned]"
translated_count += 1
line = f"#{i:<7} {bubble_text:<50} {translated}"
print(line)
output_lines.append(line)
output_lines.append(divider)
summary = (
f"✅ Done! {translated_count} bubble(s) translated, "
f"{skipped} detection(s) skipped."
)
output_lines.append(summary)
print(divider)
print(summary)
# ── 12. Export ────────────────────────────────────────────────────────────
if export_to_file:
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(output_lines))
print(f"📄 Output saved to: {export_to_file}")
# ─────────────────────────────────────────────
# HELPER
# ─────────────────────────────────────────────
def list_languages():
print(f"\n{'LANGUAGE':<30} {'CODE'}")
print("" * 40)
for name, code in SUPPORTED_LANGUAGES.items():
print(f"{name:<30} {code}")
print("" * 40)
# ─────────────────────────────────────────────
# ENTRY POINT
# ─────────────────────────────────────────────
if __name__ == "__main__":
translate_manga_text(
image_path = "page.png",
source_lang = "it",
target_lang = "ca",
confidence_threshold = 0.15,
min_text_length = 2,
export_to_file = "output.txt",
cluster_eps = "auto",
proximity_px = 80,
filter_sound_effects = True,
quality_threshold = 0.5, # bubbles scoring below this get re-read
upscale_factor = 2.5, # how much to enlarge the crop for re-read
debug = True,
)