From f92ea8410b02bd07f31c6607c574502075980a75 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Fri, 10 Apr 2026 18:05:17 +0200 Subject: [PATCH] =?UTF-8?q?Bona=20primera=20versi=C3=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + manga-translator.py | 558 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 492 insertions(+), 69 deletions(-) diff --git a/.gitignore b/.gitignore index d0a7b9e..abcceac 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ Icon # Thumbnails ._* +*.txt +*.csv + # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd diff --git a/manga-translator.py b/manga-translator.py index f32d6fe..ae787a1 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -1,107 +1,526 @@ +import re +import os +import cv2 +import numpy as np import easyocr from deep_translator import GoogleTranslator +from sklearn.cluster import DBSCAN # ───────────────────────────────────────────── # LANGUAGE CODE REFERENCE # ───────────────────────────────────────────── SUPPORTED_LANGUAGES = { - "Vietnamese" : "vi", - "Japanese" : "ja", - "English" : "en", - "Spanish" : "es", - "Korean" : "ko", - "Chinese (Simplified)": "ch_sim", + "Vietnamese" : "vi", + "Japanese" : "ja", + "English" : "en", + "Spanish" : "es", + "Korean" : "ko", + "Chinese (Simplified)" : "ch_sim", "Chinese (Traditional)": "ch_tra", - "French" : "fr", - "German" : "de", - "Italian" : "it", - "Portuguese" : "pt", - "Arabic" : "ar", - "Russian" : "ru", - "Thai" : "th", - "Catalan" : "ca", # ← Added! + "French" : "fr", + "German" : "de", + "Italian" : "it", + "Portuguese" : "pt", + "Arabic" : "ar", + "Russian" : "ru", + "Thai" : "th", + "Catalan" : "ca", } +# ───────────────────────────────────────────── +# SOUND EFFECT FILTER +# ───────────────────────────────────────────── +SOUND_EFFECT_PATTERNS = [ + r"^b+i+p+$", + r"^sha+$", + r"^ha+$", + r"^ah+$", + r"^oh+$", + r"^ugh+$", + r"^gr+$", + r"^bam+$", + r"^pow+$", + r"^crash+$", + r"^boom+$", + r"^bang+$", + r"^crack+$", + r"^whoosh+$", + r"^thud+$", + r"^snap+$", + r"^zip+$", + r"^swoosh+$", +] + +def is_sound_effect(text): + cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) + return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) + + +# ───────────────────────────────────────────── +# BOUNDING BOX HELPERS +# ───────────────────────────────────────────── +def get_cluster_bbox(items): + """ + Returns (x1, y1, x2, y2) tight bounding box around + all (cy, cx, text) center points in a cluster. + Uses a fixed half-size approximation per text block. + """ + half = 30 + x1 = min(cx for _, cx, _ in items) - half + y1 = min(cy for cy, _, _ in items) - half + x2 = max(cx for _, cx, _ in items) + half + y2 = max(cy for cy, _, _ in items) + half + return x1, y1, x2, y2 + + +def boxes_are_close(bbox_a, bbox_b, proximity_px=80): + """ + Returns True if two (x1,y1,x2,y2) boxes are within + proximity_px pixels of each other (or overlapping). + """ + ax1, ay1, ax2, ay2 = bbox_a + bx1, by1, bx2, by2 = bbox_b + ax1 -= proximity_px; ay1 -= proximity_px + ax2 += proximity_px; ay2 += proximity_px + return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1) + + +# ───────────────────────────────────────────── +# POST-CLUSTER MERGE (Union-Find) +# ───────────────────────────────────────────── +def merge_nearby_clusters(raw_clusters, proximity_px=80): + """ + Merges clusters whose bounding boxes are within + proximity_px pixels of each other. + Fixes split bubbles without changing eps globally. + """ + labels = list(raw_clusters.keys()) + bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl]) for lbl in labels} + + parent = {lbl: lbl for lbl in labels} + + def find(x): + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + + def union(x, y): + parent[find(x)] = find(y) + + for i in range(len(labels)): + for j in range(i + 1, len(labels)): + a, b = labels[i], labels[j] + if boxes_are_close(bboxes[a], bboxes[b], proximity_px): + union(a, b) + + merged = {} + for lbl in labels: + root = find(lbl) + merged.setdefault(root, []) + merged[root].extend(raw_clusters[lbl]) + + return merged + + +# ───────────────────────────────────────────── +# CROP-BASED OCR RE-READ +# For each cluster bounding box, crop the +# original image with padding and re-run OCR +# at higher quality. This fixes garbled text +# in small or low-contrast bubbles. +# ───────────────────────────────────────────── +def reread_cluster_crop( + image, + bbox, + reader, + source_lang, + padding_px=20, + upscale_factor=2.5, +): + """ + Crops a cluster region from the full image, upscales it, + and re-runs OCR for higher accuracy on small text. + + Args: + image : Full-page image as numpy array (BGR) + bbox : (x1, y1, x2, y2) cluster bounding box + reader : Initialized EasyOCR Reader + source_lang : Language code string + padding_px : Pixels of padding around the crop (default: 20) + upscale_factor: How much to enlarge the crop before OCR (default: 2.5) + + Returns: + Single cleaned string with all OCR lines merged top-to-bottom, + or None if OCR found nothing. + """ + img_h, img_w = image.shape[:2] + x1, y1, x2, y2 = bbox + + # Add padding, clamp to image bounds + x1 = max(0, int(x1) - padding_px) + y1 = max(0, int(y1) - padding_px) + x2 = min(img_w, int(x2) + padding_px) + y2 = min(img_h, int(y2) + padding_px) + + crop = image[y1:y2, x1:x2] + if crop.size == 0: + return None + + # Upscale for better OCR on small text + new_w = int(crop.shape[1] * upscale_factor) + new_h = int(crop.shape[0] * upscale_factor) + upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + + # Light sharpening to improve OCR on manga fonts + kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) + sharpened = cv2.filter2D(upscaled, -1, kernel) + + # Save temp crop and OCR it + temp_path = "_temp_crop_ocr.png" + cv2.imwrite(temp_path, sharpened) + + try: + crop_results = reader.readtext(temp_path, paragraph=False) + finally: + if os.path.exists(temp_path): + os.remove(temp_path) + + if not crop_results: + return None + + # Sort detections top-to-bottom and join lines + crop_results.sort(key=lambda r: r[0][0][1]) # sort by top-left Y + lines = [text.strip() for _, text, conf in crop_results if text.strip()] + + return fix_hyphens(lines) if lines else None + + +# ───────────────────────────────────────────── +# DBSCAN BUBBLE CLUSTERING +# ───────────────────────────────────────────── +def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): + """ + Two-pass clustering: + Pass 1 — DBSCAN on center points + Pass 2 — Bounding-box proximity merge + + Returns: + bubble_dict : cluster_id → list of (cy, cx, text) + bbox_dict : cluster_id → (x1, y1, x2, y2) + """ + if not ocr_results: + return {}, {} + + centers = [] + for bbox, text, confidence in ocr_results: + xs = [pt[0] for pt in bbox] + ys = [pt[1] for pt in bbox] + centers.append([sum(xs) / 4, sum(ys) / 4]) + + centers_array = np.array(centers, dtype=np.float32) + + db = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean") + labels = db.fit_predict(centers_array) + + raw_clusters = {} + noise_counter = int(max(labels, default=0)) + 1 + + for idx, label in enumerate(labels): + if label == -1: + label = noise_counter + noise_counter += 1 + raw_clusters.setdefault(label, []) + bbox, text, _ = ocr_results[idx] + raw_clusters[label].append((centers[idx][1], centers[idx][0], text)) + + print(f" DBSCAN pass: {len(raw_clusters)} cluster(s)") + + merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px) + print(f" After merge: {len(merged_clusters)} cluster(s)") + + # Sort in reading order + row_band_px = 150 + + def cluster_sort_key(items): + return (min(cy for cy, cx, _ in items) // row_band_px, + min(cx for cy, cx, _ in items)) + + sorted_clusters = sorted(merged_clusters.values(), key=cluster_sort_key) + + bubble_dict = {} + bbox_dict = {} + + for i, items in enumerate(sorted_clusters, start=1): + items_sorted = sorted(items, key=lambda t: t[0]) + bubble_dict[i] = [text for _, _, text in items_sorted] + bbox_dict[i] = get_cluster_bbox(items) + + return bubble_dict, bbox_dict + + +# ───────────────────────────────────────────── +# HYPHEN REMOVAL +# ───────────────────────────────────────────── +def fix_hyphens(lines): + if not lines: + return "" + merged = lines[0] + for line in lines[1:]: + line = line.strip() + merged = merged[:-1] + line if merged.endswith("-") else merged + " " + line + return re.sub(r" {2,}", " ", merged).strip() + + +# ───────────────────────────────────────────── +# AUTO EPS +# ───────────────────────────────────────────── +def compute_auto_eps(image_path, base_eps=80, reference_width=750): + image = cv2.imread(image_path) + if image is None: + return base_eps + img_w = image.shape[1] + scaled = base_eps * (img_w / reference_width) + print(f" ℹ️ Image width: {img_w}px → auto eps: {scaled:.1f}px") + return scaled + + +# ───────────────────────────────────────────── +# OCR QUALITY SCORE +# Heuristic to detect garbled OCR output. +# Low score = likely garbage, trigger re-read. +# ───────────────────────────────────────────── +def ocr_quality_score(text): + """ + Returns a quality score 0.0–1.0 for an OCR result. + + Penalises: + - High ratio of non-alphabetic characters + - Very short text (< 4 chars) + - Suspicious character combos (,,- etc.) + + A score below 0.5 triggers a crop re-read. + """ + if not text or len(text) < 2: + return 0.0 + + alpha_chars = sum(1 for c in text if c.isalpha()) + total_chars = len(text) + alpha_ratio = alpha_chars / total_chars + + # Penalise suspicious patterns + garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"] + penalty = sum(0.2 for p in garbage_patterns if re.search(p, text)) + + score = alpha_ratio - penalty + return max(0.0, min(1.0, score)) + + +# ───────────────────────────────────────────── +# DEBUG CLUSTER IMAGE +# ───────────────────────────────────────────── +def save_debug_clusters(image_path, ocr_results, bubble_dict): + image = cv2.imread(image_path) + if image is None: + return + + np.random.seed(42) + num_bubbles = max(bubble_dict.keys(), default=1) + colors = [ + tuple(int(c) for c in col) + for col in np.random.randint(50, 230, size=(num_bubbles + 2, 3)) + ] + + text_to_bubble = {} + for bubble_id, lines in bubble_dict.items(): + for line in lines: + text_to_bubble[line] = bubble_id + + for bbox, text, _ in ocr_results: + bubble_id = text_to_bubble.get(text, 0) + color = colors[(bubble_id - 1) % len(colors)] + pts = np.array(bbox, dtype=np.int32) + cv2.polylines(image, [pts], isClosed=True, color=color, thickness=2) + x = int(pts[0][0]) + y = max(int(pts[0][1]) - 5, 12) + cv2.putText(image, f"#{bubble_id}", (x, y), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + + cv2.imwrite("debug_clusters.png", image) + print(" 🐛 Cluster debug saved → debug_clusters.png") + # ───────────────────────────────────────────── # CORE FUNCTION # ───────────────────────────────────────────── def translate_manga_text( image_path, - source_lang="vi", - target_lang="en", - confidence_threshold=0.3, + source_lang="it", + target_lang="ca", + confidence_threshold=0.15, export_to_file=None, - paragraph_mode=False, + min_text_length=2, + cluster_eps="auto", + proximity_px=80, + filter_sound_effects=True, + quality_threshold=0.5, # below this → trigger crop re-read + upscale_factor=2.5, # crop upscale multiplier for re-read + debug=False, ): """ - Reads text from a manga/comic image using OCR and translates it. + Full pipeline: + OCR → filter → DBSCAN cluster → proximity merge + → quality check → crop re-read if needed + → fix hyphens → translate Args: - image_path : Path to your image file (PNG, JPG, etc.) - source_lang : Language code of the original text (default: 'vi') - target_lang : Language code to translate into (default: 'en') - confidence_threshold : Minimum OCR confidence to keep a result (default: 0.3) - export_to_file : Optional path to save output as .txt (default: None) - paragraph_mode : Group nearby text into paragraphs (default: False) + image_path : Path to your image file + source_lang : Source language code (default: 'it') + target_lang : Target language code (default: 'ca') + confidence_threshold : Min OCR confidence (default: 0.15) + export_to_file : Save output to .txt (default: None) + min_text_length : Min characters per detection(default: 2) + cluster_eps : DBSCAN eps or 'auto' (default: 'auto') + proximity_px : Post-merge proximity px (default: 80) + filter_sound_effects : Skip onomatopoeia/SFX (default: True) + quality_threshold : Min quality score 0–1 before re-read (default: 0.5) + upscale_factor : Crop upscale for re-read (default: 2.5) + debug : Save debug_clusters.png (default: False) """ - # ── 1. Initialize OCR reader ────────────────────────────────────────────── - print("Loading OCR model (first run downloads ~100MB, cached after)...") - - # EasyOCR doesn't support 'ca' (Catalan) natively for OCR — - # but Catalan uses the Latin alphabet, so 'en' + 'es' covers it perfectly. - ocr_lang = source_lang - if source_lang == "ca": - print(" ℹ️ Catalan detected: using Latin-script OCR (en+es) for best results.") - ocr_lang_list = ["en", "es"] + # ── 1. Resolve eps ──────────────────────────────────────────────────────── + if cluster_eps == "auto": + print("Computing auto eps...") + eps = compute_auto_eps(image_path) else: - ocr_lang_list = [source_lang] + eps = float(cluster_eps) + # ── 2. Load full image (needed for crop re-reads) ───────────────────────── + full_image = cv2.imread(image_path) + if full_image is None: + print(f"❌ Could not load image: {image_path}") + return + + # ── 3. Initialize OCR ───────────────────────────────────────────────────── + print("\nLoading OCR model...") + ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] reader = easyocr.Reader(ocr_lang_list) - # ── 2. Initialize translator ────────────────────────────────────────────── + # ── 4. Initialize translator ────────────────────────────────────────────── translator = GoogleTranslator(source=source_lang, target=target_lang) - # ── 3. Run OCR ──────────────────────────────────────────────────────────── - print(f"Scanning image: {image_path}\n") - results = reader.readtext(image_path, paragraph=paragraph_mode) + # ── 5. Run OCR on full image ────────────────────────────────────────────── + print(f"\nRunning OCR on: {image_path}") + results = reader.readtext(image_path, paragraph=False) + print(f" Raw detections: {len(results)}") - # ── 4. Filter & translate ───────────────────────────────────────────────── - header = f"{'#':<5} {'ORIGINAL TEXT':<45} {'TRANSLATED TEXT'}" - divider = "─" * 90 + # ── 6. Filter detections ────────────────────────────────────────────────── + filtered = [] + skipped = 0 + + for bbox, text, confidence in results: + cleaned = text.strip() + if confidence < confidence_threshold: + skipped += 1 + continue + if len(cleaned) < min_text_length: + skipped += 1 + continue + if re.fullmatch(r"[\d\W]+", cleaned): + skipped += 1 + continue + if filter_sound_effects and is_sound_effect(cleaned): + print(f" 🔇 SFX skipped: '{cleaned}'") + skipped += 1 + continue + filtered.append((bbox, cleaned, confidence)) + + print(f" ✅ {len(filtered)} detection(s) kept, {skipped} skipped.\n") + + if not filtered: + print("⚠️ No text detected after filtering.") + return + + # ── 7. Cluster + merge ──────────────────────────────────────────────────── + print(f"Clustering detections (eps={eps:.1f}px, proximity={proximity_px}px)...") + bubble_dict, bbox_dict = cluster_into_bubbles( + filtered, eps=eps, proximity_px=proximity_px + ) + print(f" ✅ {len(bubble_dict)} bubble(s) after merge.\n") + + # ── 8. Debug image ──────────────────────────────────────────────────────── + if debug: + save_debug_clusters(image_path, filtered, bubble_dict) + + # ── 9. Fix hyphens → first-pass text ───────────────────────────────────── + clean_bubbles = { + i: fix_hyphens(lines) + for i, lines in bubble_dict.items() + if lines + } + + # ── 10. Quality check → crop re-read for low-quality bubbles ───────────── + print("Checking OCR quality per bubble...") + for i, text in clean_bubbles.items(): + score = ocr_quality_score(text) + status = "✅" if score >= quality_threshold else "🔁" + print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'") + + if score < quality_threshold: + print(f" → Re-reading bubble #{i} from crop...") + reread = reread_cluster_crop( + full_image, + bbox_dict[i], + reader, + source_lang, + upscale_factor=upscale_factor, + ) + if reread: + print(f" → Re-read result: '{reread}'") + clean_bubbles[i] = reread + else: + print(f" → Re-read returned nothing, keeping original.") + + # ── 11. Translate & print ───────────────────────────────────────────────── + print() + header = f"{'BUBBLE':<8} {'ORIGINAL (Italian)':<50} {'TRANSLATED (Catalan)'}" + divider = "─" * 105 output_lines = [header, divider] - print(header) print(divider) - count = 0 - for i, (bbox, text, confidence) in enumerate(results, start=1): + translated_count = 0 - # Skip low-confidence detections (noise, borders, artifacts) - if confidence < confidence_threshold: + for i in sorted(clean_bubbles.keys()): + bubble_text = clean_bubbles[i].strip() + if not bubble_text: continue - count += 1 - try: - translated = translator.translate(text) + translated = translator.translate(bubble_text) except Exception as e: translated = f"[Translation error: {e}]" - line = f"{count:<5} {text:<45} {translated}" + if translated is None: + translated = "[No translation returned]" + + translated_count += 1 + line = f"#{i:<7} {bubble_text:<50} {translated}" print(line) output_lines.append(line) output_lines.append(divider) - output_lines.append(f"✅ Done! {count} text block(s) detected and translated.") - + summary = ( + f"✅ Done! {translated_count} bubble(s) translated, " + f"{skipped} detection(s) skipped." + ) + output_lines.append(summary) print(divider) - print(f"✅ Done! {count} text block(s) detected and translated.") + print(summary) - # ── 5. Optional: export to file ─────────────────────────────────────────── + # ── 12. Export ──────────────────────────────────────────────────────────── if export_to_file: with open(export_to_file, "w", encoding="utf-8") as f: f.write("\n".join(output_lines)) @@ -109,10 +528,9 @@ def translate_manga_text( # ───────────────────────────────────────────── -# HELPER: print all supported languages +# HELPER # ───────────────────────────────────────────── def list_languages(): - """Prints all supported language names and their codes.""" print(f"\n{'LANGUAGE':<30} {'CODE'}") print("─" * 40) for name, code in SUPPORTED_LANGUAGES.items(): @@ -121,19 +539,21 @@ def list_languages(): # ───────────────────────────────────────────── -# ENTRY POINT — edit these values and run! +# ENTRY POINT # ───────────────────────────────────────────── if __name__ == "__main__": - # 🔧 Configure your run here: translate_manga_text( - image_path = "page.png", # ← your image here - source_lang = "vi", # ← original language - target_lang = "en", # ← target language - confidence_threshold = 0.3, # ← raise to 0.5 for noisy images - export_to_file = "output.txt", # ← set None to skip saving - paragraph_mode = False, # ← True groups nearby lines - ) - - # Uncomment to see all supported languages: - # list_languages() \ No newline at end of file + image_path = "page.png", + source_lang = "it", + target_lang = "ca", + confidence_threshold = 0.15, + min_text_length = 2, + export_to_file = "output.txt", + cluster_eps = "auto", + proximity_px = 80, + filter_sound_effects = True, + quality_threshold = 0.5, # bubbles scoring below this get re-read + upscale_factor = 2.5, # how much to enlarge the crop for re-read + debug = True, + ) \ No newline at end of file