first commit

2026-04-10 17:27:04 +02:00
commit 60e9894771
3 changed files with 367 additions and 0 deletions
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -0,0 +1,139 @@
+import easyocr
+from deep_translator import GoogleTranslator
+
+
+# ─────────────────────────────────────────────
+#  LANGUAGE CODE REFERENCE
+# ─────────────────────────────────────────────
+SUPPORTED_LANGUAGES = {
+    "Vietnamese"          : "vi",
+    "Japanese"            : "ja",
+    "English"             : "en",
+    "Spanish"             : "es",
+    "Korean"              : "ko",
+    "Chinese (Simplified)": "ch_sim",
+    "Chinese (Traditional)": "ch_tra",
+    "French"              : "fr",
+    "German"              : "de",
+    "Italian"             : "it",
+    "Portuguese"          : "pt",
+    "Arabic"              : "ar",
+    "Russian"             : "ru",
+    "Thai"                : "th",
+    "Catalan"             : "ca",   # ← Added!
+}
+
+
+# ─────────────────────────────────────────────
+#  CORE FUNCTION
+# ─────────────────────────────────────────────
+def translate_manga_text(
+    image_path,
+    source_lang="vi",
+    target_lang="en",
+    confidence_threshold=0.3,
+    export_to_file=None,
+    paragraph_mode=False,
+):
+    """
+    Reads text from a manga/comic image using OCR and translates it.
+
+    Args:
+        image_path           : Path to your image file (PNG, JPG, etc.)
+        source_lang          : Language code of the original text (default: 'vi')
+        target_lang          : Language code to translate into  (default: 'en')
+        confidence_threshold : Minimum OCR confidence to keep a result (default: 0.3)
+        export_to_file       : Optional path to save output as .txt (default: None)
+        paragraph_mode       : Group nearby text into paragraphs (default: False)
+    """
+
+    # ── 1. Initialize OCR reader ──────────────────────────────────────────────
+    print("Loading OCR model (first run downloads ~100MB, cached after)...")
+
+    # EasyOCR doesn't support 'ca' (Catalan) natively for OCR —
+    # but Catalan uses the Latin alphabet, so 'en' + 'es' covers it perfectly.
+    ocr_lang = source_lang
+    if source_lang == "ca":
+        print("  ℹ️  Catalan detected: using Latin-script OCR (en+es) for best results.")
+        ocr_lang_list = ["en", "es"]
+    else:
+        ocr_lang_list = [source_lang]
+
+    reader = easyocr.Reader(ocr_lang_list)
+
+    # ── 2. Initialize translator ──────────────────────────────────────────────
+    translator = GoogleTranslator(source=source_lang, target=target_lang)
+
+    # ── 3. Run OCR ────────────────────────────────────────────────────────────
+    print(f"Scanning image: {image_path}\n")
+    results = reader.readtext(image_path, paragraph=paragraph_mode)
+
+    # ── 4. Filter & translate ─────────────────────────────────────────────────
+    header = f"{'#':<5} {'ORIGINAL TEXT':<45} {'TRANSLATED TEXT'}"
+    divider = "─" * 90
+
+    output_lines = [header, divider]
+
+    print(header)
+    print(divider)
+
+    count = 0
+    for i, (bbox, text, confidence) in enumerate(results, start=1):
+
+        # Skip low-confidence detections (noise, borders, artifacts)
+        if confidence < confidence_threshold:
+            continue
+
+        count += 1
+
+        try:
+            translated = translator.translate(text)
+        except Exception as e:
+            translated = f"[Translation error: {e}]"
+
+        line = f"{count:<5} {text:<45} {translated}"
+        print(line)
+        output_lines.append(line)
+
+    output_lines.append(divider)
+    output_lines.append(f"✅ Done! {count} text block(s) detected and translated.")
+
+    print(divider)
+    print(f"✅ Done! {count} text block(s) detected and translated.")
+
+    # ── 5. Optional: export to file ───────────────────────────────────────────
+    if export_to_file:
+        with open(export_to_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(output_lines))
+        print(f"📄 Output saved to: {export_to_file}")
+
+
+# ─────────────────────────────────────────────
+#  HELPER: print all supported languages
+# ─────────────────────────────────────────────
+def list_languages():
+    """Prints all supported language names and their codes."""
+    print(f"\n{'LANGUAGE':<30} {'CODE'}")
+    print("─" * 40)
+    for name, code in SUPPORTED_LANGUAGES.items():
+        print(f"{name:<30} {code}")
+    print("─" * 40)
+
+
+# ─────────────────────────────────────────────
+#  ENTRY POINT — edit these values and run!
+# ─────────────────────────────────────────────
+if __name__ == "__main__":
+
+    # 🔧 Configure your run here:
+    translate_manga_text(
+        image_path           = "page.png",  # ← your image here
+        source_lang          = "vi",                   # ← original language
+        target_lang          = "en",                   # ← target language
+        confidence_threshold = 0.3,                    # ← raise to 0.5 for noisy images
+        export_to_file       = "output.txt",           # ← set None to skip saving
+        paragraph_mode       = False,                  # ← True groups nearby lines
+    )
+
+    # Uncomment to see all supported languages:
+    # list_languages()