first commit

2026-04-10 17:27:04 +02:00
commit 60e9894771
3 changed files with 367 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,228 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
--- a/README.md
+++ b/README.md
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -0,0 +1,139 @@
+import easyocr
+from deep_translator import GoogleTranslator
+
+
+# ─────────────────────────────────────────────
+#  LANGUAGE CODE REFERENCE
+# ─────────────────────────────────────────────
+SUPPORTED_LANGUAGES = {
+    "Vietnamese"          : "vi",
+    "Japanese"            : "ja",
+    "English"             : "en",
+    "Spanish"             : "es",
+    "Korean"              : "ko",
+    "Chinese (Simplified)": "ch_sim",
+    "Chinese (Traditional)": "ch_tra",
+    "French"              : "fr",
+    "German"              : "de",
+    "Italian"             : "it",
+    "Portuguese"          : "pt",
+    "Arabic"              : "ar",
+    "Russian"             : "ru",
+    "Thai"                : "th",
+    "Catalan"             : "ca",   # ← Added!
+}
+
+
+# ─────────────────────────────────────────────
+#  CORE FUNCTION
+# ─────────────────────────────────────────────
+def translate_manga_text(
+    image_path,
+    source_lang="vi",
+    target_lang="en",
+    confidence_threshold=0.3,
+    export_to_file=None,
+    paragraph_mode=False,
+):
+    """
+    Reads text from a manga/comic image using OCR and translates it.
+
+    Args:
+        image_path           : Path to your image file (PNG, JPG, etc.)
+        source_lang          : Language code of the original text (default: 'vi')
+        target_lang          : Language code to translate into  (default: 'en')
+        confidence_threshold : Minimum OCR confidence to keep a result (default: 0.3)
+        export_to_file       : Optional path to save output as .txt (default: None)
+        paragraph_mode       : Group nearby text into paragraphs (default: False)
+    """
+
+    # ── 1. Initialize OCR reader ──────────────────────────────────────────────
+    print("Loading OCR model (first run downloads ~100MB, cached after)...")
+
+    # EasyOCR doesn't support 'ca' (Catalan) natively for OCR —
+    # but Catalan uses the Latin alphabet, so 'en' + 'es' covers it perfectly.
+    ocr_lang = source_lang
+    if source_lang == "ca":
+        print("  ℹ️  Catalan detected: using Latin-script OCR (en+es) for best results.")
+        ocr_lang_list = ["en", "es"]
+    else:
+        ocr_lang_list = [source_lang]
+
+    reader = easyocr.Reader(ocr_lang_list)
+
+    # ── 2. Initialize translator ──────────────────────────────────────────────
+    translator = GoogleTranslator(source=source_lang, target=target_lang)
+
+    # ── 3. Run OCR ────────────────────────────────────────────────────────────
+    print(f"Scanning image: {image_path}\n")
+    results = reader.readtext(image_path, paragraph=paragraph_mode)
+
+    # ── 4. Filter & translate ─────────────────────────────────────────────────
+    header = f"{'#':<5} {'ORIGINAL TEXT':<45} {'TRANSLATED TEXT'}"
+    divider = "─" * 90
+
+    output_lines = [header, divider]
+
+    print(header)
+    print(divider)
+
+    count = 0
+    for i, (bbox, text, confidence) in enumerate(results, start=1):
+
+        # Skip low-confidence detections (noise, borders, artifacts)
+        if confidence < confidence_threshold:
+            continue
+
+        count += 1
+
+        try:
+            translated = translator.translate(text)
+        except Exception as e:
+            translated = f"[Translation error: {e}]"
+
+        line = f"{count:<5} {text:<45} {translated}"
+        print(line)
+        output_lines.append(line)
+
+    output_lines.append(divider)
+    output_lines.append(f"✅ Done! {count} text block(s) detected and translated.")
+
+    print(divider)
+    print(f"✅ Done! {count} text block(s) detected and translated.")
+
+    # ── 5. Optional: export to file ───────────────────────────────────────────
+    if export_to_file:
+        with open(export_to_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(output_lines))
+        print(f"📄 Output saved to: {export_to_file}")
+
+
+# ─────────────────────────────────────────────
+#  HELPER: print all supported languages
+# ─────────────────────────────────────────────
+def list_languages():
+    """Prints all supported language names and their codes."""
+    print(f"\n{'LANGUAGE':<30} {'CODE'}")
+    print("─" * 40)
+    for name, code in SUPPORTED_LANGUAGES.items():
+        print(f"{name:<30} {code}")
+    print("─" * 40)
+
+
+# ─────────────────────────────────────────────
+#  ENTRY POINT — edit these values and run!
+# ─────────────────────────────────────────────
+if __name__ == "__main__":
+
+    # 🔧 Configure your run here:
+    translate_manga_text(
+        image_path           = "page.png",  # ← your image here
+        source_lang          = "vi",                   # ← original language
+        target_lang          = "en",                   # ← target language
+        confidence_threshold = 0.3,                    # ← raise to 0.5 for noisy images
+        export_to_file       = "output.txt",           # ← set None to skip saving
+        paragraph_mode       = False,                  # ← True groups nearby lines
+    )
+
+    # Uncomment to see all supported languages:
+    # list_languages()