Added all

2026-04-23 16:20:37 +02:00
parent 3ca01dae8c
commit 243e5bad47
5 changed files with 500 additions and 579 deletions
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -47,7 +47,6 @@ SHORT_ENGLISH_WORDS_2 = {
 # Combined protected set used by is_meaningful_text()
 SHORT_ENGLISH_PROTECTED = SHORT_ENGLISH_WORDS_1 | SHORT_ENGLISH_WORDS_2

-
 DIALOGUE_STOPWORDS = {
    "I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR",
    "IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT",
@@ -55,6 +54,38 @@ DIALOGUE_STOPWORDS = {
    "AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT"
 }

+PROTECTED_SHORT_TOKENS = {
+    # ... existing entries ...
+    "HUH", "HUH?", "HUH??", "HUH?!",
+    "OH", "OH!", "OOH", "OOH!",
+    "AH", "AH!", "UH", "UH...",
+    "HEY", "HEY!", "EH", "EH?",
+    "WOW", "WOW!",
+    "MORNING", "MORNING.",
+    "BECKY", "BECKY!",
+    "DAMIAN", "CECILE", "WALD",
+    "OMIGOSH", "EEEP", "EEEEP",
+    # FIX: common short words that appear alone on a manga line
+    "GOOD", "WELL", "YEAH", "OKAY", "SURE",
+    "WAIT", "STOP", "LOOK", "COME", "BACK",
+    "HERE", "OVER", "JUST", "EVEN", "ONLY",
+    "ALSO", "THEN", "WHEN", "WHAT", "THAT",
+    "THIS", "WITH", "FROM", "HAVE", "WILL",
+}
+
+_MANGA_INTERJECTIONS = {
+    # ... existing entries ...
+    # FIX: short words that appear isolated on their own OCR line
+    'GOOD', 'WELL', 'YEAH', 'OKAY', 'SURE',
+    'WAIT', 'STOP', 'LOOK', 'COME', 'BACK',
+    'HERE', 'OVER', 'JUST', 'EVEN', 'ONLY',
+    'ALSO', 'THEN', 'WHEN', 'WHAT', 'THAT',
+    'THIS', 'WITH', 'FROM', 'HAVE', 'WILL',
+    'TRUE', 'REAL', 'FINE', 'DONE', 'GONE',
+    'HELP', 'MOVE', 'STAY', 'CALM', 'COOL',
+}
+
+
 # FIX: SFX_HINTS contains ONLY pure onomatopoeia — no words
 # that could appear in dialogue (MORNING, GOOD, etc. removed)
 SFX_HINTS = {
@@ -520,10 +551,39 @@ def postprocess_translation_general(text: str) -> str:

 def fix_common_ocr_errors(text: str) -> str:
    result = text
+
+    # existing fixes
    result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
    result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
    result = result.replace('|', 'I')
    result = result.replace('`', "'")
+
+    # FIX: Replace digit-zero used as letter-O in common English words.
+    # Vision OCR sometimes reads O → 0 in bold/stylised manga fonts.
+    # Pattern: word containing digits that look like letters.
+    DIGIT_AS_LETTER = {
+        '0': 'O',
+        '1': 'I',
+        '3': 'E',
+        '4': 'A',
+        '5': 'S',
+        '8': 'B',
+    }
+
+    # Only apply inside tokens that are otherwise all-alpha
+    # e.g. "G00D" → "GOOD", "M0RNING" → "MORNING"
+    def fix_digit_letters(m):
+        word = m.group(0)
+        fixed = word
+        for digit, letter in DIGIT_AS_LETTER.items():
+            fixed = fixed.replace(digit, letter)
+        # Only accept the fix if the result is all-alpha (real word)
+        if fixed.isalpha():
+            return fixed
+        return word
+
+    result = re.sub(r'\b[A-Za-z0-9]{2,12}\b', fix_digit_letters, result)
+
    return result

 def is_valid_language(text: str, source_lang: str) -> bool:
@@ -1173,15 +1233,24 @@ def ocr_candidate_score(text: str) -> float:
    n = len(t)
    if n == 0:
        return 0.0
+
    alpha    = sum(c.isalpha() for c in t) / n
    spaces   = sum(c.isspace() for c in t) / n
    punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
    bad      = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
-    penalty  = 0.0
-    if re.search(r"\b[A-Z]\b", t):
+
+    penalty = 0.0
+
+    # FIX: Only penalise isolated single letters when the WHOLE token
+    # is a single letter — not when a word like "I" or "A" appears
+    # inside a longer sentence. Old pattern \b[A-Z]\b fired on "I"
+    # inside "I CAN'T" which incorrectly penalised valid dialogue.
+    if re.fullmatch(r"[A-Z]", t.strip()):
        penalty += 0.05
+
    if re.search(r"[0-9]{2,}", t):
        penalty += 0.08
+
    score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
    return max(0.0, min(1.0, score))