Added all

This commit is contained in:
Guillem Hernandez Sola
2026-04-23 16:20:37 +02:00
parent 3ca01dae8c
commit 243e5bad47
5 changed files with 500 additions and 579 deletions

View File

@@ -47,7 +47,6 @@ SHORT_ENGLISH_WORDS_2 = {
# Combined protected set used by is_meaningful_text()
SHORT_ENGLISH_PROTECTED = SHORT_ENGLISH_WORDS_1 | SHORT_ENGLISH_WORDS_2
DIALOGUE_STOPWORDS = {
"I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR",
"IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT",
@@ -55,6 +54,38 @@ DIALOGUE_STOPWORDS = {
"AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT"
}
PROTECTED_SHORT_TOKENS = {
# ... existing entries ...
"HUH", "HUH?", "HUH??", "HUH?!",
"OH", "OH!", "OOH", "OOH!",
"AH", "AH!", "UH", "UH...",
"HEY", "HEY!", "EH", "EH?",
"WOW", "WOW!",
"MORNING", "MORNING.",
"BECKY", "BECKY!",
"DAMIAN", "CECILE", "WALD",
"OMIGOSH", "EEEP", "EEEEP",
# FIX: common short words that appear alone on a manga line
"GOOD", "WELL", "YEAH", "OKAY", "SURE",
"WAIT", "STOP", "LOOK", "COME", "BACK",
"HERE", "OVER", "JUST", "EVEN", "ONLY",
"ALSO", "THEN", "WHEN", "WHAT", "THAT",
"THIS", "WITH", "FROM", "HAVE", "WILL",
}
_MANGA_INTERJECTIONS = {
# ... existing entries ...
# FIX: short words that appear isolated on their own OCR line
'GOOD', 'WELL', 'YEAH', 'OKAY', 'SURE',
'WAIT', 'STOP', 'LOOK', 'COME', 'BACK',
'HERE', 'OVER', 'JUST', 'EVEN', 'ONLY',
'ALSO', 'THEN', 'WHEN', 'WHAT', 'THAT',
'THIS', 'WITH', 'FROM', 'HAVE', 'WILL',
'TRUE', 'REAL', 'FINE', 'DONE', 'GONE',
'HELP', 'MOVE', 'STAY', 'CALM', 'COOL',
}
# FIX: SFX_HINTS contains ONLY pure onomatopoeia — no words
# that could appear in dialogue (MORNING, GOOD, etc. removed)
SFX_HINTS = {
@@ -520,10 +551,39 @@ def postprocess_translation_general(text: str) -> str:
def fix_common_ocr_errors(text: str) -> str:
result = text
# existing fixes
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
result = result.replace('|', 'I')
result = result.replace('`', "'")
# FIX: Replace digit-zero used as letter-O in common English words.
# Vision OCR sometimes reads O → 0 in bold/stylised manga fonts.
# Pattern: word containing digits that look like letters.
DIGIT_AS_LETTER = {
'0': 'O',
'1': 'I',
'3': 'E',
'4': 'A',
'5': 'S',
'8': 'B',
}
# Only apply inside tokens that are otherwise all-alpha
# e.g. "G00D" → "GOOD", "M0RNING" → "MORNING"
def fix_digit_letters(m):
word = m.group(0)
fixed = word
for digit, letter in DIGIT_AS_LETTER.items():
fixed = fixed.replace(digit, letter)
# Only accept the fix if the result is all-alpha (real word)
if fixed.isalpha():
return fixed
return word
result = re.sub(r'\b[A-Za-z0-9]{2,12}\b', fix_digit_letters, result)
return result
def is_valid_language(text: str, source_lang: str) -> bool:
@@ -1173,15 +1233,24 @@ def ocr_candidate_score(text: str) -> float:
n = len(t)
if n == 0:
return 0.0
alpha = sum(c.isalpha() for c in t) / n
spaces = sum(c.isspace() for c in t) / n
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
penalty = 0.0
if re.search(r"\b[A-Z]\b", t):
penalty = 0.0
# FIX: Only penalise isolated single letters when the WHOLE token
# is a single letter — not when a word like "I" or "A" appears
# inside a longer sentence. Old pattern \b[A-Z]\b fired on "I"
# inside "I CAN'T" which incorrectly penalised valid dialogue.
if re.fullmatch(r"[A-Z]", t.strip()):
penalty += 0.05
if re.search(r"[0-9]{2,}", t):
penalty += 0.08
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
return max(0.0, min(1.0, score))