Added all
This commit is contained in:
@@ -47,7 +47,6 @@ SHORT_ENGLISH_WORDS_2 = {
|
||||
# Combined protected set used by is_meaningful_text()
|
||||
SHORT_ENGLISH_PROTECTED = SHORT_ENGLISH_WORDS_1 | SHORT_ENGLISH_WORDS_2
|
||||
|
||||
|
||||
DIALOGUE_STOPWORDS = {
|
||||
"I", "YOU", "HE", "SHE", "WE", "THEY", "IT", "ME", "MY", "YOUR", "OUR",
|
||||
"IS", "ARE", "WAS", "WERE", "AM", "DO", "DID", "DON'T", "DIDN'T", "NOT",
|
||||
@@ -55,6 +54,38 @@ DIALOGUE_STOPWORDS = {
|
||||
"AND", "BUT", "SO", "THAT", "THIS", "THERE", "HERE", "THAN", "ALL", "RIGHT"
|
||||
}
|
||||
|
||||
PROTECTED_SHORT_TOKENS = {
|
||||
# ... existing entries ...
|
||||
"HUH", "HUH?", "HUH??", "HUH?!",
|
||||
"OH", "OH!", "OOH", "OOH!",
|
||||
"AH", "AH!", "UH", "UH...",
|
||||
"HEY", "HEY!", "EH", "EH?",
|
||||
"WOW", "WOW!",
|
||||
"MORNING", "MORNING.",
|
||||
"BECKY", "BECKY!",
|
||||
"DAMIAN", "CECILE", "WALD",
|
||||
"OMIGOSH", "EEEP", "EEEEP",
|
||||
# FIX: common short words that appear alone on a manga line
|
||||
"GOOD", "WELL", "YEAH", "OKAY", "SURE",
|
||||
"WAIT", "STOP", "LOOK", "COME", "BACK",
|
||||
"HERE", "OVER", "JUST", "EVEN", "ONLY",
|
||||
"ALSO", "THEN", "WHEN", "WHAT", "THAT",
|
||||
"THIS", "WITH", "FROM", "HAVE", "WILL",
|
||||
}
|
||||
|
||||
_MANGA_INTERJECTIONS = {
|
||||
# ... existing entries ...
|
||||
# FIX: short words that appear isolated on their own OCR line
|
||||
'GOOD', 'WELL', 'YEAH', 'OKAY', 'SURE',
|
||||
'WAIT', 'STOP', 'LOOK', 'COME', 'BACK',
|
||||
'HERE', 'OVER', 'JUST', 'EVEN', 'ONLY',
|
||||
'ALSO', 'THEN', 'WHEN', 'WHAT', 'THAT',
|
||||
'THIS', 'WITH', 'FROM', 'HAVE', 'WILL',
|
||||
'TRUE', 'REAL', 'FINE', 'DONE', 'GONE',
|
||||
'HELP', 'MOVE', 'STAY', 'CALM', 'COOL',
|
||||
}
|
||||
|
||||
|
||||
# FIX: SFX_HINTS contains ONLY pure onomatopoeia — no words
|
||||
# that could appear in dialogue (MORNING, GOOD, etc. removed)
|
||||
SFX_HINTS = {
|
||||
@@ -520,10 +551,39 @@ def postprocess_translation_general(text: str) -> str:
|
||||
|
||||
def fix_common_ocr_errors(text: str) -> str:
|
||||
result = text
|
||||
|
||||
# existing fixes
|
||||
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
|
||||
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
|
||||
result = result.replace('|', 'I')
|
||||
result = result.replace('`', "'")
|
||||
|
||||
# FIX: Replace digit-zero used as letter-O in common English words.
|
||||
# Vision OCR sometimes reads O → 0 in bold/stylised manga fonts.
|
||||
# Pattern: word containing digits that look like letters.
|
||||
DIGIT_AS_LETTER = {
|
||||
'0': 'O',
|
||||
'1': 'I',
|
||||
'3': 'E',
|
||||
'4': 'A',
|
||||
'5': 'S',
|
||||
'8': 'B',
|
||||
}
|
||||
|
||||
# Only apply inside tokens that are otherwise all-alpha
|
||||
# e.g. "G00D" → "GOOD", "M0RNING" → "MORNING"
|
||||
def fix_digit_letters(m):
|
||||
word = m.group(0)
|
||||
fixed = word
|
||||
for digit, letter in DIGIT_AS_LETTER.items():
|
||||
fixed = fixed.replace(digit, letter)
|
||||
# Only accept the fix if the result is all-alpha (real word)
|
||||
if fixed.isalpha():
|
||||
return fixed
|
||||
return word
|
||||
|
||||
result = re.sub(r'\b[A-Za-z0-9]{2,12}\b', fix_digit_letters, result)
|
||||
|
||||
return result
|
||||
|
||||
def is_valid_language(text: str, source_lang: str) -> bool:
|
||||
@@ -1173,15 +1233,24 @@ def ocr_candidate_score(text: str) -> float:
|
||||
n = len(t)
|
||||
if n == 0:
|
||||
return 0.0
|
||||
|
||||
alpha = sum(c.isalpha() for c in t) / n
|
||||
spaces = sum(c.isspace() for c in t) / n
|
||||
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
|
||||
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
|
||||
penalty = 0.0
|
||||
if re.search(r"\b[A-Z]\b", t):
|
||||
|
||||
penalty = 0.0
|
||||
|
||||
# FIX: Only penalise isolated single letters when the WHOLE token
|
||||
# is a single letter — not when a word like "I" or "A" appears
|
||||
# inside a longer sentence. Old pattern \b[A-Z]\b fired on "I"
|
||||
# inside "I CAN'T" which incorrectly penalised valid dialogue.
|
||||
if re.fullmatch(r"[A-Z]", t.strip()):
|
||||
penalty += 0.05
|
||||
|
||||
if re.search(r"[0-9]{2,}", t):
|
||||
penalty += 0.08
|
||||
|
||||
score = (0.62 * alpha) + (0.10 * spaces) + (0.20 * punct_ok) - (0.45 * bad) - penalty
|
||||
return max(0.0, min(1.0, score))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user