Added good stuff
This commit is contained in:
48
bubbles.json
48
bubbles.json
@@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"x": 204,
|
"x": 201,
|
||||||
"y": 137,
|
"y": 134,
|
||||||
"w": 153,
|
"w": 159,
|
||||||
"h": 82,
|
"h": 88,
|
||||||
"quads": [
|
"quads": [
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
@@ -26,10 +26,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"x": 1167,
|
"x": 1164,
|
||||||
"y": 240,
|
"y": 237,
|
||||||
"w": 132,
|
"w": 138,
|
||||||
"h": 134,
|
"h": 140,
|
||||||
"quads": [
|
"quads": [
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
@@ -106,10 +106,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"3": {
|
"3": {
|
||||||
"x": 930,
|
"x": 927,
|
||||||
"y": 378,
|
"y": 375,
|
||||||
"w": 136,
|
"w": 142,
|
||||||
"h": 132,
|
"h": 138,
|
||||||
"quads": [
|
"quads": [
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
@@ -186,10 +186,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"4": {
|
"4": {
|
||||||
"x": 220,
|
"x": 217,
|
||||||
"y": 486,
|
"y": 483,
|
||||||
"w": 150,
|
"w": 156,
|
||||||
"h": 210,
|
"h": 216,
|
||||||
"quads": [
|
"quads": [
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
@@ -320,10 +320,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"5": {
|
"5": {
|
||||||
"x": 354,
|
"x": 351,
|
||||||
"y": 1132,
|
"y": 1129,
|
||||||
"w": 92,
|
"w": 98,
|
||||||
"h": 102,
|
"h": 108,
|
||||||
"quads": [
|
"quads": [
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
@@ -382,10 +382,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"6": {
|
"6": {
|
||||||
"x": 740,
|
"x": 737,
|
||||||
"y": 1324,
|
"y": 1321,
|
||||||
"w": 38,
|
"w": 44,
|
||||||
"h": 24,
|
"h": 30,
|
||||||
"quads": [
|
"quads": [
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
|
|||||||
BIN
fonts/ComicRelief-Bold.ttf
Executable file
BIN
fonts/ComicRelief-Bold.ttf
Executable file
Binary file not shown.
BIN
fonts/ComicRelief-Regular.ttf
Executable file
BIN
fonts/ComicRelief-Regular.ttf
Executable file
Binary file not shown.
@@ -18,28 +18,100 @@ FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc"
|
|||||||
FONT_COLOR = (0, 0, 0)
|
FONT_COLOR = (0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# WORD-ONLY WRAP
|
||||||
|
#
|
||||||
|
# Breaks ONLY at space boundaries.
|
||||||
|
# Returns (lines, overflow) where overflow=True
|
||||||
|
# means a single word is wider than max_w at
|
||||||
|
# this font size → caller must try smaller.
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def wrap_text_words(draw, text, max_w, font):
|
||||||
|
"""
|
||||||
|
Word-wraps text to fit within max_w pixels.
|
||||||
|
Never inserts hyphens or breaks mid-word.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(lines, overflow)
|
||||||
|
lines : list of strings, each ≤ max_w px wide
|
||||||
|
overflow : True if any single word exceeds max_w
|
||||||
|
"""
|
||||||
|
def measure(s):
|
||||||
|
bb = draw.textbbox((0, 0), s, font=font)
|
||||||
|
return bb[2] - bb[0]
|
||||||
|
|
||||||
|
words = text.split()
|
||||||
|
lines = []
|
||||||
|
current = ""
|
||||||
|
overflow = False
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
if measure(word) > max_w:
|
||||||
|
overflow = True
|
||||||
|
break
|
||||||
|
test = (current + " " + word).strip()
|
||||||
|
if measure(test) <= max_w:
|
||||||
|
current = test
|
||||||
|
else:
|
||||||
|
if current:
|
||||||
|
lines.append(current)
|
||||||
|
current = word
|
||||||
|
|
||||||
|
if not overflow and current:
|
||||||
|
lines.append(current)
|
||||||
|
|
||||||
|
return lines, overflow
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# PARSE output.txt
|
# PARSE output.txt
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def parse_translations(filepath):
|
def parse_translations(filepath):
|
||||||
"""
|
"""
|
||||||
Parses output.txt → {bubble_id: translated_text}.
|
Parses output.txt → {bubble_id: translated_text}.
|
||||||
Only bubbles present in the file are returned.
|
Uses header line as column ruler to find the exact
|
||||||
Absent IDs are left completely untouched on the page.
|
char position of the TRANSLATED column.
|
||||||
|
Immune to commas, ellipses, spaces in translated text.
|
||||||
"""
|
"""
|
||||||
translations = {}
|
translations = {}
|
||||||
|
header_pos = None
|
||||||
|
|
||||||
with open(filepath, "r", encoding="utf-8") as f:
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
for line in f:
|
lines = f.readlines()
|
||||||
line = line.rstrip("\n")
|
|
||||||
|
for raw_line in lines:
|
||||||
|
line = raw_line.rstrip("\n")
|
||||||
|
|
||||||
|
if re.match(r"^BUBBLE\s+ORIGINAL", line):
|
||||||
|
m = re.search(r"TRANSLATED", line)
|
||||||
|
if m:
|
||||||
|
header_pos = m.start()
|
||||||
|
print(f" ℹ️ TRANSLATED column at char {header_pos}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
stripped = line.strip()
|
||||||
|
if re.match(r"^[─\-=]{3,}$", stripped):
|
||||||
|
continue
|
||||||
|
if stripped.startswith("✅") or stripped.startswith("Done"):
|
||||||
|
continue
|
||||||
if not re.match(r"^\s*#\d+", line):
|
if not re.match(r"^\s*#\d+", line):
|
||||||
continue
|
continue
|
||||||
parts = re.split(r" {2,}", line.strip())
|
|
||||||
if len(parts) < 3:
|
m_id = re.match(r"^\s*#(\d+)", line)
|
||||||
|
if not m_id:
|
||||||
continue
|
continue
|
||||||
bubble_id = int(re.sub(r"[^0-9]", "", parts[0]))
|
bubble_id = int(m_id.group(1))
|
||||||
translated = parts[-1].strip()
|
|
||||||
if translated.startswith("["):
|
if header_pos is not None and len(line) > header_pos:
|
||||||
|
translated = line[header_pos:].strip()
|
||||||
|
else:
|
||||||
|
parts = re.split(r" {2,}", stripped)
|
||||||
|
translated = parts[-1].strip() if len(parts) >= 3 else ""
|
||||||
|
|
||||||
|
if not translated or translated.startswith("["):
|
||||||
|
print(f" ⚠️ #{bubble_id}: no translation found")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
translations[bubble_id] = translated
|
translations[bubble_id] = translated
|
||||||
|
|
||||||
print(f" ✅ {len(translations)} bubble(s) to translate: "
|
print(f" ✅ {len(translations)} bubble(s) to translate: "
|
||||||
@@ -67,11 +139,6 @@ def load_bubble_boxes(filepath):
|
|||||||
# SAMPLE BACKGROUND COLOR
|
# SAMPLE BACKGROUND COLOR
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def sample_bubble_background(cv_image, bubble_data):
|
def sample_bubble_background(cv_image, bubble_data):
|
||||||
"""
|
|
||||||
Samples the dominant background color inside the bbox
|
|
||||||
by averaging the brightest 10% of pixels.
|
|
||||||
Returns (B, G, R).
|
|
||||||
"""
|
|
||||||
x = max(0, bubble_data["x"])
|
x = max(0, bubble_data["x"])
|
||||||
y = max(0, bubble_data["y"])
|
y = max(0, bubble_data["y"])
|
||||||
x2 = min(cv_image.shape[1], x + bubble_data["w"])
|
x2 = min(cv_image.shape[1], x + bubble_data["w"])
|
||||||
@@ -92,21 +159,9 @@ def sample_bubble_background(cv_image, bubble_data):
|
|||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# ERASE ORIGINAL TEXT
|
# ERASE ORIGINAL TEXT
|
||||||
# Fills the tight OCR bbox with the sampled
|
|
||||||
# background color. No extra expansion —
|
|
||||||
# the bbox from bubbles.json is already the
|
|
||||||
# exact size of the red squares.
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def erase_bubble_text(cv_image, bubble_data,
|
def erase_bubble_text(cv_image, bubble_data,
|
||||||
bg_color=(255, 255, 255)):
|
bg_color=(255, 255, 255)):
|
||||||
"""
|
|
||||||
Fills the bubble bounding box with bg_color.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cv_image : BGR numpy array (modified in place)
|
|
||||||
bubble_data : Dict with 'x','y','w','h'
|
|
||||||
bg_color : (B,G,R) fill color
|
|
||||||
"""
|
|
||||||
img_h, img_w = cv_image.shape[:2]
|
img_h, img_w = cv_image.shape[:2]
|
||||||
x = max(0, bubble_data["x"])
|
x = max(0, bubble_data["x"])
|
||||||
y = max(0, bubble_data["y"])
|
y = max(0, bubble_data["y"])
|
||||||
@@ -116,14 +171,61 @@ def erase_bubble_text(cv_image, bubble_data,
|
|||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# FIT FONT SIZE
|
# LINE HEIGHT (tight)
|
||||||
|
#
|
||||||
|
# Uses actual ascender+descender of the font
|
||||||
|
# at the given size, with a minimal 1px gap.
|
||||||
|
# Much tighter than the old flat "+2" approach.
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
|
def get_line_height(draw, font):
|
||||||
|
"""
|
||||||
|
Returns the line height in pixels for the given font.
|
||||||
|
Measured from actual glyph bounds of "Ay" (covers
|
||||||
|
ascenders and descenders) plus 1px breathing room.
|
||||||
|
"""
|
||||||
|
bb = draw.textbbox((0, 0), "Ay", font=font)
|
||||||
|
return (bb[3] - bb[1]) + 1
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# FIT FONT SIZE (dynamic ceiling, word-wrap)
|
||||||
|
#
|
||||||
|
# max_size is derived from the box itself:
|
||||||
|
# min(MAX_FONT_CAP, inner_h)
|
||||||
|
# so a tall box can use a large font and a
|
||||||
|
# small box won't waste iterations on huge sizes.
|
||||||
|
#
|
||||||
|
# Rejects a size if:
|
||||||
|
# • any single word is wider than inner_w, OR
|
||||||
|
# • total wrapped height exceeds inner_h
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
MAX_FONT_CAP = 120 # absolute ceiling across all boxes
|
||||||
|
|
||||||
def fit_font_size(draw, text, max_w, max_h, font_path,
|
def fit_font_size(draw, text, max_w, max_h, font_path,
|
||||||
min_size=7, max_size=48):
|
min_size=7):
|
||||||
"""
|
"""
|
||||||
Finds the largest font size where word-wrapped text
|
Finds the largest font size where word-wrapped text
|
||||||
fits inside (max_w × max_h).
|
fits inside max_w × max_h with NO mid-word breaking.
|
||||||
|
|
||||||
|
max_size is computed dynamically as min(MAX_FONT_CAP, max_h)
|
||||||
|
so the search always starts from a sensible upper bound
|
||||||
|
relative to the actual box height.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
draw : ImageDraw instance
|
||||||
|
text : Full text string
|
||||||
|
max_w : Available width in pixels
|
||||||
|
max_h : Available height in pixels
|
||||||
|
font_path : Path to .ttf (or None for PIL default)
|
||||||
|
min_size : Minimum font pt (default: 7)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(font, lines)
|
||||||
"""
|
"""
|
||||||
|
# Dynamic ceiling: no point trying a font taller than the box
|
||||||
|
max_size = min(MAX_FONT_CAP, max_h)
|
||||||
|
max_size = max(max_size, min_size) # safety: never below min
|
||||||
|
|
||||||
best_font = None
|
best_font = None
|
||||||
best_lines = [text]
|
best_lines = [text]
|
||||||
|
|
||||||
@@ -134,38 +236,48 @@ def fit_font_size(draw, text, max_w, max_h, font_path,
|
|||||||
except Exception:
|
except Exception:
|
||||||
font = ImageFont.load_default()
|
font = ImageFont.load_default()
|
||||||
|
|
||||||
words, lines, current = text.split(), [], ""
|
lines, overflow = wrap_text_words(draw, text, max_w, font)
|
||||||
for word in words:
|
|
||||||
test = (current + " " + word).strip()
|
|
||||||
bb = draw.textbbox((0, 0), test, font=font)
|
|
||||||
if (bb[2] - bb[0]) <= max_w:
|
|
||||||
current = test
|
|
||||||
else:
|
|
||||||
if current:
|
|
||||||
lines.append(current)
|
|
||||||
current = word
|
|
||||||
if current:
|
|
||||||
lines.append(current)
|
|
||||||
|
|
||||||
lh = draw.textbbox((0, 0), "Ay", font=font)
|
if overflow:
|
||||||
line_h = (lh[3] - lh[1]) + 2
|
continue # a word is wider than the box → too big
|
||||||
if line_h * len(lines) <= max_h:
|
|
||||||
|
line_h = get_line_height(draw, font)
|
||||||
|
total_h = line_h * len(lines)
|
||||||
|
|
||||||
|
if total_h <= max_h:
|
||||||
best_font = font
|
best_font = font
|
||||||
best_lines = lines
|
best_lines = lines
|
||||||
break
|
break # largest size that fits — done
|
||||||
|
|
||||||
return best_font or ImageFont.load_default(), best_lines
|
# Guaranteed fallback at min_size
|
||||||
|
if best_font is None:
|
||||||
|
try:
|
||||||
|
best_font = (ImageFont.truetype(font_path, min_size)
|
||||||
|
if font_path else ImageFont.load_default())
|
||||||
|
except Exception:
|
||||||
|
best_font = ImageFont.load_default()
|
||||||
|
best_lines, _ = wrap_text_words(
|
||||||
|
draw, text, max_w, best_font)
|
||||||
|
if not best_lines:
|
||||||
|
best_lines = [text]
|
||||||
|
|
||||||
|
return best_font, best_lines
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# RENDER TEXT INTO BUBBLE
|
# RENDER TEXT INTO BUBBLE
|
||||||
|
#
|
||||||
|
# Text is centered both horizontally and
|
||||||
|
# vertically inside the padded bbox.
|
||||||
|
# Line height uses get_line_height() (tight).
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def render_text_in_bubble(pil_image, bubble_data, text,
|
def render_text_in_bubble(pil_image, bubble_data, text,
|
||||||
font_path, padding=8,
|
font_path, padding=6,
|
||||||
font_color=(0, 0, 0)):
|
font_color=(0, 0, 0)):
|
||||||
"""
|
"""
|
||||||
Renders translated text centered inside the tight bbox.
|
Renders translated text centered inside the bbox.
|
||||||
Font auto-sizes to fill the same w×h the original occupied.
|
Font auto-sizes to fill the box as much as possible.
|
||||||
|
Word-wrap only — no mid-word hyphens.
|
||||||
"""
|
"""
|
||||||
x, y = bubble_data["x"], bubble_data["y"]
|
x, y = bubble_data["x"], bubble_data["y"]
|
||||||
w, h = bubble_data["w"], bubble_data["h"]
|
w, h = bubble_data["w"], bubble_data["h"]
|
||||||
@@ -174,17 +286,20 @@ def render_text_in_bubble(pil_image, bubble_data, text,
|
|||||||
inner_w = max(1, w - padding * 2)
|
inner_w = max(1, w - padding * 2)
|
||||||
inner_h = max(1, h - padding * 2)
|
inner_h = max(1, h - padding * 2)
|
||||||
|
|
||||||
font, lines = fit_font_size(draw, text, inner_w, inner_h,
|
font, lines = fit_font_size(
|
||||||
font_path)
|
draw, text, inner_w, inner_h, font_path
|
||||||
|
)
|
||||||
|
|
||||||
lh_bb = draw.textbbox((0, 0), "Ay", font=font)
|
line_h = get_line_height(draw, font)
|
||||||
line_h = (lh_bb[3] - lh_bb[1]) + 2
|
|
||||||
total_h = line_h * len(lines)
|
total_h = line_h * len(lines)
|
||||||
|
|
||||||
|
# Center block vertically
|
||||||
start_y = y + padding + max(0, (inner_h - total_h) // 2)
|
start_y = y + padding + max(0, (inner_h - total_h) // 2)
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
lb = draw.textbbox((0, 0), line, font=font)
|
bb = draw.textbbox((0, 0), line, font=font)
|
||||||
line_w = lb[2] - lb[0]
|
line_w = bb[2] - bb[0]
|
||||||
|
# Center each line horizontally
|
||||||
start_x = x + padding + max(0, (inner_w - line_w) // 2)
|
start_x = x + padding + max(0, (inner_w - line_w) // 2)
|
||||||
draw.text((start_x, start_y), line,
|
draw.text((start_x, start_y), line,
|
||||||
font=font, fill=font_color)
|
font=font, fill=font_color)
|
||||||
@@ -216,19 +331,9 @@ def render_translated_page(
|
|||||||
font_path = FONT_PATH,
|
font_path = FONT_PATH,
|
||||||
font_fallback = FONT_FALLBACK,
|
font_fallback = FONT_FALLBACK,
|
||||||
font_color = FONT_COLOR,
|
font_color = FONT_COLOR,
|
||||||
text_padding = 8,
|
text_padding = 6,
|
||||||
debug = False,
|
debug = False,
|
||||||
):
|
):
|
||||||
"""
|
|
||||||
Pipeline:
|
|
||||||
1. Parse translations (only present IDs processed)
|
|
||||||
2. Load bubble boxes from bubbles.json
|
|
||||||
3. Cross-check IDs — absent ones left untouched
|
|
||||||
4. Sample background color per bubble
|
|
||||||
5. Erase original text (fill tight bbox)
|
|
||||||
6. Render translated text sized to fit the bbox
|
|
||||||
7. Save output
|
|
||||||
"""
|
|
||||||
print("=" * 55)
|
print("=" * 55)
|
||||||
print(" MANGA TRANSLATOR — RENDERER")
|
print(" MANGA TRANSLATOR — RENDERER")
|
||||||
print("=" * 55)
|
print("=" * 55)
|
||||||
@@ -344,9 +449,9 @@ if __name__ == "__main__":
|
|||||||
output_image = "page_translated.png",
|
output_image = "page_translated.png",
|
||||||
translations_file = "output.txt",
|
translations_file = "output.txt",
|
||||||
bubbles_file = "bubbles.json",
|
bubbles_file = "bubbles.json",
|
||||||
font_path = "font.ttf",
|
font_path = "fonts/ComicRelief-Regular.ttf",
|
||||||
font_fallback = "/System/Library/Fonts/Helvetica.ttc",
|
font_fallback = "/System/Library/Fonts/Helvetica.ttc",
|
||||||
font_color = (0, 0, 0),
|
font_color = (0, 0, 0),
|
||||||
text_padding = 8,
|
text_padding = 6,
|
||||||
debug = True,
|
debug = True,
|
||||||
)
|
)
|
||||||
@@ -48,41 +48,73 @@ def is_sound_effect(text):
|
|||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# TOKEN FILTER
|
# TOKEN CLASSIFIER
|
||||||
|
#
|
||||||
|
# Three categories:
|
||||||
|
# "alpha" — contains at least one letter (È, é, A-Z etc.)
|
||||||
|
# "punct" — 2+ chars, all punctuation (... ?? !! ?! …)
|
||||||
|
# "noise" — everything else (single symbols, pure digits,
|
||||||
|
# low-confidence, sound effects)
|
||||||
|
#
|
||||||
|
# Both "alpha" and "punct" tokens are KEPT:
|
||||||
|
# - "alpha" → contributes to translation text AND bbox
|
||||||
|
# - "punct" → contributes to bbox only (not translation text)
|
||||||
|
# unless it immediately follows alpha text
|
||||||
|
# in the same cluster (handled in clustering)
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def should_keep_token(text, confidence, confidence_threshold,
|
def classify_token(text, confidence, confidence_threshold,
|
||||||
min_text_length, filter_sound_effects):
|
min_text_length, filter_sound_effects):
|
||||||
"""
|
"""
|
||||||
Returns (keep: bool, reason: str).
|
Returns one of: "alpha" | "punct" | "noise"
|
||||||
|
|
||||||
|
"alpha" : has at least one letter → keep for text + bbox
|
||||||
|
"punct" : 2+ chars, no letters → keep for bbox only
|
||||||
|
"noise" : drop entirely
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
1. Drop if confidence below threshold
|
1. Drop if confidence below threshold → noise
|
||||||
2. Drop if shorter than min_text_length
|
2. Drop if shorter than min_text_length → noise
|
||||||
3. Drop pure digit strings
|
3. Drop pure digit strings → noise
|
||||||
4. Drop single non-alpha characters
|
4. Drop single non-alpha characters → noise
|
||||||
5. Drop sound effects if filter enabled
|
5. Drop sound effects if filter enabled → noise
|
||||||
6. Keep everything else
|
6. 2+ char string with no letters → punct
|
||||||
|
7. Has at least one letter → alpha
|
||||||
"""
|
"""
|
||||||
cleaned = text.strip()
|
cleaned = text.strip()
|
||||||
|
|
||||||
if confidence < confidence_threshold:
|
if confidence < confidence_threshold:
|
||||||
return False, f"low confidence ({confidence:.2f})"
|
return "noise"
|
||||||
if len(cleaned) < min_text_length:
|
if len(cleaned) < min_text_length:
|
||||||
return False, "too short"
|
return "noise"
|
||||||
if re.fullmatch(r"\d+", cleaned):
|
if re.fullmatch(r"\d+", cleaned):
|
||||||
return False, "pure digits"
|
return "noise"
|
||||||
if len(cleaned) == 1 and not cleaned.isalpha():
|
if len(cleaned) == 1 and not cleaned.isalpha():
|
||||||
return False, "single symbol"
|
return "noise"
|
||||||
if filter_sound_effects and is_sound_effect(cleaned):
|
if filter_sound_effects and is_sound_effect(cleaned):
|
||||||
return False, "sound effect"
|
return "noise"
|
||||||
|
|
||||||
return True, "ok"
|
# 2+ chars with no letters at all → punctuation token
|
||||||
|
# Examples: "..." "??" "!!" "?!" "…" ".."
|
||||||
|
if not any(ch.isalpha() for ch in cleaned):
|
||||||
|
return "punct"
|
||||||
|
|
||||||
|
return "alpha"
|
||||||
|
|
||||||
|
|
||||||
|
def should_keep_token(text, confidence, confidence_threshold,
|
||||||
|
min_text_length, filter_sound_effects):
|
||||||
|
"""
|
||||||
|
Backward-compatible wrapper.
|
||||||
|
Returns (keep: bool, category: str).
|
||||||
|
"""
|
||||||
|
cat = classify_token(text, confidence, confidence_threshold,
|
||||||
|
min_text_length, filter_sound_effects)
|
||||||
|
return cat != "noise", cat
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# BOUNDING BOX
|
# BOUNDING BOX
|
||||||
#
|
#
|
||||||
# Rules (match the red square exactly):
|
|
||||||
# Width = widest single quad's width
|
# Width = widest single quad's width
|
||||||
# Height = sum of ALL quad heights stacked
|
# Height = sum of ALL quad heights stacked
|
||||||
# X = centered on the widest quad's CX
|
# X = centered on the widest quad's CX
|
||||||
@@ -93,12 +125,11 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
|
|||||||
"""
|
"""
|
||||||
Computes the bubble erase bbox:
|
Computes the bubble erase bbox:
|
||||||
|
|
||||||
1. Per-quad: measure w, h, cx for every OCR detection
|
1. Per-quad: measure w, h, cx
|
||||||
2. Width = width of the widest single quad
|
2. Width = width of the widest single quad
|
||||||
3. Height = sum of every quad's height
|
3. Height = sum of every quad's height
|
||||||
4. X = widest quad's center ± max_w/2
|
4. X = widest quad's center ± max_w/2
|
||||||
(all lines sit symmetrically inside)
|
5. Y = top of topmost quad → Y + total_h
|
||||||
5. Y = top of topmost quad, bottom = Y + total_h
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
ocr_bboxes : List of EasyOCR quad bboxes
|
ocr_bboxes : List of EasyOCR quad bboxes
|
||||||
@@ -113,7 +144,6 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
|
|||||||
if not ocr_bboxes:
|
if not ocr_bboxes:
|
||||||
return 0, 0, 0, 0
|
return 0, 0, 0, 0
|
||||||
|
|
||||||
# ── Per-quad metrics ──────────────────────────────────────────
|
|
||||||
quad_metrics = []
|
quad_metrics = []
|
||||||
for quad in ocr_bboxes:
|
for quad in ocr_bboxes:
|
||||||
xs = [pt[0] for pt in quad]
|
xs = [pt[0] for pt in quad]
|
||||||
@@ -121,30 +151,23 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
|
|||||||
qx1, qx2 = min(xs), max(xs)
|
qx1, qx2 = min(xs), max(xs)
|
||||||
qy1, qy2 = min(ys), max(ys)
|
qy1, qy2 = min(ys), max(ys)
|
||||||
quad_metrics.append({
|
quad_metrics.append({
|
||||||
"x1" : qx1,
|
"x1" : qx1, "x2" : qx2,
|
||||||
"x2" : qx2,
|
"y1" : qy1, "y2" : qy2,
|
||||||
"y1" : qy1,
|
|
||||||
"y2" : qy2,
|
|
||||||
"w" : qx2 - qx1,
|
"w" : qx2 - qx1,
|
||||||
"h" : qy2 - qy1,
|
"h" : qy2 - qy1,
|
||||||
"cx" : (qx1 + qx2) / 2.0,
|
"cx" : (qx1 + qx2) / 2.0,
|
||||||
})
|
})
|
||||||
|
|
||||||
# ── Width: widest single quad ─────────────────────────────────
|
|
||||||
widest = max(quad_metrics, key=lambda q: q["w"])
|
widest = max(quad_metrics, key=lambda q: q["w"])
|
||||||
max_w = widest["w"]
|
max_w = widest["w"]
|
||||||
center_x = widest["cx"]
|
center_x = widest["cx"]
|
||||||
|
|
||||||
# ── Height: sum of all quad heights ──────────────────────────
|
|
||||||
total_h = sum(q["h"] for q in quad_metrics)
|
total_h = sum(q["h"] for q in quad_metrics)
|
||||||
|
|
||||||
# ── Box edges ─────────────────────────────────────────────────
|
|
||||||
box_x1 = center_x - max_w / 2.0
|
box_x1 = center_x - max_w / 2.0
|
||||||
box_x2 = center_x + max_w / 2.0
|
box_x2 = center_x + max_w / 2.0
|
||||||
box_y1 = min(q["y1"] for q in quad_metrics)
|
box_y1 = min(q["y1"] for q in quad_metrics)
|
||||||
box_y2 = box_y1 + total_h
|
box_y2 = box_y1 + total_h
|
||||||
|
|
||||||
# ── Padding + clamp ───────────────────────────────────────────
|
|
||||||
x1 = max(0, box_x1 - padding_px)
|
x1 = max(0, box_x1 - padding_px)
|
||||||
y1 = max(0, box_y1 - padding_px)
|
y1 = max(0, box_y1 - padding_px)
|
||||||
x2 = min(img_w, box_x2 + padding_px)
|
x2 = min(img_w, box_x2 + padding_px)
|
||||||
@@ -171,17 +194,6 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
|
|||||||
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
|
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
# TEXT LINE FILTER
|
|
||||||
# ─────────────────────────────────────────────
|
|
||||||
def has_translatable_content(text):
|
|
||||||
"""
|
|
||||||
True if text contains at least one letter.
|
|
||||||
ch.isalpha() handles È, é, ñ, ü etc.
|
|
||||||
"""
|
|
||||||
return any(ch.isalpha() for ch in text)
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# POST-CLUSTER MERGE (Union-Find)
|
# POST-CLUSTER MERGE (Union-Find)
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
@@ -270,11 +282,17 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
|||||||
Pass 1 — DBSCAN on center points
|
Pass 1 — DBSCAN on center points
|
||||||
Pass 2 — Bounding-box proximity merge
|
Pass 2 — Bounding-box proximity merge
|
||||||
|
|
||||||
|
Token categories per cluster:
|
||||||
|
"alpha" tokens → translation text + bbox
|
||||||
|
"punct" tokens → bbox only (e.g. "..." after "HN")
|
||||||
|
"noise" tokens → already filtered before this function
|
||||||
|
|
||||||
Bbox: widest-line width (centered) × stacked height.
|
Bbox: widest-line width (centered) × stacked height.
|
||||||
All quads contribute to bbox regardless of content.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bubble_dict : cluster_id → list of translatable text lines
|
bubble_dict : cluster_id → list of text lines
|
||||||
|
(alpha tokens only, punct appended
|
||||||
|
to last alpha line if spatially adjacent)
|
||||||
bbox_dict : cluster_id → (x1, y1, x2, y2)
|
bbox_dict : cluster_id → (x1, y1, x2, y2)
|
||||||
ocr_quads : cluster_id → list of ALL raw EasyOCR quads
|
ocr_quads : cluster_id → list of ALL raw EasyOCR quads
|
||||||
"""
|
"""
|
||||||
@@ -303,6 +321,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
|||||||
raw_clusters.setdefault(label, [])
|
raw_clusters.setdefault(label, [])
|
||||||
raw_quads.setdefault(label, [])
|
raw_quads.setdefault(label, [])
|
||||||
bbox, text, _ = ocr_results[idx]
|
bbox, text, _ = ocr_results[idx]
|
||||||
|
# Store (cy, cx, text, category)
|
||||||
|
cat = ocr_results[idx][2] # confidence stored as category below
|
||||||
raw_clusters[label].append(
|
raw_clusters[label].append(
|
||||||
(centers[idx][1], centers[idx][0], text))
|
(centers[idx][1], centers[idx][0], text))
|
||||||
raw_quads[label].append(bbox)
|
raw_quads[label].append(bbox)
|
||||||
@@ -335,15 +355,40 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
|||||||
|
|
||||||
items_sorted = sorted(items, key=lambda t: t[0])
|
items_sorted = sorted(items, key=lambda t: t[0])
|
||||||
|
|
||||||
text_lines = [
|
# ── Build text lines ──────────────────────────────────────
|
||||||
text for _, _, text in items_sorted
|
# Alpha tokens become text lines.
|
||||||
if has_translatable_content(text)
|
# Punct tokens (... ?? etc.) are appended to the
|
||||||
]
|
# nearest preceding alpha token on the same Y level.
|
||||||
|
alpha_lines = [] # (cy, text) for alpha tokens
|
||||||
|
punct_tokens = [] # (cy, text) for punct tokens
|
||||||
|
|
||||||
|
for cy, cx, text in items_sorted:
|
||||||
|
if any(ch.isalpha() for ch in text):
|
||||||
|
alpha_lines.append((cy, text))
|
||||||
|
else:
|
||||||
|
punct_tokens.append((cy, text))
|
||||||
|
|
||||||
|
# Append each punct token to the closest alpha line by Y
|
||||||
|
for pcy, ptext in punct_tokens:
|
||||||
|
if alpha_lines:
|
||||||
|
# Find alpha line with closest cy
|
||||||
|
closest_idx = min(
|
||||||
|
range(len(alpha_lines)),
|
||||||
|
key=lambda k: abs(alpha_lines[k][0] - pcy)
|
||||||
|
)
|
||||||
|
cy_a, text_a = alpha_lines[closest_idx]
|
||||||
|
alpha_lines[closest_idx] = (cy_a, text_a + ptext)
|
||||||
|
# If no alpha lines at all, punct still contributes
|
||||||
|
# to bbox but not to translation text
|
||||||
|
|
||||||
|
text_lines = [t for _, t in alpha_lines]
|
||||||
|
|
||||||
|
# Fallback: if no alpha at all, keep everything
|
||||||
if not text_lines:
|
if not text_lines:
|
||||||
text_lines = [text for _, _, text in items_sorted]
|
text_lines = [text for _, _, text in items_sorted]
|
||||||
|
|
||||||
bubble_dict[i] = text_lines
|
bubble_dict[i] = text_lines
|
||||||
ocr_quads[i] = quads
|
ocr_quads[i] = quads # ALL quads → full bbox
|
||||||
|
|
||||||
bbox_dict[i] = get_cluster_bbox_from_ocr(
|
bbox_dict[i] = get_cluster_bbox_from_ocr(
|
||||||
quads, image_shape, padding_px=bbox_padding
|
quads, image_shape, padding_px=bbox_padding
|
||||||
@@ -353,7 +398,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
|||||||
print(f" Cluster #{i}: {len(quads)} quad(s) "
|
print(f" Cluster #{i}: {len(quads)} quad(s) "
|
||||||
f"bbox=({int(b[0])},{int(b[1])})→"
|
f"bbox=({int(b[0])},{int(b[1])})→"
|
||||||
f"({int(b[2])},{int(b[3])}) "
|
f"({int(b[2])},{int(b[3])}) "
|
||||||
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}")
|
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])} "
|
||||||
|
f"text={text_lines}")
|
||||||
|
|
||||||
return bubble_dict, bbox_dict, ocr_quads
|
return bubble_dict, bbox_dict, ocr_quads
|
||||||
|
|
||||||
@@ -519,15 +565,17 @@ def translate_manga_text(
|
|||||||
|
|
||||||
for bbox, text, confidence in results:
|
for bbox, text, confidence in results:
|
||||||
cleaned = text.strip()
|
cleaned = text.strip()
|
||||||
keep, reason = should_keep_token(
|
keep, category = should_keep_token(
|
||||||
cleaned, confidence,
|
cleaned, confidence,
|
||||||
confidence_threshold, min_text_length,
|
confidence_threshold, min_text_length,
|
||||||
filter_sound_effects
|
filter_sound_effects
|
||||||
)
|
)
|
||||||
if keep:
|
if keep:
|
||||||
filtered.append((bbox, cleaned, confidence))
|
filtered.append((bbox, cleaned, confidence))
|
||||||
|
if category == "punct":
|
||||||
|
print(f" ✔ Punct kept: '{cleaned}'")
|
||||||
else:
|
else:
|
||||||
if reason == "sound effect":
|
if category == "sound effect":
|
||||||
print(f" 🔇 SFX skipped: '{cleaned}'")
|
print(f" 🔇 SFX skipped: '{cleaned}'")
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
|
||||||
@@ -656,6 +704,6 @@ if __name__ == "__main__":
|
|||||||
filter_sound_effects = True,
|
filter_sound_effects = True,
|
||||||
quality_threshold = 0.5,
|
quality_threshold = 0.5,
|
||||||
upscale_factor = 2.5,
|
upscale_factor = 2.5,
|
||||||
bbox_padding = 0,
|
bbox_padding = 3,
|
||||||
debug = True,
|
debug = True,
|
||||||
)
|
)
|
||||||
Reference in New Issue
Block a user