Added good stuff
This commit is contained in:
48
bubbles.json
48
bubbles.json
@@ -1,9 +1,9 @@
|
||||
{
|
||||
"1": {
|
||||
"x": 204,
|
||||
"y": 137,
|
||||
"w": 153,
|
||||
"h": 82,
|
||||
"x": 201,
|
||||
"y": 134,
|
||||
"w": 159,
|
||||
"h": 88,
|
||||
"quads": [
|
||||
[
|
||||
[
|
||||
@@ -26,10 +26,10 @@
|
||||
]
|
||||
},
|
||||
"2": {
|
||||
"x": 1167,
|
||||
"y": 240,
|
||||
"w": 132,
|
||||
"h": 134,
|
||||
"x": 1164,
|
||||
"y": 237,
|
||||
"w": 138,
|
||||
"h": 140,
|
||||
"quads": [
|
||||
[
|
||||
[
|
||||
@@ -106,10 +106,10 @@
|
||||
]
|
||||
},
|
||||
"3": {
|
||||
"x": 930,
|
||||
"y": 378,
|
||||
"w": 136,
|
||||
"h": 132,
|
||||
"x": 927,
|
||||
"y": 375,
|
||||
"w": 142,
|
||||
"h": 138,
|
||||
"quads": [
|
||||
[
|
||||
[
|
||||
@@ -186,10 +186,10 @@
|
||||
]
|
||||
},
|
||||
"4": {
|
||||
"x": 220,
|
||||
"y": 486,
|
||||
"w": 150,
|
||||
"h": 210,
|
||||
"x": 217,
|
||||
"y": 483,
|
||||
"w": 156,
|
||||
"h": 216,
|
||||
"quads": [
|
||||
[
|
||||
[
|
||||
@@ -320,10 +320,10 @@
|
||||
]
|
||||
},
|
||||
"5": {
|
||||
"x": 354,
|
||||
"y": 1132,
|
||||
"w": 92,
|
||||
"h": 102,
|
||||
"x": 351,
|
||||
"y": 1129,
|
||||
"w": 98,
|
||||
"h": 108,
|
||||
"quads": [
|
||||
[
|
||||
[
|
||||
@@ -382,10 +382,10 @@
|
||||
]
|
||||
},
|
||||
"6": {
|
||||
"x": 740,
|
||||
"y": 1324,
|
||||
"w": 38,
|
||||
"h": 24,
|
||||
"x": 737,
|
||||
"y": 1321,
|
||||
"w": 44,
|
||||
"h": 30,
|
||||
"quads": [
|
||||
[
|
||||
[
|
||||
|
||||
BIN
fonts/ComicRelief-Bold.ttf
Executable file
BIN
fonts/ComicRelief-Bold.ttf
Executable file
Binary file not shown.
BIN
fonts/ComicRelief-Regular.ttf
Executable file
BIN
fonts/ComicRelief-Regular.ttf
Executable file
Binary file not shown.
@@ -18,29 +18,101 @@ FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc"
|
||||
FONT_COLOR = (0, 0, 0)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# WORD-ONLY WRAP
|
||||
#
|
||||
# Breaks ONLY at space boundaries.
|
||||
# Returns (lines, overflow) where overflow=True
|
||||
# means a single word is wider than max_w at
|
||||
# this font size → caller must try smaller.
|
||||
# ─────────────────────────────────────────────
|
||||
def wrap_text_words(draw, text, max_w, font):
|
||||
"""
|
||||
Word-wraps text to fit within max_w pixels.
|
||||
Never inserts hyphens or breaks mid-word.
|
||||
|
||||
Returns:
|
||||
(lines, overflow)
|
||||
lines : list of strings, each ≤ max_w px wide
|
||||
overflow : True if any single word exceeds max_w
|
||||
"""
|
||||
def measure(s):
|
||||
bb = draw.textbbox((0, 0), s, font=font)
|
||||
return bb[2] - bb[0]
|
||||
|
||||
words = text.split()
|
||||
lines = []
|
||||
current = ""
|
||||
overflow = False
|
||||
|
||||
for word in words:
|
||||
if measure(word) > max_w:
|
||||
overflow = True
|
||||
break
|
||||
test = (current + " " + word).strip()
|
||||
if measure(test) <= max_w:
|
||||
current = test
|
||||
else:
|
||||
if current:
|
||||
lines.append(current)
|
||||
current = word
|
||||
|
||||
if not overflow and current:
|
||||
lines.append(current)
|
||||
|
||||
return lines, overflow
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# PARSE output.txt
|
||||
# ─────────────────────────────────────────────
|
||||
def parse_translations(filepath):
|
||||
"""
|
||||
Parses output.txt → {bubble_id: translated_text}.
|
||||
Only bubbles present in the file are returned.
|
||||
Absent IDs are left completely untouched on the page.
|
||||
Uses header line as column ruler to find the exact
|
||||
char position of the TRANSLATED column.
|
||||
Immune to commas, ellipses, spaces in translated text.
|
||||
"""
|
||||
translations = {}
|
||||
header_pos = None
|
||||
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.rstrip("\n")
|
||||
if not re.match(r"^\s*#\d+", line):
|
||||
continue
|
||||
parts = re.split(r" {2,}", line.strip())
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
bubble_id = int(re.sub(r"[^0-9]", "", parts[0]))
|
||||
translated = parts[-1].strip()
|
||||
if translated.startswith("["):
|
||||
continue
|
||||
translations[bubble_id] = translated
|
||||
lines = f.readlines()
|
||||
|
||||
for raw_line in lines:
|
||||
line = raw_line.rstrip("\n")
|
||||
|
||||
if re.match(r"^BUBBLE\s+ORIGINAL", line):
|
||||
m = re.search(r"TRANSLATED", line)
|
||||
if m:
|
||||
header_pos = m.start()
|
||||
print(f" ℹ️ TRANSLATED column at char {header_pos}")
|
||||
continue
|
||||
|
||||
stripped = line.strip()
|
||||
if re.match(r"^[─\-=]{3,}$", stripped):
|
||||
continue
|
||||
if stripped.startswith("✅") or stripped.startswith("Done"):
|
||||
continue
|
||||
if not re.match(r"^\s*#\d+", line):
|
||||
continue
|
||||
|
||||
m_id = re.match(r"^\s*#(\d+)", line)
|
||||
if not m_id:
|
||||
continue
|
||||
bubble_id = int(m_id.group(1))
|
||||
|
||||
if header_pos is not None and len(line) > header_pos:
|
||||
translated = line[header_pos:].strip()
|
||||
else:
|
||||
parts = re.split(r" {2,}", stripped)
|
||||
translated = parts[-1].strip() if len(parts) >= 3 else ""
|
||||
|
||||
if not translated or translated.startswith("["):
|
||||
print(f" ⚠️ #{bubble_id}: no translation found")
|
||||
continue
|
||||
|
||||
translations[bubble_id] = translated
|
||||
|
||||
print(f" ✅ {len(translations)} bubble(s) to translate: "
|
||||
f"{sorted(translations.keys())}")
|
||||
@@ -67,11 +139,6 @@ def load_bubble_boxes(filepath):
|
||||
# SAMPLE BACKGROUND COLOR
|
||||
# ─────────────────────────────────────────────
|
||||
def sample_bubble_background(cv_image, bubble_data):
|
||||
"""
|
||||
Samples the dominant background color inside the bbox
|
||||
by averaging the brightest 10% of pixels.
|
||||
Returns (B, G, R).
|
||||
"""
|
||||
x = max(0, bubble_data["x"])
|
||||
y = max(0, bubble_data["y"])
|
||||
x2 = min(cv_image.shape[1], x + bubble_data["w"])
|
||||
@@ -92,21 +159,9 @@ def sample_bubble_background(cv_image, bubble_data):
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# ERASE ORIGINAL TEXT
|
||||
# Fills the tight OCR bbox with the sampled
|
||||
# background color. No extra expansion —
|
||||
# the bbox from bubbles.json is already the
|
||||
# exact size of the red squares.
|
||||
# ─────────────────────────────────────────────
|
||||
def erase_bubble_text(cv_image, bubble_data,
|
||||
bg_color=(255, 255, 255)):
|
||||
"""
|
||||
Fills the bubble bounding box with bg_color.
|
||||
|
||||
Args:
|
||||
cv_image : BGR numpy array (modified in place)
|
||||
bubble_data : Dict with 'x','y','w','h'
|
||||
bg_color : (B,G,R) fill color
|
||||
"""
|
||||
img_h, img_w = cv_image.shape[:2]
|
||||
x = max(0, bubble_data["x"])
|
||||
y = max(0, bubble_data["y"])
|
||||
@@ -116,14 +171,61 @@ def erase_bubble_text(cv_image, bubble_data,
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# FIT FONT SIZE
|
||||
# LINE HEIGHT (tight)
|
||||
#
|
||||
# Uses actual ascender+descender of the font
|
||||
# at the given size, with a minimal 1px gap.
|
||||
# Much tighter than the old flat "+2" approach.
|
||||
# ─────────────────────────────────────────────
|
||||
def get_line_height(draw, font):
|
||||
"""
|
||||
Returns the line height in pixels for the given font.
|
||||
Measured from actual glyph bounds of "Ay" (covers
|
||||
ascenders and descenders) plus 1px breathing room.
|
||||
"""
|
||||
bb = draw.textbbox((0, 0), "Ay", font=font)
|
||||
return (bb[3] - bb[1]) + 1
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# FIT FONT SIZE (dynamic ceiling, word-wrap)
|
||||
#
|
||||
# max_size is derived from the box itself:
|
||||
# min(MAX_FONT_CAP, inner_h)
|
||||
# so a tall box can use a large font and a
|
||||
# small box won't waste iterations on huge sizes.
|
||||
#
|
||||
# Rejects a size if:
|
||||
# • any single word is wider than inner_w, OR
|
||||
# • total wrapped height exceeds inner_h
|
||||
# ─────────────────────────────────────────────
|
||||
MAX_FONT_CAP = 120 # absolute ceiling across all boxes
|
||||
|
||||
def fit_font_size(draw, text, max_w, max_h, font_path,
|
||||
min_size=7, max_size=48):
|
||||
min_size=7):
|
||||
"""
|
||||
Finds the largest font size where word-wrapped text
|
||||
fits inside (max_w × max_h).
|
||||
fits inside max_w × max_h with NO mid-word breaking.
|
||||
|
||||
max_size is computed dynamically as min(MAX_FONT_CAP, max_h)
|
||||
so the search always starts from a sensible upper bound
|
||||
relative to the actual box height.
|
||||
|
||||
Args:
|
||||
draw : ImageDraw instance
|
||||
text : Full text string
|
||||
max_w : Available width in pixels
|
||||
max_h : Available height in pixels
|
||||
font_path : Path to .ttf (or None for PIL default)
|
||||
min_size : Minimum font pt (default: 7)
|
||||
|
||||
Returns:
|
||||
(font, lines)
|
||||
"""
|
||||
# Dynamic ceiling: no point trying a font taller than the box
|
||||
max_size = min(MAX_FONT_CAP, max_h)
|
||||
max_size = max(max_size, min_size) # safety: never below min
|
||||
|
||||
best_font = None
|
||||
best_lines = [text]
|
||||
|
||||
@@ -134,38 +236,48 @@ def fit_font_size(draw, text, max_w, max_h, font_path,
|
||||
except Exception:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
words, lines, current = text.split(), [], ""
|
||||
for word in words:
|
||||
test = (current + " " + word).strip()
|
||||
bb = draw.textbbox((0, 0), test, font=font)
|
||||
if (bb[2] - bb[0]) <= max_w:
|
||||
current = test
|
||||
else:
|
||||
if current:
|
||||
lines.append(current)
|
||||
current = word
|
||||
if current:
|
||||
lines.append(current)
|
||||
lines, overflow = wrap_text_words(draw, text, max_w, font)
|
||||
|
||||
lh = draw.textbbox((0, 0), "Ay", font=font)
|
||||
line_h = (lh[3] - lh[1]) + 2
|
||||
if line_h * len(lines) <= max_h:
|
||||
if overflow:
|
||||
continue # a word is wider than the box → too big
|
||||
|
||||
line_h = get_line_height(draw, font)
|
||||
total_h = line_h * len(lines)
|
||||
|
||||
if total_h <= max_h:
|
||||
best_font = font
|
||||
best_lines = lines
|
||||
break
|
||||
break # largest size that fits — done
|
||||
|
||||
return best_font or ImageFont.load_default(), best_lines
|
||||
# Guaranteed fallback at min_size
|
||||
if best_font is None:
|
||||
try:
|
||||
best_font = (ImageFont.truetype(font_path, min_size)
|
||||
if font_path else ImageFont.load_default())
|
||||
except Exception:
|
||||
best_font = ImageFont.load_default()
|
||||
best_lines, _ = wrap_text_words(
|
||||
draw, text, max_w, best_font)
|
||||
if not best_lines:
|
||||
best_lines = [text]
|
||||
|
||||
return best_font, best_lines
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# RENDER TEXT INTO BUBBLE
|
||||
#
|
||||
# Text is centered both horizontally and
|
||||
# vertically inside the padded bbox.
|
||||
# Line height uses get_line_height() (tight).
|
||||
# ─────────────────────────────────────────────
|
||||
def render_text_in_bubble(pil_image, bubble_data, text,
|
||||
font_path, padding=8,
|
||||
font_path, padding=6,
|
||||
font_color=(0, 0, 0)):
|
||||
"""
|
||||
Renders translated text centered inside the tight bbox.
|
||||
Font auto-sizes to fill the same w×h the original occupied.
|
||||
Renders translated text centered inside the bbox.
|
||||
Font auto-sizes to fill the box as much as possible.
|
||||
Word-wrap only — no mid-word hyphens.
|
||||
"""
|
||||
x, y = bubble_data["x"], bubble_data["y"]
|
||||
w, h = bubble_data["w"], bubble_data["h"]
|
||||
@@ -174,17 +286,20 @@ def render_text_in_bubble(pil_image, bubble_data, text,
|
||||
inner_w = max(1, w - padding * 2)
|
||||
inner_h = max(1, h - padding * 2)
|
||||
|
||||
font, lines = fit_font_size(draw, text, inner_w, inner_h,
|
||||
font_path)
|
||||
font, lines = fit_font_size(
|
||||
draw, text, inner_w, inner_h, font_path
|
||||
)
|
||||
|
||||
lh_bb = draw.textbbox((0, 0), "Ay", font=font)
|
||||
line_h = (lh_bb[3] - lh_bb[1]) + 2
|
||||
line_h = get_line_height(draw, font)
|
||||
total_h = line_h * len(lines)
|
||||
|
||||
# Center block vertically
|
||||
start_y = y + padding + max(0, (inner_h - total_h) // 2)
|
||||
|
||||
for line in lines:
|
||||
lb = draw.textbbox((0, 0), line, font=font)
|
||||
line_w = lb[2] - lb[0]
|
||||
bb = draw.textbbox((0, 0), line, font=font)
|
||||
line_w = bb[2] - bb[0]
|
||||
# Center each line horizontally
|
||||
start_x = x + padding + max(0, (inner_w - line_w) // 2)
|
||||
draw.text((start_x, start_y), line,
|
||||
font=font, fill=font_color)
|
||||
@@ -216,19 +331,9 @@ def render_translated_page(
|
||||
font_path = FONT_PATH,
|
||||
font_fallback = FONT_FALLBACK,
|
||||
font_color = FONT_COLOR,
|
||||
text_padding = 8,
|
||||
text_padding = 6,
|
||||
debug = False,
|
||||
):
|
||||
"""
|
||||
Pipeline:
|
||||
1. Parse translations (only present IDs processed)
|
||||
2. Load bubble boxes from bubbles.json
|
||||
3. Cross-check IDs — absent ones left untouched
|
||||
4. Sample background color per bubble
|
||||
5. Erase original text (fill tight bbox)
|
||||
6. Render translated text sized to fit the bbox
|
||||
7. Save output
|
||||
"""
|
||||
print("=" * 55)
|
||||
print(" MANGA TRANSLATOR — RENDERER")
|
||||
print("=" * 55)
|
||||
@@ -271,7 +376,7 @@ def render_translated_page(
|
||||
print("\n🎨 Sampling backgrounds...")
|
||||
bg_colors = {}
|
||||
for bid in to_process:
|
||||
bg_bgr = sample_bubble_background(
|
||||
bg_bgr = sample_bubble_background(
|
||||
cv_image, bubble_boxes[bid])
|
||||
bg_colors[bid] = bg_bgr
|
||||
bg_rgb = (bg_bgr[2], bg_bgr[1], bg_bgr[0])
|
||||
@@ -344,9 +449,9 @@ if __name__ == "__main__":
|
||||
output_image = "page_translated.png",
|
||||
translations_file = "output.txt",
|
||||
bubbles_file = "bubbles.json",
|
||||
font_path = "font.ttf",
|
||||
font_path = "fonts/ComicRelief-Regular.ttf",
|
||||
font_fallback = "/System/Library/Fonts/Helvetica.ttc",
|
||||
font_color = (0, 0, 0),
|
||||
text_padding = 8,
|
||||
text_padding = 6,
|
||||
debug = True,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -48,57 +48,88 @@ def is_sound_effect(text):
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# TOKEN FILTER
|
||||
# TOKEN CLASSIFIER
|
||||
#
|
||||
# Three categories:
|
||||
# "alpha" — contains at least one letter (È, é, A-Z etc.)
|
||||
# "punct" — 2+ chars, all punctuation (... ?? !! ?! …)
|
||||
# "noise" — everything else (single symbols, pure digits,
|
||||
# low-confidence, sound effects)
|
||||
#
|
||||
# Both "alpha" and "punct" tokens are KEPT:
|
||||
# - "alpha" → contributes to translation text AND bbox
|
||||
# - "punct" → contributes to bbox only (not translation text)
|
||||
# unless it immediately follows alpha text
|
||||
# in the same cluster (handled in clustering)
|
||||
# ─────────────────────────────────────────────
|
||||
def should_keep_token(text, confidence, confidence_threshold,
|
||||
min_text_length, filter_sound_effects):
|
||||
def classify_token(text, confidence, confidence_threshold,
|
||||
min_text_length, filter_sound_effects):
|
||||
"""
|
||||
Returns (keep: bool, reason: str).
|
||||
Returns one of: "alpha" | "punct" | "noise"
|
||||
|
||||
"alpha" : has at least one letter → keep for text + bbox
|
||||
"punct" : 2+ chars, no letters → keep for bbox only
|
||||
"noise" : drop entirely
|
||||
|
||||
Rules:
|
||||
1. Drop if confidence below threshold
|
||||
2. Drop if shorter than min_text_length
|
||||
3. Drop pure digit strings
|
||||
4. Drop single non-alpha characters
|
||||
5. Drop sound effects if filter enabled
|
||||
6. Keep everything else
|
||||
1. Drop if confidence below threshold → noise
|
||||
2. Drop if shorter than min_text_length → noise
|
||||
3. Drop pure digit strings → noise
|
||||
4. Drop single non-alpha characters → noise
|
||||
5. Drop sound effects if filter enabled → noise
|
||||
6. 2+ char string with no letters → punct
|
||||
7. Has at least one letter → alpha
|
||||
"""
|
||||
cleaned = text.strip()
|
||||
|
||||
if confidence < confidence_threshold:
|
||||
return False, f"low confidence ({confidence:.2f})"
|
||||
return "noise"
|
||||
if len(cleaned) < min_text_length:
|
||||
return False, "too short"
|
||||
return "noise"
|
||||
if re.fullmatch(r"\d+", cleaned):
|
||||
return False, "pure digits"
|
||||
return "noise"
|
||||
if len(cleaned) == 1 and not cleaned.isalpha():
|
||||
return False, "single symbol"
|
||||
return "noise"
|
||||
if filter_sound_effects and is_sound_effect(cleaned):
|
||||
return False, "sound effect"
|
||||
return "noise"
|
||||
|
||||
return True, "ok"
|
||||
# 2+ chars with no letters at all → punctuation token
|
||||
# Examples: "..." "??" "!!" "?!" "…" ".."
|
||||
if not any(ch.isalpha() for ch in cleaned):
|
||||
return "punct"
|
||||
|
||||
return "alpha"
|
||||
|
||||
|
||||
def should_keep_token(text, confidence, confidence_threshold,
|
||||
min_text_length, filter_sound_effects):
|
||||
"""
|
||||
Backward-compatible wrapper.
|
||||
Returns (keep: bool, category: str).
|
||||
"""
|
||||
cat = classify_token(text, confidence, confidence_threshold,
|
||||
min_text_length, filter_sound_effects)
|
||||
return cat != "noise", cat
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# BOUNDING BOX
|
||||
#
|
||||
# Rules (match the red square exactly):
|
||||
# Width = widest single quad's width
|
||||
# Height = sum of ALL quad heights stacked
|
||||
# X = centered on the widest quad's CX
|
||||
# Y = topmost Y1 of all quads
|
||||
# Width = widest single quad's width
|
||||
# Height = sum of ALL quad heights stacked
|
||||
# X = centered on the widest quad's CX
|
||||
# Y = topmost Y1 of all quads
|
||||
# ─────────────────────────────────────────────
|
||||
def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
|
||||
padding_px=10):
|
||||
"""
|
||||
Computes the bubble erase bbox:
|
||||
|
||||
1. Per-quad: measure w, h, cx for every OCR detection
|
||||
1. Per-quad: measure w, h, cx
|
||||
2. Width = width of the widest single quad
|
||||
3. Height = sum of every quad's height
|
||||
4. X = widest quad's center ± max_w/2
|
||||
(all lines sit symmetrically inside)
|
||||
5. Y = top of topmost quad, bottom = Y + total_h
|
||||
5. Y = top of topmost quad → Y + total_h
|
||||
|
||||
Args:
|
||||
ocr_bboxes : List of EasyOCR quad bboxes
|
||||
@@ -113,7 +144,6 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
|
||||
if not ocr_bboxes:
|
||||
return 0, 0, 0, 0
|
||||
|
||||
# ── Per-quad metrics ──────────────────────────────────────────
|
||||
quad_metrics = []
|
||||
for quad in ocr_bboxes:
|
||||
xs = [pt[0] for pt in quad]
|
||||
@@ -121,30 +151,23 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
|
||||
qx1, qx2 = min(xs), max(xs)
|
||||
qy1, qy2 = min(ys), max(ys)
|
||||
quad_metrics.append({
|
||||
"x1" : qx1,
|
||||
"x2" : qx2,
|
||||
"y1" : qy1,
|
||||
"y2" : qy2,
|
||||
"x1" : qx1, "x2" : qx2,
|
||||
"y1" : qy1, "y2" : qy2,
|
||||
"w" : qx2 - qx1,
|
||||
"h" : qy2 - qy1,
|
||||
"cx" : (qx1 + qx2) / 2.0,
|
||||
})
|
||||
|
||||
# ── Width: widest single quad ─────────────────────────────────
|
||||
widest = max(quad_metrics, key=lambda q: q["w"])
|
||||
max_w = widest["w"]
|
||||
center_x = widest["cx"]
|
||||
total_h = sum(q["h"] for q in quad_metrics)
|
||||
|
||||
# ── Height: sum of all quad heights ──────────────────────────
|
||||
total_h = sum(q["h"] for q in quad_metrics)
|
||||
|
||||
# ── Box edges ─────────────────────────────────────────────────
|
||||
box_x1 = center_x - max_w / 2.0
|
||||
box_x2 = center_x + max_w / 2.0
|
||||
box_y1 = min(q["y1"] for q in quad_metrics)
|
||||
box_y2 = box_y1 + total_h
|
||||
|
||||
# ── Padding + clamp ───────────────────────────────────────────
|
||||
x1 = max(0, box_x1 - padding_px)
|
||||
y1 = max(0, box_y1 - padding_px)
|
||||
x2 = min(img_w, box_x2 + padding_px)
|
||||
@@ -171,17 +194,6 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
|
||||
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# TEXT LINE FILTER
|
||||
# ─────────────────────────────────────────────
|
||||
def has_translatable_content(text):
|
||||
"""
|
||||
True if text contains at least one letter.
|
||||
ch.isalpha() handles È, é, ñ, ü etc.
|
||||
"""
|
||||
return any(ch.isalpha() for ch in text)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# POST-CLUSTER MERGE (Union-Find)
|
||||
# ─────────────────────────────────────────────
|
||||
@@ -270,11 +282,17 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
||||
Pass 1 — DBSCAN on center points
|
||||
Pass 2 — Bounding-box proximity merge
|
||||
|
||||
Token categories per cluster:
|
||||
"alpha" tokens → translation text + bbox
|
||||
"punct" tokens → bbox only (e.g. "..." after "HN")
|
||||
"noise" tokens → already filtered before this function
|
||||
|
||||
Bbox: widest-line width (centered) × stacked height.
|
||||
All quads contribute to bbox regardless of content.
|
||||
|
||||
Returns:
|
||||
bubble_dict : cluster_id → list of translatable text lines
|
||||
bubble_dict : cluster_id → list of text lines
|
||||
(alpha tokens only, punct appended
|
||||
to last alpha line if spatially adjacent)
|
||||
bbox_dict : cluster_id → (x1, y1, x2, y2)
|
||||
ocr_quads : cluster_id → list of ALL raw EasyOCR quads
|
||||
"""
|
||||
@@ -303,6 +321,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
||||
raw_clusters.setdefault(label, [])
|
||||
raw_quads.setdefault(label, [])
|
||||
bbox, text, _ = ocr_results[idx]
|
||||
# Store (cy, cx, text, category)
|
||||
cat = ocr_results[idx][2] # confidence stored as category below
|
||||
raw_clusters[label].append(
|
||||
(centers[idx][1], centers[idx][0], text))
|
||||
raw_quads[label].append(bbox)
|
||||
@@ -335,15 +355,40 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
||||
|
||||
items_sorted = sorted(items, key=lambda t: t[0])
|
||||
|
||||
text_lines = [
|
||||
text for _, _, text in items_sorted
|
||||
if has_translatable_content(text)
|
||||
]
|
||||
# ── Build text lines ──────────────────────────────────────
|
||||
# Alpha tokens become text lines.
|
||||
# Punct tokens (... ?? etc.) are appended to the
|
||||
# nearest preceding alpha token on the same Y level.
|
||||
alpha_lines = [] # (cy, text) for alpha tokens
|
||||
punct_tokens = [] # (cy, text) for punct tokens
|
||||
|
||||
for cy, cx, text in items_sorted:
|
||||
if any(ch.isalpha() for ch in text):
|
||||
alpha_lines.append((cy, text))
|
||||
else:
|
||||
punct_tokens.append((cy, text))
|
||||
|
||||
# Append each punct token to the closest alpha line by Y
|
||||
for pcy, ptext in punct_tokens:
|
||||
if alpha_lines:
|
||||
# Find alpha line with closest cy
|
||||
closest_idx = min(
|
||||
range(len(alpha_lines)),
|
||||
key=lambda k: abs(alpha_lines[k][0] - pcy)
|
||||
)
|
||||
cy_a, text_a = alpha_lines[closest_idx]
|
||||
alpha_lines[closest_idx] = (cy_a, text_a + ptext)
|
||||
# If no alpha lines at all, punct still contributes
|
||||
# to bbox but not to translation text
|
||||
|
||||
text_lines = [t for _, t in alpha_lines]
|
||||
|
||||
# Fallback: if no alpha at all, keep everything
|
||||
if not text_lines:
|
||||
text_lines = [text for _, _, text in items_sorted]
|
||||
|
||||
bubble_dict[i] = text_lines
|
||||
ocr_quads[i] = quads
|
||||
ocr_quads[i] = quads # ALL quads → full bbox
|
||||
|
||||
bbox_dict[i] = get_cluster_bbox_from_ocr(
|
||||
quads, image_shape, padding_px=bbox_padding
|
||||
@@ -353,7 +398,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
|
||||
print(f" Cluster #{i}: {len(quads)} quad(s) "
|
||||
f"bbox=({int(b[0])},{int(b[1])})→"
|
||||
f"({int(b[2])},{int(b[3])}) "
|
||||
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}")
|
||||
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])} "
|
||||
f"text={text_lines}")
|
||||
|
||||
return bubble_dict, bbox_dict, ocr_quads
|
||||
|
||||
@@ -519,15 +565,17 @@ def translate_manga_text(
|
||||
|
||||
for bbox, text, confidence in results:
|
||||
cleaned = text.strip()
|
||||
keep, reason = should_keep_token(
|
||||
keep, category = should_keep_token(
|
||||
cleaned, confidence,
|
||||
confidence_threshold, min_text_length,
|
||||
filter_sound_effects
|
||||
)
|
||||
if keep:
|
||||
filtered.append((bbox, cleaned, confidence))
|
||||
if category == "punct":
|
||||
print(f" ✔ Punct kept: '{cleaned}'")
|
||||
else:
|
||||
if reason == "sound effect":
|
||||
if category == "sound effect":
|
||||
print(f" 🔇 SFX skipped: '{cleaned}'")
|
||||
skipped += 1
|
||||
|
||||
@@ -656,6 +704,6 @@ if __name__ == "__main__":
|
||||
filter_sound_effects = True,
|
||||
quality_threshold = 0.5,
|
||||
upscale_factor = 2.5,
|
||||
bbox_padding = 0,
|
||||
bbox_padding = 3,
|
||||
debug = True,
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user