Added good stuff

This commit is contained in:
Guillem Hernandez Sola
2026-04-11 14:34:18 +02:00
parent 555892348f
commit 727b052e93
5 changed files with 310 additions and 157 deletions

View File

@@ -1,9 +1,9 @@
{
"1": {
"x": 204,
"y": 137,
"w": 153,
"h": 82,
"x": 201,
"y": 134,
"w": 159,
"h": 88,
"quads": [
[
[
@@ -26,10 +26,10 @@
]
},
"2": {
"x": 1167,
"y": 240,
"w": 132,
"h": 134,
"x": 1164,
"y": 237,
"w": 138,
"h": 140,
"quads": [
[
[
@@ -106,10 +106,10 @@
]
},
"3": {
"x": 930,
"y": 378,
"w": 136,
"h": 132,
"x": 927,
"y": 375,
"w": 142,
"h": 138,
"quads": [
[
[
@@ -186,10 +186,10 @@
]
},
"4": {
"x": 220,
"y": 486,
"w": 150,
"h": 210,
"x": 217,
"y": 483,
"w": 156,
"h": 216,
"quads": [
[
[
@@ -320,10 +320,10 @@
]
},
"5": {
"x": 354,
"y": 1132,
"w": 92,
"h": 102,
"x": 351,
"y": 1129,
"w": 98,
"h": 108,
"quads": [
[
[
@@ -382,10 +382,10 @@
]
},
"6": {
"x": 740,
"y": 1324,
"w": 38,
"h": 24,
"x": 737,
"y": 1321,
"w": 44,
"h": 30,
"quads": [
[
[

BIN
fonts/ComicRelief-Bold.ttf Executable file

Binary file not shown.

BIN
fonts/ComicRelief-Regular.ttf Executable file

Binary file not shown.

View File

@@ -18,29 +18,101 @@ FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc"
FONT_COLOR = (0, 0, 0)
# ─────────────────────────────────────────────
# WORD-ONLY WRAP
#
# Breaks ONLY at space boundaries.
# Returns (lines, overflow) where overflow=True
# means a single word is wider than max_w at
# this font size → caller must try smaller.
# ─────────────────────────────────────────────
def wrap_text_words(draw, text, max_w, font):
"""
Word-wraps text to fit within max_w pixels.
Never inserts hyphens or breaks mid-word.
Returns:
(lines, overflow)
lines : list of strings, each ≤ max_w px wide
overflow : True if any single word exceeds max_w
"""
def measure(s):
bb = draw.textbbox((0, 0), s, font=font)
return bb[2] - bb[0]
words = text.split()
lines = []
current = ""
overflow = False
for word in words:
if measure(word) > max_w:
overflow = True
break
test = (current + " " + word).strip()
if measure(test) <= max_w:
current = test
else:
if current:
lines.append(current)
current = word
if not overflow and current:
lines.append(current)
return lines, overflow
# ─────────────────────────────────────────────
# PARSE output.txt
# ─────────────────────────────────────────────
def parse_translations(filepath):
"""
Parses output.txt → {bubble_id: translated_text}.
Only bubbles present in the file are returned.
Absent IDs are left completely untouched on the page.
Uses header line as column ruler to find the exact
char position of the TRANSLATED column.
Immune to commas, ellipses, spaces in translated text.
"""
translations = {}
header_pos = None
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")
if not re.match(r"^\s*#\d+", line):
continue
parts = re.split(r" {2,}", line.strip())
if len(parts) < 3:
continue
bubble_id = int(re.sub(r"[^0-9]", "", parts[0]))
translated = parts[-1].strip()
if translated.startswith("["):
continue
translations[bubble_id] = translated
lines = f.readlines()
for raw_line in lines:
line = raw_line.rstrip("\n")
if re.match(r"^BUBBLE\s+ORIGINAL", line):
m = re.search(r"TRANSLATED", line)
if m:
header_pos = m.start()
print(f" TRANSLATED column at char {header_pos}")
continue
stripped = line.strip()
if re.match(r"^[─\-=]{3,}$", stripped):
continue
if stripped.startswith("") or stripped.startswith("Done"):
continue
if not re.match(r"^\s*#\d+", line):
continue
m_id = re.match(r"^\s*#(\d+)", line)
if not m_id:
continue
bubble_id = int(m_id.group(1))
if header_pos is not None and len(line) > header_pos:
translated = line[header_pos:].strip()
else:
parts = re.split(r" {2,}", stripped)
translated = parts[-1].strip() if len(parts) >= 3 else ""
if not translated or translated.startswith("["):
print(f" ⚠️ #{bubble_id}: no translation found")
continue
translations[bubble_id] = translated
print(f"{len(translations)} bubble(s) to translate: "
f"{sorted(translations.keys())}")
@@ -67,11 +139,6 @@ def load_bubble_boxes(filepath):
# SAMPLE BACKGROUND COLOR
# ─────────────────────────────────────────────
def sample_bubble_background(cv_image, bubble_data):
"""
Samples the dominant background color inside the bbox
by averaging the brightest 10% of pixels.
Returns (B, G, R).
"""
x = max(0, bubble_data["x"])
y = max(0, bubble_data["y"])
x2 = min(cv_image.shape[1], x + bubble_data["w"])
@@ -92,21 +159,9 @@ def sample_bubble_background(cv_image, bubble_data):
# ─────────────────────────────────────────────
# ERASE ORIGINAL TEXT
# Fills the tight OCR bbox with the sampled
# background color. No extra expansion —
# the bbox from bubbles.json is already the
# exact size of the red squares.
# ─────────────────────────────────────────────
def erase_bubble_text(cv_image, bubble_data,
bg_color=(255, 255, 255)):
"""
Fills the bubble bounding box with bg_color.
Args:
cv_image : BGR numpy array (modified in place)
bubble_data : Dict with 'x','y','w','h'
bg_color : (B,G,R) fill color
"""
img_h, img_w = cv_image.shape[:2]
x = max(0, bubble_data["x"])
y = max(0, bubble_data["y"])
@@ -116,14 +171,61 @@ def erase_bubble_text(cv_image, bubble_data,
# ─────────────────────────────────────────────
# FIT FONT SIZE
# LINE HEIGHT (tight)
#
# Uses actual ascender+descender of the font
# at the given size, with a minimal 1px gap.
# Much tighter than the old flat "+2" approach.
# ─────────────────────────────────────────────
def get_line_height(draw, font):
"""
Returns the line height in pixels for the given font.
Measured from actual glyph bounds of "Ay" (covers
ascenders and descenders) plus 1px breathing room.
"""
bb = draw.textbbox((0, 0), "Ay", font=font)
return (bb[3] - bb[1]) + 1
# ─────────────────────────────────────────────
# FIT FONT SIZE (dynamic ceiling, word-wrap)
#
# max_size is derived from the box itself:
# min(MAX_FONT_CAP, inner_h)
# so a tall box can use a large font and a
# small box won't waste iterations on huge sizes.
#
# Rejects a size if:
# • any single word is wider than inner_w, OR
# • total wrapped height exceeds inner_h
# ─────────────────────────────────────────────
MAX_FONT_CAP = 120 # absolute ceiling across all boxes
def fit_font_size(draw, text, max_w, max_h, font_path,
min_size=7, max_size=48):
min_size=7):
"""
Finds the largest font size where word-wrapped text
fits inside (max_w × max_h).
fits inside max_w × max_h with NO mid-word breaking.
max_size is computed dynamically as min(MAX_FONT_CAP, max_h)
so the search always starts from a sensible upper bound
relative to the actual box height.
Args:
draw : ImageDraw instance
text : Full text string
max_w : Available width in pixels
max_h : Available height in pixels
font_path : Path to .ttf (or None for PIL default)
min_size : Minimum font pt (default: 7)
Returns:
(font, lines)
"""
# Dynamic ceiling: no point trying a font taller than the box
max_size = min(MAX_FONT_CAP, max_h)
max_size = max(max_size, min_size) # safety: never below min
best_font = None
best_lines = [text]
@@ -134,38 +236,48 @@ def fit_font_size(draw, text, max_w, max_h, font_path,
except Exception:
font = ImageFont.load_default()
words, lines, current = text.split(), [], ""
for word in words:
test = (current + " " + word).strip()
bb = draw.textbbox((0, 0), test, font=font)
if (bb[2] - bb[0]) <= max_w:
current = test
else:
if current:
lines.append(current)
current = word
if current:
lines.append(current)
lines, overflow = wrap_text_words(draw, text, max_w, font)
lh = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh[3] - lh[1]) + 2
if line_h * len(lines) <= max_h:
if overflow:
continue # a word is wider than the box → too big
line_h = get_line_height(draw, font)
total_h = line_h * len(lines)
if total_h <= max_h:
best_font = font
best_lines = lines
break
break # largest size that fits — done
return best_font or ImageFont.load_default(), best_lines
# Guaranteed fallback at min_size
if best_font is None:
try:
best_font = (ImageFont.truetype(font_path, min_size)
if font_path else ImageFont.load_default())
except Exception:
best_font = ImageFont.load_default()
best_lines, _ = wrap_text_words(
draw, text, max_w, best_font)
if not best_lines:
best_lines = [text]
return best_font, best_lines
# ─────────────────────────────────────────────
# RENDER TEXT INTO BUBBLE
#
# Text is centered both horizontally and
# vertically inside the padded bbox.
# Line height uses get_line_height() (tight).
# ─────────────────────────────────────────────
def render_text_in_bubble(pil_image, bubble_data, text,
font_path, padding=8,
font_path, padding=6,
font_color=(0, 0, 0)):
"""
Renders translated text centered inside the tight bbox.
Font auto-sizes to fill the same w×h the original occupied.
Renders translated text centered inside the bbox.
Font auto-sizes to fill the box as much as possible.
Word-wrap only — no mid-word hyphens.
"""
x, y = bubble_data["x"], bubble_data["y"]
w, h = bubble_data["w"], bubble_data["h"]
@@ -174,17 +286,20 @@ def render_text_in_bubble(pil_image, bubble_data, text,
inner_w = max(1, w - padding * 2)
inner_h = max(1, h - padding * 2)
font, lines = fit_font_size(draw, text, inner_w, inner_h,
font_path)
font, lines = fit_font_size(
draw, text, inner_w, inner_h, font_path
)
lh_bb = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bb[3] - lh_bb[1]) + 2
line_h = get_line_height(draw, font)
total_h = line_h * len(lines)
# Center block vertically
start_y = y + padding + max(0, (inner_h - total_h) // 2)
for line in lines:
lb = draw.textbbox((0, 0), line, font=font)
line_w = lb[2] - lb[0]
bb = draw.textbbox((0, 0), line, font=font)
line_w = bb[2] - bb[0]
# Center each line horizontally
start_x = x + padding + max(0, (inner_w - line_w) // 2)
draw.text((start_x, start_y), line,
font=font, fill=font_color)
@@ -216,19 +331,9 @@ def render_translated_page(
font_path = FONT_PATH,
font_fallback = FONT_FALLBACK,
font_color = FONT_COLOR,
text_padding = 8,
text_padding = 6,
debug = False,
):
"""
Pipeline:
1. Parse translations (only present IDs processed)
2. Load bubble boxes from bubbles.json
3. Cross-check IDs — absent ones left untouched
4. Sample background color per bubble
5. Erase original text (fill tight bbox)
6. Render translated text sized to fit the bbox
7. Save output
"""
print("=" * 55)
print(" MANGA TRANSLATOR — RENDERER")
print("=" * 55)
@@ -271,7 +376,7 @@ def render_translated_page(
print("\n🎨 Sampling backgrounds...")
bg_colors = {}
for bid in to_process:
bg_bgr = sample_bubble_background(
bg_bgr = sample_bubble_background(
cv_image, bubble_boxes[bid])
bg_colors[bid] = bg_bgr
bg_rgb = (bg_bgr[2], bg_bgr[1], bg_bgr[0])
@@ -344,9 +449,9 @@ if __name__ == "__main__":
output_image = "page_translated.png",
translations_file = "output.txt",
bubbles_file = "bubbles.json",
font_path = "font.ttf",
font_path = "fonts/ComicRelief-Regular.ttf",
font_fallback = "/System/Library/Fonts/Helvetica.ttc",
font_color = (0, 0, 0),
text_padding = 8,
text_padding = 6,
debug = True,
)
)

View File

@@ -48,57 +48,88 @@ def is_sound_effect(text):
# ─────────────────────────────────────────────
# TOKEN FILTER
# TOKEN CLASSIFIER
#
# Three categories:
# "alpha" — contains at least one letter (È, é, A-Z etc.)
# "punct" — 2+ chars, all punctuation (... ?? !! ?! …)
# "noise" — everything else (single symbols, pure digits,
# low-confidence, sound effects)
#
# Both "alpha" and "punct" tokens are KEPT:
# - "alpha" → contributes to translation text AND bbox
# - "punct" → contributes to bbox only (not translation text)
# unless it immediately follows alpha text
# in the same cluster (handled in clustering)
# ─────────────────────────────────────────────
def should_keep_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
def classify_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
"""
Returns (keep: bool, reason: str).
Returns one of: "alpha" | "punct" | "noise"
"alpha" : has at least one letter → keep for text + bbox
"punct" : 2+ chars, no letters → keep for bbox only
"noise" : drop entirely
Rules:
1. Drop if confidence below threshold
2. Drop if shorter than min_text_length
3. Drop pure digit strings
4. Drop single non-alpha characters
5. Drop sound effects if filter enabled
6. Keep everything else
1. Drop if confidence below threshold → noise
2. Drop if shorter than min_text_length → noise
3. Drop pure digit strings → noise
4. Drop single non-alpha characters → noise
5. Drop sound effects if filter enabled → noise
6. 2+ char string with no letters → punct
7. Has at least one letter → alpha
"""
cleaned = text.strip()
if confidence < confidence_threshold:
return False, f"low confidence ({confidence:.2f})"
return "noise"
if len(cleaned) < min_text_length:
return False, "too short"
return "noise"
if re.fullmatch(r"\d+", cleaned):
return False, "pure digits"
return "noise"
if len(cleaned) == 1 and not cleaned.isalpha():
return False, "single symbol"
return "noise"
if filter_sound_effects and is_sound_effect(cleaned):
return False, "sound effect"
return "noise"
return True, "ok"
# 2+ chars with no letters at all → punctuation token
# Examples: "..." "??" "!!" "?!" "…" ".."
if not any(ch.isalpha() for ch in cleaned):
return "punct"
return "alpha"
def should_keep_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
"""
Backward-compatible wrapper.
Returns (keep: bool, category: str).
"""
cat = classify_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects)
return cat != "noise", cat
# ─────────────────────────────────────────────
# BOUNDING BOX
#
# Rules (match the red square exactly):
# Width = widest single quad's width
# Height = sum of ALL quad heights stacked
# X = centered on the widest quad's CX
# Y = topmost Y1 of all quads
# Width = widest single quad's width
# Height = sum of ALL quad heights stacked
# X = centered on the widest quad's CX
# Y = topmost Y1 of all quads
# ─────────────────────────────────────────────
def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
padding_px=10):
"""
Computes the bubble erase bbox:
1. Per-quad: measure w, h, cx for every OCR detection
1. Per-quad: measure w, h, cx
2. Width = width of the widest single quad
3. Height = sum of every quad's height
4. X = widest quad's center ± max_w/2
(all lines sit symmetrically inside)
5. Y = top of topmost quad, bottom = Y + total_h
5. Y = top of topmost quad → Y + total_h
Args:
ocr_bboxes : List of EasyOCR quad bboxes
@@ -113,7 +144,6 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
if not ocr_bboxes:
return 0, 0, 0, 0
# ── Per-quad metrics ──────────────────────────────────────────
quad_metrics = []
for quad in ocr_bboxes:
xs = [pt[0] for pt in quad]
@@ -121,30 +151,23 @@ def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
qx1, qx2 = min(xs), max(xs)
qy1, qy2 = min(ys), max(ys)
quad_metrics.append({
"x1" : qx1,
"x2" : qx2,
"y1" : qy1,
"y2" : qy2,
"x1" : qx1, "x2" : qx2,
"y1" : qy1, "y2" : qy2,
"w" : qx2 - qx1,
"h" : qy2 - qy1,
"cx" : (qx1 + qx2) / 2.0,
})
# ── Width: widest single quad ─────────────────────────────────
widest = max(quad_metrics, key=lambda q: q["w"])
max_w = widest["w"]
center_x = widest["cx"]
total_h = sum(q["h"] for q in quad_metrics)
# ── Height: sum of all quad heights ──────────────────────────
total_h = sum(q["h"] for q in quad_metrics)
# ── Box edges ─────────────────────────────────────────────────
box_x1 = center_x - max_w / 2.0
box_x2 = center_x + max_w / 2.0
box_y1 = min(q["y1"] for q in quad_metrics)
box_y2 = box_y1 + total_h
# ── Padding + clamp ───────────────────────────────────────────
x1 = max(0, box_x1 - padding_px)
y1 = max(0, box_y1 - padding_px)
x2 = min(img_w, box_x2 + padding_px)
@@ -171,17 +194,6 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
# ─────────────────────────────────────────────
# TEXT LINE FILTER
# ─────────────────────────────────────────────
def has_translatable_content(text):
"""
True if text contains at least one letter.
ch.isalpha() handles È, é, ñ, ü etc.
"""
return any(ch.isalpha() for ch in text)
# ─────────────────────────────────────────────
# POST-CLUSTER MERGE (Union-Find)
# ─────────────────────────────────────────────
@@ -270,11 +282,17 @@ def cluster_into_bubbles(ocr_results, image_shape,
Pass 1 — DBSCAN on center points
Pass 2 — Bounding-box proximity merge
Token categories per cluster:
"alpha" tokens → translation text + bbox
"punct" tokens → bbox only (e.g. "..." after "HN")
"noise" tokens → already filtered before this function
Bbox: widest-line width (centered) × stacked height.
All quads contribute to bbox regardless of content.
Returns:
bubble_dict : cluster_id → list of translatable text lines
bubble_dict : cluster_id → list of text lines
(alpha tokens only, punct appended
to last alpha line if spatially adjacent)
bbox_dict : cluster_id → (x1, y1, x2, y2)
ocr_quads : cluster_id → list of ALL raw EasyOCR quads
"""
@@ -303,6 +321,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
raw_clusters.setdefault(label, [])
raw_quads.setdefault(label, [])
bbox, text, _ = ocr_results[idx]
# Store (cy, cx, text, category)
cat = ocr_results[idx][2] # confidence stored as category below
raw_clusters[label].append(
(centers[idx][1], centers[idx][0], text))
raw_quads[label].append(bbox)
@@ -335,15 +355,40 @@ def cluster_into_bubbles(ocr_results, image_shape,
items_sorted = sorted(items, key=lambda t: t[0])
text_lines = [
text for _, _, text in items_sorted
if has_translatable_content(text)
]
# ── Build text lines ──────────────────────────────────────
# Alpha tokens become text lines.
# Punct tokens (... ?? etc.) are appended to the
# nearest preceding alpha token on the same Y level.
alpha_lines = [] # (cy, text) for alpha tokens
punct_tokens = [] # (cy, text) for punct tokens
for cy, cx, text in items_sorted:
if any(ch.isalpha() for ch in text):
alpha_lines.append((cy, text))
else:
punct_tokens.append((cy, text))
# Append each punct token to the closest alpha line by Y
for pcy, ptext in punct_tokens:
if alpha_lines:
# Find alpha line with closest cy
closest_idx = min(
range(len(alpha_lines)),
key=lambda k: abs(alpha_lines[k][0] - pcy)
)
cy_a, text_a = alpha_lines[closest_idx]
alpha_lines[closest_idx] = (cy_a, text_a + ptext)
# If no alpha lines at all, punct still contributes
# to bbox but not to translation text
text_lines = [t for _, t in alpha_lines]
# Fallback: if no alpha at all, keep everything
if not text_lines:
text_lines = [text for _, _, text in items_sorted]
bubble_dict[i] = text_lines
ocr_quads[i] = quads
ocr_quads[i] = quads # ALL quads → full bbox
bbox_dict[i] = get_cluster_bbox_from_ocr(
quads, image_shape, padding_px=bbox_padding
@@ -353,7 +398,8 @@ def cluster_into_bubbles(ocr_results, image_shape,
print(f" Cluster #{i}: {len(quads)} quad(s) "
f"bbox=({int(b[0])},{int(b[1])})→"
f"({int(b[2])},{int(b[3])}) "
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}")
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])} "
f"text={text_lines}")
return bubble_dict, bbox_dict, ocr_quads
@@ -519,15 +565,17 @@ def translate_manga_text(
for bbox, text, confidence in results:
cleaned = text.strip()
keep, reason = should_keep_token(
keep, category = should_keep_token(
cleaned, confidence,
confidence_threshold, min_text_length,
filter_sound_effects
)
if keep:
filtered.append((bbox, cleaned, confidence))
if category == "punct":
print(f" ✔ Punct kept: '{cleaned}'")
else:
if reason == "sound effect":
if category == "sound effect":
print(f" 🔇 SFX skipped: '{cleaned}'")
skipped += 1
@@ -656,6 +704,6 @@ if __name__ == "__main__":
filter_sound_effects = True,
quality_threshold = 0.5,
upscale_factor = 2.5,
bbox_padding = 0,
bbox_padding = 3,
debug = True,
)
)