Added new

This commit is contained in:
Guillem Hernandez Sola
2026-04-11 14:00:07 +02:00
parent 458915278e
commit 555892348f
3 changed files with 836 additions and 399 deletions

View File

@@ -1,38 +1,410 @@
{
"1": {
"x": 251,
"y": 149,
"w": 60,
"h": 60
"x": 204,
"y": 137,
"w": 153,
"h": 82,
"quads": [
[
[
204,
172
],
[
348,
137
],
[
358,
185
],
[
215,
220
]
]
]
},
"2": {
"x": 1202,
"y": 226,
"w": 61,
"h": 159
"x": 1167,
"y": 240,
"w": 132,
"h": 134,
"quads": [
[
[
1214,
240
],
[
1252,
240
],
[
1252,
272
],
[
1214,
272
]
],
[
[
1167,
271
],
[
1299,
271
],
[
1299,
307
],
[
1167,
307
]
],
[
[
1175,
303
],
[
1289,
303
],
[
1289,
339
],
[
1175,
339
]
],
[
[
1206,
340
],
[
1260,
340
],
[
1260,
370
],
[
1206,
370
]
]
]
},
"3": {
"x": 966,
"y": 364,
"w": 62,
"h": 156
"x": 930,
"y": 378,
"w": 136,
"h": 132,
"quads": [
[
[
930,
378
],
[
1062,
378
],
[
1062,
410
],
[
930,
410
]
],
[
[
930,
410
],
[
1066,
410
],
[
1066,
442
],
[
930,
442
]
],
[
[
954,
439
],
[
1041,
439
],
[
1041,
475
],
[
954,
475
]
],
[
[
946,
474
],
[
1050,
474
],
[
1050,
506
],
[
946,
506
]
]
]
},
"4": {
"x": 265,
"y": 471,
"w": 62,
"h": 230
"x": 220,
"y": 486,
"w": 150,
"h": 210,
"quads": [
[
[
278,
486
],
[
312,
486
],
[
312,
516
],
[
278,
516
]
],
[
[
236,
514
],
[
356,
514
],
[
356,
544
],
[
236,
544
]
],
[
[
236,
542
],
[
358,
542
],
[
358,
572
],
[
236,
572
]
],
[
[
220,
572
],
[
370,
572
],
[
370,
600
],
[
220,
600
]
],
[
[
240,
598
],
[
350,
598
],
[
350,
630
],
[
240,
630
]
],
[
[
246,
628
],
[
346,
628
],
[
346,
658
],
[
246,
658
]
],
[
[
250,
656
],
[
340,
656
],
[
340,
686
],
[
250,
686
]
]
]
},
"5": {
"x": 359,
"y": 1114,
"w": 72,
"h": 134
"x": 354,
"y": 1132,
"w": 92,
"h": 102,
"quads": [
[
[
384,
1132
],
[
418,
1132
],
[
418,
1156
],
[
384,
1156
]
],
[
[
354,
1154
],
[
446,
1154
],
[
446,
1208
],
[
354,
1208
]
],
[
[
366,
1206
],
[
412,
1206
],
[
412,
1230
],
[
366,
1230
]
]
]
},
"6": {
"x": 729,
"y": 1306,
"w": 60,
"h": 60
"x": 740,
"y": 1324,
"w": 38,
"h": 24,
"quads": [
[
[
740,
1324
],
[
778,
1324
],
[
778,
1348
],
[
740,
1348
]
]
]
}
}

View File

@@ -13,218 +13,172 @@ INPUT_IMAGE = "page.png"
OUTPUT_IMAGE = "page_translated.png"
TRANSLATIONS_FILE = "output.txt"
BUBBLES_FILE = "bubbles.json"
FONT_PATH = "font.ttf"
FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc"
FONT_COLOR = (0, 0, 0)
BUBBLE_FILL = (255, 255, 255)
# ─────────────────────────────────────────────
# STEP 1: PARSE output.txt
# Robust parser: always takes the LAST
# whitespace-separated column as translation.
# PARSE output.txt
# ─────────────────────────────────────────────
def parse_translations(filepath):
"""
Parses output.txt and returns {bubble_id: translated_text}.
Strategy: split each #N line on 2+ consecutive spaces,
then always take the LAST token as the translation.
This is robust even when original or translated text
contains internal spaces.
Args:
filepath : Path to output.txt
Returns:
Dict {1: "LA NOIA ESTÀ IL·LESA!", ...}
Parses output.txt {bubble_id: translated_text}.
Only bubbles present in the file are returned.
Absent IDs are left completely untouched on the page.
"""
translations = {}
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")
# Must start with #N
if not re.match(r"^#\d+", line.strip()):
if not re.match(r"^\s*#\d+", line):
continue
# Split on 2+ spaces → [bubble_id_col, original_col, translated_col]
parts = re.split(r" {2,}", line.strip())
if len(parts) < 3:
continue
bubble_id = int(re.sub(r"[^0-9]", "", parts[0]))
translated = parts[-1].strip() # always last column
translated = parts[-1].strip()
if translated.startswith("["):
continue
translations[bubble_id] = translated
print(f" Parsed {len(translations)} translation(s) from {filepath}")
print(f"{len(translations)} bubble(s) to translate: "
f"{sorted(translations.keys())}")
for bid, text in sorted(translations.items()):
print(f" #{bid}: {text}")
return translations
# ─────────────────────────────────────────────
# STEP 2: LOAD BUBBLE BOXES from bubbles.json
# These were saved by manga-translator.py
# and are guaranteed to match the clusters.
# LOAD bubbles.json
# ─────────────────────────────────────────────
def load_bubble_boxes(filepath):
"""
Loads bubble bounding boxes from bubbles.json.
Expected format:
{
"1": {"x": 120, "y": 45, "w": 180, "h": 210},
"2": { ... },
...
}
Args:
filepath : Path to bubbles.json
Returns:
Dict {bubble_id (int): (x, y, w, h)}
"""
with open(filepath, "r", encoding="utf-8") as f:
raw = json.load(f)
boxes = {}
for key, val in raw.items():
bubble_id = int(key)
boxes[bubble_id] = (val["x"], val["y"], val["w"], val["h"])
print(f" ✅ Loaded {len(boxes)} bubble box(es) from {filepath}")
for bid, (x, y, w, h) in sorted(boxes.items()):
print(f" #{bid}: ({x},{y}) {w}×{h}px")
boxes = {int(k): v for k, v in raw.items()}
print(f" ✅ Loaded {len(boxes)} bubble(s)")
for bid, val in sorted(boxes.items()):
print(f" #{bid}: ({val['x']},{val['y']}) "
f"{val['w']}×{val['h']}px")
return boxes
# ─────────────────────────────────────────────
# STEP 3: ERASE BUBBLE CONTENT
# Fills a rectangular region with white.
# Uses a slightly inset rect to preserve
# the bubble border.
# SAMPLE BACKGROUND COLOR
# ─────────────────────────────────────────────
def erase_bubble_rect(image, x, y, w, h, padding=6):
def sample_bubble_background(cv_image, bubble_data):
"""
Fills the interior of a bounding box with white,
leaving a border of `padding` pixels intact.
Samples the dominant background color inside the bbox
by averaging the brightest 10% of pixels.
Returns (B, G, R).
"""
x = max(0, bubble_data["x"])
y = max(0, bubble_data["y"])
x2 = min(cv_image.shape[1], x + bubble_data["w"])
y2 = min(cv_image.shape[0], y + bubble_data["h"])
region = cv_image[y:y2, x:x2]
if region.size == 0:
return (255, 255, 255)
gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
threshold = np.percentile(gray, 90)
bg_mask = gray >= threshold
if not np.any(bg_mask):
return (255, 255, 255)
return tuple(int(c) for c in region[bg_mask].mean(axis=0))
# ─────────────────────────────────────────────
# ERASE ORIGINAL TEXT
# Fills the tight OCR bbox with the sampled
# background color. No extra expansion —
# the bbox from bubbles.json is already the
# exact size of the red squares.
# ─────────────────────────────────────────────
def erase_bubble_text(cv_image, bubble_data,
bg_color=(255, 255, 255)):
"""
Fills the bubble bounding box with bg_color.
Args:
image : BGR numpy array (modified in place)
x,y,w,h : Bounding box
padding : Pixels to leave as border (default: 6)
cv_image : BGR numpy array (modified in place)
bubble_data : Dict with 'x','y','w','h'
bg_color : (B,G,R) fill color
"""
x1 = max(0, x + padding)
y1 = max(0, y + padding)
x2 = min(image.shape[1], x + w - padding)
y2 = min(image.shape[0], y + h - padding)
if x2 > x1 and y2 > y1:
image[y1:y2, x1:x2] = 255
img_h, img_w = cv_image.shape[:2]
x = max(0, bubble_data["x"])
y = max(0, bubble_data["y"])
x2 = min(img_w, bubble_data["x"] + bubble_data["w"])
y2 = min(img_h, bubble_data["y"] + bubble_data["h"])
cv_image[y:y2, x:x2] = list(bg_color)
# ─────────────────────────────────────────────
# STEP 4: FIT FONT SIZE
# Finds the largest font size where the text
# fits inside (max_w × max_h) with word wrap.
# FIT FONT SIZE
# ─────────────────────────────────────────────
def fit_font_size(draw, text, max_w, max_h, font_path,
min_size=8, max_size=48):
min_size=7, max_size=48):
"""
Binary-searches for the largest font size where
word-wrapped text fits within the given box.
Args:
draw : PIL ImageDraw instance
text : Text string to fit
max_w : Available width in pixels
max_h : Available height in pixels
font_path : Path to .ttf font (or None for default)
min_size : Smallest font size to try (default: 8)
max_size : Largest font size to try (default: 48)
Returns:
(font, list_of_wrapped_lines)
Finds the largest font size where word-wrapped text
fits inside (max_w × max_h).
"""
best_font = None
best_lines = [text]
for size in range(max_size, min_size - 1, -1):
try:
font = ImageFont.truetype(font_path, size) if font_path else ImageFont.load_default()
font = (ImageFont.truetype(font_path, size)
if font_path else ImageFont.load_default())
except Exception:
font = ImageFont.load_default()
# Word-wrap
words = text.split()
lines = []
current = ""
words, lines, current = text.split(), [], ""
for word in words:
test = (current + " " + word).strip()
bbox = draw.textbbox((0, 0), test, font=font)
if (bbox[2] - bbox[0]) <= max_w:
bb = draw.textbbox((0, 0), test, font=font)
if (bb[2] - bb[0]) <= max_w:
current = test
else:
if current:
lines.append(current)
current = word
if current:
lines.append(current)
# Measure total block height
lh_bbox = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bbox[3] - lh_bbox[1]) + 3
total_h = line_h * len(lines)
if total_h <= max_h:
lh = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh[3] - lh[1]) + 2
if line_h * len(lines) <= max_h:
best_font = font
best_lines = lines
break
if best_font is None:
best_font = ImageFont.load_default()
return best_font, best_lines
return best_font or ImageFont.load_default(), best_lines
# ─────────────────────────────────────────────
# STEP 5: RENDER TEXT INTO BUBBLE
# Draws translated text centered inside
# the bubble bounding box.
# RENDER TEXT INTO BUBBLE
# ─────────────────────────────────────────────
def render_text_in_bubble(pil_image, x, y, w, h, text,
font_path, padding=12,
def render_text_in_bubble(pil_image, bubble_data, text,
font_path, padding=8,
font_color=(0, 0, 0)):
"""
Renders text centered (horizontally + vertically)
inside a bubble bounding box.
Args:
pil_image : PIL Image (modified in place)
x,y,w,h : Bubble bounding box
text : Translated text to render
font_path : Path to .ttf font (or None)
padding : Inner padding in pixels (default: 12)
font_color : RGB color tuple (default: black)
Renders translated text centered inside the tight bbox.
Font auto-sizes to fill the same w×h the original occupied.
"""
x, y = bubble_data["x"], bubble_data["y"]
w, h = bubble_data["w"], bubble_data["h"]
draw = ImageDraw.Draw(pil_image)
inner_w = max(1, w - padding * 2)
inner_h = max(1, h - padding * 2)
font, lines = fit_font_size(draw, text, inner_w, inner_h, font_path)
lh_bbox = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bbox[3] - lh_bbox[1]) + 3
font, lines = fit_font_size(draw, text, inner_w, inner_h,
font_path)
lh_bb = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bb[3] - lh_bb[1]) + 2
total_h = line_h * len(lines)
start_y = y + padding + max(0, (inner_h - total_h) // 2)
@@ -232,7 +186,8 @@ def render_text_in_bubble(pil_image, x, y, w, h, text,
lb = draw.textbbox((0, 0), line, font=font)
line_w = lb[2] - lb[0]
start_x = x + padding + max(0, (inner_w - line_w) // 2)
draw.text((start_x, start_y), line, font=font, fill=font_color)
draw.text((start_x, start_y), line,
font=font, fill=font_color)
start_y += line_h
@@ -244,7 +199,7 @@ def resolve_font(font_path, fallback):
print(f" ✅ Using font: {font_path}")
return font_path
if fallback and os.path.exists(fallback):
print(f" ⚠️ '{font_path}' not found → fallback: {fallback}")
print(f" ⚠️ Fallback: {fallback}")
return fallback
print(" ⚠️ No font found. Using PIL default.")
return None
@@ -261,104 +216,122 @@ def render_translated_page(
font_path = FONT_PATH,
font_fallback = FONT_FALLBACK,
font_color = FONT_COLOR,
erase_padding = 6,
text_padding = 12,
text_padding = 8,
debug = False,
):
"""
Full rendering pipeline:
1. Parse translations from output.txt
Pipeline:
1. Parse translations (only present IDs processed)
2. Load bubble boxes from bubbles.json
3. Load original manga page
4. Erase original text from each bubble
5. Render translated text into each bubble
6. Save output image
Args:
input_image : Source manga page (default: 'page.png')
output_image : Output path (default: 'page_translated.png')
translations_file : Path to output.txt (default: 'output.txt')
bubbles_file : Path to bubbles.json (default: 'bubbles.json')
font_path : Primary .ttf font path
font_fallback : Fallback font path
font_color : RGB text color (default: black)
erase_padding : Border px when erasing (default: 6)
text_padding : Inner padding for text (default: 12)
debug : Save debug_render.png (default: False)
3. Cross-check IDs — absent ones left untouched
4. Sample background color per bubble
5. Erase original text (fill tight bbox)
6. Render translated text sized to fit the bbox
7. Save output
"""
print("=" * 55)
print(" MANGA TRANSLATOR — RENDERER")
print("=" * 55)
# ── 1. Parse translations ─────────────────────────────────────────────────
print("\n📄 Parsing translations...")
translations = parse_translations(translations_file)
if not translations:
print("❌ No translations found. Aborting.")
return
# ── 2. Load bubble boxes ──────────────────────────────────────────────────
print(f"\n📦 Loading bubble boxes from {bubbles_file}...")
print(f"\n📦 Loading bubble data...")
bubble_boxes = load_bubble_boxes(bubbles_file)
if not bubble_boxes:
print("❌ No bubble boxes found. Re-run manga-translator.py first.")
print("❌ No bubble data. Re-run manga-translator.py.")
return
# ── 3. Load image ─────────────────────────────────────────────────────────
print(f"\n🖼️ Loading image: {input_image}")
translate_ids = set(translations.keys())
box_ids = set(bubble_boxes.keys())
to_process = sorted(translate_ids & box_ids)
untouched = sorted(box_ids - translate_ids)
missing = sorted(translate_ids - box_ids)
print(f"\n🔗 To process : {to_process}")
print(f" Untouched : {untouched}")
if missing:
print(f" ⚠️ In output.txt but no box: {missing}")
if not to_process:
print("❌ No matching IDs. Aborting.")
return
print(f"\n🖼️ Loading: {input_image}")
cv_image = cv2.imread(input_image)
if cv_image is None:
print(f"❌ Could not load: {input_image}")
return
print(f" Image size: {cv_image.shape[1]}×{cv_image.shape[0]}px")
print(f" {cv_image.shape[1]}×{cv_image.shape[0]}px")
# ── 4. Erase original text ────────────────────────────────────────────────
print("\n🧹 Erasing original bubble text...")
for bubble_id in sorted(translations.keys()):
if bubble_id not in bubble_boxes:
print(f" ⚠️ #{bubble_id}: no box in bubbles.json, skipping")
continue
x, y, w, h = bubble_boxes[bubble_id]
erase_bubble_rect(cv_image, x, y, w, h, padding=erase_padding)
print(f" Erased #{bubble_id} at ({x},{y}) {w}×{h}px")
# Sample backgrounds BEFORE erasing
print("\n🎨 Sampling backgrounds...")
bg_colors = {}
for bid in to_process:
bg_bgr = sample_bubble_background(
cv_image, bubble_boxes[bid])
bg_colors[bid] = bg_bgr
bg_rgb = (bg_bgr[2], bg_bgr[1], bg_bgr[0])
brightness = sum(bg_rgb) / 3
ink = "black" if brightness > 128 else "white"
print(f" #{bid}: RGB{bg_rgb} ink→{ink}")
# ── 5. Convert to PIL ─────────────────────────────────────────────────────
pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
# Erase
print("\n🧹 Erasing original text...")
for bid in to_process:
bd = bubble_boxes[bid]
erase_bubble_text(cv_image, bd, bg_color=bg_colors[bid])
print(f" ✅ #{bid} ({bd['w']}×{bd['h']}px)")
pil_image = Image.fromarray(
cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
# ── 6. Resolve font ───────────────────────────────────────────────────────
print("\n🔤 Resolving font...")
resolved_font = resolve_font(font_path, font_fallback)
# ── 7. Render translated text ─────────────────────────────────────────────
print("\n✍️ Rendering translated text...")
for bubble_id, text in sorted(translations.items()):
if bubble_id not in bubble_boxes:
continue
x, y, w, h = bubble_boxes[bubble_id]
# Render
print("\n✍️ Rendering...")
for bid in to_process:
text = translations[bid]
bd = bubble_boxes[bid]
bg_rgb = (bg_colors[bid][2],
bg_colors[bid][1],
bg_colors[bid][0])
brightness = sum(bg_rgb) / 3
txt_color = (0, 0, 0) if brightness > 128 \
else (255, 255, 255)
render_text_in_bubble(
pil_image, x, y, w, h, text,
pil_image, bd, text,
font_path = resolved_font,
padding = text_padding,
font_color = font_color,
font_color = txt_color,
)
print(f" #{bubble_id}: '{text}' → ({x},{y}) {w}×{h}px")
print(f" #{bid}: '{text}' "
f"({bd['x']},{bd['y']}) {bd['w']}×{bd['h']}px")
# ── 8. Debug overlay ──────────────────────────────────────────────────────
if debug:
dbg = pil_image.copy()
dbg_draw = ImageDraw.Draw(dbg)
for bubble_id, (x, y, w, h) in sorted(bubble_boxes.items()):
dbg_draw.rectangle([x, y, x + w, y + h], outline=(255, 0, 0), width=2)
dbg_draw.text((x + 4, y + 4), f"#{bubble_id}", fill=(255, 0, 0))
for bid, bd in sorted(bubble_boxes.items()):
color = (0, 200, 0) if bid in translate_ids \
else (160, 160, 160)
dbg_draw.rectangle(
[bd["x"], bd["y"],
bd["x"] + bd["w"], bd["y"] + bd["h"]],
outline=color, width=2)
dbg_draw.text((bd["x"] + 3, bd["y"] + 3),
f"#{bid}", fill=color)
dbg.save("debug_render.png")
print("\n 🐛 Debug render saved → debug_render.png")
print("\n 🐛 debug_render.png saved "
"(green=translated, grey=untouched)")
# ── 9. Save output ────────────────────────────────────────────────────────
print(f"\n💾 Saving → {output_image}")
pil_image.save(output_image, "PNG")
print(f" ✅ Done! Open: {output_image}")
print(" ✅ Done!")
print("=" * 55)
@@ -366,7 +339,6 @@ def render_translated_page(
# ENTRY POINT
# ─────────────────────────────────────────────
if __name__ == "__main__":
render_translated_page(
input_image = "page.png",
output_image = "page_translated.png",
@@ -375,7 +347,6 @@ if __name__ == "__main__":
font_path = "font.ttf",
font_fallback = "/System/Library/Fonts/Helvetica.ttc",
font_color = (0, 0, 0),
erase_padding = 6,
text_padding = 12,
text_padding = 8,
debug = True,
)

View File

@@ -29,44 +29,132 @@ SUPPORTED_LANGUAGES = {
"Catalan" : "ca",
}
# ─────────────────────────────────────────────
# SOUND EFFECT FILTER
# ─────────────────────────────────────────────
SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$",
r"^sha+$",
r"^ha+$",
r"^ah+$",
r"^oh+$",
r"^ugh+$",
r"^gr+$",
r"^bam+$",
r"^pow+$",
r"^crash+$",
r"^boom+$",
r"^bang+$",
r"^crack+$",
r"^whoosh+$",
r"^thud+$",
r"^snap+$",
r"^zip+$",
r"^swoosh+$",
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
r"^oh+$", r"^ugh+$", r"^gr+$", r"^bam+$",
r"^pow+$", r"^crash+$", r"^boom+$", r"^bang+$",
r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$",
r"^zip+$", r"^swoosh+$",
]
def is_sound_effect(text):
cleaned = re.sub(r"[^a-z]", "", text.strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
return any(re.fullmatch(p, cleaned, re.IGNORECASE)
for p in SOUND_EFFECT_PATTERNS)
# ─────────────────────────────────────────────
# BOUNDING BOX HELPERS
# TOKEN FILTER
# ─────────────────────────────────────────────
def should_keep_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
"""
Returns (keep: bool, reason: str).
Rules:
1. Drop if confidence below threshold
2. Drop if shorter than min_text_length
3. Drop pure digit strings
4. Drop single non-alpha characters
5. Drop sound effects if filter enabled
6. Keep everything else
"""
cleaned = text.strip()
if confidence < confidence_threshold:
return False, f"low confidence ({confidence:.2f})"
if len(cleaned) < min_text_length:
return False, "too short"
if re.fullmatch(r"\d+", cleaned):
return False, "pure digits"
if len(cleaned) == 1 and not cleaned.isalpha():
return False, "single symbol"
if filter_sound_effects and is_sound_effect(cleaned):
return False, "sound effect"
return True, "ok"
# ─────────────────────────────────────────────
# BOUNDING BOX
#
# Rules (match the red square exactly):
# Width = widest single quad's width
# Height = sum of ALL quad heights stacked
# X = centered on the widest quad's CX
# Y = topmost Y1 of all quads
# ─────────────────────────────────────────────
def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
padding_px=10):
"""
Computes the bubble erase bbox:
1. Per-quad: measure w, h, cx for every OCR detection
2. Width = width of the widest single quad
3. Height = sum of every quad's height
4. X = widest quad's center ± max_w/2
(all lines sit symmetrically inside)
5. Y = top of topmost quad, bottom = Y + total_h
Args:
ocr_bboxes : List of EasyOCR quad bboxes
image_shape : (height, width) for clamping
padding_px : Expansion on each side (default: 10)
Returns:
(x1, y1, x2, y2) clamped to image bounds
"""
img_h, img_w = image_shape[:2]
if not ocr_bboxes:
return 0, 0, 0, 0
# ── Per-quad metrics ──────────────────────────────────────────
quad_metrics = []
for quad in ocr_bboxes:
xs = [pt[0] for pt in quad]
ys = [pt[1] for pt in quad]
qx1, qx2 = min(xs), max(xs)
qy1, qy2 = min(ys), max(ys)
quad_metrics.append({
"x1" : qx1,
"x2" : qx2,
"y1" : qy1,
"y2" : qy2,
"w" : qx2 - qx1,
"h" : qy2 - qy1,
"cx" : (qx1 + qx2) / 2.0,
})
# ── Width: widest single quad ─────────────────────────────────
widest = max(quad_metrics, key=lambda q: q["w"])
max_w = widest["w"]
center_x = widest["cx"]
# ── Height: sum of all quad heights ──────────────────────────
total_h = sum(q["h"] for q in quad_metrics)
# ── Box edges ─────────────────────────────────────────────────
box_x1 = center_x - max_w / 2.0
box_x2 = center_x + max_w / 2.0
box_y1 = min(q["y1"] for q in quad_metrics)
box_y2 = box_y1 + total_h
# ── Padding + clamp ───────────────────────────────────────────
x1 = max(0, box_x1 - padding_px)
y1 = max(0, box_y1 - padding_px)
x2 = min(img_w, box_x2 + padding_px)
y2 = min(img_h, box_y2 + padding_px)
return x1, y1, x2, y2
def get_cluster_bbox(items):
"""
Returns (x1, y1, x2, y2) tight bounding box around
all (cy, cx, text) center points in a cluster.
Uses a fixed half-size approximation per text block.
"""
"""Fallback center-point bbox — used only during merge step."""
half = 30
x1 = min(cx for _, cx, _ in items) - half
y1 = min(cy for cy, _, _ in items) - half
@@ -76,10 +164,6 @@ def get_cluster_bbox(items):
def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
"""
Returns True if two (x1,y1,x2,y2) boxes are within
proximity_px pixels of each other (or overlapping).
"""
ax1, ay1, ax2, ay2 = bbox_a
bx1, by1, bx2, by2 = bbox_b
ax1 -= proximity_px; ay1 -= proximity_px
@@ -87,18 +171,25 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
# ─────────────────────────────────────────────
# TEXT LINE FILTER
# ─────────────────────────────────────────────
def has_translatable_content(text):
"""
True if text contains at least one letter.
ch.isalpha() handles È, é, ñ, ü etc.
"""
return any(ch.isalpha() for ch in text)
# ─────────────────────────────────────────────
# POST-CLUSTER MERGE (Union-Find)
# ─────────────────────────────────────────────
def merge_nearby_clusters(raw_clusters, proximity_px=80):
"""
Merges clusters whose bounding boxes are within
proximity_px pixels of each other.
Fixes split bubbles without changing eps globally.
"""
def merge_nearby_clusters(raw_clusters, raw_quads,
proximity_px=80):
labels = list(raw_clusters.keys())
bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl]) for lbl in labels}
bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl])
for lbl in labels}
parent = {lbl: lbl for lbl in labels}
def find(x):
@@ -116,30 +207,23 @@ def merge_nearby_clusters(raw_clusters, proximity_px=80):
if boxes_are_close(bboxes[a], bboxes[b], proximity_px):
union(a, b)
merged = {}
merged_clusters = {}
merged_quads = {}
for lbl in labels:
root = find(lbl)
merged.setdefault(root, [])
merged[root].extend(raw_clusters[lbl])
merged_clusters.setdefault(root, [])
merged_quads.setdefault(root, [])
merged_clusters[root].extend(raw_clusters[lbl])
merged_quads[root].extend(raw_quads[lbl])
return merged
return merged_clusters, merged_quads
# ─────────────────────────────────────────────
# CROP-BASED OCR RE-READ
# ─────────────────────────────────────────────
def reread_cluster_crop(
image,
bbox,
reader,
source_lang,
padding_px=20,
upscale_factor=2.5,
):
"""
Crops a cluster region from the full image, upscales it,
and re-runs OCR for higher accuracy on small text.
"""
def reread_cluster_crop(image, bbox, reader, source_lang,
padding_px=20, upscale_factor=2.5):
img_h, img_w = image.shape[:2]
x1, y1, x2, y2 = bbox
@@ -154,13 +238,13 @@ def reread_cluster_crop(
new_w = int(crop.shape[1] * upscale_factor)
new_h = int(crop.shape[0] * upscale_factor)
upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
upscaled = cv2.resize(crop, (new_w, new_h),
interpolation=cv2.INTER_CUBIC)
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
sharpened = cv2.filter2D(upscaled, -1, kernel)
temp_path = "_temp_crop_ocr.png"
cv2.imwrite(temp_path, sharpened)
try:
crop_results = reader.readtext(temp_path, paragraph=False)
finally:
@@ -171,26 +255,31 @@ def reread_cluster_crop(
return None
crop_results.sort(key=lambda r: r[0][0][1])
lines = [text.strip() for _, text, conf in crop_results if text.strip()]
lines = [t.strip() for _, t, _ in crop_results if t.strip()]
return fix_hyphens(lines) if lines else None
# ─────────────────────────────────────────────
# DBSCAN BUBBLE CLUSTERING
# ─────────────────────────────────────────────
def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
def cluster_into_bubbles(ocr_results, image_shape,
eps=80, min_samples=1,
proximity_px=80, bbox_padding=10):
"""
Two-pass clustering:
Pass 1 — DBSCAN on center points
Pass 2 — Bounding-box proximity merge
Bbox: widest-line width (centered) × stacked height.
All quads contribute to bbox regardless of content.
Returns:
bubble_dict : cluster_id → list of (cy, cx, text)
bubble_dict : cluster_id → list of translatable text lines
bbox_dict : cluster_id → (x1, y1, x2, y2)
ocr_quads : cluster_id → list of ALL raw EasyOCR quads
"""
if not ocr_results:
return {}, {}
return {}, {}, {}
centers = []
for bbox, text, confidence in ocr_results:
@@ -199,11 +288,12 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
centers.append([sum(xs) / 4, sum(ys) / 4])
centers_array = np.array(centers, dtype=np.float32)
db = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean")
db = DBSCAN(eps=eps, min_samples=min_samples,
metric="euclidean")
labels = db.fit_predict(centers_array)
raw_clusters = {}
raw_quads = {}
noise_counter = int(max(labels, default=0)) + 1
for idx, label in enumerate(labels):
@@ -211,12 +301,17 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
label = noise_counter
noise_counter += 1
raw_clusters.setdefault(label, [])
raw_quads.setdefault(label, [])
bbox, text, _ = ocr_results[idx]
raw_clusters[label].append((centers[idx][1], centers[idx][0], text))
raw_clusters[label].append(
(centers[idx][1], centers[idx][0], text))
raw_quads[label].append(bbox)
print(f" DBSCAN pass: {len(raw_clusters)} cluster(s)")
merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px)
merged_clusters, merged_quads = merge_nearby_clusters(
raw_clusters, raw_quads, proximity_px=proximity_px
)
print(f" After merge: {len(merged_clusters)} cluster(s)")
row_band_px = 150
@@ -225,17 +320,42 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
return (min(cy for cy, cx, _ in items) // row_band_px,
min(cx for cy, cx, _ in items))
sorted_clusters = sorted(merged_clusters.values(), key=cluster_sort_key)
sorted_labels = sorted(
merged_clusters.keys(),
key=lambda lbl: cluster_sort_key(merged_clusters[lbl])
)
bubble_dict = {}
bbox_dict = {}
ocr_quads = {}
for i, lbl in enumerate(sorted_labels, start=1):
items = merged_clusters[lbl]
quads = merged_quads[lbl]
for i, items in enumerate(sorted_clusters, start=1):
items_sorted = sorted(items, key=lambda t: t[0])
bubble_dict[i] = [text for _, _, text in items_sorted]
bbox_dict[i] = get_cluster_bbox(items)
return bubble_dict, bbox_dict
text_lines = [
text for _, _, text in items_sorted
if has_translatable_content(text)
]
if not text_lines:
text_lines = [text for _, _, text in items_sorted]
bubble_dict[i] = text_lines
ocr_quads[i] = quads
bbox_dict[i] = get_cluster_bbox_from_ocr(
quads, image_shape, padding_px=bbox_padding
)
b = bbox_dict[i]
print(f" Cluster #{i}: {len(quads)} quad(s) "
f"bbox=({int(b[0])},{int(b[1])})→"
f"({int(b[2])},{int(b[3])}) "
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}")
return bubble_dict, bbox_dict, ocr_quads
# ─────────────────────────────────────────────
@@ -247,7 +367,8 @@ def fix_hyphens(lines):
merged = lines[0]
for line in lines[1:]:
line = line.strip()
merged = merged[:-1] + line if merged.endswith("-") else merged + " " + line
merged = (merged[:-1] + line if merged.endswith("-")
else merged + " " + line)
return re.sub(r" {2,}", " ", merged).strip()
@@ -268,63 +389,45 @@ def compute_auto_eps(image_path, base_eps=80, reference_width=750):
# OCR QUALITY SCORE
# ─────────────────────────────────────────────
def ocr_quality_score(text):
"""
Returns a quality score 0.01.0 for an OCR result.
Low score triggers a crop re-read.
"""
if not text or len(text) < 2:
return 0.0
alpha_chars = sum(1 for c in text if c.isalpha())
total_chars = len(text)
alpha_ratio = alpha_chars / total_chars
garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
penalty = sum(0.2 for p in garbage_patterns if re.search(p, text))
alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text)
garbage = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
penalty = sum(0.2 for p in garbage if re.search(p, text))
return max(0.0, min(1.0, alpha_ratio - penalty))
# ─────────────────────────────────────────────
# BUBBLE JSON EXPORT
# Saves bbox_dict to bubbles.json so the
# renderer can load exact cluster positions.
# ─────────────────────────────────────────────
def export_bubble_boxes(bbox_dict, filepath="bubbles.json"):
"""
Serialises bbox_dict to a JSON file.
Format written:
{
"1": {"x": 120, "y": 45, "w": 180, "h": 210},
...
}
Args:
bbox_dict : Dict {bubble_id (int): (x1, y1, x2, y2)}
filepath : Output path (default: 'bubbles.json')
"""
def export_bubble_boxes(bbox_dict, ocr_quads_dict,
filepath="bubbles.json"):
export = {}
for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
quads = ocr_quads_dict.get(bubble_id, [])
export[str(bubble_id)] = {
"x": int(x1),
"y": int(y1),
"w": int(x2 - x1),
"h": int(y2 - y1),
"x" : int(x1),
"y" : int(y1),
"w" : int(x2 - x1),
"h" : int(y2 - y1),
"quads": [[[int(pt[0]), int(pt[1])] for pt in quad]
for quad in quads],
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(export, f, indent=2, ensure_ascii=False)
print(f"📦 Bubble boxes saved → {filepath}")
for bubble_id, vals in export.items():
print(f" #{bubble_id}: ({vals['x']},{vals['y']}) {vals['w']}×{vals['h']}px")
print(f"\n📦 Bubble boxes saved → {filepath}")
for bid, v in export.items():
print(f" #{bid}: ({v['x']},{v['y']}) "
f"{v['w']}×{v['h']}px [{len(v['quads'])} quad(s)]")
# ─────────────────────────────────────────────
# DEBUG CLUSTER IMAGE
# ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr_results, bubble_dict):
def save_debug_clusters(image_path, ocr_results,
bubble_dict, bbox_dict):
image = cv2.imread(image_path)
if image is None:
return
@@ -333,7 +436,8 @@ def save_debug_clusters(image_path, ocr_results, bubble_dict):
num_bubbles = max(bubble_dict.keys(), default=1)
colors = [
tuple(int(c) for c in col)
for col in np.random.randint(50, 230, size=(num_bubbles + 2, 3))
for col in np.random.randint(50, 230,
size=(num_bubbles + 2, 3))
]
text_to_bubble = {}
@@ -345,14 +449,21 @@ def save_debug_clusters(image_path, ocr_results, bubble_dict):
bubble_id = text_to_bubble.get(text, 0)
color = colors[(bubble_id - 1) % len(colors)]
pts = np.array(bbox, dtype=np.int32)
cv2.polylines(image, [pts], isClosed=True, color=color, thickness=2)
x = int(pts[0][0])
y = max(int(pts[0][1]) - 5, 12)
cv2.putText(image, f"#{bubble_id}", (x, y),
cv2.polylines(image, [pts], isClosed=True,
color=color, thickness=1)
for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
color = colors[(bubble_id - 1) % len(colors)]
cv2.rectangle(image,
(int(x1), int(y1)),
(int(x2), int(y2)),
color, 2)
cv2.putText(image, f"BOX#{bubble_id}",
(int(x1) + 2, int(y1) + 16),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
cv2.imwrite("debug_clusters.png", image)
print(" 🐛 Cluster debug saved → debug_clusters.png")
print(" 🐛 debug_clusters.png saved")
# ─────────────────────────────────────────────
@@ -362,39 +473,18 @@ def translate_manga_text(
image_path,
source_lang="it",
target_lang="ca",
confidence_threshold=0.15,
confidence_threshold=0.10,
export_to_file=None,
export_bubbles_to="bubbles.json", # ← NEW: path for bubble boxes JSON
export_bubbles_to="bubbles.json",
min_text_length=2,
cluster_eps="auto",
proximity_px=80,
filter_sound_effects=True,
quality_threshold=0.5,
upscale_factor=2.5,
bbox_padding=10,
debug=False,
):
"""
Full pipeline:
OCR → filter → DBSCAN cluster → proximity merge
→ quality check → crop re-read if needed
→ fix hyphens → translate → export txt + json
Args:
image_path : Path to your image file
source_lang : Source language code (default: 'it')
target_lang : Target language code (default: 'ca')
confidence_threshold : Min OCR confidence (default: 0.15)
export_to_file : Save translations to .txt (default: None)
export_bubbles_to : Save bubble boxes to .json (default: 'bubbles.json')
min_text_length : Min characters per detection(default: 2)
cluster_eps : DBSCAN eps or 'auto' (default: 'auto')
proximity_px : Post-merge proximity px (default: 80)
filter_sound_effects : Skip onomatopoeia/SFX (default: True)
quality_threshold : Min quality score 01 (default: 0.5)
upscale_factor : Crop upscale for re-read (default: 2.5)
debug : Save debug_clusters.png (default: False)
"""
# ── 1. Resolve eps ────────────────────────────────────────────────────────
if cluster_eps == "auto":
print("Computing auto eps...")
@@ -410,54 +500,61 @@ def translate_manga_text(
# ── 3. Initialize OCR ─────────────────────────────────────────────────────
print("\nLoading OCR model...")
ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang]
ocr_lang_list = ["en", "es"] if source_lang == "ca" \
else [source_lang]
reader = easyocr.Reader(ocr_lang_list)
# ── 4. Initialize translator ──────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
translator = GoogleTranslator(source=source_lang,
target=target_lang)
# ── 5. Run OCR ────────────────────────────────────────────────────────────
print(f"\nRunning OCR on: {image_path}")
results = reader.readtext(image_path, paragraph=False)
print(f" Raw detections: {len(results)}")
# ── 6. Filter detections ──────────────────────────────────────────────────
# ── 6. Filter ─────────────────────────────────────────────────────────────
filtered = []
skipped = 0
for bbox, text, confidence in results:
cleaned = text.strip()
if confidence < confidence_threshold:
skipped += 1
continue
if len(cleaned) < min_text_length:
skipped += 1
continue
if re.fullmatch(r"[\d\W]+", cleaned):
skipped += 1
continue
if filter_sound_effects and is_sound_effect(cleaned):
keep, reason = should_keep_token(
cleaned, confidence,
confidence_threshold, min_text_length,
filter_sound_effects
)
if keep:
filtered.append((bbox, cleaned, confidence))
else:
if reason == "sound effect":
print(f" 🔇 SFX skipped: '{cleaned}'")
skipped += 1
continue
filtered.append((bbox, cleaned, confidence))
print(f"{len(filtered)} detection(s) kept, {skipped} skipped.\n")
print(f"{len(filtered)} kept, {skipped} skipped.\n")
if not filtered:
print("⚠️ No text detected after filtering.")
return
# ── 7. Cluster + merge ────────────────────────────────────────────────────
print(f"Clustering detections (eps={eps:.1f}px, proximity={proximity_px}px)...")
bubble_dict, bbox_dict = cluster_into_bubbles(
filtered, eps=eps, proximity_px=proximity_px
print(f"Clustering (eps={eps:.1f}px, "
f"proximity={proximity_px}px, "
f"bbox_padding={bbox_padding}px)...")
bubble_dict, bbox_dict, ocr_quads = cluster_into_bubbles(
filtered,
image_shape = full_image.shape,
eps = eps,
proximity_px = proximity_px,
bbox_padding = bbox_padding,
)
print(f"{len(bubble_dict)} bubble(s) after merge.\n")
# ── 8. Debug image ────────────────────────────────────────────────────────
# ── 8. Debug ──────────────────────────────────────────────────────────────
if debug:
save_debug_clusters(image_path, filtered, bubble_dict)
save_debug_clusters(image_path, filtered,
bubble_dict, bbox_dict)
# ── 9. Fix hyphens ────────────────────────────────────────────────────────
clean_bubbles = {
@@ -471,41 +568,39 @@ def translate_manga_text(
for i, text in clean_bubbles.items():
score = ocr_quality_score(text)
status = "" if score >= quality_threshold else "🔁"
print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'")
print(f" #{i}: score={score:.2f} {status} '{text[:55]}'")
if score < quality_threshold:
print(f" → Re-reading bubble #{i} from crop...")
print(f" → Re-reading #{i} from crop...")
reread = reread_cluster_crop(
full_image, bbox_dict[i], reader, source_lang,
upscale_factor=upscale_factor,
)
if reread:
print(f" Re-read result: '{reread}'")
print(f"'{reread}'")
clean_bubbles[i] = reread
else:
print(f"Re-read returned nothing, keeping original.")
print(f"Nothing found, keeping original.")
# ── 11. Translate & print ─────────────────────────────────────────────────
print()
header = f"{'BUBBLE':<8} {'ORIGINAL (Italian)':<50} {'TRANSLATED (Catalan)'}"
header = (f"{'BUBBLE':<8} "
f"{'ORIGINAL (Italian)':<50} "
f"{'TRANSLATED (Catalan)'}")
divider = "" * 105
output_lines = [header, divider]
print(header)
print(divider)
translated_count = 0
for i in sorted(clean_bubbles.keys()):
bubble_text = clean_bubbles[i].strip()
if not bubble_text:
continue
try:
translated = translator.translate(bubble_text)
except Exception as e:
translated = f"[Translation error: {e}]"
if translated is None:
translated = "[No translation returned]"
@@ -515,23 +610,22 @@ def translate_manga_text(
output_lines.append(line)
output_lines.append(divider)
summary = (
f"✅ Done! {translated_count} bubble(s) translated, "
f"{skipped} detection(s) skipped."
)
summary = (f"✅ Done! {translated_count} bubble(s) translated, "
f"{skipped} detection(s) skipped.")
output_lines.append(summary)
print(divider)
print(summary)
# ── 12. Export translations .txt ──────────────────────────────────────────
# ── 12. Export translations ───────────────────────────────────────────────
if export_to_file:
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(output_lines))
print(f"📄 Translations saved → {export_to_file}")
# ── 13. Export bubble boxes .json ─────────────────────────────────────────
# ── 13. Export bubble boxes ───────────────────────────────────────────────
if export_bubbles_to:
export_bubble_boxes(bbox_dict, filepath=export_bubbles_to)
export_bubble_boxes(bbox_dict, ocr_quads,
filepath=export_bubbles_to)
# ─────────────────────────────────────────────
@@ -549,19 +643,19 @@ def list_languages():
# ENTRY POINT
# ─────────────────────────────────────────────
if __name__ == "__main__":
translate_manga_text(
image_path = "page.png",
source_lang = "it",
target_lang = "ca",
confidence_threshold = 0.15,
confidence_threshold = 0.10,
min_text_length = 2,
export_to_file = "output.txt",
export_bubbles_to = "bubbles.json", # ← NEW
export_bubbles_to = "bubbles.json",
cluster_eps = "auto",
proximity_px = 80,
filter_sound_effects = True,
quality_threshold = 0.5,
upscale_factor = 2.5,
bbox_padding = 0,
debug = True,
)