Added new

This commit is contained in:
Guillem Hernandez Sola
2026-04-11 14:00:07 +02:00
parent 458915278e
commit 555892348f
3 changed files with 836 additions and 399 deletions

View File

@@ -1,38 +1,410 @@
{ {
"1": { "1": {
"x": 251, "x": 204,
"y": 149, "y": 137,
"w": 60, "w": 153,
"h": 60 "h": 82,
"quads": [
[
[
204,
172
],
[
348,
137
],
[
358,
185
],
[
215,
220
]
]
]
}, },
"2": { "2": {
"x": 1202, "x": 1167,
"y": 226, "y": 240,
"w": 61, "w": 132,
"h": 159 "h": 134,
"quads": [
[
[
1214,
240
],
[
1252,
240
],
[
1252,
272
],
[
1214,
272
]
],
[
[
1167,
271
],
[
1299,
271
],
[
1299,
307
],
[
1167,
307
]
],
[
[
1175,
303
],
[
1289,
303
],
[
1289,
339
],
[
1175,
339
]
],
[
[
1206,
340
],
[
1260,
340
],
[
1260,
370
],
[
1206,
370
]
]
]
}, },
"3": { "3": {
"x": 966, "x": 930,
"y": 364, "y": 378,
"w": 62, "w": 136,
"h": 156 "h": 132,
"quads": [
[
[
930,
378
],
[
1062,
378
],
[
1062,
410
],
[
930,
410
]
],
[
[
930,
410
],
[
1066,
410
],
[
1066,
442
],
[
930,
442
]
],
[
[
954,
439
],
[
1041,
439
],
[
1041,
475
],
[
954,
475
]
],
[
[
946,
474
],
[
1050,
474
],
[
1050,
506
],
[
946,
506
]
]
]
}, },
"4": { "4": {
"x": 265, "x": 220,
"y": 471, "y": 486,
"w": 62, "w": 150,
"h": 230 "h": 210,
"quads": [
[
[
278,
486
],
[
312,
486
],
[
312,
516
],
[
278,
516
]
],
[
[
236,
514
],
[
356,
514
],
[
356,
544
],
[
236,
544
]
],
[
[
236,
542
],
[
358,
542
],
[
358,
572
],
[
236,
572
]
],
[
[
220,
572
],
[
370,
572
],
[
370,
600
],
[
220,
600
]
],
[
[
240,
598
],
[
350,
598
],
[
350,
630
],
[
240,
630
]
],
[
[
246,
628
],
[
346,
628
],
[
346,
658
],
[
246,
658
]
],
[
[
250,
656
],
[
340,
656
],
[
340,
686
],
[
250,
686
]
]
]
}, },
"5": { "5": {
"x": 359, "x": 354,
"y": 1114, "y": 1132,
"w": 72, "w": 92,
"h": 134 "h": 102,
"quads": [
[
[
384,
1132
],
[
418,
1132
],
[
418,
1156
],
[
384,
1156
]
],
[
[
354,
1154
],
[
446,
1154
],
[
446,
1208
],
[
354,
1208
]
],
[
[
366,
1206
],
[
412,
1206
],
[
412,
1230
],
[
366,
1230
]
]
]
}, },
"6": { "6": {
"x": 729, "x": 740,
"y": 1306, "y": 1324,
"w": 60, "w": 38,
"h": 60 "h": 24,
"quads": [
[
[
740,
1324
],
[
778,
1324
],
[
778,
1348
],
[
740,
1348
]
]
]
} }
} }

View File

@@ -13,218 +13,172 @@ INPUT_IMAGE = "page.png"
OUTPUT_IMAGE = "page_translated.png" OUTPUT_IMAGE = "page_translated.png"
TRANSLATIONS_FILE = "output.txt" TRANSLATIONS_FILE = "output.txt"
BUBBLES_FILE = "bubbles.json" BUBBLES_FILE = "bubbles.json"
FONT_PATH = "font.ttf" FONT_PATH = "font.ttf"
FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc" FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc"
FONT_COLOR = (0, 0, 0) FONT_COLOR = (0, 0, 0)
BUBBLE_FILL = (255, 255, 255)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# STEP 1: PARSE output.txt # PARSE output.txt
# Robust parser: always takes the LAST
# whitespace-separated column as translation.
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def parse_translations(filepath): def parse_translations(filepath):
""" """
Parses output.txt and returns {bubble_id: translated_text}. Parses output.txt {bubble_id: translated_text}.
Only bubbles present in the file are returned.
Strategy: split each #N line on 2+ consecutive spaces, Absent IDs are left completely untouched on the page.
then always take the LAST token as the translation.
This is robust even when original or translated text
contains internal spaces.
Args:
filepath : Path to output.txt
Returns:
Dict {1: "LA NOIA ESTÀ IL·LESA!", ...}
""" """
translations = {} translations = {}
with open(filepath, "r", encoding="utf-8") as f: with open(filepath, "r", encoding="utf-8") as f:
for line in f: for line in f:
line = line.rstrip("\n") line = line.rstrip("\n")
if not re.match(r"^\s*#\d+", line):
# Must start with #N
if not re.match(r"^#\d+", line.strip()):
continue continue
# Split on 2+ spaces → [bubble_id_col, original_col, translated_col]
parts = re.split(r" {2,}", line.strip()) parts = re.split(r" {2,}", line.strip())
if len(parts) < 3: if len(parts) < 3:
continue continue
bubble_id = int(re.sub(r"[^0-9]", "", parts[0])) bubble_id = int(re.sub(r"[^0-9]", "", parts[0]))
translated = parts[-1].strip() # always last column translated = parts[-1].strip()
if translated.startswith("["):
continue
translations[bubble_id] = translated translations[bubble_id] = translated
print(f" Parsed {len(translations)} translation(s) from {filepath}") print(f"{len(translations)} bubble(s) to translate: "
f"{sorted(translations.keys())}")
for bid, text in sorted(translations.items()): for bid, text in sorted(translations.items()):
print(f" #{bid}: {text}") print(f" #{bid}: {text}")
return translations return translations
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# STEP 2: LOAD BUBBLE BOXES from bubbles.json # LOAD bubbles.json
# These were saved by manga-translator.py
# and are guaranteed to match the clusters.
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def load_bubble_boxes(filepath): def load_bubble_boxes(filepath):
"""
Loads bubble bounding boxes from bubbles.json.
Expected format:
{
"1": {"x": 120, "y": 45, "w": 180, "h": 210},
"2": { ... },
...
}
Args:
filepath : Path to bubbles.json
Returns:
Dict {bubble_id (int): (x, y, w, h)}
"""
with open(filepath, "r", encoding="utf-8") as f: with open(filepath, "r", encoding="utf-8") as f:
raw = json.load(f) raw = json.load(f)
boxes = {int(k): v for k, v in raw.items()}
boxes = {} print(f" ✅ Loaded {len(boxes)} bubble(s)")
for key, val in raw.items(): for bid, val in sorted(boxes.items()):
bubble_id = int(key) print(f" #{bid}: ({val['x']},{val['y']}) "
boxes[bubble_id] = (val["x"], val["y"], val["w"], val["h"]) f"{val['w']}×{val['h']}px")
print(f" ✅ Loaded {len(boxes)} bubble box(es) from {filepath}")
for bid, (x, y, w, h) in sorted(boxes.items()):
print(f" #{bid}: ({x},{y}) {w}×{h}px")
return boxes return boxes
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# STEP 3: ERASE BUBBLE CONTENT # SAMPLE BACKGROUND COLOR
# Fills a rectangular region with white.
# Uses a slightly inset rect to preserve
# the bubble border.
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def erase_bubble_rect(image, x, y, w, h, padding=6): def sample_bubble_background(cv_image, bubble_data):
""" """
Fills the interior of a bounding box with white, Samples the dominant background color inside the bbox
leaving a border of `padding` pixels intact. by averaging the brightest 10% of pixels.
Returns (B, G, R).
"""
x = max(0, bubble_data["x"])
y = max(0, bubble_data["y"])
x2 = min(cv_image.shape[1], x + bubble_data["w"])
y2 = min(cv_image.shape[0], y + bubble_data["h"])
region = cv_image[y:y2, x:x2]
if region.size == 0:
return (255, 255, 255)
gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
threshold = np.percentile(gray, 90)
bg_mask = gray >= threshold
if not np.any(bg_mask):
return (255, 255, 255)
return tuple(int(c) for c in region[bg_mask].mean(axis=0))
# ─────────────────────────────────────────────
# ERASE ORIGINAL TEXT
# Fills the tight OCR bbox with the sampled
# background color. No extra expansion —
# the bbox from bubbles.json is already the
# exact size of the red squares.
# ─────────────────────────────────────────────
def erase_bubble_text(cv_image, bubble_data,
bg_color=(255, 255, 255)):
"""
Fills the bubble bounding box with bg_color.
Args: Args:
image : BGR numpy array (modified in place) cv_image : BGR numpy array (modified in place)
x,y,w,h : Bounding box bubble_data : Dict with 'x','y','w','h'
padding : Pixels to leave as border (default: 6) bg_color : (B,G,R) fill color
""" """
x1 = max(0, x + padding) img_h, img_w = cv_image.shape[:2]
y1 = max(0, y + padding) x = max(0, bubble_data["x"])
x2 = min(image.shape[1], x + w - padding) y = max(0, bubble_data["y"])
y2 = min(image.shape[0], y + h - padding) x2 = min(img_w, bubble_data["x"] + bubble_data["w"])
y2 = min(img_h, bubble_data["y"] + bubble_data["h"])
if x2 > x1 and y2 > y1: cv_image[y:y2, x:x2] = list(bg_color)
image[y1:y2, x1:x2] = 255
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# STEP 4: FIT FONT SIZE # FIT FONT SIZE
# Finds the largest font size where the text
# fits inside (max_w × max_h) with word wrap.
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def fit_font_size(draw, text, max_w, max_h, font_path, def fit_font_size(draw, text, max_w, max_h, font_path,
min_size=8, max_size=48): min_size=7, max_size=48):
""" """
Binary-searches for the largest font size where Finds the largest font size where word-wrapped text
word-wrapped text fits within the given box. fits inside (max_w × max_h).
Args:
draw : PIL ImageDraw instance
text : Text string to fit
max_w : Available width in pixels
max_h : Available height in pixels
font_path : Path to .ttf font (or None for default)
min_size : Smallest font size to try (default: 8)
max_size : Largest font size to try (default: 48)
Returns:
(font, list_of_wrapped_lines)
""" """
best_font = None best_font = None
best_lines = [text] best_lines = [text]
for size in range(max_size, min_size - 1, -1): for size in range(max_size, min_size - 1, -1):
try: try:
font = ImageFont.truetype(font_path, size) if font_path else ImageFont.load_default() font = (ImageFont.truetype(font_path, size)
if font_path else ImageFont.load_default())
except Exception: except Exception:
font = ImageFont.load_default() font = ImageFont.load_default()
# Word-wrap words, lines, current = text.split(), [], ""
words = text.split()
lines = []
current = ""
for word in words: for word in words:
test = (current + " " + word).strip() test = (current + " " + word).strip()
bbox = draw.textbbox((0, 0), test, font=font) bb = draw.textbbox((0, 0), test, font=font)
if (bbox[2] - bbox[0]) <= max_w: if (bb[2] - bb[0]) <= max_w:
current = test current = test
else: else:
if current: if current:
lines.append(current) lines.append(current)
current = word current = word
if current: if current:
lines.append(current) lines.append(current)
# Measure total block height lh = draw.textbbox((0, 0), "Ay", font=font)
lh_bbox = draw.textbbox((0, 0), "Ay", font=font) line_h = (lh[3] - lh[1]) + 2
line_h = (lh_bbox[3] - lh_bbox[1]) + 3 if line_h * len(lines) <= max_h:
total_h = line_h * len(lines)
if total_h <= max_h:
best_font = font best_font = font
best_lines = lines best_lines = lines
break break
if best_font is None: return best_font or ImageFont.load_default(), best_lines
best_font = ImageFont.load_default()
return best_font, best_lines
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# STEP 5: RENDER TEXT INTO BUBBLE # RENDER TEXT INTO BUBBLE
# Draws translated text centered inside
# the bubble bounding box.
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def render_text_in_bubble(pil_image, x, y, w, h, text, def render_text_in_bubble(pil_image, bubble_data, text,
font_path, padding=12, font_path, padding=8,
font_color=(0, 0, 0)): font_color=(0, 0, 0)):
""" """
Renders text centered (horizontally + vertically) Renders translated text centered inside the tight bbox.
inside a bubble bounding box. Font auto-sizes to fill the same w×h the original occupied.
Args:
pil_image : PIL Image (modified in place)
x,y,w,h : Bubble bounding box
text : Translated text to render
font_path : Path to .ttf font (or None)
padding : Inner padding in pixels (default: 12)
font_color : RGB color tuple (default: black)
""" """
x, y = bubble_data["x"], bubble_data["y"]
w, h = bubble_data["w"], bubble_data["h"]
draw = ImageDraw.Draw(pil_image) draw = ImageDraw.Draw(pil_image)
inner_w = max(1, w - padding * 2) inner_w = max(1, w - padding * 2)
inner_h = max(1, h - padding * 2) inner_h = max(1, h - padding * 2)
font, lines = fit_font_size(draw, text, inner_w, inner_h, font_path) font, lines = fit_font_size(draw, text, inner_w, inner_h,
font_path)
lh_bbox = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bbox[3] - lh_bbox[1]) + 3
lh_bb = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bb[3] - lh_bb[1]) + 2
total_h = line_h * len(lines) total_h = line_h * len(lines)
start_y = y + padding + max(0, (inner_h - total_h) // 2) start_y = y + padding + max(0, (inner_h - total_h) // 2)
@@ -232,7 +186,8 @@ def render_text_in_bubble(pil_image, x, y, w, h, text,
lb = draw.textbbox((0, 0), line, font=font) lb = draw.textbbox((0, 0), line, font=font)
line_w = lb[2] - lb[0] line_w = lb[2] - lb[0]
start_x = x + padding + max(0, (inner_w - line_w) // 2) start_x = x + padding + max(0, (inner_w - line_w) // 2)
draw.text((start_x, start_y), line, font=font, fill=font_color) draw.text((start_x, start_y), line,
font=font, fill=font_color)
start_y += line_h start_y += line_h
@@ -244,7 +199,7 @@ def resolve_font(font_path, fallback):
print(f" ✅ Using font: {font_path}") print(f" ✅ Using font: {font_path}")
return font_path return font_path
if fallback and os.path.exists(fallback): if fallback and os.path.exists(fallback):
print(f" ⚠️ '{font_path}' not found → fallback: {fallback}") print(f" ⚠️ Fallback: {fallback}")
return fallback return fallback
print(" ⚠️ No font found. Using PIL default.") print(" ⚠️ No font found. Using PIL default.")
return None return None
@@ -261,104 +216,122 @@ def render_translated_page(
font_path = FONT_PATH, font_path = FONT_PATH,
font_fallback = FONT_FALLBACK, font_fallback = FONT_FALLBACK,
font_color = FONT_COLOR, font_color = FONT_COLOR,
erase_padding = 6, text_padding = 8,
text_padding = 12,
debug = False, debug = False,
): ):
""" """
Full rendering pipeline: Pipeline:
1. Parse translations from output.txt 1. Parse translations (only present IDs processed)
2. Load bubble boxes from bubbles.json 2. Load bubble boxes from bubbles.json
3. Load original manga page 3. Cross-check IDs — absent ones left untouched
4. Erase original text from each bubble 4. Sample background color per bubble
5. Render translated text into each bubble 5. Erase original text (fill tight bbox)
6. Save output image 6. Render translated text sized to fit the bbox
7. Save output
Args:
input_image : Source manga page (default: 'page.png')
output_image : Output path (default: 'page_translated.png')
translations_file : Path to output.txt (default: 'output.txt')
bubbles_file : Path to bubbles.json (default: 'bubbles.json')
font_path : Primary .ttf font path
font_fallback : Fallback font path
font_color : RGB text color (default: black)
erase_padding : Border px when erasing (default: 6)
text_padding : Inner padding for text (default: 12)
debug : Save debug_render.png (default: False)
""" """
print("=" * 55) print("=" * 55)
print(" MANGA TRANSLATOR — RENDERER") print(" MANGA TRANSLATOR — RENDERER")
print("=" * 55) print("=" * 55)
# ── 1. Parse translations ─────────────────────────────────────────────────
print("\n📄 Parsing translations...") print("\n📄 Parsing translations...")
translations = parse_translations(translations_file) translations = parse_translations(translations_file)
if not translations: if not translations:
print("❌ No translations found. Aborting.") print("❌ No translations found. Aborting.")
return return
# ── 2. Load bubble boxes ────────────────────────────────────────────────── print(f"\n📦 Loading bubble data...")
print(f"\n📦 Loading bubble boxes from {bubbles_file}...")
bubble_boxes = load_bubble_boxes(bubbles_file) bubble_boxes = load_bubble_boxes(bubbles_file)
if not bubble_boxes: if not bubble_boxes:
print("❌ No bubble boxes found. Re-run manga-translator.py first.") print("❌ No bubble data. Re-run manga-translator.py.")
return return
# ── 3. Load image ───────────────────────────────────────────────────────── translate_ids = set(translations.keys())
print(f"\n🖼️ Loading image: {input_image}") box_ids = set(bubble_boxes.keys())
to_process = sorted(translate_ids & box_ids)
untouched = sorted(box_ids - translate_ids)
missing = sorted(translate_ids - box_ids)
print(f"\n🔗 To process : {to_process}")
print(f" Untouched : {untouched}")
if missing:
print(f" ⚠️ In output.txt but no box: {missing}")
if not to_process:
print("❌ No matching IDs. Aborting.")
return
print(f"\n🖼️ Loading: {input_image}")
cv_image = cv2.imread(input_image) cv_image = cv2.imread(input_image)
if cv_image is None: if cv_image is None:
print(f"❌ Could not load: {input_image}") print(f"❌ Could not load: {input_image}")
return return
print(f" Image size: {cv_image.shape[1]}×{cv_image.shape[0]}px") print(f" {cv_image.shape[1]}×{cv_image.shape[0]}px")
# ── 4. Erase original text ──────────────────────────────────────────────── # Sample backgrounds BEFORE erasing
print("\n🧹 Erasing original bubble text...") print("\n🎨 Sampling backgrounds...")
for bubble_id in sorted(translations.keys()): bg_colors = {}
if bubble_id not in bubble_boxes: for bid in to_process:
print(f" ⚠️ #{bubble_id}: no box in bubbles.json, skipping") bg_bgr = sample_bubble_background(
continue cv_image, bubble_boxes[bid])
x, y, w, h = bubble_boxes[bubble_id] bg_colors[bid] = bg_bgr
erase_bubble_rect(cv_image, x, y, w, h, padding=erase_padding) bg_rgb = (bg_bgr[2], bg_bgr[1], bg_bgr[0])
print(f" Erased #{bubble_id} at ({x},{y}) {w}×{h}px") brightness = sum(bg_rgb) / 3
ink = "black" if brightness > 128 else "white"
print(f" #{bid}: RGB{bg_rgb} ink→{ink}")
# ── 5. Convert to PIL ───────────────────────────────────────────────────── # Erase
pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)) print("\n🧹 Erasing original text...")
for bid in to_process:
bd = bubble_boxes[bid]
erase_bubble_text(cv_image, bd, bg_color=bg_colors[bid])
print(f" ✅ #{bid} ({bd['w']}×{bd['h']}px)")
pil_image = Image.fromarray(
cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
# ── 6. Resolve font ───────────────────────────────────────────────────────
print("\n🔤 Resolving font...") print("\n🔤 Resolving font...")
resolved_font = resolve_font(font_path, font_fallback) resolved_font = resolve_font(font_path, font_fallback)
# ── 7. Render translated text ───────────────────────────────────────────── # Render
print("\n✍️ Rendering translated text...") print("\n✍️ Rendering...")
for bubble_id, text in sorted(translations.items()): for bid in to_process:
if bubble_id not in bubble_boxes: text = translations[bid]
continue bd = bubble_boxes[bid]
x, y, w, h = bubble_boxes[bubble_id] bg_rgb = (bg_colors[bid][2],
bg_colors[bid][1],
bg_colors[bid][0])
brightness = sum(bg_rgb) / 3
txt_color = (0, 0, 0) if brightness > 128 \
else (255, 255, 255)
render_text_in_bubble( render_text_in_bubble(
pil_image, x, y, w, h, text, pil_image, bd, text,
font_path = resolved_font, font_path = resolved_font,
padding = text_padding, padding = text_padding,
font_color = font_color, font_color = txt_color,
) )
print(f" #{bubble_id}: '{text}' → ({x},{y}) {w}×{h}px") print(f" #{bid}: '{text}' "
f"({bd['x']},{bd['y']}) {bd['w']}×{bd['h']}px")
# ── 8. Debug overlay ──────────────────────────────────────────────────────
if debug: if debug:
dbg = pil_image.copy() dbg = pil_image.copy()
dbg_draw = ImageDraw.Draw(dbg) dbg_draw = ImageDraw.Draw(dbg)
for bubble_id, (x, y, w, h) in sorted(bubble_boxes.items()): for bid, bd in sorted(bubble_boxes.items()):
dbg_draw.rectangle([x, y, x + w, y + h], outline=(255, 0, 0), width=2) color = (0, 200, 0) if bid in translate_ids \
dbg_draw.text((x + 4, y + 4), f"#{bubble_id}", fill=(255, 0, 0)) else (160, 160, 160)
dbg_draw.rectangle(
[bd["x"], bd["y"],
bd["x"] + bd["w"], bd["y"] + bd["h"]],
outline=color, width=2)
dbg_draw.text((bd["x"] + 3, bd["y"] + 3),
f"#{bid}", fill=color)
dbg.save("debug_render.png") dbg.save("debug_render.png")
print("\n 🐛 Debug render saved → debug_render.png") print("\n 🐛 debug_render.png saved "
"(green=translated, grey=untouched)")
# ── 9. Save output ────────────────────────────────────────────────────────
print(f"\n💾 Saving → {output_image}") print(f"\n💾 Saving → {output_image}")
pil_image.save(output_image, "PNG") pil_image.save(output_image, "PNG")
print(f" ✅ Done! Open: {output_image}") print(" ✅ Done!")
print("=" * 55) print("=" * 55)
@@ -366,7 +339,6 @@ def render_translated_page(
# ENTRY POINT # ENTRY POINT
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
render_translated_page( render_translated_page(
input_image = "page.png", input_image = "page.png",
output_image = "page_translated.png", output_image = "page_translated.png",
@@ -375,7 +347,6 @@ if __name__ == "__main__":
font_path = "font.ttf", font_path = "font.ttf",
font_fallback = "/System/Library/Fonts/Helvetica.ttc", font_fallback = "/System/Library/Fonts/Helvetica.ttc",
font_color = (0, 0, 0), font_color = (0, 0, 0),
erase_padding = 6, text_padding = 8,
text_padding = 12,
debug = True, debug = True,
) )

View File

@@ -29,44 +29,132 @@ SUPPORTED_LANGUAGES = {
"Catalan" : "ca", "Catalan" : "ca",
} }
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# SOUND EFFECT FILTER # SOUND EFFECT FILTER
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
SOUND_EFFECT_PATTERNS = [ SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$", r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
r"^sha+$", r"^oh+$", r"^ugh+$", r"^gr+$", r"^bam+$",
r"^ha+$", r"^pow+$", r"^crash+$", r"^boom+$", r"^bang+$",
r"^ah+$", r"^crack+$", r"^whoosh+$", r"^thud+$", r"^snap+$",
r"^oh+$", r"^zip+$", r"^swoosh+$",
r"^ugh+$",
r"^gr+$",
r"^bam+$",
r"^pow+$",
r"^crash+$",
r"^boom+$",
r"^bang+$",
r"^crack+$",
r"^whoosh+$",
r"^thud+$",
r"^snap+$",
r"^zip+$",
r"^swoosh+$",
] ]
def is_sound_effect(text): def is_sound_effect(text):
cleaned = re.sub(r"[^a-z]", "", text.strip().lower()) cleaned = re.sub(r"[^a-z]", "", text.strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) return any(re.fullmatch(p, cleaned, re.IGNORECASE)
for p in SOUND_EFFECT_PATTERNS)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# BOUNDING BOX HELPERS # TOKEN FILTER
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def should_keep_token(text, confidence, confidence_threshold,
min_text_length, filter_sound_effects):
"""
Returns (keep: bool, reason: str).
Rules:
1. Drop if confidence below threshold
2. Drop if shorter than min_text_length
3. Drop pure digit strings
4. Drop single non-alpha characters
5. Drop sound effects if filter enabled
6. Keep everything else
"""
cleaned = text.strip()
if confidence < confidence_threshold:
return False, f"low confidence ({confidence:.2f})"
if len(cleaned) < min_text_length:
return False, "too short"
if re.fullmatch(r"\d+", cleaned):
return False, "pure digits"
if len(cleaned) == 1 and not cleaned.isalpha():
return False, "single symbol"
if filter_sound_effects and is_sound_effect(cleaned):
return False, "sound effect"
return True, "ok"
# ─────────────────────────────────────────────
# BOUNDING BOX
#
# Rules (match the red square exactly):
# Width = widest single quad's width
# Height = sum of ALL quad heights stacked
# X = centered on the widest quad's CX
# Y = topmost Y1 of all quads
# ─────────────────────────────────────────────
def get_cluster_bbox_from_ocr(ocr_bboxes, image_shape,
padding_px=10):
"""
Computes the bubble erase bbox:
1. Per-quad: measure w, h, cx for every OCR detection
2. Width = width of the widest single quad
3. Height = sum of every quad's height
4. X = widest quad's center ± max_w/2
(all lines sit symmetrically inside)
5. Y = top of topmost quad, bottom = Y + total_h
Args:
ocr_bboxes : List of EasyOCR quad bboxes
image_shape : (height, width) for clamping
padding_px : Expansion on each side (default: 10)
Returns:
(x1, y1, x2, y2) clamped to image bounds
"""
img_h, img_w = image_shape[:2]
if not ocr_bboxes:
return 0, 0, 0, 0
# ── Per-quad metrics ──────────────────────────────────────────
quad_metrics = []
for quad in ocr_bboxes:
xs = [pt[0] for pt in quad]
ys = [pt[1] for pt in quad]
qx1, qx2 = min(xs), max(xs)
qy1, qy2 = min(ys), max(ys)
quad_metrics.append({
"x1" : qx1,
"x2" : qx2,
"y1" : qy1,
"y2" : qy2,
"w" : qx2 - qx1,
"h" : qy2 - qy1,
"cx" : (qx1 + qx2) / 2.0,
})
# ── Width: widest single quad ─────────────────────────────────
widest = max(quad_metrics, key=lambda q: q["w"])
max_w = widest["w"]
center_x = widest["cx"]
# ── Height: sum of all quad heights ──────────────────────────
total_h = sum(q["h"] for q in quad_metrics)
# ── Box edges ─────────────────────────────────────────────────
box_x1 = center_x - max_w / 2.0
box_x2 = center_x + max_w / 2.0
box_y1 = min(q["y1"] for q in quad_metrics)
box_y2 = box_y1 + total_h
# ── Padding + clamp ───────────────────────────────────────────
x1 = max(0, box_x1 - padding_px)
y1 = max(0, box_y1 - padding_px)
x2 = min(img_w, box_x2 + padding_px)
y2 = min(img_h, box_y2 + padding_px)
return x1, y1, x2, y2
def get_cluster_bbox(items): def get_cluster_bbox(items):
""" """Fallback center-point bbox — used only during merge step."""
Returns (x1, y1, x2, y2) tight bounding box around
all (cy, cx, text) center points in a cluster.
Uses a fixed half-size approximation per text block.
"""
half = 30 half = 30
x1 = min(cx for _, cx, _ in items) - half x1 = min(cx for _, cx, _ in items) - half
y1 = min(cy for cy, _, _ in items) - half y1 = min(cy for cy, _, _ in items) - half
@@ -76,10 +164,6 @@ def get_cluster_bbox(items):
def boxes_are_close(bbox_a, bbox_b, proximity_px=80): def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
"""
Returns True if two (x1,y1,x2,y2) boxes are within
proximity_px pixels of each other (or overlapping).
"""
ax1, ay1, ax2, ay2 = bbox_a ax1, ay1, ax2, ay2 = bbox_a
bx1, by1, bx2, by2 = bbox_b bx1, by1, bx2, by2 = bbox_b
ax1 -= proximity_px; ay1 -= proximity_px ax1 -= proximity_px; ay1 -= proximity_px
@@ -87,18 +171,25 @@ def boxes_are_close(bbox_a, bbox_b, proximity_px=80):
return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1) return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
# ─────────────────────────────────────────────
# TEXT LINE FILTER
# ─────────────────────────────────────────────
def has_translatable_content(text):
"""
True if text contains at least one letter.
ch.isalpha() handles È, é, ñ, ü etc.
"""
return any(ch.isalpha() for ch in text)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# POST-CLUSTER MERGE (Union-Find) # POST-CLUSTER MERGE (Union-Find)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def merge_nearby_clusters(raw_clusters, proximity_px=80): def merge_nearby_clusters(raw_clusters, raw_quads,
""" proximity_px=80):
Merges clusters whose bounding boxes are within
proximity_px pixels of each other.
Fixes split bubbles without changing eps globally.
"""
labels = list(raw_clusters.keys()) labels = list(raw_clusters.keys())
bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl]) for lbl in labels} bboxes = {lbl: get_cluster_bbox(raw_clusters[lbl])
for lbl in labels}
parent = {lbl: lbl for lbl in labels} parent = {lbl: lbl for lbl in labels}
def find(x): def find(x):
@@ -116,30 +207,23 @@ def merge_nearby_clusters(raw_clusters, proximity_px=80):
if boxes_are_close(bboxes[a], bboxes[b], proximity_px): if boxes_are_close(bboxes[a], bboxes[b], proximity_px):
union(a, b) union(a, b)
merged = {} merged_clusters = {}
merged_quads = {}
for lbl in labels: for lbl in labels:
root = find(lbl) root = find(lbl)
merged.setdefault(root, []) merged_clusters.setdefault(root, [])
merged[root].extend(raw_clusters[lbl]) merged_quads.setdefault(root, [])
merged_clusters[root].extend(raw_clusters[lbl])
merged_quads[root].extend(raw_quads[lbl])
return merged return merged_clusters, merged_quads
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# CROP-BASED OCR RE-READ # CROP-BASED OCR RE-READ
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def reread_cluster_crop( def reread_cluster_crop(image, bbox, reader, source_lang,
image, padding_px=20, upscale_factor=2.5):
bbox,
reader,
source_lang,
padding_px=20,
upscale_factor=2.5,
):
"""
Crops a cluster region from the full image, upscales it,
and re-runs OCR for higher accuracy on small text.
"""
img_h, img_w = image.shape[:2] img_h, img_w = image.shape[:2]
x1, y1, x2, y2 = bbox x1, y1, x2, y2 = bbox
@@ -154,13 +238,13 @@ def reread_cluster_crop(
new_w = int(crop.shape[1] * upscale_factor) new_w = int(crop.shape[1] * upscale_factor)
new_h = int(crop.shape[0] * upscale_factor) new_h = int(crop.shape[0] * upscale_factor)
upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) upscaled = cv2.resize(crop, (new_w, new_h),
interpolation=cv2.INTER_CUBIC)
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
sharpened = cv2.filter2D(upscaled, -1, kernel) sharpened = cv2.filter2D(upscaled, -1, kernel)
temp_path = "_temp_crop_ocr.png" temp_path = "_temp_crop_ocr.png"
cv2.imwrite(temp_path, sharpened) cv2.imwrite(temp_path, sharpened)
try: try:
crop_results = reader.readtext(temp_path, paragraph=False) crop_results = reader.readtext(temp_path, paragraph=False)
finally: finally:
@@ -171,26 +255,31 @@ def reread_cluster_crop(
return None return None
crop_results.sort(key=lambda r: r[0][0][1]) crop_results.sort(key=lambda r: r[0][0][1])
lines = [text.strip() for _, text, conf in crop_results if text.strip()] lines = [t.strip() for _, t, _ in crop_results if t.strip()]
return fix_hyphens(lines) if lines else None return fix_hyphens(lines) if lines else None
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# DBSCAN BUBBLE CLUSTERING # DBSCAN BUBBLE CLUSTERING
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80): def cluster_into_bubbles(ocr_results, image_shape,
eps=80, min_samples=1,
proximity_px=80, bbox_padding=10):
""" """
Two-pass clustering: Two-pass clustering:
Pass 1 — DBSCAN on center points Pass 1 — DBSCAN on center points
Pass 2 — Bounding-box proximity merge Pass 2 — Bounding-box proximity merge
Bbox: widest-line width (centered) × stacked height.
All quads contribute to bbox regardless of content.
Returns: Returns:
bubble_dict : cluster_id → list of (cy, cx, text) bubble_dict : cluster_id → list of translatable text lines
bbox_dict : cluster_id → (x1, y1, x2, y2) bbox_dict : cluster_id → (x1, y1, x2, y2)
ocr_quads : cluster_id → list of ALL raw EasyOCR quads
""" """
if not ocr_results: if not ocr_results:
return {}, {} return {}, {}, {}
centers = [] centers = []
for bbox, text, confidence in ocr_results: for bbox, text, confidence in ocr_results:
@@ -199,11 +288,12 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
centers.append([sum(xs) / 4, sum(ys) / 4]) centers.append([sum(xs) / 4, sum(ys) / 4])
centers_array = np.array(centers, dtype=np.float32) centers_array = np.array(centers, dtype=np.float32)
db = DBSCAN(eps=eps, min_samples=min_samples,
db = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean") metric="euclidean")
labels = db.fit_predict(centers_array) labels = db.fit_predict(centers_array)
raw_clusters = {} raw_clusters = {}
raw_quads = {}
noise_counter = int(max(labels, default=0)) + 1 noise_counter = int(max(labels, default=0)) + 1
for idx, label in enumerate(labels): for idx, label in enumerate(labels):
@@ -211,12 +301,17 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
label = noise_counter label = noise_counter
noise_counter += 1 noise_counter += 1
raw_clusters.setdefault(label, []) raw_clusters.setdefault(label, [])
raw_quads.setdefault(label, [])
bbox, text, _ = ocr_results[idx] bbox, text, _ = ocr_results[idx]
raw_clusters[label].append((centers[idx][1], centers[idx][0], text)) raw_clusters[label].append(
(centers[idx][1], centers[idx][0], text))
raw_quads[label].append(bbox)
print(f" DBSCAN pass: {len(raw_clusters)} cluster(s)") print(f" DBSCAN pass: {len(raw_clusters)} cluster(s)")
merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px) merged_clusters, merged_quads = merge_nearby_clusters(
raw_clusters, raw_quads, proximity_px=proximity_px
)
print(f" After merge: {len(merged_clusters)} cluster(s)") print(f" After merge: {len(merged_clusters)} cluster(s)")
row_band_px = 150 row_band_px = 150
@@ -225,17 +320,42 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
return (min(cy for cy, cx, _ in items) // row_band_px, return (min(cy for cy, cx, _ in items) // row_band_px,
min(cx for cy, cx, _ in items)) min(cx for cy, cx, _ in items))
sorted_clusters = sorted(merged_clusters.values(), key=cluster_sort_key) sorted_labels = sorted(
merged_clusters.keys(),
key=lambda lbl: cluster_sort_key(merged_clusters[lbl])
)
bubble_dict = {} bubble_dict = {}
bbox_dict = {} bbox_dict = {}
ocr_quads = {}
for i, lbl in enumerate(sorted_labels, start=1):
items = merged_clusters[lbl]
quads = merged_quads[lbl]
for i, items in enumerate(sorted_clusters, start=1):
items_sorted = sorted(items, key=lambda t: t[0]) items_sorted = sorted(items, key=lambda t: t[0])
bubble_dict[i] = [text for _, _, text in items_sorted]
bbox_dict[i] = get_cluster_bbox(items)
return bubble_dict, bbox_dict text_lines = [
text for _, _, text in items_sorted
if has_translatable_content(text)
]
if not text_lines:
text_lines = [text for _, _, text in items_sorted]
bubble_dict[i] = text_lines
ocr_quads[i] = quads
bbox_dict[i] = get_cluster_bbox_from_ocr(
quads, image_shape, padding_px=bbox_padding
)
b = bbox_dict[i]
print(f" Cluster #{i}: {len(quads)} quad(s) "
f"bbox=({int(b[0])},{int(b[1])})→"
f"({int(b[2])},{int(b[3])}) "
f"w={int(b[2]-b[0])} h={int(b[3]-b[1])}")
return bubble_dict, bbox_dict, ocr_quads
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
@@ -247,7 +367,8 @@ def fix_hyphens(lines):
merged = lines[0] merged = lines[0]
for line in lines[1:]: for line in lines[1:]:
line = line.strip() line = line.strip()
merged = merged[:-1] + line if merged.endswith("-") else merged + " " + line merged = (merged[:-1] + line if merged.endswith("-")
else merged + " " + line)
return re.sub(r" {2,}", " ", merged).strip() return re.sub(r" {2,}", " ", merged).strip()
@@ -268,63 +389,45 @@ def compute_auto_eps(image_path, base_eps=80, reference_width=750):
# OCR QUALITY SCORE # OCR QUALITY SCORE
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def ocr_quality_score(text): def ocr_quality_score(text):
"""
Returns a quality score 0.01.0 for an OCR result.
Low score triggers a crop re-read.
"""
if not text or len(text) < 2: if not text or len(text) < 2:
return 0.0 return 0.0
alpha_ratio = sum(1 for c in text if c.isalpha()) / len(text)
alpha_chars = sum(1 for c in text if c.isalpha()) garbage = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
total_chars = len(text) penalty = sum(0.2 for p in garbage if re.search(p, text))
alpha_ratio = alpha_chars / total_chars
garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
penalty = sum(0.2 for p in garbage_patterns if re.search(p, text))
return max(0.0, min(1.0, alpha_ratio - penalty)) return max(0.0, min(1.0, alpha_ratio - penalty))
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# BUBBLE JSON EXPORT # BUBBLE JSON EXPORT
# Saves bbox_dict to bubbles.json so the
# renderer can load exact cluster positions.
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def export_bubble_boxes(bbox_dict, filepath="bubbles.json"): def export_bubble_boxes(bbox_dict, ocr_quads_dict,
""" filepath="bubbles.json"):
Serialises bbox_dict to a JSON file.
Format written:
{
"1": {"x": 120, "y": 45, "w": 180, "h": 210},
...
}
Args:
bbox_dict : Dict {bubble_id (int): (x1, y1, x2, y2)}
filepath : Output path (default: 'bubbles.json')
"""
export = {} export = {}
for bubble_id, (x1, y1, x2, y2) in bbox_dict.items(): for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
quads = ocr_quads_dict.get(bubble_id, [])
export[str(bubble_id)] = { export[str(bubble_id)] = {
"x": int(x1), "x" : int(x1),
"y": int(y1), "y" : int(y1),
"w": int(x2 - x1), "w" : int(x2 - x1),
"h": int(y2 - y1), "h" : int(y2 - y1),
"quads": [[[int(pt[0]), int(pt[1])] for pt in quad]
for quad in quads],
} }
with open(filepath, "w", encoding="utf-8") as f: with open(filepath, "w", encoding="utf-8") as f:
json.dump(export, f, indent=2, ensure_ascii=False) json.dump(export, f, indent=2, ensure_ascii=False)
print(f"📦 Bubble boxes saved → {filepath}") print(f"\n📦 Bubble boxes saved → {filepath}")
for bubble_id, vals in export.items(): for bid, v in export.items():
print(f" #{bubble_id}: ({vals['x']},{vals['y']}) {vals['w']}×{vals['h']}px") print(f" #{bid}: ({v['x']},{v['y']}) "
f"{v['w']}×{v['h']}px [{len(v['quads'])} quad(s)]")
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# DEBUG CLUSTER IMAGE # DEBUG CLUSTER IMAGE
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def save_debug_clusters(image_path, ocr_results, bubble_dict): def save_debug_clusters(image_path, ocr_results,
bubble_dict, bbox_dict):
image = cv2.imread(image_path) image = cv2.imread(image_path)
if image is None: if image is None:
return return
@@ -333,7 +436,8 @@ def save_debug_clusters(image_path, ocr_results, bubble_dict):
num_bubbles = max(bubble_dict.keys(), default=1) num_bubbles = max(bubble_dict.keys(), default=1)
colors = [ colors = [
tuple(int(c) for c in col) tuple(int(c) for c in col)
for col in np.random.randint(50, 230, size=(num_bubbles + 2, 3)) for col in np.random.randint(50, 230,
size=(num_bubbles + 2, 3))
] ]
text_to_bubble = {} text_to_bubble = {}
@@ -345,14 +449,21 @@ def save_debug_clusters(image_path, ocr_results, bubble_dict):
bubble_id = text_to_bubble.get(text, 0) bubble_id = text_to_bubble.get(text, 0)
color = colors[(bubble_id - 1) % len(colors)] color = colors[(bubble_id - 1) % len(colors)]
pts = np.array(bbox, dtype=np.int32) pts = np.array(bbox, dtype=np.int32)
cv2.polylines(image, [pts], isClosed=True, color=color, thickness=2) cv2.polylines(image, [pts], isClosed=True,
x = int(pts[0][0]) color=color, thickness=1)
y = max(int(pts[0][1]) - 5, 12)
cv2.putText(image, f"#{bubble_id}", (x, y), for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
color = colors[(bubble_id - 1) % len(colors)]
cv2.rectangle(image,
(int(x1), int(y1)),
(int(x2), int(y2)),
color, 2)
cv2.putText(image, f"BOX#{bubble_id}",
(int(x1) + 2, int(y1) + 16),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
cv2.imwrite("debug_clusters.png", image) cv2.imwrite("debug_clusters.png", image)
print(" 🐛 Cluster debug saved → debug_clusters.png") print(" 🐛 debug_clusters.png saved")
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
@@ -362,39 +473,18 @@ def translate_manga_text(
image_path, image_path,
source_lang="it", source_lang="it",
target_lang="ca", target_lang="ca",
confidence_threshold=0.15, confidence_threshold=0.10,
export_to_file=None, export_to_file=None,
export_bubbles_to="bubbles.json", # ← NEW: path for bubble boxes JSON export_bubbles_to="bubbles.json",
min_text_length=2, min_text_length=2,
cluster_eps="auto", cluster_eps="auto",
proximity_px=80, proximity_px=80,
filter_sound_effects=True, filter_sound_effects=True,
quality_threshold=0.5, quality_threshold=0.5,
upscale_factor=2.5, upscale_factor=2.5,
bbox_padding=10,
debug=False, debug=False,
): ):
"""
Full pipeline:
OCR → filter → DBSCAN cluster → proximity merge
→ quality check → crop re-read if needed
→ fix hyphens → translate → export txt + json
Args:
image_path : Path to your image file
source_lang : Source language code (default: 'it')
target_lang : Target language code (default: 'ca')
confidence_threshold : Min OCR confidence (default: 0.15)
export_to_file : Save translations to .txt (default: None)
export_bubbles_to : Save bubble boxes to .json (default: 'bubbles.json')
min_text_length : Min characters per detection(default: 2)
cluster_eps : DBSCAN eps or 'auto' (default: 'auto')
proximity_px : Post-merge proximity px (default: 80)
filter_sound_effects : Skip onomatopoeia/SFX (default: True)
quality_threshold : Min quality score 01 (default: 0.5)
upscale_factor : Crop upscale for re-read (default: 2.5)
debug : Save debug_clusters.png (default: False)
"""
# ── 1. Resolve eps ──────────────────────────────────────────────────────── # ── 1. Resolve eps ────────────────────────────────────────────────────────
if cluster_eps == "auto": if cluster_eps == "auto":
print("Computing auto eps...") print("Computing auto eps...")
@@ -410,54 +500,61 @@ def translate_manga_text(
# ── 3. Initialize OCR ───────────────────────────────────────────────────── # ── 3. Initialize OCR ─────────────────────────────────────────────────────
print("\nLoading OCR model...") print("\nLoading OCR model...")
ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang] ocr_lang_list = ["en", "es"] if source_lang == "ca" \
else [source_lang]
reader = easyocr.Reader(ocr_lang_list) reader = easyocr.Reader(ocr_lang_list)
# ── 4. Initialize translator ────────────────────────────────────────────── # ── 4. Initialize translator ──────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang) translator = GoogleTranslator(source=source_lang,
target=target_lang)
# ── 5. Run OCR ──────────────────────────────────────────────────────────── # ── 5. Run OCR ────────────────────────────────────────────────────────────
print(f"\nRunning OCR on: {image_path}") print(f"\nRunning OCR on: {image_path}")
results = reader.readtext(image_path, paragraph=False) results = reader.readtext(image_path, paragraph=False)
print(f" Raw detections: {len(results)}") print(f" Raw detections: {len(results)}")
# ── 6. Filter detections ────────────────────────────────────────────────── # ── 6. Filter ─────────────────────────────────────────────────────────────
filtered = [] filtered = []
skipped = 0 skipped = 0
for bbox, text, confidence in results: for bbox, text, confidence in results:
cleaned = text.strip() cleaned = text.strip()
if confidence < confidence_threshold: keep, reason = should_keep_token(
skipped += 1 cleaned, confidence,
continue confidence_threshold, min_text_length,
if len(cleaned) < min_text_length: filter_sound_effects
skipped += 1 )
continue if keep:
if re.fullmatch(r"[\d\W]+", cleaned): filtered.append((bbox, cleaned, confidence))
skipped += 1 else:
continue if reason == "sound effect":
if filter_sound_effects and is_sound_effect(cleaned):
print(f" 🔇 SFX skipped: '{cleaned}'") print(f" 🔇 SFX skipped: '{cleaned}'")
skipped += 1 skipped += 1
continue
filtered.append((bbox, cleaned, confidence))
print(f"{len(filtered)} detection(s) kept, {skipped} skipped.\n") print(f"{len(filtered)} kept, {skipped} skipped.\n")
if not filtered: if not filtered:
print("⚠️ No text detected after filtering.") print("⚠️ No text detected after filtering.")
return return
# ── 7. Cluster + merge ──────────────────────────────────────────────────── # ── 7. Cluster + merge ────────────────────────────────────────────────────
print(f"Clustering detections (eps={eps:.1f}px, proximity={proximity_px}px)...") print(f"Clustering (eps={eps:.1f}px, "
bubble_dict, bbox_dict = cluster_into_bubbles( f"proximity={proximity_px}px, "
filtered, eps=eps, proximity_px=proximity_px f"bbox_padding={bbox_padding}px)...")
bubble_dict, bbox_dict, ocr_quads = cluster_into_bubbles(
filtered,
image_shape = full_image.shape,
eps = eps,
proximity_px = proximity_px,
bbox_padding = bbox_padding,
) )
print(f"{len(bubble_dict)} bubble(s) after merge.\n") print(f"{len(bubble_dict)} bubble(s) after merge.\n")
# ── 8. Debug image ──────────────────────────────────────────────────────── # ── 8. Debug ──────────────────────────────────────────────────────────────
if debug: if debug:
save_debug_clusters(image_path, filtered, bubble_dict) save_debug_clusters(image_path, filtered,
bubble_dict, bbox_dict)
# ── 9. Fix hyphens ──────────────────────────────────────────────────────── # ── 9. Fix hyphens ────────────────────────────────────────────────────────
clean_bubbles = { clean_bubbles = {
@@ -471,41 +568,39 @@ def translate_manga_text(
for i, text in clean_bubbles.items(): for i, text in clean_bubbles.items():
score = ocr_quality_score(text) score = ocr_quality_score(text)
status = "" if score >= quality_threshold else "🔁" status = "" if score >= quality_threshold else "🔁"
print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'") print(f" #{i}: score={score:.2f} {status} '{text[:55]}'")
if score < quality_threshold: if score < quality_threshold:
print(f" → Re-reading bubble #{i} from crop...") print(f" → Re-reading #{i} from crop...")
reread = reread_cluster_crop( reread = reread_cluster_crop(
full_image, bbox_dict[i], reader, source_lang, full_image, bbox_dict[i], reader, source_lang,
upscale_factor=upscale_factor, upscale_factor=upscale_factor,
) )
if reread: if reread:
print(f" Re-read result: '{reread}'") print(f"'{reread}'")
clean_bubbles[i] = reread clean_bubbles[i] = reread
else: else:
print(f"Re-read returned nothing, keeping original.") print(f"Nothing found, keeping original.")
# ── 11. Translate & print ───────────────────────────────────────────────── # ── 11. Translate & print ─────────────────────────────────────────────────
print() print()
header = f"{'BUBBLE':<8} {'ORIGINAL (Italian)':<50} {'TRANSLATED (Catalan)'}" header = (f"{'BUBBLE':<8} "
f"{'ORIGINAL (Italian)':<50} "
f"{'TRANSLATED (Catalan)'}")
divider = "" * 105 divider = "" * 105
output_lines = [header, divider] output_lines = [header, divider]
print(header) print(header)
print(divider) print(divider)
translated_count = 0 translated_count = 0
for i in sorted(clean_bubbles.keys()): for i in sorted(clean_bubbles.keys()):
bubble_text = clean_bubbles[i].strip() bubble_text = clean_bubbles[i].strip()
if not bubble_text: if not bubble_text:
continue continue
try: try:
translated = translator.translate(bubble_text) translated = translator.translate(bubble_text)
except Exception as e: except Exception as e:
translated = f"[Translation error: {e}]" translated = f"[Translation error: {e}]"
if translated is None: if translated is None:
translated = "[No translation returned]" translated = "[No translation returned]"
@@ -515,23 +610,22 @@ def translate_manga_text(
output_lines.append(line) output_lines.append(line)
output_lines.append(divider) output_lines.append(divider)
summary = ( summary = (f"✅ Done! {translated_count} bubble(s) translated, "
f"✅ Done! {translated_count} bubble(s) translated, " f"{skipped} detection(s) skipped.")
f"{skipped} detection(s) skipped."
)
output_lines.append(summary) output_lines.append(summary)
print(divider) print(divider)
print(summary) print(summary)
# ── 12. Export translations .txt ────────────────────────────────────────── # ── 12. Export translations ───────────────────────────────────────────────
if export_to_file: if export_to_file:
with open(export_to_file, "w", encoding="utf-8") as f: with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(output_lines)) f.write("\n".join(output_lines))
print(f"📄 Translations saved → {export_to_file}") print(f"📄 Translations saved → {export_to_file}")
# ── 13. Export bubble boxes .json ───────────────────────────────────────── # ── 13. Export bubble boxes ───────────────────────────────────────────────
if export_bubbles_to: if export_bubbles_to:
export_bubble_boxes(bbox_dict, filepath=export_bubbles_to) export_bubble_boxes(bbox_dict, ocr_quads,
filepath=export_bubbles_to)
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
@@ -549,19 +643,19 @@ def list_languages():
# ENTRY POINT # ENTRY POINT
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path = "page.png", image_path = "page.png",
source_lang = "it", source_lang = "it",
target_lang = "ca", target_lang = "ca",
confidence_threshold = 0.15, confidence_threshold = 0.10,
min_text_length = 2, min_text_length = 2,
export_to_file = "output.txt", export_to_file = "output.txt",
export_bubbles_to = "bubbles.json", # ← NEW export_bubbles_to = "bubbles.json",
cluster_eps = "auto", cluster_eps = "auto",
proximity_px = 80, proximity_px = 80,
filter_sound_effects = True, filter_sound_effects = True,
quality_threshold = 0.5, quality_threshold = 0.5,
upscale_factor = 2.5, upscale_factor = 2.5,
bbox_padding = 0,
debug = True, debug = True,
) )