Inici del render

This commit is contained in:
Guillem Hernandez Sola
2026-04-10 18:14:54 +02:00
parent f92ea8410b
commit 458915278e
3 changed files with 484 additions and 57 deletions

38
bubbles.json Normal file
View File

@@ -0,0 +1,38 @@
{
"1": {
"x": 251,
"y": 149,
"w": 60,
"h": 60
},
"2": {
"x": 1202,
"y": 226,
"w": 61,
"h": 159
},
"3": {
"x": 966,
"y": 364,
"w": 62,
"h": 156
},
"4": {
"x": 265,
"y": 471,
"w": 62,
"h": 230
},
"5": {
"x": 359,
"y": 1114,
"w": 72,
"h": 134
},
"6": {
"x": 729,
"y": 1306,
"w": 60,
"h": 60
}
}

381
manga-renderer.py Normal file
View File

@@ -0,0 +1,381 @@
import re
import json
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os
# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
INPUT_IMAGE = "page.png"
OUTPUT_IMAGE = "page_translated.png"
TRANSLATIONS_FILE = "output.txt"
BUBBLES_FILE = "bubbles.json"
FONT_PATH = "font.ttf"
FONT_FALLBACK = "/System/Library/Fonts/Helvetica.ttc"
FONT_COLOR = (0, 0, 0)
BUBBLE_FILL = (255, 255, 255)
# ─────────────────────────────────────────────
# STEP 1: PARSE output.txt
# Robust parser: always takes the LAST
# whitespace-separated column as translation.
# ─────────────────────────────────────────────
def parse_translations(filepath):
"""
Parses output.txt and returns {bubble_id: translated_text}.
Strategy: split each #N line on 2+ consecutive spaces,
then always take the LAST token as the translation.
This is robust even when original or translated text
contains internal spaces.
Args:
filepath : Path to output.txt
Returns:
Dict {1: "LA NOIA ESTÀ IL·LESA!", ...}
"""
translations = {}
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")
# Must start with #N
if not re.match(r"^#\d+", line.strip()):
continue
# Split on 2+ spaces → [bubble_id_col, original_col, translated_col]
parts = re.split(r" {2,}", line.strip())
if len(parts) < 3:
continue
bubble_id = int(re.sub(r"[^0-9]", "", parts[0]))
translated = parts[-1].strip() # always last column
translations[bubble_id] = translated
print(f" ✅ Parsed {len(translations)} translation(s) from {filepath}")
for bid, text in sorted(translations.items()):
print(f" #{bid}: {text}")
return translations
# ─────────────────────────────────────────────
# STEP 2: LOAD BUBBLE BOXES from bubbles.json
# These were saved by manga-translator.py
# and are guaranteed to match the clusters.
# ─────────────────────────────────────────────
def load_bubble_boxes(filepath):
"""
Loads bubble bounding boxes from bubbles.json.
Expected format:
{
"1": {"x": 120, "y": 45, "w": 180, "h": 210},
"2": { ... },
...
}
Args:
filepath : Path to bubbles.json
Returns:
Dict {bubble_id (int): (x, y, w, h)}
"""
with open(filepath, "r", encoding="utf-8") as f:
raw = json.load(f)
boxes = {}
for key, val in raw.items():
bubble_id = int(key)
boxes[bubble_id] = (val["x"], val["y"], val["w"], val["h"])
print(f" ✅ Loaded {len(boxes)} bubble box(es) from {filepath}")
for bid, (x, y, w, h) in sorted(boxes.items()):
print(f" #{bid}: ({x},{y}) {w}×{h}px")
return boxes
# ─────────────────────────────────────────────
# STEP 3: ERASE BUBBLE CONTENT
# Fills a rectangular region with white.
# Uses a slightly inset rect to preserve
# the bubble border.
# ─────────────────────────────────────────────
def erase_bubble_rect(image, x, y, w, h, padding=6):
"""
Fills the interior of a bounding box with white,
leaving a border of `padding` pixels intact.
Args:
image : BGR numpy array (modified in place)
x,y,w,h : Bounding box
padding : Pixels to leave as border (default: 6)
"""
x1 = max(0, x + padding)
y1 = max(0, y + padding)
x2 = min(image.shape[1], x + w - padding)
y2 = min(image.shape[0], y + h - padding)
if x2 > x1 and y2 > y1:
image[y1:y2, x1:x2] = 255
# ─────────────────────────────────────────────
# STEP 4: FIT FONT SIZE
# Finds the largest font size where the text
# fits inside (max_w × max_h) with word wrap.
# ─────────────────────────────────────────────
def fit_font_size(draw, text, max_w, max_h, font_path,
min_size=8, max_size=48):
"""
Binary-searches for the largest font size where
word-wrapped text fits within the given box.
Args:
draw : PIL ImageDraw instance
text : Text string to fit
max_w : Available width in pixels
max_h : Available height in pixels
font_path : Path to .ttf font (or None for default)
min_size : Smallest font size to try (default: 8)
max_size : Largest font size to try (default: 48)
Returns:
(font, list_of_wrapped_lines)
"""
best_font = None
best_lines = [text]
for size in range(max_size, min_size - 1, -1):
try:
font = ImageFont.truetype(font_path, size) if font_path else ImageFont.load_default()
except Exception:
font = ImageFont.load_default()
# Word-wrap
words = text.split()
lines = []
current = ""
for word in words:
test = (current + " " + word).strip()
bbox = draw.textbbox((0, 0), test, font=font)
if (bbox[2] - bbox[0]) <= max_w:
current = test
else:
if current:
lines.append(current)
current = word
if current:
lines.append(current)
# Measure total block height
lh_bbox = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bbox[3] - lh_bbox[1]) + 3
total_h = line_h * len(lines)
if total_h <= max_h:
best_font = font
best_lines = lines
break
if best_font is None:
best_font = ImageFont.load_default()
return best_font, best_lines
# ─────────────────────────────────────────────
# STEP 5: RENDER TEXT INTO BUBBLE
# Draws translated text centered inside
# the bubble bounding box.
# ─────────────────────────────────────────────
def render_text_in_bubble(pil_image, x, y, w, h, text,
font_path, padding=12,
font_color=(0, 0, 0)):
"""
Renders text centered (horizontally + vertically)
inside a bubble bounding box.
Args:
pil_image : PIL Image (modified in place)
x,y,w,h : Bubble bounding box
text : Translated text to render
font_path : Path to .ttf font (or None)
padding : Inner padding in pixels (default: 12)
font_color : RGB color tuple (default: black)
"""
draw = ImageDraw.Draw(pil_image)
inner_w = max(1, w - padding * 2)
inner_h = max(1, h - padding * 2)
font, lines = fit_font_size(draw, text, inner_w, inner_h, font_path)
lh_bbox = draw.textbbox((0, 0), "Ay", font=font)
line_h = (lh_bbox[3] - lh_bbox[1]) + 3
total_h = line_h * len(lines)
start_y = y + padding + max(0, (inner_h - total_h) // 2)
for line in lines:
lb = draw.textbbox((0, 0), line, font=font)
line_w = lb[2] - lb[0]
start_x = x + padding + max(0, (inner_w - line_w) // 2)
draw.text((start_x, start_y), line, font=font, fill=font_color)
start_y += line_h
# ─────────────────────────────────────────────
# RESOLVE FONT
# ─────────────────────────────────────────────
def resolve_font(font_path, fallback):
if font_path and os.path.exists(font_path):
print(f" ✅ Using font: {font_path}")
return font_path
if fallback and os.path.exists(fallback):
print(f" ⚠️ '{font_path}' not found → fallback: {fallback}")
return fallback
print(" ⚠️ No font found. Using PIL default.")
return None
# ─────────────────────────────────────────────
# MAIN RENDERER
# ─────────────────────────────────────────────
def render_translated_page(
input_image = INPUT_IMAGE,
output_image = OUTPUT_IMAGE,
translations_file = TRANSLATIONS_FILE,
bubbles_file = BUBBLES_FILE,
font_path = FONT_PATH,
font_fallback = FONT_FALLBACK,
font_color = FONT_COLOR,
erase_padding = 6,
text_padding = 12,
debug = False,
):
"""
Full rendering pipeline:
1. Parse translations from output.txt
2. Load bubble boxes from bubbles.json
3. Load original manga page
4. Erase original text from each bubble
5. Render translated text into each bubble
6. Save output image
Args:
input_image : Source manga page (default: 'page.png')
output_image : Output path (default: 'page_translated.png')
translations_file : Path to output.txt (default: 'output.txt')
bubbles_file : Path to bubbles.json (default: 'bubbles.json')
font_path : Primary .ttf font path
font_fallback : Fallback font path
font_color : RGB text color (default: black)
erase_padding : Border px when erasing (default: 6)
text_padding : Inner padding for text (default: 12)
debug : Save debug_render.png (default: False)
"""
print("=" * 55)
print(" MANGA TRANSLATOR — RENDERER")
print("=" * 55)
# ── 1. Parse translations ─────────────────────────────────────────────────
print("\n📄 Parsing translations...")
translations = parse_translations(translations_file)
if not translations:
print("❌ No translations found. Aborting.")
return
# ── 2. Load bubble boxes ──────────────────────────────────────────────────
print(f"\n📦 Loading bubble boxes from {bubbles_file}...")
bubble_boxes = load_bubble_boxes(bubbles_file)
if not bubble_boxes:
print("❌ No bubble boxes found. Re-run manga-translator.py first.")
return
# ── 3. Load image ─────────────────────────────────────────────────────────
print(f"\n🖼️ Loading image: {input_image}")
cv_image = cv2.imread(input_image)
if cv_image is None:
print(f"❌ Could not load: {input_image}")
return
print(f" Image size: {cv_image.shape[1]}×{cv_image.shape[0]}px")
# ── 4. Erase original text ────────────────────────────────────────────────
print("\n🧹 Erasing original bubble text...")
for bubble_id in sorted(translations.keys()):
if bubble_id not in bubble_boxes:
print(f" ⚠️ #{bubble_id}: no box in bubbles.json, skipping")
continue
x, y, w, h = bubble_boxes[bubble_id]
erase_bubble_rect(cv_image, x, y, w, h, padding=erase_padding)
print(f" Erased #{bubble_id} at ({x},{y}) {w}×{h}px")
# ── 5. Convert to PIL ─────────────────────────────────────────────────────
pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
# ── 6. Resolve font ───────────────────────────────────────────────────────
print("\n🔤 Resolving font...")
resolved_font = resolve_font(font_path, font_fallback)
# ── 7. Render translated text ─────────────────────────────────────────────
print("\n✍️ Rendering translated text...")
for bubble_id, text in sorted(translations.items()):
if bubble_id not in bubble_boxes:
continue
x, y, w, h = bubble_boxes[bubble_id]
render_text_in_bubble(
pil_image, x, y, w, h, text,
font_path = resolved_font,
padding = text_padding,
font_color = font_color,
)
print(f" #{bubble_id}: '{text}' → ({x},{y}) {w}×{h}px")
# ── 8. Debug overlay ──────────────────────────────────────────────────────
if debug:
dbg = pil_image.copy()
dbg_draw = ImageDraw.Draw(dbg)
for bubble_id, (x, y, w, h) in sorted(bubble_boxes.items()):
dbg_draw.rectangle([x, y, x + w, y + h], outline=(255, 0, 0), width=2)
dbg_draw.text((x + 4, y + 4), f"#{bubble_id}", fill=(255, 0, 0))
dbg.save("debug_render.png")
print("\n 🐛 Debug render saved → debug_render.png")
# ── 9. Save output ────────────────────────────────────────────────────────
print(f"\n💾 Saving → {output_image}")
pil_image.save(output_image, "PNG")
print(f" ✅ Done! Open: {output_image}")
print("=" * 55)
# ─────────────────────────────────────────────
# ENTRY POINT
# ─────────────────────────────────────────────
if __name__ == "__main__":
render_translated_page(
input_image = "page.png",
output_image = "page_translated.png",
translations_file = "output.txt",
bubbles_file = "bubbles.json",
font_path = "font.ttf",
font_fallback = "/System/Library/Fonts/Helvetica.ttc",
font_color = (0, 0, 0),
erase_padding = 6,
text_padding = 12,
debug = True,
)

View File

@@ -1,5 +1,6 @@
import re
import os
import json
import cv2
import numpy as np
import easyocr
@@ -126,10 +127,6 @@ def merge_nearby_clusters(raw_clusters, proximity_px=80):
# ─────────────────────────────────────────────
# CROP-BASED OCR RE-READ
# For each cluster bounding box, crop the
# original image with padding and re-run OCR
# at higher quality. This fixes garbled text
# in small or low-contrast bubbles.
# ─────────────────────────────────────────────
def reread_cluster_crop(
image,
@@ -142,23 +139,10 @@ def reread_cluster_crop(
"""
Crops a cluster region from the full image, upscales it,
and re-runs OCR for higher accuracy on small text.
Args:
image : Full-page image as numpy array (BGR)
bbox : (x1, y1, x2, y2) cluster bounding box
reader : Initialized EasyOCR Reader
source_lang : Language code string
padding_px : Pixels of padding around the crop (default: 20)
upscale_factor: How much to enlarge the crop before OCR (default: 2.5)
Returns:
Single cleaned string with all OCR lines merged top-to-bottom,
or None if OCR found nothing.
"""
img_h, img_w = image.shape[:2]
x1, y1, x2, y2 = bbox
# Add padding, clamp to image bounds
x1 = max(0, int(x1) - padding_px)
y1 = max(0, int(y1) - padding_px)
x2 = min(img_w, int(x2) + padding_px)
@@ -168,16 +152,12 @@ def reread_cluster_crop(
if crop.size == 0:
return None
# Upscale for better OCR on small text
new_w = int(crop.shape[1] * upscale_factor)
new_h = int(crop.shape[0] * upscale_factor)
upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
# Light sharpening to improve OCR on manga fonts
new_w = int(crop.shape[1] * upscale_factor)
new_h = int(crop.shape[0] * upscale_factor)
upscaled = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
sharpened = cv2.filter2D(upscaled, -1, kernel)
# Save temp crop and OCR it
temp_path = "_temp_crop_ocr.png"
cv2.imwrite(temp_path, sharpened)
@@ -190,8 +170,7 @@ def reread_cluster_crop(
if not crop_results:
return None
# Sort detections top-to-bottom and join lines
crop_results.sort(key=lambda r: r[0][0][1]) # sort by top-left Y
crop_results.sort(key=lambda r: r[0][0][1])
lines = [text.strip() for _, text, conf in crop_results if text.strip()]
return fix_hyphens(lines) if lines else None
@@ -240,7 +219,6 @@ def cluster_into_bubbles(ocr_results, eps=80, min_samples=1, proximity_px=80):
merged_clusters = merge_nearby_clusters(raw_clusters, proximity_px=proximity_px)
print(f" After merge: {len(merged_clusters)} cluster(s)")
# Sort in reading order
row_band_px = 150
def cluster_sort_key(items):
@@ -288,19 +266,11 @@ def compute_auto_eps(image_path, base_eps=80, reference_width=750):
# ─────────────────────────────────────────────
# OCR QUALITY SCORE
# Heuristic to detect garbled OCR output.
# Low score = likely garbage, trigger re-read.
# ─────────────────────────────────────────────
def ocr_quality_score(text):
"""
Returns a quality score 0.01.0 for an OCR result.
Penalises:
- High ratio of non-alphabetic characters
- Very short text (< 4 chars)
- Suspicious character combos (,,- etc.)
A score below 0.5 triggers a crop re-read.
Low score triggers a crop re-read.
"""
if not text or len(text) < 2:
return 0.0
@@ -309,12 +279,46 @@ def ocr_quality_score(text):
total_chars = len(text)
alpha_ratio = alpha_chars / total_chars
# Penalise suspicious patterns
garbage_patterns = [r",,", r"\.\.-", r"[^\w\s\'\!\?\.,-]{2,}"]
penalty = sum(0.2 for p in garbage_patterns if re.search(p, text))
score = alpha_ratio - penalty
return max(0.0, min(1.0, score))
return max(0.0, min(1.0, alpha_ratio - penalty))
# ─────────────────────────────────────────────
# BUBBLE JSON EXPORT
# Saves bbox_dict to bubbles.json so the
# renderer can load exact cluster positions.
# ─────────────────────────────────────────────
def export_bubble_boxes(bbox_dict, filepath="bubbles.json"):
"""
Serialises bbox_dict to a JSON file.
Format written:
{
"1": {"x": 120, "y": 45, "w": 180, "h": 210},
...
}
Args:
bbox_dict : Dict {bubble_id (int): (x1, y1, x2, y2)}
filepath : Output path (default: 'bubbles.json')
"""
export = {}
for bubble_id, (x1, y1, x2, y2) in bbox_dict.items():
export[str(bubble_id)] = {
"x": int(x1),
"y": int(y1),
"w": int(x2 - x1),
"h": int(y2 - y1),
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(export, f, indent=2, ensure_ascii=False)
print(f"📦 Bubble boxes saved → {filepath}")
for bubble_id, vals in export.items():
print(f" #{bubble_id}: ({vals['x']},{vals['y']}) {vals['w']}×{vals['h']}px")
# ─────────────────────────────────────────────
@@ -360,31 +364,33 @@ def translate_manga_text(
target_lang="ca",
confidence_threshold=0.15,
export_to_file=None,
export_bubbles_to="bubbles.json", # ← NEW: path for bubble boxes JSON
min_text_length=2,
cluster_eps="auto",
proximity_px=80,
filter_sound_effects=True,
quality_threshold=0.5, # below this → trigger crop re-read
upscale_factor=2.5, # crop upscale multiplier for re-read
quality_threshold=0.5,
upscale_factor=2.5,
debug=False,
):
"""
Full pipeline:
OCR → filter → DBSCAN cluster → proximity merge
→ quality check → crop re-read if needed
→ fix hyphens → translate
→ fix hyphens → translate → export txt + json
Args:
image_path : Path to your image file
source_lang : Source language code (default: 'it')
target_lang : Target language code (default: 'ca')
confidence_threshold : Min OCR confidence (default: 0.15)
export_to_file : Save output to .txt (default: None)
export_to_file : Save translations to .txt (default: None)
export_bubbles_to : Save bubble boxes to .json (default: 'bubbles.json')
min_text_length : Min characters per detection(default: 2)
cluster_eps : DBSCAN eps or 'auto' (default: 'auto')
proximity_px : Post-merge proximity px (default: 80)
filter_sound_effects : Skip onomatopoeia/SFX (default: True)
quality_threshold : Min quality score 01 before re-read (default: 0.5)
quality_threshold : Min quality score 01 (default: 0.5)
upscale_factor : Crop upscale for re-read (default: 2.5)
debug : Save debug_clusters.png (default: False)
"""
@@ -396,7 +402,7 @@ def translate_manga_text(
else:
eps = float(cluster_eps)
# ── 2. Load full image (needed for crop re-reads) ─────────────────────────
# ── 2. Load full image ────────────────────────────────────────────────────
full_image = cv2.imread(image_path)
if full_image is None:
print(f"❌ Could not load image: {image_path}")
@@ -410,7 +416,7 @@ def translate_manga_text(
# ── 4. Initialize translator ──────────────────────────────────────────────
translator = GoogleTranslator(source=source_lang, target=target_lang)
# ── 5. Run OCR on full image ──────────────────────────────────────────────
# ── 5. Run OCR ────────────────────────────────────────────────────────────
print(f"\nRunning OCR on: {image_path}")
results = reader.readtext(image_path, paragraph=False)
print(f" Raw detections: {len(results)}")
@@ -453,27 +459,24 @@ def translate_manga_text(
if debug:
save_debug_clusters(image_path, filtered, bubble_dict)
# ── 9. Fix hyphens → first-pass text ─────────────────────────────────────
# ── 9. Fix hyphens ────────────────────────────────────────────────────────
clean_bubbles = {
i: fix_hyphens(lines)
for i, lines in bubble_dict.items()
if lines
}
# ── 10. Quality check crop re-read for low-quality bubbles ─────────────
# ── 10. Quality check + crop re-read ──────────────────────────────────────
print("Checking OCR quality per bubble...")
for i, text in clean_bubbles.items():
score = ocr_quality_score(text)
score = ocr_quality_score(text)
status = "" if score >= quality_threshold else "🔁"
print(f" Bubble #{i}: score={score:.2f} {status} '{text[:60]}'")
if score < quality_threshold:
print(f" → Re-reading bubble #{i} from crop...")
reread = reread_cluster_crop(
full_image,
bbox_dict[i],
reader,
source_lang,
full_image, bbox_dict[i], reader, source_lang,
upscale_factor=upscale_factor,
)
if reread:
@@ -520,11 +523,15 @@ def translate_manga_text(
print(divider)
print(summary)
# ── 12. Export ────────────────────────────────────────────────────────────
# ── 12. Export translations .txt ──────────────────────────────────────────
if export_to_file:
with open(export_to_file, "w", encoding="utf-8") as f:
f.write("\n".join(output_lines))
print(f"📄 Output saved to: {export_to_file}")
print(f"📄 Translations saved {export_to_file}")
# ── 13. Export bubble boxes .json ─────────────────────────────────────────
if export_bubbles_to:
export_bubble_boxes(bbox_dict, filepath=export_bubbles_to)
# ─────────────────────────────────────────────
@@ -550,10 +557,11 @@ if __name__ == "__main__":
confidence_threshold = 0.15,
min_text_length = 2,
export_to_file = "output.txt",
export_bubbles_to = "bubbles.json", # ← NEW
cluster_eps = "auto",
proximity_px = 80,
filter_sound_effects = True,
quality_threshold = 0.5, # bubbles scoring below this get re-read
upscale_factor = 2.5, # how much to enlarge the crop for re-read
quality_threshold = 0.5,
upscale_factor = 2.5,
debug = True,
)