Improving white coloring
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
@@ -10,9 +10,9 @@ import easyocr
|
||||
from deep_translator import GoogleTranslator
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# CONFIG
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
GLOSSARY = {
|
||||
"ANYA": "ANYA",
|
||||
"STARLIGHT ANYA": "STARLIGHT ANYA",
|
||||
@@ -34,16 +34,17 @@ TITLE_PATTERNS = [
|
||||
]
|
||||
|
||||
NOISE_PATTERNS = [
|
||||
r"^[^a-zA-Z0-9\?!.]+$",
|
||||
r"^BOX[0-9A-Z#\s]*$",
|
||||
r"^[^a-zA-Z0-9\?!.¡¿]+$",
|
||||
r"^BOX[#\s0-9A-Z\-]*$", # debug labels
|
||||
r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", # e.g. 98x12
|
||||
]
|
||||
|
||||
TOP_BAND_RATIO = 0.08
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# TEXT HELPERS
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
def normalize_text(text: str) -> str:
|
||||
t = (text or "").strip().upper()
|
||||
t = t.replace("“", "\"").replace("”", "\"")
|
||||
@@ -51,18 +52,22 @@ def normalize_text(text: str) -> str:
|
||||
t = t.replace("…", "...")
|
||||
t = re.sub(r"\s+", " ", t)
|
||||
t = re.sub(r"\s+([,.;:!?])", r"\1", t)
|
||||
t = re.sub(r"([¡¿])\s+", r"\1", t)
|
||||
t = re.sub(r"\(\s+", "(", t)
|
||||
t = re.sub(r"\s+\)", ")", t)
|
||||
t = re.sub(r"\.{4,}", "...", t)
|
||||
t = re.sub(r",\?", "?", t)
|
||||
t = t.replace("IQUE", "¡QUE")
|
||||
t = t.replace("IQUIEN", "¿QUIEN")
|
||||
return t.strip()
|
||||
|
||||
|
||||
def apply_glossary(text: str) -> str:
|
||||
out = text or ""
|
||||
for k in sorted(GLOSSARY.keys(), key=len, reverse=True):
|
||||
out = re.sub(rf"\b{re.escape(k)}\b", GLOSSARY[k], out, flags=re.IGNORECASE)
|
||||
return out
|
||||
|
||||
|
||||
def postprocess_translation_general(text: str) -> str:
|
||||
t = normalize_text(text)
|
||||
t = re.sub(r"\s{2,}", " ", t).strip()
|
||||
@@ -71,34 +76,47 @@ def postprocess_translation_general(text: str) -> str:
|
||||
return t
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# FILTERS
|
||||
# ─────────────────────────────────────────────
|
||||
def is_sound_effect(text: str) -> bool:
|
||||
cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
|
||||
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
|
||||
|
||||
|
||||
def is_title_text(text: str) -> bool:
|
||||
t = (text or "").strip().lower()
|
||||
return any(re.fullmatch(p, t, re.IGNORECASE) for p in TITLE_PATTERNS)
|
||||
|
||||
|
||||
def is_noise_text(text: str) -> bool:
|
||||
t = (text or "").strip()
|
||||
return any(re.fullmatch(p, t) for p in NOISE_PATTERNS)
|
||||
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
|
||||
return True
|
||||
|
||||
# very short isolated junk
|
||||
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
|
||||
return True
|
||||
|
||||
# mostly-symbol garbage
|
||||
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
|
||||
if len(t) <= 6 and symbol_ratio > 0.60:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# GEOMETRY
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
def quad_bbox(quad):
|
||||
xs = [p[0] for p in quad]
|
||||
ys = [p[1] for p in quad]
|
||||
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
|
||||
|
||||
|
||||
def quad_center(quad):
|
||||
x1, y1, x2, y2 = quad_bbox(quad)
|
||||
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
|
||||
|
||||
|
||||
def boxes_union_xyxy(boxes):
|
||||
boxes = [b for b in boxes if b is not None]
|
||||
if not boxes:
|
||||
@@ -110,17 +128,20 @@ def boxes_union_xyxy(boxes):
|
||||
int(max(b[3] for b in boxes)),
|
||||
)
|
||||
|
||||
|
||||
def bbox_area_xyxy(b):
|
||||
if b is None:
|
||||
return 0
|
||||
return int(max(0, b[2] - b[0]) * max(0, b[3] - b[1]))
|
||||
|
||||
|
||||
def xyxy_to_xywh(b):
|
||||
if b is None:
|
||||
return None
|
||||
x1, y1, x2, y2 = b
|
||||
return {"x": int(x1), "y": int(y1), "w": int(max(0, x2 - x1)), "h": int(max(0, y2 - y1))}
|
||||
|
||||
|
||||
def overlap_or_near(a, b, gap=0):
|
||||
ax1, ay1, ax2, ay2 = a
|
||||
bx1, by1, bx2, by2 = b
|
||||
@@ -129,21 +150,9 @@ def overlap_or_near(a, b, gap=0):
|
||||
return gap_x <= gap and gap_y <= gap
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# QUALITY / SCORING
|
||||
# ─────────────────────────────────────────────
|
||||
def ocr_quality_score(text: str) -> float:
|
||||
if not text or len(text) < 2:
|
||||
return 0.0
|
||||
alpha_ratio = sum(1 for c in text if c.isalpha()) / max(1, len(text))
|
||||
penalty = 0.0
|
||||
if re.search(r"[^\w\s\'\!\?\.,\-]{2,}", text):
|
||||
penalty += 0.2
|
||||
if re.search(r",,", text):
|
||||
penalty += 0.2
|
||||
bonus = 0.05 if re.search(r"[.!?]$", text) else 0.0
|
||||
return max(0.0, min(1.0, alpha_ratio - penalty + bonus))
|
||||
|
||||
# ============================================================
|
||||
# OCR QUALITY SCORING
|
||||
# ============================================================
|
||||
def ocr_candidate_score(text: str) -> float:
|
||||
if not text:
|
||||
return 0.0
|
||||
@@ -154,8 +163,8 @@ def ocr_candidate_score(text: str) -> float:
|
||||
|
||||
alpha = sum(c.isalpha() for c in t) / n
|
||||
spaces = sum(c.isspace() for c in t) / n
|
||||
punct_ok = sum(c in ".,!?'-:;()[]\"" for c in t) / n
|
||||
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"]", t)) / n
|
||||
punct_ok = sum(c in ".,!?'-:;()[]\"¡¿" for c in t) / n
|
||||
bad = len(re.findall(r"[^\w\s\.\,\!\?\-\'\:\;\(\)\[\]\"¡¿]", t)) / n
|
||||
|
||||
penalty = 0.0
|
||||
if re.search(r"\b[A-Z]\b", t):
|
||||
@@ -169,9 +178,9 @@ def ocr_candidate_score(text: str) -> float:
|
||||
return max(0.0, min(1.0, score))
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# OCR MULTI-PASS
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# OCR MULTI-PASS REREAD
|
||||
# ============================================================
|
||||
def preprocess_variant(crop_bgr, mode):
|
||||
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
@@ -198,6 +207,7 @@ def preprocess_variant(crop_bgr, mode):
|
||||
|
||||
return gray
|
||||
|
||||
|
||||
def rotate_image_keep_bounds(img, angle_deg):
|
||||
h, w = img.shape[:2]
|
||||
c = (w / 2, h / 2)
|
||||
@@ -213,6 +223,7 @@ def rotate_image_keep_bounds(img, angle_deg):
|
||||
|
||||
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
|
||||
|
||||
|
||||
def run_ocr_on_array(reader, arr):
|
||||
tmp = "_tmp_ocr.png"
|
||||
cv2.imwrite(tmp, arr)
|
||||
@@ -222,6 +233,7 @@ def run_ocr_on_array(reader, arr):
|
||||
if os.path.exists(tmp):
|
||||
os.remove(tmp)
|
||||
|
||||
|
||||
def rebuild_text_from_ocr_result(res):
|
||||
if not res:
|
||||
return ""
|
||||
@@ -245,7 +257,7 @@ def rebuild_text_from_ocr_result(res):
|
||||
med_h = float(np.median([x[5] for x in norm]))
|
||||
row_tol = max(6.0, med_h * 0.75)
|
||||
|
||||
norm.sort(key=lambda z: z[4]) # y
|
||||
norm.sort(key=lambda z: z[4]) # y-center
|
||||
rows = []
|
||||
for it in norm:
|
||||
placed = False
|
||||
@@ -259,17 +271,17 @@ def rebuild_text_from_ocr_result(res):
|
||||
rows.append({"yc": it[4], "m": [it]})
|
||||
|
||||
rows.sort(key=lambda r: r["yc"])
|
||||
|
||||
lines = []
|
||||
for r in rows:
|
||||
mem = sorted(r["m"], key=lambda z: z[3]) # x
|
||||
mem = sorted(r["m"], key=lambda z: z[3]) # x-center
|
||||
line = normalize_text(" ".join(x[1] for x in mem))
|
||||
if line:
|
||||
lines.append(line)
|
||||
|
||||
return normalize_text(" ".join(lines))
|
||||
|
||||
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22):
|
||||
|
||||
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
|
||||
ih, iw = image.shape[:2]
|
||||
x1, y1, x2, y2 = bbox
|
||||
x1 = max(0, int(x1 - pad))
|
||||
@@ -318,9 +330,9 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=22):
|
||||
return best_text, best_score
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# LINES + YELLOW BOXES
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# LINE REBUILD + LINE BOXES (YELLOW)
|
||||
# ============================================================
|
||||
def build_lines_from_indices(indices, ocr):
|
||||
if not indices:
|
||||
return []
|
||||
@@ -336,7 +348,7 @@ def build_lines_from_indices(indices, ocr):
|
||||
med_h = float(np.median([it[4] for it in items])) if items else 10.0
|
||||
row_tol = max(6.0, med_h * 0.75)
|
||||
|
||||
items.sort(key=lambda x: x[3])
|
||||
items.sort(key=lambda x: x[3]) # y
|
||||
rows = []
|
||||
for it in items:
|
||||
i, b, xc, yc, h = it
|
||||
@@ -353,19 +365,23 @@ def build_lines_from_indices(indices, ocr):
|
||||
rows.sort(key=lambda r: r["yc"])
|
||||
lines = []
|
||||
for r in rows:
|
||||
mem = sorted(r["m"], key=lambda z: z[2])
|
||||
mem = sorted(r["m"], key=lambda z: z[2]) # x
|
||||
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
|
||||
lines.append(txt)
|
||||
if txt and not is_noise_text(txt):
|
||||
lines.append(txt)
|
||||
|
||||
return lines
|
||||
|
||||
def build_line_boxes_from_indices(indices, ocr):
|
||||
|
||||
def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
||||
"""
|
||||
Robust yellow-box generation with punctuation attachment:
|
||||
Improved yellow box builder:
|
||||
- row grouping
|
||||
- chunking by x gap
|
||||
- attach tiny punctuation/special tokens to nearest chunk
|
||||
- x-gap chunking
|
||||
- punctuation attachment
|
||||
- token coverage guarantee
|
||||
- larger/asymmetric padding (fix clipped chars)
|
||||
- min-size safety expansion
|
||||
"""
|
||||
if not indices:
|
||||
return []
|
||||
@@ -374,19 +390,25 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
for i in indices:
|
||||
b = quad_bbox(ocr[i][0])
|
||||
txt = normalize_text(ocr[i][1])
|
||||
if is_noise_text(txt):
|
||||
continue
|
||||
xc = (b[0] + b[2]) / 2.0
|
||||
yc = (b[1] + b[3]) / 2.0
|
||||
w = max(1.0, b[2] - b[0])
|
||||
h = max(1.0, b[3] - b[1])
|
||||
|
||||
items.append({
|
||||
"i": i, "b": b, "txt": txt,
|
||||
"xc": xc, "yc": yc, "w": w, "h": h
|
||||
})
|
||||
|
||||
med_h = float(np.median([it["h"] for it in items])) if items else 10.0
|
||||
if not items:
|
||||
return []
|
||||
|
||||
med_h = float(np.median([it["h"] for it in items]))
|
||||
row_tol = max(6.0, med_h * 0.90)
|
||||
gap_x_tol = max(8.0, med_h * 1.25)
|
||||
pad = max(1, int(round(med_h * 0.12)))
|
||||
pad = max(3, int(round(med_h * 0.22))) # was 0.12
|
||||
|
||||
def is_punct_like(t):
|
||||
raw = (t or "").strip()
|
||||
@@ -395,7 +417,7 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
|
||||
return punct_ratio >= 0.5 or len(raw) <= 2
|
||||
|
||||
# 1) row grouping
|
||||
# 1) group into rows
|
||||
items_sorted = sorted(items, key=lambda x: x["yc"])
|
||||
rows = []
|
||||
for it in items_sorted:
|
||||
@@ -414,6 +436,7 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
|
||||
for r in rows:
|
||||
mem = sorted(r["m"], key=lambda z: z["xc"])
|
||||
|
||||
normal = [t for t in mem if not is_punct_like(t["txt"])]
|
||||
punct = [t for t in mem if is_punct_like(t["txt"])]
|
||||
|
||||
@@ -421,7 +444,7 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
normal = mem
|
||||
punct = []
|
||||
|
||||
# 2) chunk normal by x-gap
|
||||
# 2) chunk by x-gap
|
||||
chunks = []
|
||||
cur = [normal[0]]
|
||||
for t in normal[1:]:
|
||||
@@ -435,7 +458,7 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
cur = [t]
|
||||
chunks.append(cur)
|
||||
|
||||
# 3) attach punct tokens to nearest chunk
|
||||
# 3) attach punctuation/special tokens with larger near-gap
|
||||
for p in punct:
|
||||
pb = p["b"]
|
||||
pxc, pyc = p["xc"], p["yc"]
|
||||
@@ -450,7 +473,7 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
dy = abs(pyc - cy)
|
||||
score = dx + 1.8 * dy
|
||||
|
||||
near = overlap_or_near(pb, ub, gap=int(med_h * 0.9))
|
||||
near = overlap_or_near(pb, ub, gap=int(med_h * 1.25))
|
||||
if near:
|
||||
score -= med_h * 2.0
|
||||
|
||||
@@ -463,14 +486,17 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
else:
|
||||
chunks.append([p])
|
||||
|
||||
# 4) emit chunk boxes
|
||||
# 4) emit chunk boxes with asymmetric padding
|
||||
for ch in chunks:
|
||||
ub = boxes_union_xyxy([x["b"] for x in ch])
|
||||
if ub:
|
||||
x1, y1, x2, y2 = ub
|
||||
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad))
|
||||
pad_x = pad
|
||||
pad_top = int(round(pad * 1.35))
|
||||
pad_bot = int(round(pad * 0.95))
|
||||
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
||||
|
||||
# 5) guarantee every token is inside some yellow box
|
||||
# 5) guarantee every token is covered
|
||||
token_boxes = [it["b"] for it in items]
|
||||
|
||||
def inside(tb, lb):
|
||||
@@ -479,7 +505,10 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
for tb in token_boxes:
|
||||
if not any(inside(tb, lb) for lb in out_boxes):
|
||||
x1, y1, x2, y2 = tb
|
||||
out_boxes.append((x1 - pad, y1 - pad, x2 + pad, y2 + pad))
|
||||
pad_x = pad
|
||||
pad_top = int(round(pad * 1.35))
|
||||
pad_bot = int(round(pad * 0.95))
|
||||
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
||||
|
||||
# 6) merge heavy overlaps
|
||||
merged = []
|
||||
@@ -499,19 +528,51 @@ def build_line_boxes_from_indices(indices, ocr):
|
||||
if not merged_into:
|
||||
merged.append(b)
|
||||
|
||||
# 7) min-size safety expansion (for tiny lines like "NO.")
|
||||
safe = []
|
||||
for (x1, y1, x2, y2) in merged:
|
||||
w = x2 - x1
|
||||
h = y2 - y1
|
||||
if w < 28:
|
||||
d = (28 - w) // 2 + 2
|
||||
x1 -= d
|
||||
x2 += d
|
||||
if h < 18:
|
||||
d = (18 - h) // 2 + 2
|
||||
y1 -= d
|
||||
y2 += d
|
||||
safe.append((x1, y1, x2, y2))
|
||||
merged = safe
|
||||
|
||||
# clamp bounds
|
||||
if image_shape is not None:
|
||||
ih, iw = image_shape[:2]
|
||||
clamped = []
|
||||
for b in merged:
|
||||
x1 = max(0, int(b[0]))
|
||||
y1 = max(0, int(b[1]))
|
||||
x2 = min(iw - 1, int(b[2]))
|
||||
y2 = min(ih - 1, int(b[3]))
|
||||
if x2 > x1 and y2 > y1:
|
||||
clamped.append((x1, y1, x2, y2))
|
||||
merged = clamped
|
||||
else:
|
||||
merged = [(int(b[0]), int(b[1]), int(b[2]), int(b[3])) for b in merged]
|
||||
|
||||
merged.sort(key=lambda z: (z[1], z[0]))
|
||||
return merged
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# GROUPING
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# GROUP TOKENS TO BUBBLES
|
||||
# ============================================================
|
||||
def auto_gap(image_path, base=18, ref_w=750):
|
||||
img = cv2.imread(image_path)
|
||||
if img is None:
|
||||
return base
|
||||
return base * (img.shape[1] / ref_w)
|
||||
|
||||
|
||||
def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
||||
n = len(ocr)
|
||||
if n == 0:
|
||||
@@ -575,8 +636,8 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
||||
x1, y1, x2, y2 = ub
|
||||
x1 = max(0, x1 - bbox_padding)
|
||||
y1 = max(0, y1 - bbox_padding)
|
||||
x2 = min(iw, x2 + bbox_padding)
|
||||
y2 = min(ih, y2 + bbox_padding)
|
||||
x2 = min(iw - 1, x2 + bbox_padding)
|
||||
y2 = min(ih - 1, y2 + bbox_padding)
|
||||
|
||||
bubbles[bid] = lines
|
||||
bubble_boxes[bid] = (x1, y1, x2, y2)
|
||||
@@ -586,40 +647,40 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
||||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# DEBUG
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# DEBUG IMAGE
|
||||
# ============================================================
|
||||
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
|
||||
img = cv2.imread(image_path)
|
||||
if img is None:
|
||||
return
|
||||
|
||||
# OCR token quads
|
||||
# OCR token quads (gray)
|
||||
for bbox, txt, conf in ocr:
|
||||
pts = np.array(bbox, dtype=np.int32)
|
||||
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
|
||||
|
||||
# Bubble + line boxes
|
||||
# bubble boxes (green) + line boxes (yellow)
|
||||
for bid, bb in bubble_boxes.items():
|
||||
x1, y1, x2, y2 = bb
|
||||
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
|
||||
cv2.putText(img, f"BOX#{bid}", (x1 + 2, y1 + 16),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2)
|
||||
cv2.putText(
|
||||
img, f"BOX#{bid}", (x1 + 2, max(15, y1 + 16)),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 2
|
||||
)
|
||||
|
||||
idxs = bubble_indices.get(bid, [])
|
||||
line_boxes = build_line_boxes_from_indices(idxs, ocr)
|
||||
line_boxes = build_line_boxes_from_indices(idxs, ocr, image_shape=img.shape)
|
||||
for lb in line_boxes:
|
||||
lx1, ly1, lx2, ly2 = lb
|
||||
lx1 = max(0, int(lx1)); ly1 = max(0, int(ly1))
|
||||
lx2 = min(img.shape[1] - 1, int(lx2)); ly2 = min(img.shape[0] - 1, int(ly2))
|
||||
cv2.rectangle(img, (lx1, ly1), (lx2, ly2), (0, 255, 255), 3)
|
||||
|
||||
cv2.imwrite(out_path, img)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# EXPORT
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
def estimate_reading_order(bbox_dict, mode="ltr"):
|
||||
items = []
|
||||
for bid, (x1, y1, x2, y2) in bbox_dict.items():
|
||||
@@ -627,7 +688,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
|
||||
cy = (y1 + y2) / 2.0
|
||||
items.append((bid, cx, cy))
|
||||
|
||||
items.sort(key=lambda t: t[2]) # top to bottom
|
||||
items.sort(key=lambda t: t[2]) # top -> bottom
|
||||
|
||||
rows = []
|
||||
tol = 90
|
||||
@@ -650,9 +711,9 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
|
||||
|
||||
return {bid: i + 1 for i, bid in enumerate(order)}
|
||||
|
||||
|
||||
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
|
||||
out = {}
|
||||
|
||||
for bid, bb in bbox_dict.items():
|
||||
x1, y1, x2, y2 = bb
|
||||
quads = quads_dict.get(bid, [])
|
||||
@@ -661,7 +722,7 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
||||
qboxes = [quad_bbox(q) for q in quads]
|
||||
text_union = boxes_union_xyxy(qboxes)
|
||||
|
||||
line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr)
|
||||
line_boxes_xyxy = build_line_boxes_from_indices(idxs, ocr, image_shape=image_shape)
|
||||
line_union_xyxy = boxes_union_xyxy(line_boxes_xyxy)
|
||||
line_union_area = bbox_area_xyxy(line_union_xyxy)
|
||||
|
||||
@@ -676,7 +737,6 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
||||
[[int(p[0]), int(p[1])] for p in q] for q in quads
|
||||
],
|
||||
"text_bbox": xyxy_to_xywh(text_union),
|
||||
|
||||
"line_bboxes": [xyxy_to_xywh(lb) for lb in line_boxes_xyxy],
|
||||
"line_union_bbox": xyxy_to_xywh(line_union_xyxy) if line_union_xyxy else None,
|
||||
"line_union_area": int(line_union_area),
|
||||
@@ -686,9 +746,9 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
||||
json.dump(out, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# MAIN
|
||||
# ─────────────────────────────────────────────
|
||||
# ============================================================
|
||||
# MAIN PIPELINE
|
||||
# ============================================================
|
||||
def translate_manga_text(
|
||||
image_path,
|
||||
source_lang="en",
|
||||
@@ -711,7 +771,14 @@ def translate_manga_text(
|
||||
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
||||
|
||||
print("Loading OCR...")
|
||||
ocr_lang_list = ["en", "es"] if source_lang == "ca" else [source_lang]
|
||||
# Catalan often OCRs better with es+en in manga pages
|
||||
if source_lang == "ca":
|
||||
ocr_lang_list = ["es", "en"]
|
||||
elif source_lang == "en":
|
||||
ocr_lang_list = ["en", "es"]
|
||||
else:
|
||||
ocr_lang_list = [source_lang]
|
||||
|
||||
reader = easyocr.Reader(ocr_lang_list)
|
||||
|
||||
print("Running OCR...")
|
||||
@@ -742,7 +809,7 @@ def translate_manga_text(
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# reduce false positives in very top strip
|
||||
# reduce top-strip false positives
|
||||
if qb[1] < int(ih * TOP_BAND_RATIO):
|
||||
if conf < 0.70 and len(t) >= 5:
|
||||
skipped += 1
|
||||
@@ -770,20 +837,18 @@ def translate_manga_text(
|
||||
|
||||
translator = GoogleTranslator(source=source_lang, target=target_lang)
|
||||
|
||||
# robust bubble text cleanup
|
||||
clean_lines = {}
|
||||
for bid, lines in bubbles.items():
|
||||
base_txt = normalize_text(" ".join(lines))
|
||||
base_sc = ocr_candidate_score(base_txt)
|
||||
|
||||
# only robust reread on low quality
|
||||
if base_sc < quality_threshold:
|
||||
rr_txt, rr_sc = reread_crop_robust(
|
||||
image,
|
||||
bubble_boxes[bid],
|
||||
reader,
|
||||
upscale=3.0,
|
||||
pad=22
|
||||
pad=24
|
||||
)
|
||||
if rr_txt and rr_sc > base_sc + 0.06:
|
||||
txt = rr_txt
|
||||
@@ -792,7 +857,12 @@ def translate_manga_text(
|
||||
else:
|
||||
txt = base_txt
|
||||
|
||||
clean_lines[bid] = apply_glossary(txt)
|
||||
# tiny targeted corrections for common OCR confusions
|
||||
txt = txt.replace(" BOMPORTA", " IMPORTA")
|
||||
txt = txt.replace(" TESTO ", " ESTO ")
|
||||
txt = txt.replace(" MIVERDAD", " MI VERDAD")
|
||||
|
||||
clean_lines[bid] = apply_glossary(normalize_text(txt))
|
||||
|
||||
reading_map = estimate_reading_order(bubble_boxes, mode=reading_mode)
|
||||
|
||||
@@ -822,6 +892,7 @@ def translate_manga_text(
|
||||
out_lines.append(
|
||||
f"#{bid}|{reading_map.get(bid,bid)}|{src_u}|{tgt}|{','.join(flags) if flags else '-'}"
|
||||
)
|
||||
|
||||
print(
|
||||
f"#{bid:<7} {reading_map.get(bid,bid):<6} "
|
||||
f"{src_u[:50]:<50} {tgt[:50]:<50} {','.join(flags) if flags else '-'}"
|
||||
@@ -851,10 +922,13 @@ def translate_manga_text(
|
||||
print("Saved: debug_clusters.png")
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ENTRYPOINT
|
||||
# ============================================================
|
||||
if __name__ == "__main__":
|
||||
translate_manga_text(
|
||||
image_path="001-page.png",
|
||||
source_lang="it",
|
||||
image_path="004-page.png",
|
||||
source_lang="es",
|
||||
target_lang="ca",
|
||||
confidence_threshold=0.12,
|
||||
min_text_length=1,
|
||||
|
||||
Reference in New Issue
Block a user