Added hybrid

This commit is contained in:
Guillem Hernandez Sola
2026-04-15 16:22:35 +02:00
parent eadc28154a
commit 5ef8c39f69
2 changed files with 212 additions and 81 deletions

2
.gitignore vendored
View File

@@ -7,6 +7,8 @@
.AppleDouble .AppleDouble
.LSOverride .LSOverride
.venv311/
# Icon must end with two \r # Icon must end with two \r
Icon Icon

View File

@@ -6,9 +6,13 @@ import re
import json import json
import cv2 import cv2
import numpy as np import numpy as np
import easyocr
from deep_translator import GoogleTranslator from deep_translator import GoogleTranslator
# OCR engines
import easyocr
from paddleocr import PaddleOCR
# ============================================================ # ============================================================
# CONFIG # CONFIG
@@ -35,8 +39,8 @@ TITLE_PATTERNS = [
NOISE_PATTERNS = [ NOISE_PATTERNS = [
r"^[^a-zA-Z0-9\?!.¡¿]+$", r"^[^a-zA-Z0-9\?!.¡¿]+$",
r"^BOX[#\s0-9A-Z\-]*$", # debug labels r"^BOX[#\s0-9A-Z\-]*$",
r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", # e.g. 98x12 r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$",
] ]
TOP_BAND_RATIO = 0.08 TOP_BAND_RATIO = 0.08
@@ -56,8 +60,6 @@ def normalize_text(text: str) -> str:
t = re.sub(r"\(\s+", "(", t) t = re.sub(r"\(\s+", "(", t)
t = re.sub(r"\s+\)", ")", t) t = re.sub(r"\s+\)", ")", t)
t = re.sub(r"\.{4,}", "...", t) t = re.sub(r"\.{4,}", "...", t)
t = t.replace("IQUE", "¡QUE")
t = t.replace("IQUIEN", "¿QUIEN")
return t.strip() return t.strip()
@@ -91,11 +93,9 @@ def is_noise_text(text: str) -> bool:
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS): if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
return True return True
# very short isolated junk
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t): if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
return True return True
# mostly-symbol garbage
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t)) symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
if len(t) <= 6 and symbol_ratio > 0.60: if len(t) <= 6 and symbol_ratio > 0.60:
return True return True
@@ -104,7 +104,7 @@ def is_noise_text(text: str) -> bool:
# ============================================================ # ============================================================
# GEOMETRY # GEOMETRY HELPERS
# ============================================================ # ============================================================
def quad_bbox(quad): def quad_bbox(quad):
xs = [p[0] for p in quad] xs = [p[0] for p in quad]
@@ -151,7 +151,7 @@ def overlap_or_near(a, b, gap=0):
# ============================================================ # ============================================================
# OCR QUALITY SCORING # QUALITY
# ============================================================ # ============================================================
def ocr_candidate_score(text: str) -> float: def ocr_candidate_score(text: str) -> float:
if not text: if not text:
@@ -179,7 +179,180 @@ def ocr_candidate_score(text: str) -> float:
# ============================================================ # ============================================================
# OCR MULTI-PASS REREAD # OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID)
# ============================================================
class HybridOCR:
def __init__(self, source_lang="en", use_gpu=False):
self.source_lang = source_lang
# Paddle language choice (single lang for Paddle)
# For manga EN/ES pages, latin model is robust.
if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"):
paddle_lang = "latin"
elif source_lang in ("ja",):
paddle_lang = "japan"
elif source_lang in ("ko",):
paddle_lang = "korean"
elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"):
paddle_lang = "ch"
else:
paddle_lang = "latin"
# EasyOCR language list
if source_lang == "ca":
easy_langs = ["es", "en"]
elif source_lang == "en":
easy_langs = ["en", "es"]
elif source_lang == "es":
easy_langs = ["es", "en"]
else:
easy_langs = [source_lang]
self.paddle = PaddleOCR(
use_angle_cls=True,
lang=paddle_lang,
use_gpu=use_gpu,
show_log=False
)
self.easy = easyocr.Reader(easy_langs, gpu=use_gpu)
@staticmethod
def _paddle_to_std(result):
"""
Convert Paddle result to Easy-like:
[ (quad, text, conf), ... ]
"""
out = []
# paddle.ocr(...) returns list per image
# each item line: [ [ [x,y],...4pts ], (text, conf) ]
if not result:
return out
# result can be [None] or nested list
blocks = result if isinstance(result, list) else [result]
for blk in blocks:
if blk is None:
continue
if len(blk) == 0:
continue
# some versions wrap once more
if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2:
lines = blk
elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2:
lines = blk
else:
# maybe nested once more
if len(blk) == 1 and isinstance(blk[0], list):
lines = blk[0]
else:
lines = []
for ln in lines:
try:
pts, rec = ln
txt, conf = rec[0], float(rec[1])
quad = [[float(p[0]), float(p[1])] for p in pts]
out.append((quad, txt, conf))
except Exception:
continue
return out
def read_full_image(self, image_path):
"""
Primary: Paddle
Fallback merge: EasyOCR
Returns merged standardized detections.
"""
# Paddle
pr = self.paddle.ocr(image_path, cls=True)
paddle_det = self._paddle_to_std(pr)
# Easy
easy_det = self.easy.readtext(image_path, paragraph=False)
# Merge by IOU/text proximity
merged = list(paddle_det)
for eb in easy_det:
eq, et, ec = eb
ebox = quad_bbox(eq)
keep = True
for pb in paddle_det:
pq, pt, pc = pb
pbox = quad_bbox(pq)
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.55:
# if overlapped and paddle exists, keep paddle unless easy much higher conf
if float(ec) > float(pc) + 0.20:
# replace paddle with easy-like entry
try:
merged.remove(pb)
except Exception:
pass
merged.append((eq, et, float(ec)))
keep = False
break
if keep:
merged.append((eq, et, float(ec)))
return merged
def read_array_with_both(self, arr_gray_or_bgr):
"""
OCR from array (used in robust reread pass).
Returns merged detections in standardized format.
"""
tmp = "_tmp_ocr_hybrid.png"
cv2.imwrite(tmp, arr_gray_or_bgr)
try:
pr = self.paddle.ocr(tmp, cls=True)
paddle_det = self._paddle_to_std(pr)
easy_det = self.easy.readtext(tmp, paragraph=False)
merged = list(paddle_det)
for eb in easy_det:
eq, et, ec = eb
ebox = quad_bbox(eq)
keep = True
for pb in paddle_det:
pq, pt, pc = pb
pbox = quad_bbox(pq)
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
if iou > 0.55:
if float(ec) > float(pc) + 0.20:
try:
merged.remove(pb)
except Exception:
pass
merged.append((eq, et, float(ec)))
keep = False
break
if keep:
merged.append((eq, et, float(ec)))
return merged
finally:
if os.path.exists(tmp):
os.remove(tmp)
# ============================================================
# PREPROCESS + ROBUST REREAD
# ============================================================ # ============================================================
def preprocess_variant(crop_bgr, mode): def preprocess_variant(crop_bgr, mode):
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
@@ -212,8 +385,7 @@ def rotate_image_keep_bounds(img, angle_deg):
h, w = img.shape[:2] h, w = img.shape[:2]
c = (w / 2, h / 2) c = (w / 2, h / 2)
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0) M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
cos = abs(M[0, 0]) cos = abs(M[0, 0]); sin = abs(M[0, 1])
sin = abs(M[0, 1])
new_w = int((h * sin) + (w * cos)) new_w = int((h * sin) + (w * cos))
new_h = int((h * cos) + (w * sin)) new_h = int((h * cos) + (w * sin))
@@ -224,16 +396,6 @@ def rotate_image_keep_bounds(img, angle_deg):
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255) return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
def run_ocr_on_array(reader, arr):
tmp = "_tmp_ocr.png"
cv2.imwrite(tmp, arr)
try:
return reader.readtext(tmp, paragraph=False)
finally:
if os.path.exists(tmp):
os.remove(tmp)
def rebuild_text_from_ocr_result(res): def rebuild_text_from_ocr_result(res):
if not res: if not res:
return "" return ""
@@ -257,7 +419,7 @@ def rebuild_text_from_ocr_result(res):
med_h = float(np.median([x[5] for x in norm])) med_h = float(np.median([x[5] for x in norm]))
row_tol = max(6.0, med_h * 0.75) row_tol = max(6.0, med_h * 0.75)
norm.sort(key=lambda z: z[4]) # y-center norm.sort(key=lambda z: z[4]) # y
rows = [] rows = []
for it in norm: for it in norm:
placed = False placed = False
@@ -273,7 +435,7 @@ def rebuild_text_from_ocr_result(res):
rows.sort(key=lambda r: r["yc"]) rows.sort(key=lambda r: r["yc"])
lines = [] lines = []
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z[3]) # x-center mem = sorted(r["m"], key=lambda z: z[3]) # x
line = normalize_text(" ".join(x[1] for x in mem)) line = normalize_text(" ".join(x[1] for x in mem))
if line: if line:
lines.append(line) lines.append(line)
@@ -281,7 +443,7 @@ def rebuild_text_from_ocr_result(res):
return normalize_text(" ".join(lines)) return normalize_text(" ".join(lines))
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24): def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24):
ih, iw = image.shape[:2] ih, iw = image.shape[:2]
x1, y1, x2, y2 = bbox x1, y1, x2, y2 = bbox
x1 = max(0, int(x1 - pad)) x1 = max(0, int(x1 - pad))
@@ -313,12 +475,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
for a in angles: for a in angles:
rot = rotate_image_keep_bounds(proc3, a) rot = rotate_image_keep_bounds(proc3, a)
if len(rot.shape) == 3: res = hybrid_ocr.read_array_with_both(rot)
rot_in = cv2.cvtColor(rot, cv2.COLOR_BGR2GRAY)
else:
rot_in = rot
res = run_ocr_on_array(reader, rot_in)
txt = rebuild_text_from_ocr_result(res) txt = rebuild_text_from_ocr_result(res)
sc = ocr_candidate_score(txt) sc = ocr_candidate_score(txt)
@@ -331,7 +488,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
# ============================================================ # ============================================================
# LINE REBUILD + LINE BOXES (YELLOW) # LINE REBUILD + YELLOW BOXES
# ============================================================ # ============================================================
def build_lines_from_indices(indices, ocr): def build_lines_from_indices(indices, ocr):
if not indices: if not indices:
@@ -348,7 +505,7 @@ def build_lines_from_indices(indices, ocr):
med_h = float(np.median([it[4] for it in items])) if items else 10.0 med_h = float(np.median([it[4] for it in items])) if items else 10.0
row_tol = max(6.0, med_h * 0.75) row_tol = max(6.0, med_h * 0.75)
items.sort(key=lambda x: x[3]) # y items.sort(key=lambda x: x[3])
rows = [] rows = []
for it in items: for it in items:
i, b, xc, yc, h = it i, b, xc, yc, h = it
@@ -365,7 +522,7 @@ def build_lines_from_indices(indices, ocr):
rows.sort(key=lambda r: r["yc"]) rows.sort(key=lambda r: r["yc"])
lines = [] lines = []
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z[2]) # x mem = sorted(r["m"], key=lambda z: z[2])
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem)) txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
if txt and not is_noise_text(txt): if txt and not is_noise_text(txt):
lines.append(txt) lines.append(txt)
@@ -374,15 +531,6 @@ def build_lines_from_indices(indices, ocr):
def build_line_boxes_from_indices(indices, ocr, image_shape=None): def build_line_boxes_from_indices(indices, ocr, image_shape=None):
"""
Improved yellow box builder:
- row grouping
- x-gap chunking
- punctuation attachment
- token coverage guarantee
- larger/asymmetric padding (fix clipped chars)
- min-size safety expansion
"""
if not indices: if not indices:
return [] return []
@@ -392,6 +540,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
txt = normalize_text(ocr[i][1]) txt = normalize_text(ocr[i][1])
if is_noise_text(txt): if is_noise_text(txt):
continue continue
xc = (b[0] + b[2]) / 2.0 xc = (b[0] + b[2]) / 2.0
yc = (b[1] + b[3]) / 2.0 yc = (b[1] + b[3]) / 2.0
w = max(1.0, b[2] - b[0]) w = max(1.0, b[2] - b[0])
@@ -408,7 +557,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
med_h = float(np.median([it["h"] for it in items])) med_h = float(np.median([it["h"] for it in items]))
row_tol = max(6.0, med_h * 0.90) row_tol = max(6.0, med_h * 0.90)
gap_x_tol = max(8.0, med_h * 1.25) gap_x_tol = max(8.0, med_h * 1.25)
pad = max(3, int(round(med_h * 0.22))) # was 0.12 pad = max(3, int(round(med_h * 0.22)))
def is_punct_like(t): def is_punct_like(t):
raw = (t or "").strip() raw = (t or "").strip()
@@ -417,7 +566,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw)) punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
return punct_ratio >= 0.5 or len(raw) <= 2 return punct_ratio >= 0.5 or len(raw) <= 2
# 1) group into rows
items_sorted = sorted(items, key=lambda x: x["yc"]) items_sorted = sorted(items, key=lambda x: x["yc"])
rows = [] rows = []
for it in items_sorted: for it in items_sorted:
@@ -436,7 +584,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
for r in rows: for r in rows:
mem = sorted(r["m"], key=lambda z: z["xc"]) mem = sorted(r["m"], key=lambda z: z["xc"])
normal = [t for t in mem if not is_punct_like(t["txt"])] normal = [t for t in mem if not is_punct_like(t["txt"])]
punct = [t for t in mem if is_punct_like(t["txt"])] punct = [t for t in mem if is_punct_like(t["txt"])]
@@ -444,7 +591,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
normal = mem normal = mem
punct = [] punct = []
# 2) chunk by x-gap
chunks = [] chunks = []
cur = [normal[0]] cur = [normal[0]]
for t in normal[1:]: for t in normal[1:]:
@@ -458,7 +604,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
cur = [t] cur = [t]
chunks.append(cur) chunks.append(cur)
# 3) attach punctuation/special tokens with larger near-gap
for p in punct: for p in punct:
pb = p["b"] pb = p["b"]
pxc, pyc = p["xc"], p["yc"] pxc, pyc = p["xc"], p["yc"]
@@ -486,7 +631,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
else: else:
chunks.append([p]) chunks.append([p])
# 4) emit chunk boxes with asymmetric padding
for ch in chunks: for ch in chunks:
ub = boxes_union_xyxy([x["b"] for x in ch]) ub = boxes_union_xyxy([x["b"] for x in ch])
if ub: if ub:
@@ -496,7 +640,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
pad_bot = int(round(pad * 0.95)) pad_bot = int(round(pad * 0.95))
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
# 5) guarantee every token is covered
token_boxes = [it["b"] for it in items] token_boxes = [it["b"] for it in items]
def inside(tb, lb): def inside(tb, lb):
@@ -510,7 +653,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
pad_bot = int(round(pad * 0.95)) pad_bot = int(round(pad * 0.95))
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot)) out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
# 6) merge heavy overlaps
merged = [] merged = []
for b in out_boxes: for b in out_boxes:
merged_into = False merged_into = False
@@ -528,23 +670,19 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
if not merged_into: if not merged_into:
merged.append(b) merged.append(b)
# 7) min-size safety expansion (for tiny lines like "NO.")
safe = [] safe = []
for (x1, y1, x2, y2) in merged: for (x1, y1, x2, y2) in merged:
w = x2 - x1 w = x2 - x1
h = y2 - y1 h = y2 - y1
if w < 28: if w < 28:
d = (28 - w) // 2 + 2 d = (28 - w) // 2 + 2
x1 -= d x1 -= d; x2 += d
x2 += d
if h < 18: if h < 18:
d = (18 - h) // 2 + 2 d = (18 - h) // 2 + 2
y1 -= d y1 -= d; y2 += d
y2 += d
safe.append((x1, y1, x2, y2)) safe.append((x1, y1, x2, y2))
merged = safe merged = safe
# clamp bounds
if image_shape is not None: if image_shape is not None:
ih, iw = image_shape[:2] ih, iw = image_shape[:2]
clamped = [] clamped = []
@@ -564,7 +702,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
# ============================================================ # ============================================================
# GROUP TOKENS TO BUBBLES # GROUPING
# ============================================================ # ============================================================
def auto_gap(image_path, base=18, ref_w=750): def auto_gap(image_path, base=18, ref_w=750):
img = cv2.imread(image_path) img = cv2.imread(image_path)
@@ -648,19 +786,17 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
# ============================================================ # ============================================================
# DEBUG IMAGE # DEBUG
# ============================================================ # ============================================================
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"): def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
img = cv2.imread(image_path) img = cv2.imread(image_path)
if img is None: if img is None:
return return
# OCR token quads (gray)
for bbox, txt, conf in ocr: for bbox, txt, conf in ocr:
pts = np.array(bbox, dtype=np.int32) pts = np.array(bbox, dtype=np.int32)
cv2.polylines(img, [pts], True, (180, 180, 180), 1) cv2.polylines(img, [pts], True, (180, 180, 180), 1)
# bubble boxes (green) + line boxes (yellow)
for bid, bb in bubble_boxes.items(): for bid, bb in bubble_boxes.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2) cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
@@ -688,7 +824,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
cy = (y1 + y2) / 2.0 cy = (y1 + y2) / 2.0
items.append((bid, cx, cy)) items.append((bid, cx, cy))
items.sort(key=lambda t: t[2]) # top -> bottom items.sort(key=lambda t: t[2])
rows = [] rows = []
tol = 90 tol = 90
@@ -714,6 +850,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape): def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
out = {} out = {}
for bid, bb in bbox_dict.items(): for bid, bb in bbox_dict.items():
x1, y1, x2, y2 = bb x1, y1, x2, y2 = bb
quads = quads_dict.get(bid, []) quads = quads_dict.get(bid, [])
@@ -761,7 +898,8 @@ def translate_manga_text(
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="ltr", reading_mode="ltr",
debug=True debug=True,
use_gpu=False
): ):
image = cv2.imread(image_path) image = cv2.imread(image_path)
if image is None: if image is None:
@@ -770,20 +908,12 @@ def translate_manga_text(
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
print("Loading OCR...") print("Loading Hybrid OCR (Paddle + EasyOCR)...")
# Catalan often OCRs better with es+en in manga pages hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu)
if source_lang == "ca":
ocr_lang_list = ["es", "en"]
elif source_lang == "en":
ocr_lang_list = ["en", "es"]
else:
ocr_lang_list = [source_lang]
reader = easyocr.Reader(ocr_lang_list)
print("Running OCR...") print("Running OCR...")
raw = reader.readtext(image_path, paragraph=False) raw = hybrid.read_full_image(image_path)
print(f"Raw detections: {len(raw)}") print(f"Raw detections (merged): {len(raw)}")
filtered = [] filtered = []
skipped = 0 skipped = 0
@@ -809,7 +939,6 @@ def translate_manga_text(
skipped += 1 skipped += 1
continue continue
# reduce top-strip false positives
if qb[1] < int(ih * TOP_BAND_RATIO): if qb[1] < int(ih * TOP_BAND_RATIO):
if conf < 0.70 and len(t) >= 5: if conf < 0.70 and len(t) >= 5:
skipped += 1 skipped += 1
@@ -846,7 +975,7 @@ def translate_manga_text(
rr_txt, rr_sc = reread_crop_robust( rr_txt, rr_sc = reread_crop_robust(
image, image,
bubble_boxes[bid], bubble_boxes[bid],
reader, hybrid,
upscale=3.0, upscale=3.0,
pad=24 pad=24
) )
@@ -857,7 +986,6 @@ def translate_manga_text(
else: else:
txt = base_txt txt = base_txt
# tiny targeted corrections for common OCR confusions
txt = txt.replace(" BOMPORTA", " IMPORTA") txt = txt.replace(" BOMPORTA", " IMPORTA")
txt = txt.replace(" TESTO ", " ESTO ") txt = txt.replace(" TESTO ", " ESTO ")
txt = txt.replace(" MIVERDAD", " MI VERDAD") txt = txt.replace(" MIVERDAD", " MI VERDAD")
@@ -927,8 +1055,8 @@ def translate_manga_text(
# ============================================================ # ============================================================
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path="004-page.png", image_path="001-page.png",
source_lang="es", source_lang="it",
target_lang="ca", target_lang="ca",
confidence_threshold=0.12, confidence_threshold=0.12,
min_text_length=1, min_text_length=1,
@@ -938,5 +1066,6 @@ if __name__ == "__main__":
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="ltr", reading_mode="ltr",
debug=True debug=True,
use_gpu=False
) )