Added hybrid
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -7,6 +7,8 @@
|
|||||||
.AppleDouble
|
.AppleDouble
|
||||||
.LSOverride
|
.LSOverride
|
||||||
|
|
||||||
|
.venv311/
|
||||||
|
|
||||||
# Icon must end with two \r
|
# Icon must end with two \r
|
||||||
Icon
|
Icon
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,13 @@ import re
|
|||||||
import json
|
import json
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import easyocr
|
|
||||||
from deep_translator import GoogleTranslator
|
from deep_translator import GoogleTranslator
|
||||||
|
|
||||||
|
# OCR engines
|
||||||
|
import easyocr
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# CONFIG
|
# CONFIG
|
||||||
@@ -35,8 +39,8 @@ TITLE_PATTERNS = [
|
|||||||
|
|
||||||
NOISE_PATTERNS = [
|
NOISE_PATTERNS = [
|
||||||
r"^[^a-zA-Z0-9\?!.¡¿]+$",
|
r"^[^a-zA-Z0-9\?!.¡¿]+$",
|
||||||
r"^BOX[#\s0-9A-Z\-]*$", # debug labels
|
r"^BOX[#\s0-9A-Z\-]*$",
|
||||||
r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$", # e.g. 98x12
|
r"^[0-9]{1,3}\s*[Xx]\s*[0-9]{1,3}$",
|
||||||
]
|
]
|
||||||
|
|
||||||
TOP_BAND_RATIO = 0.08
|
TOP_BAND_RATIO = 0.08
|
||||||
@@ -56,8 +60,6 @@ def normalize_text(text: str) -> str:
|
|||||||
t = re.sub(r"\(\s+", "(", t)
|
t = re.sub(r"\(\s+", "(", t)
|
||||||
t = re.sub(r"\s+\)", ")", t)
|
t = re.sub(r"\s+\)", ")", t)
|
||||||
t = re.sub(r"\.{4,}", "...", t)
|
t = re.sub(r"\.{4,}", "...", t)
|
||||||
t = t.replace("IQUE", "¡QUE")
|
|
||||||
t = t.replace("IQUIEN", "¿QUIEN")
|
|
||||||
return t.strip()
|
return t.strip()
|
||||||
|
|
||||||
|
|
||||||
@@ -91,11 +93,9 @@ def is_noise_text(text: str) -> bool:
|
|||||||
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
|
if any(re.fullmatch(p, t) for p in NOISE_PATTERNS):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# very short isolated junk
|
|
||||||
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
|
if len(t) <= 2 and not re.search(r"[A-Z0-9]", t):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# mostly-symbol garbage
|
|
||||||
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
|
symbol_ratio = sum(1 for c in t if not c.isalnum() and not c.isspace()) / max(1, len(t))
|
||||||
if len(t) <= 6 and symbol_ratio > 0.60:
|
if len(t) <= 6 and symbol_ratio > 0.60:
|
||||||
return True
|
return True
|
||||||
@@ -104,7 +104,7 @@ def is_noise_text(text: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# GEOMETRY
|
# GEOMETRY HELPERS
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def quad_bbox(quad):
|
def quad_bbox(quad):
|
||||||
xs = [p[0] for p in quad]
|
xs = [p[0] for p in quad]
|
||||||
@@ -151,7 +151,7 @@ def overlap_or_near(a, b, gap=0):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# OCR QUALITY SCORING
|
# QUALITY
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def ocr_candidate_score(text: str) -> float:
|
def ocr_candidate_score(text: str) -> float:
|
||||||
if not text:
|
if not text:
|
||||||
@@ -179,7 +179,180 @@ def ocr_candidate_score(text: str) -> float:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# OCR MULTI-PASS REREAD
|
# OCR ENGINE WRAPPER (PADDLE + EASYOCR HYBRID)
|
||||||
|
# ============================================================
|
||||||
|
class HybridOCR:
|
||||||
|
def __init__(self, source_lang="en", use_gpu=False):
|
||||||
|
self.source_lang = source_lang
|
||||||
|
|
||||||
|
# Paddle language choice (single lang for Paddle)
|
||||||
|
# For manga EN/ES pages, latin model is robust.
|
||||||
|
if source_lang in ("en", "es", "ca", "fr", "de", "it", "pt"):
|
||||||
|
paddle_lang = "latin"
|
||||||
|
elif source_lang in ("ja",):
|
||||||
|
paddle_lang = "japan"
|
||||||
|
elif source_lang in ("ko",):
|
||||||
|
paddle_lang = "korean"
|
||||||
|
elif source_lang in ("ch", "zh", "zh-cn", "zh-tw"):
|
||||||
|
paddle_lang = "ch"
|
||||||
|
else:
|
||||||
|
paddle_lang = "latin"
|
||||||
|
|
||||||
|
# EasyOCR language list
|
||||||
|
if source_lang == "ca":
|
||||||
|
easy_langs = ["es", "en"]
|
||||||
|
elif source_lang == "en":
|
||||||
|
easy_langs = ["en", "es"]
|
||||||
|
elif source_lang == "es":
|
||||||
|
easy_langs = ["es", "en"]
|
||||||
|
else:
|
||||||
|
easy_langs = [source_lang]
|
||||||
|
|
||||||
|
self.paddle = PaddleOCR(
|
||||||
|
use_angle_cls=True,
|
||||||
|
lang=paddle_lang,
|
||||||
|
use_gpu=use_gpu,
|
||||||
|
show_log=False
|
||||||
|
)
|
||||||
|
self.easy = easyocr.Reader(easy_langs, gpu=use_gpu)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _paddle_to_std(result):
|
||||||
|
"""
|
||||||
|
Convert Paddle result to Easy-like:
|
||||||
|
[ (quad, text, conf), ... ]
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
# paddle.ocr(...) returns list per image
|
||||||
|
# each item line: [ [ [x,y],...4pts ], (text, conf) ]
|
||||||
|
if not result:
|
||||||
|
return out
|
||||||
|
# result can be [None] or nested list
|
||||||
|
blocks = result if isinstance(result, list) else [result]
|
||||||
|
for blk in blocks:
|
||||||
|
if blk is None:
|
||||||
|
continue
|
||||||
|
if len(blk) == 0:
|
||||||
|
continue
|
||||||
|
# some versions wrap once more
|
||||||
|
if isinstance(blk[0], list) and len(blk[0]) > 0 and isinstance(blk[0][0], (list, tuple)) and len(blk[0]) == 2:
|
||||||
|
lines = blk
|
||||||
|
elif isinstance(blk[0], (list, tuple)) and len(blk[0]) >= 2:
|
||||||
|
lines = blk
|
||||||
|
else:
|
||||||
|
# maybe nested once more
|
||||||
|
if len(blk) == 1 and isinstance(blk[0], list):
|
||||||
|
lines = blk[0]
|
||||||
|
else:
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
for ln in lines:
|
||||||
|
try:
|
||||||
|
pts, rec = ln
|
||||||
|
txt, conf = rec[0], float(rec[1])
|
||||||
|
quad = [[float(p[0]), float(p[1])] for p in pts]
|
||||||
|
out.append((quad, txt, conf))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return out
|
||||||
|
|
||||||
|
def read_full_image(self, image_path):
|
||||||
|
"""
|
||||||
|
Primary: Paddle
|
||||||
|
Fallback merge: EasyOCR
|
||||||
|
Returns merged standardized detections.
|
||||||
|
"""
|
||||||
|
# Paddle
|
||||||
|
pr = self.paddle.ocr(image_path, cls=True)
|
||||||
|
paddle_det = self._paddle_to_std(pr)
|
||||||
|
|
||||||
|
# Easy
|
||||||
|
easy_det = self.easy.readtext(image_path, paragraph=False)
|
||||||
|
|
||||||
|
# Merge by IOU/text proximity
|
||||||
|
merged = list(paddle_det)
|
||||||
|
for eb in easy_det:
|
||||||
|
eq, et, ec = eb
|
||||||
|
ebox = quad_bbox(eq)
|
||||||
|
keep = True
|
||||||
|
for pb in paddle_det:
|
||||||
|
pq, pt, pc = pb
|
||||||
|
pbox = quad_bbox(pq)
|
||||||
|
|
||||||
|
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
|
||||||
|
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
|
||||||
|
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
||||||
|
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
|
||||||
|
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
|
||||||
|
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
|
||||||
|
|
||||||
|
if iou > 0.55:
|
||||||
|
# if overlapped and paddle exists, keep paddle unless easy much higher conf
|
||||||
|
if float(ec) > float(pc) + 0.20:
|
||||||
|
# replace paddle with easy-like entry
|
||||||
|
try:
|
||||||
|
merged.remove(pb)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
merged.append((eq, et, float(ec)))
|
||||||
|
keep = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if keep:
|
||||||
|
merged.append((eq, et, float(ec)))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def read_array_with_both(self, arr_gray_or_bgr):
|
||||||
|
"""
|
||||||
|
OCR from array (used in robust reread pass).
|
||||||
|
Returns merged detections in standardized format.
|
||||||
|
"""
|
||||||
|
tmp = "_tmp_ocr_hybrid.png"
|
||||||
|
cv2.imwrite(tmp, arr_gray_or_bgr)
|
||||||
|
try:
|
||||||
|
pr = self.paddle.ocr(tmp, cls=True)
|
||||||
|
paddle_det = self._paddle_to_std(pr)
|
||||||
|
easy_det = self.easy.readtext(tmp, paragraph=False)
|
||||||
|
|
||||||
|
merged = list(paddle_det)
|
||||||
|
|
||||||
|
for eb in easy_det:
|
||||||
|
eq, et, ec = eb
|
||||||
|
ebox = quad_bbox(eq)
|
||||||
|
keep = True
|
||||||
|
for pb in paddle_det:
|
||||||
|
pq, pt, pc = pb
|
||||||
|
pbox = quad_bbox(pq)
|
||||||
|
|
||||||
|
ix1 = max(ebox[0], pbox[0]); iy1 = max(ebox[1], pbox[1])
|
||||||
|
ix2 = min(ebox[2], pbox[2]); iy2 = min(ebox[3], pbox[3])
|
||||||
|
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
||||||
|
a1 = max(1, (ebox[2] - ebox[0]) * (ebox[3] - ebox[1]))
|
||||||
|
a2 = max(1, (pbox[2] - pbox[0]) * (pbox[3] - pbox[1]))
|
||||||
|
iou = inter / float(a1 + a2 - inter) if (a1 + a2 - inter) > 0 else 0.0
|
||||||
|
|
||||||
|
if iou > 0.55:
|
||||||
|
if float(ec) > float(pc) + 0.20:
|
||||||
|
try:
|
||||||
|
merged.remove(pb)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
merged.append((eq, et, float(ec)))
|
||||||
|
keep = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if keep:
|
||||||
|
merged.append((eq, et, float(ec)))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
finally:
|
||||||
|
if os.path.exists(tmp):
|
||||||
|
os.remove(tmp)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# PREPROCESS + ROBUST REREAD
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def preprocess_variant(crop_bgr, mode):
|
def preprocess_variant(crop_bgr, mode):
|
||||||
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
||||||
@@ -212,8 +385,7 @@ def rotate_image_keep_bounds(img, angle_deg):
|
|||||||
h, w = img.shape[:2]
|
h, w = img.shape[:2]
|
||||||
c = (w / 2, h / 2)
|
c = (w / 2, h / 2)
|
||||||
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
|
M = cv2.getRotationMatrix2D(c, angle_deg, 1.0)
|
||||||
cos = abs(M[0, 0])
|
cos = abs(M[0, 0]); sin = abs(M[0, 1])
|
||||||
sin = abs(M[0, 1])
|
|
||||||
|
|
||||||
new_w = int((h * sin) + (w * cos))
|
new_w = int((h * sin) + (w * cos))
|
||||||
new_h = int((h * cos) + (w * sin))
|
new_h = int((h * cos) + (w * sin))
|
||||||
@@ -224,16 +396,6 @@ def rotate_image_keep_bounds(img, angle_deg):
|
|||||||
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
|
return cv2.warpAffine(img, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderValue=255)
|
||||||
|
|
||||||
|
|
||||||
def run_ocr_on_array(reader, arr):
|
|
||||||
tmp = "_tmp_ocr.png"
|
|
||||||
cv2.imwrite(tmp, arr)
|
|
||||||
try:
|
|
||||||
return reader.readtext(tmp, paragraph=False)
|
|
||||||
finally:
|
|
||||||
if os.path.exists(tmp):
|
|
||||||
os.remove(tmp)
|
|
||||||
|
|
||||||
|
|
||||||
def rebuild_text_from_ocr_result(res):
|
def rebuild_text_from_ocr_result(res):
|
||||||
if not res:
|
if not res:
|
||||||
return ""
|
return ""
|
||||||
@@ -257,7 +419,7 @@ def rebuild_text_from_ocr_result(res):
|
|||||||
med_h = float(np.median([x[5] for x in norm]))
|
med_h = float(np.median([x[5] for x in norm]))
|
||||||
row_tol = max(6.0, med_h * 0.75)
|
row_tol = max(6.0, med_h * 0.75)
|
||||||
|
|
||||||
norm.sort(key=lambda z: z[4]) # y-center
|
norm.sort(key=lambda z: z[4]) # y
|
||||||
rows = []
|
rows = []
|
||||||
for it in norm:
|
for it in norm:
|
||||||
placed = False
|
placed = False
|
||||||
@@ -273,7 +435,7 @@ def rebuild_text_from_ocr_result(res):
|
|||||||
rows.sort(key=lambda r: r["yc"])
|
rows.sort(key=lambda r: r["yc"])
|
||||||
lines = []
|
lines = []
|
||||||
for r in rows:
|
for r in rows:
|
||||||
mem = sorted(r["m"], key=lambda z: z[3]) # x-center
|
mem = sorted(r["m"], key=lambda z: z[3]) # x
|
||||||
line = normalize_text(" ".join(x[1] for x in mem))
|
line = normalize_text(" ".join(x[1] for x in mem))
|
||||||
if line:
|
if line:
|
||||||
lines.append(line)
|
lines.append(line)
|
||||||
@@ -281,7 +443,7 @@ def rebuild_text_from_ocr_result(res):
|
|||||||
return normalize_text(" ".join(lines))
|
return normalize_text(" ".join(lines))
|
||||||
|
|
||||||
|
|
||||||
def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
|
def reread_crop_robust(image, bbox, hybrid_ocr: HybridOCR, upscale=3.0, pad=24):
|
||||||
ih, iw = image.shape[:2]
|
ih, iw = image.shape[:2]
|
||||||
x1, y1, x2, y2 = bbox
|
x1, y1, x2, y2 = bbox
|
||||||
x1 = max(0, int(x1 - pad))
|
x1 = max(0, int(x1 - pad))
|
||||||
@@ -313,12 +475,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
|
|||||||
|
|
||||||
for a in angles:
|
for a in angles:
|
||||||
rot = rotate_image_keep_bounds(proc3, a)
|
rot = rotate_image_keep_bounds(proc3, a)
|
||||||
if len(rot.shape) == 3:
|
res = hybrid_ocr.read_array_with_both(rot)
|
||||||
rot_in = cv2.cvtColor(rot, cv2.COLOR_BGR2GRAY)
|
|
||||||
else:
|
|
||||||
rot_in = rot
|
|
||||||
|
|
||||||
res = run_ocr_on_array(reader, rot_in)
|
|
||||||
txt = rebuild_text_from_ocr_result(res)
|
txt = rebuild_text_from_ocr_result(res)
|
||||||
sc = ocr_candidate_score(txt)
|
sc = ocr_candidate_score(txt)
|
||||||
|
|
||||||
@@ -331,7 +488,7 @@ def reread_crop_robust(image, bbox, reader, upscale=3.0, pad=24):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# LINE REBUILD + LINE BOXES (YELLOW)
|
# LINE REBUILD + YELLOW BOXES
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def build_lines_from_indices(indices, ocr):
|
def build_lines_from_indices(indices, ocr):
|
||||||
if not indices:
|
if not indices:
|
||||||
@@ -348,7 +505,7 @@ def build_lines_from_indices(indices, ocr):
|
|||||||
med_h = float(np.median([it[4] for it in items])) if items else 10.0
|
med_h = float(np.median([it[4] for it in items])) if items else 10.0
|
||||||
row_tol = max(6.0, med_h * 0.75)
|
row_tol = max(6.0, med_h * 0.75)
|
||||||
|
|
||||||
items.sort(key=lambda x: x[3]) # y
|
items.sort(key=lambda x: x[3])
|
||||||
rows = []
|
rows = []
|
||||||
for it in items:
|
for it in items:
|
||||||
i, b, xc, yc, h = it
|
i, b, xc, yc, h = it
|
||||||
@@ -365,7 +522,7 @@ def build_lines_from_indices(indices, ocr):
|
|||||||
rows.sort(key=lambda r: r["yc"])
|
rows.sort(key=lambda r: r["yc"])
|
||||||
lines = []
|
lines = []
|
||||||
for r in rows:
|
for r in rows:
|
||||||
mem = sorted(r["m"], key=lambda z: z[2]) # x
|
mem = sorted(r["m"], key=lambda z: z[2])
|
||||||
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
|
txt = normalize_text(" ".join(ocr[i][1] for i, _, _, _ in mem))
|
||||||
if txt and not is_noise_text(txt):
|
if txt and not is_noise_text(txt):
|
||||||
lines.append(txt)
|
lines.append(txt)
|
||||||
@@ -374,15 +531,6 @@ def build_lines_from_indices(indices, ocr):
|
|||||||
|
|
||||||
|
|
||||||
def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
||||||
"""
|
|
||||||
Improved yellow box builder:
|
|
||||||
- row grouping
|
|
||||||
- x-gap chunking
|
|
||||||
- punctuation attachment
|
|
||||||
- token coverage guarantee
|
|
||||||
- larger/asymmetric padding (fix clipped chars)
|
|
||||||
- min-size safety expansion
|
|
||||||
"""
|
|
||||||
if not indices:
|
if not indices:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@@ -392,6 +540,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
txt = normalize_text(ocr[i][1])
|
txt = normalize_text(ocr[i][1])
|
||||||
if is_noise_text(txt):
|
if is_noise_text(txt):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
xc = (b[0] + b[2]) / 2.0
|
xc = (b[0] + b[2]) / 2.0
|
||||||
yc = (b[1] + b[3]) / 2.0
|
yc = (b[1] + b[3]) / 2.0
|
||||||
w = max(1.0, b[2] - b[0])
|
w = max(1.0, b[2] - b[0])
|
||||||
@@ -408,7 +557,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
med_h = float(np.median([it["h"] for it in items]))
|
med_h = float(np.median([it["h"] for it in items]))
|
||||||
row_tol = max(6.0, med_h * 0.90)
|
row_tol = max(6.0, med_h * 0.90)
|
||||||
gap_x_tol = max(8.0, med_h * 1.25)
|
gap_x_tol = max(8.0, med_h * 1.25)
|
||||||
pad = max(3, int(round(med_h * 0.22))) # was 0.12
|
pad = max(3, int(round(med_h * 0.22)))
|
||||||
|
|
||||||
def is_punct_like(t):
|
def is_punct_like(t):
|
||||||
raw = (t or "").strip()
|
raw = (t or "").strip()
|
||||||
@@ -417,7 +566,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
|
punct_ratio = sum(1 for c in raw if not c.isalnum()) / max(1, len(raw))
|
||||||
return punct_ratio >= 0.5 or len(raw) <= 2
|
return punct_ratio >= 0.5 or len(raw) <= 2
|
||||||
|
|
||||||
# 1) group into rows
|
|
||||||
items_sorted = sorted(items, key=lambda x: x["yc"])
|
items_sorted = sorted(items, key=lambda x: x["yc"])
|
||||||
rows = []
|
rows = []
|
||||||
for it in items_sorted:
|
for it in items_sorted:
|
||||||
@@ -436,7 +584,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
|
|
||||||
for r in rows:
|
for r in rows:
|
||||||
mem = sorted(r["m"], key=lambda z: z["xc"])
|
mem = sorted(r["m"], key=lambda z: z["xc"])
|
||||||
|
|
||||||
normal = [t for t in mem if not is_punct_like(t["txt"])]
|
normal = [t for t in mem if not is_punct_like(t["txt"])]
|
||||||
punct = [t for t in mem if is_punct_like(t["txt"])]
|
punct = [t for t in mem if is_punct_like(t["txt"])]
|
||||||
|
|
||||||
@@ -444,7 +591,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
normal = mem
|
normal = mem
|
||||||
punct = []
|
punct = []
|
||||||
|
|
||||||
# 2) chunk by x-gap
|
|
||||||
chunks = []
|
chunks = []
|
||||||
cur = [normal[0]]
|
cur = [normal[0]]
|
||||||
for t in normal[1:]:
|
for t in normal[1:]:
|
||||||
@@ -458,7 +604,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
cur = [t]
|
cur = [t]
|
||||||
chunks.append(cur)
|
chunks.append(cur)
|
||||||
|
|
||||||
# 3) attach punctuation/special tokens with larger near-gap
|
|
||||||
for p in punct:
|
for p in punct:
|
||||||
pb = p["b"]
|
pb = p["b"]
|
||||||
pxc, pyc = p["xc"], p["yc"]
|
pxc, pyc = p["xc"], p["yc"]
|
||||||
@@ -486,7 +631,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
else:
|
else:
|
||||||
chunks.append([p])
|
chunks.append([p])
|
||||||
|
|
||||||
# 4) emit chunk boxes with asymmetric padding
|
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
ub = boxes_union_xyxy([x["b"] for x in ch])
|
ub = boxes_union_xyxy([x["b"] for x in ch])
|
||||||
if ub:
|
if ub:
|
||||||
@@ -496,7 +640,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
pad_bot = int(round(pad * 0.95))
|
pad_bot = int(round(pad * 0.95))
|
||||||
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
||||||
|
|
||||||
# 5) guarantee every token is covered
|
|
||||||
token_boxes = [it["b"] for it in items]
|
token_boxes = [it["b"] for it in items]
|
||||||
|
|
||||||
def inside(tb, lb):
|
def inside(tb, lb):
|
||||||
@@ -510,7 +653,6 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
pad_bot = int(round(pad * 0.95))
|
pad_bot = int(round(pad * 0.95))
|
||||||
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
out_boxes.append((x1 - pad_x, y1 - pad_top, x2 + pad_x, y2 + pad_bot))
|
||||||
|
|
||||||
# 6) merge heavy overlaps
|
|
||||||
merged = []
|
merged = []
|
||||||
for b in out_boxes:
|
for b in out_boxes:
|
||||||
merged_into = False
|
merged_into = False
|
||||||
@@ -528,23 +670,19 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
if not merged_into:
|
if not merged_into:
|
||||||
merged.append(b)
|
merged.append(b)
|
||||||
|
|
||||||
# 7) min-size safety expansion (for tiny lines like "NO.")
|
|
||||||
safe = []
|
safe = []
|
||||||
for (x1, y1, x2, y2) in merged:
|
for (x1, y1, x2, y2) in merged:
|
||||||
w = x2 - x1
|
w = x2 - x1
|
||||||
h = y2 - y1
|
h = y2 - y1
|
||||||
if w < 28:
|
if w < 28:
|
||||||
d = (28 - w) // 2 + 2
|
d = (28 - w) // 2 + 2
|
||||||
x1 -= d
|
x1 -= d; x2 += d
|
||||||
x2 += d
|
|
||||||
if h < 18:
|
if h < 18:
|
||||||
d = (18 - h) // 2 + 2
|
d = (18 - h) // 2 + 2
|
||||||
y1 -= d
|
y1 -= d; y2 += d
|
||||||
y2 += d
|
|
||||||
safe.append((x1, y1, x2, y2))
|
safe.append((x1, y1, x2, y2))
|
||||||
merged = safe
|
merged = safe
|
||||||
|
|
||||||
# clamp bounds
|
|
||||||
if image_shape is not None:
|
if image_shape is not None:
|
||||||
ih, iw = image_shape[:2]
|
ih, iw = image_shape[:2]
|
||||||
clamped = []
|
clamped = []
|
||||||
@@ -564,7 +702,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# GROUP TOKENS TO BUBBLES
|
# GROUPING
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def auto_gap(image_path, base=18, ref_w=750):
|
def auto_gap(image_path, base=18, ref_w=750):
|
||||||
img = cv2.imread(image_path)
|
img = cv2.imread(image_path)
|
||||||
@@ -648,19 +786,17 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# DEBUG IMAGE
|
# DEBUG
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
|
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, out_path="debug_clusters.png"):
|
||||||
img = cv2.imread(image_path)
|
img = cv2.imread(image_path)
|
||||||
if img is None:
|
if img is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
# OCR token quads (gray)
|
|
||||||
for bbox, txt, conf in ocr:
|
for bbox, txt, conf in ocr:
|
||||||
pts = np.array(bbox, dtype=np.int32)
|
pts = np.array(bbox, dtype=np.int32)
|
||||||
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
|
cv2.polylines(img, [pts], True, (180, 180, 180), 1)
|
||||||
|
|
||||||
# bubble boxes (green) + line boxes (yellow)
|
|
||||||
for bid, bb in bubble_boxes.items():
|
for bid, bb in bubble_boxes.items():
|
||||||
x1, y1, x2, y2 = bb
|
x1, y1, x2, y2 = bb
|
||||||
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
|
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 220, 0), 2)
|
||||||
@@ -688,7 +824,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
|
|||||||
cy = (y1 + y2) / 2.0
|
cy = (y1 + y2) / 2.0
|
||||||
items.append((bid, cx, cy))
|
items.append((bid, cx, cy))
|
||||||
|
|
||||||
items.sort(key=lambda t: t[2]) # top -> bottom
|
items.sort(key=lambda t: t[2])
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
tol = 90
|
tol = 90
|
||||||
@@ -714,6 +850,7 @@ def estimate_reading_order(bbox_dict, mode="ltr"):
|
|||||||
|
|
||||||
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
|
def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_map, image_shape):
|
||||||
out = {}
|
out = {}
|
||||||
|
|
||||||
for bid, bb in bbox_dict.items():
|
for bid, bb in bbox_dict.items():
|
||||||
x1, y1, x2, y2 = bb
|
x1, y1, x2, y2 = bb
|
||||||
quads = quads_dict.get(bid, [])
|
quads = quads_dict.get(bid, [])
|
||||||
@@ -761,7 +898,8 @@ def translate_manga_text(
|
|||||||
export_to_file="output.txt",
|
export_to_file="output.txt",
|
||||||
export_bubbles_to="bubbles.json",
|
export_bubbles_to="bubbles.json",
|
||||||
reading_mode="ltr",
|
reading_mode="ltr",
|
||||||
debug=True
|
debug=True,
|
||||||
|
use_gpu=False
|
||||||
):
|
):
|
||||||
image = cv2.imread(image_path)
|
image = cv2.imread(image_path)
|
||||||
if image is None:
|
if image is None:
|
||||||
@@ -770,20 +908,12 @@ def translate_manga_text(
|
|||||||
|
|
||||||
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
||||||
|
|
||||||
print("Loading OCR...")
|
print("Loading Hybrid OCR (Paddle + EasyOCR)...")
|
||||||
# Catalan often OCRs better with es+en in manga pages
|
hybrid = HybridOCR(source_lang=source_lang, use_gpu=use_gpu)
|
||||||
if source_lang == "ca":
|
|
||||||
ocr_lang_list = ["es", "en"]
|
|
||||||
elif source_lang == "en":
|
|
||||||
ocr_lang_list = ["en", "es"]
|
|
||||||
else:
|
|
||||||
ocr_lang_list = [source_lang]
|
|
||||||
|
|
||||||
reader = easyocr.Reader(ocr_lang_list)
|
|
||||||
|
|
||||||
print("Running OCR...")
|
print("Running OCR...")
|
||||||
raw = reader.readtext(image_path, paragraph=False)
|
raw = hybrid.read_full_image(image_path)
|
||||||
print(f"Raw detections: {len(raw)}")
|
print(f"Raw detections (merged): {len(raw)}")
|
||||||
|
|
||||||
filtered = []
|
filtered = []
|
||||||
skipped = 0
|
skipped = 0
|
||||||
@@ -809,7 +939,6 @@ def translate_manga_text(
|
|||||||
skipped += 1
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# reduce top-strip false positives
|
|
||||||
if qb[1] < int(ih * TOP_BAND_RATIO):
|
if qb[1] < int(ih * TOP_BAND_RATIO):
|
||||||
if conf < 0.70 and len(t) >= 5:
|
if conf < 0.70 and len(t) >= 5:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
@@ -846,7 +975,7 @@ def translate_manga_text(
|
|||||||
rr_txt, rr_sc = reread_crop_robust(
|
rr_txt, rr_sc = reread_crop_robust(
|
||||||
image,
|
image,
|
||||||
bubble_boxes[bid],
|
bubble_boxes[bid],
|
||||||
reader,
|
hybrid,
|
||||||
upscale=3.0,
|
upscale=3.0,
|
||||||
pad=24
|
pad=24
|
||||||
)
|
)
|
||||||
@@ -857,7 +986,6 @@ def translate_manga_text(
|
|||||||
else:
|
else:
|
||||||
txt = base_txt
|
txt = base_txt
|
||||||
|
|
||||||
# tiny targeted corrections for common OCR confusions
|
|
||||||
txt = txt.replace(" BOMPORTA", " IMPORTA")
|
txt = txt.replace(" BOMPORTA", " IMPORTA")
|
||||||
txt = txt.replace(" TESTO ", " ESTO ")
|
txt = txt.replace(" TESTO ", " ESTO ")
|
||||||
txt = txt.replace(" MIVERDAD", " MI VERDAD")
|
txt = txt.replace(" MIVERDAD", " MI VERDAD")
|
||||||
@@ -927,8 +1055,8 @@ def translate_manga_text(
|
|||||||
# ============================================================
|
# ============================================================
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
translate_manga_text(
|
translate_manga_text(
|
||||||
image_path="004-page.png",
|
image_path="001-page.png",
|
||||||
source_lang="es",
|
source_lang="it",
|
||||||
target_lang="ca",
|
target_lang="ca",
|
||||||
confidence_threshold=0.12,
|
confidence_threshold=0.12,
|
||||||
min_text_length=1,
|
min_text_length=1,
|
||||||
@@ -938,5 +1066,6 @@ if __name__ == "__main__":
|
|||||||
export_to_file="output.txt",
|
export_to_file="output.txt",
|
||||||
export_bubbles_to="bubbles.json",
|
export_bubbles_to="bubbles.json",
|
||||||
reading_mode="ltr",
|
reading_mode="ltr",
|
||||||
debug=True
|
debug=True,
|
||||||
|
use_gpu=False
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user