Added some changes2
This commit is contained in:
@@ -194,14 +194,104 @@ def ocr_candidate_score(text: str) -> float:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# SPLITTERS
|
# SPLITTERS + QUAD NORMALIZATION
|
||||||
# ============================================================
|
# ============================================================
|
||||||
|
def estimate_char_capacity_width(text_len, med_h, k=0.72):
|
||||||
|
return max(18.0, text_len * med_h * k)
|
||||||
|
|
||||||
|
|
||||||
|
def shrink_ocr_quad_to_text(quad, text, med_h):
|
||||||
|
x1, y1, x2, y2 = quad_bbox(quad)
|
||||||
|
w = max(1, x2 - x1)
|
||||||
|
h = max(1, y2 - y1)
|
||||||
|
|
||||||
|
t = (text or "").strip()
|
||||||
|
n = max(1, len(t.replace(" ", "")))
|
||||||
|
exp_w = estimate_char_capacity_width(n, med_h, k=0.62)
|
||||||
|
max_w = max(exp_w * 1.35, h * 1.15)
|
||||||
|
|
||||||
|
if w <= max_w:
|
||||||
|
return quad
|
||||||
|
|
||||||
|
cx = (x1 + x2) / 2.0
|
||||||
|
nw = int(round(max_w))
|
||||||
|
nx1 = int(round(cx - nw / 2))
|
||||||
|
nx2 = int(round(cx + nw / 2))
|
||||||
|
|
||||||
|
return [[nx1, y1], [nx2, y1], [nx2, y2], [nx1, y2]]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_ocr_quads(filtered_ocr):
|
||||||
|
if not filtered_ocr:
|
||||||
|
return filtered_ocr
|
||||||
|
|
||||||
|
hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
|
||||||
|
med_h = float(np.median(hs)) if hs else 14.0
|
||||||
|
|
||||||
|
out = []
|
||||||
|
for quad, text, conf in filtered_ocr:
|
||||||
|
nq = shrink_ocr_quad_to_text(quad, text, med_h)
|
||||||
|
out.append((nq, text, conf))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def split_abnormal_bridge_quads(image_bgr, filtered_ocr):
|
||||||
|
if not filtered_ocr:
|
||||||
|
return filtered_ocr, 0
|
||||||
|
|
||||||
|
hs = [max(1, quad_bbox(q)[3] - quad_bbox(q)[1]) for q, _, _ in filtered_ocr]
|
||||||
|
med_h = float(np.median(hs)) if hs else 14.0
|
||||||
|
|
||||||
|
out = []
|
||||||
|
splits = 0
|
||||||
|
|
||||||
|
for quad, text, conf in filtered_ocr:
|
||||||
|
x1, y1, x2, y2 = quad_bbox(quad)
|
||||||
|
w = max(1, x2 - x1)
|
||||||
|
h = max(1, y2 - y1)
|
||||||
|
|
||||||
|
if w > med_h * 11.0 and " " in text and len(text) >= 14:
|
||||||
|
roi = image_bgr[max(0, y1):min(image_bgr.shape[0], y2), max(0, x1):min(image_bgr.shape[1], x2)]
|
||||||
|
if roi.size > 0:
|
||||||
|
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
||||||
|
_, inv = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||||
|
proj = np.sum(inv, axis=0)
|
||||||
|
|
||||||
|
s = int(w * 0.18)
|
||||||
|
e = int(w * 0.82)
|
||||||
|
if e > s:
|
||||||
|
segment = proj[s:e]
|
||||||
|
valley_rel = int(np.argmin(segment))
|
||||||
|
valley_x = s + valley_rel
|
||||||
|
|
||||||
|
low = float(segment[valley_rel])
|
||||||
|
meanv = float(np.mean(segment))
|
||||||
|
if low < meanv * 0.52:
|
||||||
|
split_x = x1 + valley_x
|
||||||
|
|
||||||
|
char_w = w / max(1, len(text))
|
||||||
|
split_idx = int((split_x - x1) / max(1e-6, char_w))
|
||||||
|
spaces = [i for i, c in enumerate(text) if c == " "]
|
||||||
|
if spaces:
|
||||||
|
split_idx = min(spaces, key=lambda i: abs(i - split_idx))
|
||||||
|
|
||||||
|
left_t = text[:split_idx].strip()
|
||||||
|
right_t = text[split_idx:].strip()
|
||||||
|
|
||||||
|
if left_t and right_t:
|
||||||
|
ql = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]]
|
||||||
|
qr = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]]
|
||||||
|
out.append((ql, left_t, conf))
|
||||||
|
out.append((qr, right_t, conf))
|
||||||
|
splits += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
out.append((quad, text, conf))
|
||||||
|
|
||||||
|
return out, splits
|
||||||
|
|
||||||
|
|
||||||
def split_wide_ocr_items(image_bgr, filtered_ocr):
|
def split_wide_ocr_items(image_bgr, filtered_ocr):
|
||||||
"""
|
|
||||||
Detects if Apple Vision incorrectly merged two columns into a single wide line.
|
|
||||||
It measures the width of the white gaps and only splits if the gap is
|
|
||||||
significantly wider than a normal space between words.
|
|
||||||
"""
|
|
||||||
new_filtered = []
|
new_filtered = []
|
||||||
splits_made = 0
|
splits_made = 0
|
||||||
|
|
||||||
@@ -211,7 +301,6 @@ def split_wide_ocr_items(image_bgr, filtered_ocr):
|
|||||||
w = x2 - x1
|
w = x2 - x1
|
||||||
h = max(1, y2 - y1)
|
h = max(1, y2 - y1)
|
||||||
|
|
||||||
# Check if it's abnormally wide
|
|
||||||
if w > h * 2.5 and len(text) > 5 and ' ' in text:
|
if w > h * 2.5 and len(text) > 5 and ' ' in text:
|
||||||
pad = 2
|
pad = 2
|
||||||
roi_y1 = max(0, y1 - pad)
|
roi_y1 = max(0, y1 - pad)
|
||||||
@@ -229,15 +318,12 @@ def split_wide_ocr_items(image_bgr, filtered_ocr):
|
|||||||
end_x = int(w * 0.80)
|
end_x = int(w * 0.80)
|
||||||
|
|
||||||
if start_x < end_x:
|
if start_x < end_x:
|
||||||
# Calculate expected character width
|
|
||||||
char_w = w / max(1, len(text))
|
char_w = w / max(1, len(text))
|
||||||
# A real column gap should be at least 2.5 chars wide or 75% of line height
|
|
||||||
min_gap_width = max(int(char_w * 2.5), int(h * 0.75))
|
min_gap_width = max(int(char_w * 2.5), int(h * 0.75))
|
||||||
|
|
||||||
gap_threshold = h * 255 * 0.15
|
gap_threshold = h * 255 * 0.15
|
||||||
gap_mask = proj < gap_threshold
|
gap_mask = proj < gap_threshold
|
||||||
|
|
||||||
# Find the widest continuous gap
|
|
||||||
best_gap_start = -1
|
best_gap_start = -1
|
||||||
best_gap_len = 0
|
best_gap_len = 0
|
||||||
current_gap_start = -1
|
current_gap_start = -1
|
||||||
@@ -258,12 +344,10 @@ def split_wide_ocr_items(image_bgr, filtered_ocr):
|
|||||||
best_gap_len = current_gap_len
|
best_gap_len = current_gap_len
|
||||||
best_gap_start = current_gap_start
|
best_gap_start = current_gap_start
|
||||||
|
|
||||||
# ONLY split if the gap is wide enough to be a gutter between bubbles
|
|
||||||
if best_gap_len >= min_gap_width:
|
if best_gap_len >= min_gap_width:
|
||||||
split_x = roi_x1 + best_gap_start + (best_gap_len // 2)
|
split_x = roi_x1 + best_gap_start + (best_gap_len // 2)
|
||||||
|
|
||||||
split_idx = int((split_x - x1) / char_w)
|
split_idx = int((split_x - x1) / max(1e-6, char_w))
|
||||||
|
|
||||||
spaces = [i for i, c in enumerate(text) if c == ' ']
|
spaces = [i for i, c in enumerate(text) if c == ' ']
|
||||||
if spaces:
|
if spaces:
|
||||||
best_space = min(spaces, key=lambda i: abs(i - split_idx))
|
best_space = min(spaces, key=lambda i: abs(i - split_idx))
|
||||||
@@ -281,7 +365,6 @@ def split_wide_ocr_items(image_bgr, filtered_ocr):
|
|||||||
splits_made += 1
|
splits_made += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If no split was made, keep the original item
|
|
||||||
new_filtered.append(item)
|
new_filtered.append(item)
|
||||||
|
|
||||||
return new_filtered, splits_made
|
return new_filtered, splits_made
|
||||||
@@ -428,6 +511,178 @@ def split_bubble_if_multiple_rows(indices, ocr, bid=None):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_vertical_text_like(indices, ocr):
|
||||||
|
if len(indices) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
bxs = [quad_bbox(ocr[i][0]) for i in indices]
|
||||||
|
ub = boxes_union_xyxy(bxs)
|
||||||
|
if ub is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
x1, y1, x2, y2 = ub
|
||||||
|
w = max(1, x2 - x1)
|
||||||
|
h = max(1, y2 - y1)
|
||||||
|
|
||||||
|
aspect = h / w
|
||||||
|
xcs = [((b[0] + b[2]) / 2.0) for b in bxs]
|
||||||
|
x_spread = float(np.std(xcs)) if len(xcs) > 1 else 0.0
|
||||||
|
med_h = float(np.median([max(1, b[3]-b[1]) for b in bxs]))
|
||||||
|
|
||||||
|
ys = sorted([((b[1] + b[3]) / 2.0) for b in bxs])
|
||||||
|
gaps = [ys[i+1] - ys[i] for i in range(len(ys)-1)] if len(ys) >= 2 else [0]
|
||||||
|
med_gap = float(np.median(gaps)) if gaps else 0.0
|
||||||
|
|
||||||
|
return (
|
||||||
|
aspect > 1.35 and
|
||||||
|
x_spread < max(10.0, med_h * 0.9) and
|
||||||
|
med_gap > max(6.0, med_h * 0.35)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def split_cluster_by_big_vertical_gap(indices, ocr, factor=1.9, min_gap=22):
|
||||||
|
if len(indices) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
items = []
|
||||||
|
for i in indices:
|
||||||
|
b = quad_bbox(ocr[i][0])
|
||||||
|
yc = (b[1] + b[3]) / 2.0
|
||||||
|
h = max(1.0, b[3] - b[1])
|
||||||
|
items.append((i, b, yc, h))
|
||||||
|
|
||||||
|
items.sort(key=lambda t: t[2])
|
||||||
|
med_h = float(np.median([t[3] for t in items])) if items else 12.0
|
||||||
|
|
||||||
|
best_k = -1
|
||||||
|
best_gap = -1
|
||||||
|
for k in range(len(items)-1):
|
||||||
|
y_top = items[k][1][3]
|
||||||
|
y_bot = items[k+1][1][1]
|
||||||
|
gap = y_bot - y_top
|
||||||
|
if gap > best_gap:
|
||||||
|
best_gap = gap
|
||||||
|
best_k = k
|
||||||
|
|
||||||
|
if best_k < 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if best_gap > max(min_gap, med_h * factor):
|
||||||
|
a = [t[0] for t in items[:best_k+1]]
|
||||||
|
b = [t[0] for t in items[best_k+1:]]
|
||||||
|
if a and b:
|
||||||
|
return a, b
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def split_nested_or_side_by_side(indices, ocr):
|
||||||
|
if len(indices) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
boxes = [quad_bbox(ocr[i][0]) for i in indices]
|
||||||
|
xcs = np.array([[(b[0] + b[2]) / 2.0] for b in boxes], dtype=np.float32)
|
||||||
|
|
||||||
|
c1 = float(np.min(xcs))
|
||||||
|
c2 = float(np.max(xcs))
|
||||||
|
if abs(c2 - c1) < 8:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for _ in range(12):
|
||||||
|
g1, g2 = [], []
|
||||||
|
for idx, v in enumerate(xcs[:, 0]):
|
||||||
|
if abs(v - c1) <= abs(v - c2):
|
||||||
|
g1.append(idx)
|
||||||
|
else:
|
||||||
|
g2.append(idx)
|
||||||
|
if not g1 or not g2:
|
||||||
|
return None
|
||||||
|
new_c1 = float(np.mean([xcs[i, 0] for i in g1]))
|
||||||
|
new_c2 = float(np.mean([xcs[i, 0] for i in g2]))
|
||||||
|
if abs(new_c1 - c1) < 0.5 and abs(new_c2 - c2) < 0.5:
|
||||||
|
break
|
||||||
|
c1, c2 = new_c1, new_c2
|
||||||
|
|
||||||
|
left_group = g1 if c1 < c2 else g2
|
||||||
|
right_group = g2 if c1 < c2 else g1
|
||||||
|
|
||||||
|
left_idxs = [indices[i] for i in left_group]
|
||||||
|
right_idxs = [indices[i] for i in right_group]
|
||||||
|
if not left_idxs or not right_idxs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
left_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in left_idxs])
|
||||||
|
right_box = boxes_union_xyxy([quad_bbox(ocr[i][0]) for i in right_idxs])
|
||||||
|
|
||||||
|
sep = right_box[0] - left_box[2]
|
||||||
|
if sep < -8:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return left_idxs, right_idxs
|
||||||
|
|
||||||
|
|
||||||
|
def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubble_indices, ocr):
|
||||||
|
bids = sorted(bubbles.keys())
|
||||||
|
used = set()
|
||||||
|
out_b, out_bb, out_bq, out_bi = {}, {}, {}, {}
|
||||||
|
nbid = 1
|
||||||
|
|
||||||
|
all_h = []
|
||||||
|
for i in range(len(ocr)):
|
||||||
|
b = quad_bbox(ocr[i][0])
|
||||||
|
all_h.append(max(1, b[3]-b[1]))
|
||||||
|
med_h = float(np.median(all_h)) if all_h else 14.0
|
||||||
|
|
||||||
|
for i, a in enumerate(bids):
|
||||||
|
if a in used:
|
||||||
|
continue
|
||||||
|
used.add(a)
|
||||||
|
group = [a]
|
||||||
|
|
||||||
|
ax1, ay1, ax2, ay2 = bubble_boxes[a]
|
||||||
|
|
||||||
|
for b in bids[i+1:]:
|
||||||
|
if b in used:
|
||||||
|
continue
|
||||||
|
bx1, by1, bx2, by2 = bubble_boxes[b]
|
||||||
|
|
||||||
|
acx, acy = (ax1+ax2)/2.0, (ay1+ay2)/2.0
|
||||||
|
bcx, bcy = (bx1+bx2)/2.0, (by1+by2)/2.0
|
||||||
|
dx, dy = abs(acx-bcx), abs(acy-bcy)
|
||||||
|
|
||||||
|
near = dx < med_h * 10.0 and dy < med_h * 3.6
|
||||||
|
touching = overlap_or_near((ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2), gap=int(med_h*1.25))
|
||||||
|
|
||||||
|
ua = boxes_union_xyxy([(ax1, ay1, ax2, ay2), (bx1, by1, bx2, by2)])
|
||||||
|
area_a = max(1, (ax2-ax1)*(ay2-ay1))
|
||||||
|
area_b = max(1, (bx2-bx1)*(by2-by1))
|
||||||
|
area_u = max(1, (ua[2]-ua[0])*(ua[3]-ua[1]))
|
||||||
|
compact_union = area_u < (area_a + area_b) * 1.65
|
||||||
|
|
||||||
|
if near and touching and compact_union:
|
||||||
|
group.append(b)
|
||||||
|
used.add(b)
|
||||||
|
ax1 = min(ax1, bx1); ay1 = min(ay1, by1); ax2 = max(ax2, bx2); ay2 = max(ay2, by2)
|
||||||
|
|
||||||
|
idxs = []
|
||||||
|
quads = []
|
||||||
|
for g in group:
|
||||||
|
idxs.extend(bubble_indices[g])
|
||||||
|
quads.extend(bubble_quads[g])
|
||||||
|
|
||||||
|
idxs = sorted(set(idxs))
|
||||||
|
ub = boxes_union_xyxy([quad_bbox(ocr[k][0]) for k in idxs])
|
||||||
|
if ub is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
out_b[nbid] = build_lines_from_indices(idxs, ocr)
|
||||||
|
out_bb[nbid] = ub
|
||||||
|
out_bq[nbid] = quads
|
||||||
|
out_bi[nbid] = idxs
|
||||||
|
nbid += 1
|
||||||
|
|
||||||
|
return out_b, out_bb, out_bq, out_bi
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# OCR ENGINES (Apple Native Vision)
|
# OCR ENGINES (Apple Native Vision)
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -503,7 +758,6 @@ class MacVisionDetector:
|
|||||||
request.setRecognitionLanguages_(self.langs)
|
request.setRecognitionLanguages_(self.langs)
|
||||||
|
|
||||||
handler.performRequests_error_([request], None)
|
handler.performRequests_error_([request], None)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -596,13 +850,7 @@ def rebuild_text_from_vision_result(res):
|
|||||||
return normalize_text(" ".join(lines))
|
return normalize_text(" ".join(lines))
|
||||||
|
|
||||||
|
|
||||||
def reread_bubble_with_vision(
|
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24):
|
||||||
image_bgr,
|
|
||||||
bbox_xyxy,
|
|
||||||
vision_detector: MacVisionDetector,
|
|
||||||
upscale=3.0,
|
|
||||||
pad=24
|
|
||||||
):
|
|
||||||
ih, iw = image_bgr.shape[:2]
|
ih, iw = image_bgr.shape[:2]
|
||||||
x1, y1, x2, y2 = bbox_xyxy
|
x1, y1, x2, y2 = bbox_xyxy
|
||||||
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
|
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
|
||||||
@@ -697,7 +945,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
med_h = float(np.median([it["h"] for it in items]))
|
med_h = float(np.median([it["h"] for it in items]))
|
||||||
row_tol = max(6.0, med_h * 0.90)
|
row_tol = max(6.0, med_h * 0.90)
|
||||||
gap_x_tol = max(8.0, med_h * 1.25)
|
gap_x_tol = max(8.0, med_h * 1.25)
|
||||||
pad = max(3, int(round(med_h * 0.22)))
|
pad = max(2, int(round(med_h * 0.14)))
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
for it in sorted(items, key=lambda x: x["yc"]):
|
for it in sorted(items, key=lambda x: x["yc"]):
|
||||||
@@ -736,7 +984,7 @@ def build_line_boxes_from_indices(indices, ocr, image_shape=None):
|
|||||||
ub = boxes_union_xyxy([x["b"] for x in ch])
|
ub = boxes_union_xyxy([x["b"] for x in ch])
|
||||||
if ub:
|
if ub:
|
||||||
x1, y1, x2, y2 = ub
|
x1, y1, x2, y2 = ub
|
||||||
out_boxes.append((x1 - pad, y1 - int(round(pad*1.35)), x2 + pad, y2 + int(round(pad*0.95))))
|
out_boxes.append((x1 - pad, y1 - int(round(pad * 1.2)), x2 + pad, y2 + int(round(pad * 0.9))))
|
||||||
|
|
||||||
if image_shape is not None:
|
if image_shape is not None:
|
||||||
ih, iw = image_shape[:2]
|
ih, iw = image_shape[:2]
|
||||||
@@ -759,7 +1007,7 @@ def auto_gap(image_path, base=18, ref_w=750):
|
|||||||
return base * (img.shape[1] / ref_w)
|
return base * (img.shape[1] / ref_w)
|
||||||
|
|
||||||
|
|
||||||
def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=1):
|
||||||
n = len(ocr)
|
n = len(ocr)
|
||||||
if n == 0:
|
if n == 0:
|
||||||
return {}, {}, {}, {}
|
return {}, {}, {}, {}
|
||||||
@@ -769,7 +1017,6 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
hs = [max(1.0, b[3] - b[1]) for b in boxes]
|
hs = [max(1.0, b[3] - b[1]) for b in boxes]
|
||||||
med_h = float(np.median(hs)) if hs else 12.0
|
med_h = float(np.median(hs)) if hs else 12.0
|
||||||
dist_thresh = max(20.0, med_h * 1.8)
|
dist_thresh = max(20.0, med_h * 1.8)
|
||||||
|
|
||||||
adaptive_gap_y = max(gap_px, med_h * 2.5)
|
adaptive_gap_y = max(gap_px, med_h * 2.5)
|
||||||
|
|
||||||
p = list(range(n))
|
p = list(range(n))
|
||||||
@@ -795,16 +1042,13 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5)
|
is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5)
|
||||||
|
|
||||||
if gap_x == 0 and gap_y <= (med_h * 3.5):
|
if gap_x == 0 and gap_y <= (med_h * 3.5):
|
||||||
unite(i, j)
|
unite(i, j); continue
|
||||||
continue
|
|
||||||
|
|
||||||
if is_vertically_aligned and gap_y <= (med_h * 3.5):
|
if is_vertically_aligned and gap_y <= (med_h * 3.2):
|
||||||
unite(i, j)
|
unite(i, j); continue
|
||||||
continue
|
|
||||||
|
|
||||||
if gap_x <= gap_px and gap_y <= adaptive_gap_y:
|
if gap_x <= gap_px and gap_y <= adaptive_gap_y:
|
||||||
unite(i, j)
|
unite(i, j); continue
|
||||||
continue
|
|
||||||
|
|
||||||
d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
|
d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
|
||||||
if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 1.5:
|
if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 1.5:
|
||||||
@@ -831,8 +1075,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
x1, y1, x2, y2 = ub
|
x1, y1, x2, y2 = ub
|
||||||
|
adaptive_pad = max(1, int(round(med_h * 0.16)))
|
||||||
adaptive_pad = max(bbox_padding, int(round(med_h * 0.35)))
|
|
||||||
x1 = max(0, x1 - adaptive_pad); y1 = max(0, y1 - adaptive_pad)
|
x1 = max(0, x1 - adaptive_pad); y1 = max(0, y1 - adaptive_pad)
|
||||||
x2 = min(iw - 1, x2 + adaptive_pad); y2 = min(ih - 1, y2 + adaptive_pad)
|
x2 = min(iw - 1, x2 + adaptive_pad); y2 = min(ih - 1, y2 + adaptive_pad)
|
||||||
|
|
||||||
@@ -847,14 +1090,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
|
|||||||
# ============================================================
|
# ============================================================
|
||||||
# DEBUG / EXPORT
|
# DEBUG / EXPORT
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def save_debug_clusters(
|
def save_debug_clusters(image_path, ocr, bubble_boxes, bubble_indices, clean_lines=None, out_path="debug_clusters.png"):
|
||||||
image_path,
|
|
||||||
ocr,
|
|
||||||
bubble_boxes,
|
|
||||||
bubble_indices,
|
|
||||||
clean_lines=None,
|
|
||||||
out_path="debug_clusters.png"
|
|
||||||
):
|
|
||||||
img = cv2.imread(image_path)
|
img = cv2.imread(image_path)
|
||||||
if img is None:
|
if img is None:
|
||||||
return
|
return
|
||||||
@@ -874,23 +1110,20 @@ def save_debug_clusters(
|
|||||||
text = clean_lines[bid]
|
text = clean_lines[bid]
|
||||||
words = text.split()
|
words = text.split()
|
||||||
lines = []
|
lines = []
|
||||||
current_line = ""
|
cur = ""
|
||||||
|
for w in words:
|
||||||
for word in words:
|
if len(cur) + len(w) < 25:
|
||||||
if len(current_line) + len(word) < 25:
|
cur += w + " "
|
||||||
current_line += word + " "
|
|
||||||
else:
|
else:
|
||||||
lines.append(current_line.strip())
|
lines.append(cur.strip())
|
||||||
current_line = word + " "
|
cur = w + " "
|
||||||
if current_line:
|
if cur:
|
||||||
lines.append(current_line.strip())
|
lines.append(cur.strip())
|
||||||
|
|
||||||
y_text = y2 + 18
|
y_text = y2 + 18
|
||||||
for line in lines:
|
for line in lines:
|
||||||
cv2.putText(img, line, (x1, y_text),
|
cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3)
|
||||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 3)
|
cv2.putText(img, line, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
|
||||||
cv2.putText(img, line, (x1, y_text),
|
|
||||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
|
|
||||||
y_text += 18
|
y_text += 18
|
||||||
|
|
||||||
cv2.imwrite(out_path, img)
|
cv2.imwrite(out_path, img)
|
||||||
@@ -1018,12 +1251,26 @@ def translate_manga_text(
|
|||||||
print("⚠️ No text after filtering.")
|
print("⚠️ No text after filtering.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# 1) split obvious wide OCR merges
|
||||||
filtered, splits_made = split_wide_ocr_items(image, filtered)
|
filtered, splits_made = split_wide_ocr_items(image, filtered)
|
||||||
if splits_made > 0:
|
if splits_made > 0:
|
||||||
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
|
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
|
||||||
|
|
||||||
|
# 2) split giant bridge quads (fixes page16 BOX19-like glue)
|
||||||
|
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
|
||||||
|
if bridge_splits > 0:
|
||||||
|
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
|
||||||
|
|
||||||
|
# 3) shrink quads to tighter text footprint
|
||||||
|
filtered = normalize_ocr_quads(filtered)
|
||||||
|
|
||||||
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens(
|
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens(
|
||||||
filtered, image.shape, gap_px=resolved_gap, bbox_padding=3
|
filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# merge accidental sibling fragments (fixes page15 BOX11+BOX16 style)
|
||||||
|
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
|
||||||
|
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
|
||||||
)
|
)
|
||||||
|
|
||||||
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
|
new_bubbles, new_bubble_boxes, new_bubble_quads, new_bubble_indices = {}, {}, {}, {}
|
||||||
@@ -1034,11 +1281,17 @@ def translate_manga_text(
|
|||||||
box = bubble_boxes[bid]
|
box = bubble_boxes[bid]
|
||||||
bubble_split = None
|
bubble_split = None
|
||||||
|
|
||||||
|
if is_vertical_text_like(bubble_indices[bid], filtered):
|
||||||
|
vgap_split = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.7, min_gap=18)
|
||||||
|
if vgap_split:
|
||||||
|
bubble_split = vgap_split
|
||||||
|
splits_performed.append(f"BOX#{bid} (vertical-stack y-gap split)")
|
||||||
|
|
||||||
|
if bubble_split is None:
|
||||||
split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
|
split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
|
||||||
if split_result:
|
if split_result:
|
||||||
box_left, box_right, split_x = split_result
|
_, _, split_x = split_result
|
||||||
left_idxs, right_idxs = [], []
|
left_idxs, right_idxs = [], []
|
||||||
|
|
||||||
for idx in bubble_indices[bid]:
|
for idx in bubble_indices[bid]:
|
||||||
cx, cy = quad_center(filtered[idx][0])
|
cx, cy = quad_center(filtered[idx][0])
|
||||||
if cx < split_x:
|
if cx < split_x:
|
||||||
@@ -1052,38 +1305,54 @@ def translate_manga_text(
|
|||||||
elif len(bubble_quads[bid]) >= 4:
|
elif len(bubble_quads[bid]) >= 4:
|
||||||
col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True)
|
col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True)
|
||||||
if col_split:
|
if col_split:
|
||||||
left_idxs, right_idxs = col_split
|
l, r = col_split
|
||||||
if left_idxs and right_idxs:
|
if l and r:
|
||||||
bubble_split = (left_idxs, right_idxs)
|
bubble_split = (l, r)
|
||||||
splits_performed.append(f"BOX#{bid} ({len(left_idxs)} quads | {len(right_idxs)} quads)")
|
splits_performed.append(f"BOX#{bid} ({len(l)} quads | {len(r)} quads)")
|
||||||
|
|
||||||
if bubble_split is None:
|
if bubble_split is None:
|
||||||
col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid)
|
col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid)
|
||||||
if col_split:
|
if col_split:
|
||||||
left_idxs, right_idxs = col_split
|
l, r = col_split
|
||||||
if left_idxs and right_idxs:
|
if l and r:
|
||||||
bubble_split = (left_idxs, right_idxs)
|
bubble_split = (l, r)
|
||||||
splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(left_idxs)} | {len(right_idxs)} quads)")
|
splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(l)} | {len(r)} quads)")
|
||||||
|
|
||||||
|
if bubble_split is None:
|
||||||
|
nested_split = split_nested_or_side_by_side(bubble_indices[bid], filtered)
|
||||||
|
if nested_split:
|
||||||
|
l, r = nested_split
|
||||||
|
if l and r:
|
||||||
|
bubble_split = (l, r)
|
||||||
|
splits_performed.append(f"BOX#{bid} (nested/side-by-side forced split)")
|
||||||
|
|
||||||
if bubble_split is None:
|
if bubble_split is None:
|
||||||
row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid)
|
row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid)
|
||||||
if row_split:
|
if row_split:
|
||||||
top_idxs, bottom_idxs = row_split
|
t, b = row_split
|
||||||
if top_idxs and bottom_idxs:
|
if t and b:
|
||||||
bubble_split = (top_idxs, bottom_idxs)
|
bubble_split = (t, b)
|
||||||
splits_performed.append(f"BOX#{bid} (Horizontal Row Split: {len(top_idxs)} | {len(bottom_idxs)} quads)")
|
splits_performed.append(f"BOX#{bid} (Horizontal Row Split: {len(t)} | {len(b)} quads)")
|
||||||
|
|
||||||
|
if bubble_split is None:
|
||||||
|
gy = split_cluster_by_big_vertical_gap(bubble_indices[bid], filtered, factor=1.9, min_gap=22)
|
||||||
|
if gy:
|
||||||
|
a, b = gy
|
||||||
|
bubble_split = (a, b)
|
||||||
|
splits_performed.append(f"BOX#{bid} (large vertical-gap split)")
|
||||||
|
|
||||||
if bubble_split:
|
if bubble_split:
|
||||||
part1_idxs, part2_idxs = bubble_split
|
part1_idxs, part2_idxs = bubble_split
|
||||||
|
|
||||||
new_bubbles[bid] = build_lines_from_indices(part1_idxs, filtered)
|
new_bubbles[bid] = build_lines_from_indices(part1_idxs, filtered)
|
||||||
ub_1 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part1_idxs])
|
ub_1 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part1_idxs])
|
||||||
new_bubble_boxes[bid] = (max(0, ub_1[0]-3), max(0, ub_1[1]-3), min(iw-1, ub_1[2]+3), min(ih-1, ub_1[3]+3))
|
new_bubble_boxes[bid] = (max(0, ub_1[0]-2), max(0, ub_1[1]-2), min(iw-1, ub_1[2]+2), min(ih-1, ub_1[3]+2))
|
||||||
new_bubble_quads[bid] = [filtered[i][0] for i in part1_idxs]
|
new_bubble_quads[bid] = [filtered[i][0] for i in part1_idxs]
|
||||||
new_bubble_indices[bid] = part1_idxs
|
new_bubble_indices[bid] = part1_idxs
|
||||||
|
|
||||||
new_bubbles[next_bid] = build_lines_from_indices(part2_idxs, filtered)
|
new_bubbles[next_bid] = build_lines_from_indices(part2_idxs, filtered)
|
||||||
ub_2 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part2_idxs])
|
ub_2 = boxes_union_xyxy([quad_bbox(filtered[i][0]) for i in part2_idxs])
|
||||||
new_bubble_boxes[next_bid] = (max(0, ub_2[0]-3), max(0, ub_2[1]-3), min(iw-1, ub_2[2]+3), min(ih-1, ub_2[3]+3))
|
new_bubble_boxes[next_bid] = (max(0, ub_2[0]-2), max(0, ub_2[1]-2), min(iw-1, ub_2[2]+2), min(ih-1, ub_2[3]+2))
|
||||||
new_bubble_quads[next_bid] = [filtered[i][0] for i in part2_idxs]
|
new_bubble_quads[next_bid] = [filtered[i][0] for i in part2_idxs]
|
||||||
new_bubble_indices[next_bid] = part2_idxs
|
new_bubble_indices[next_bid] = part2_idxs
|
||||||
next_bid += 1
|
next_bid += 1
|
||||||
@@ -1205,7 +1474,7 @@ def translate_manga_text(
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
translate_manga_text(
|
translate_manga_text(
|
||||||
image_path="004.png",
|
image_path="16.jpg",
|
||||||
source_lang="english",
|
source_lang="english",
|
||||||
target_lang="ca",
|
target_lang="ca",
|
||||||
confidence_threshold=0.05,
|
confidence_threshold=0.05,
|
||||||
|
|||||||
Reference in New Issue
Block a user