Compare commits

..

2 Commits

Author SHA1 Message Date
Guillem Hernandez Sola
f00647e668 Added new styles 2026-04-21 21:45:46 +02:00
Guillem Hernandez Sola
a5c81f4ff0 Added new styles 2026-04-21 21:27:22 +02:00

View File

@@ -104,11 +104,9 @@ def looks_like_box_tag(t: str) -> bool:
def is_noise_text(text: str) -> bool: def is_noise_text(text: str) -> bool:
t = (text or "").strip() t = (text or "").strip()
# ALLOW pure punctuation clusters like "...", "!!", "?!"
if re.fullmatch(r"[\?\!\.]+", t): if re.fullmatch(r"[\?\!\.]+", t):
return False return False
# ALLOW single alphabetical characters (crucial for vertical text)
if len(t) == 1 and t.isalpha(): if len(t) == 1 and t.isalpha():
return False return False
@@ -117,7 +115,6 @@ def is_noise_text(text: str) -> bool:
if looks_like_box_tag(t): if looks_like_box_tag(t):
return True return True
# Relaxed the length check to allow 1-2 letter words and punctuation
if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!\.]", t) and not t.isalpha(): if len(t) <= 2 and not re.search(r"[A-Z0-9\?\!\.]", t) and not t.isalpha():
return True return True
@@ -199,12 +196,103 @@ def ocr_candidate_score(text: str) -> float:
# ============================================================ # ============================================================
# SPLITTERS # SPLITTERS
# ============================================================ # ============================================================
def split_wide_ocr_items(image_bgr, filtered_ocr):
"""
Detects if Apple Vision incorrectly merged two columns into a single wide line.
It measures the width of the white gaps and only splits if the gap is
significantly wider than a normal space between words.
"""
new_filtered = []
splits_made = 0
for item in filtered_ocr:
quad, text, conf = item
x1, y1, x2, y2 = quad_bbox(quad)
w = x2 - x1
h = max(1, y2 - y1)
# Check if it's abnormally wide
if w > h * 2.5 and len(text) > 5 and ' ' in text:
pad = 2
roi_y1 = max(0, y1 - pad)
roi_y2 = min(image_bgr.shape[0], y2 + pad)
roi_x1 = max(0, x1)
roi_x2 = min(image_bgr.shape[1], x2)
roi = image_bgr[roi_y1:roi_y2, roi_x1:roi_x2]
if roi.size > 0:
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
proj = np.sum(thresh, axis=0)
start_x = int(w * 0.20)
end_x = int(w * 0.80)
if start_x < end_x:
# Calculate expected character width
char_w = w / max(1, len(text))
# A real column gap should be at least 2.5 chars wide or 75% of line height
min_gap_width = max(int(char_w * 2.5), int(h * 0.75))
gap_threshold = h * 255 * 0.15
gap_mask = proj < gap_threshold
# Find the widest continuous gap
best_gap_start = -1
best_gap_len = 0
current_gap_start = -1
current_gap_len = 0
for x_rel in range(start_x, end_x):
if gap_mask[x_rel]:
if current_gap_len == 0:
current_gap_start = x_rel
current_gap_len += 1
else:
if current_gap_len > best_gap_len:
best_gap_len = current_gap_len
best_gap_start = current_gap_start
current_gap_len = 0
if current_gap_len > best_gap_len:
best_gap_len = current_gap_len
best_gap_start = current_gap_start
# ONLY split if the gap is wide enough to be a gutter between bubbles
if best_gap_len >= min_gap_width:
split_x = roi_x1 + best_gap_start + (best_gap_len // 2)
split_idx = int((split_x - x1) / char_w)
spaces = [i for i, c in enumerate(text) if c == ' ']
if spaces:
best_space = min(spaces, key=lambda i: abs(i - split_idx))
if abs(best_space - split_idx) < len(text) * 0.35:
split_idx = best_space
text_left = text[:split_idx].strip()
text_right = text[split_idx:].strip()
if text_left and text_right:
quad_left = [[x1, y1], [split_x, y1], [split_x, y2], [x1, y2]]
quad_right = [[split_x, y1], [x2, y1], [x2, y2], [split_x, y2]]
new_filtered.append((quad_left, text_left, conf))
new_filtered.append((quad_right, text_right, conf))
splits_made += 1
continue
# If no split was made, keep the original item
new_filtered.append(item)
return new_filtered, splits_made
def split_panel_box(image_bgr, bbox_xyxy, bubble_quads=None): def split_panel_box(image_bgr, bbox_xyxy, bubble_quads=None):
x1, y1, x2, y2 = bbox_xyxy x1, y1, x2, y2 = bbox_xyxy
w = x2 - x1 w = x2 - x1
h = y2 - y1 h = y2 - y1
if bubble_quads is not None and len(bubble_quads) < 10: if bubble_quads is not None and len(bubble_quads) < 4:
return None return None
if w < 50 or h < 50: if w < 50 or h < 50:
@@ -280,20 +368,20 @@ def split_bubble_if_multiple_columns(indices, ocr, bid=None, use_aggressive_thre
med_h = float(np.median(hs)) if hs else 15.0 med_h = float(np.median(hs)) if hs else 15.0
if use_aggressive_thresholds: if use_aggressive_thresholds:
threshold1 = 80.0 threshold1 = 60.0
threshold2 = med_h * 1.2 threshold2 = med_h * 1.0
min_gap = 40.0 min_gap = 20.0
else: else:
threshold1 = 120.0 threshold1 = 90.0
threshold2 = med_h * 3.0 threshold2 = med_h * 1.5
min_gap = 60.0 min_gap = 25.0
if max_gap_size > threshold1 or (max_gap_size > threshold2 and max_gap_size > min_gap): if max_gap_size > threshold1 or (max_gap_size > threshold2 and max_gap_size > min_gap):
split_idx = max_gap_idx split_idx = max_gap_idx
left_indices = [item[0] for item in sorted_items[:split_idx]] left_indices = [item[0] for item in sorted_items[:split_idx]]
right_indices = [item[0] for item in sorted_items[split_idx:]] right_indices = [item[0] for item in sorted_items[split_idx:]]
if len(left_indices) < 2 or len(right_indices) < 2: if len(left_indices) < 1 or len(right_indices) < 1:
return None return None
return left_indices, right_indices return left_indices, right_indices
@@ -326,8 +414,8 @@ def split_bubble_if_multiple_rows(indices, ocr, bid=None):
hs = [b[3] - b[1] for b in boxes] hs = [b[3] - b[1] for b in boxes]
med_h = float(np.median(hs)) if hs else 15.0 med_h = float(np.median(hs)) if hs else 15.0
threshold = med_h * 2.5 threshold = med_h * 1.8
min_gap = 40.0 min_gap = 20.0
if max_gap_size > threshold and max_gap_size > min_gap: if max_gap_size > threshold and max_gap_size > min_gap:
split_idx = max_gap_idx split_idx = max_gap_idx
@@ -345,10 +433,8 @@ def split_bubble_if_multiple_rows(indices, ocr, bid=None):
# ============================================================ # ============================================================
class MacVisionDetector: class MacVisionDetector:
def __init__(self, source_lang="en"): def __init__(self, source_lang="en"):
# 1. Normalize the input language string
lang_key = source_lang.lower().strip() lang_key = source_lang.lower().strip()
# 2. Comprehensive mapping to Apple Vision BCP-47 language codes
lang_map = { lang_map = {
"en": "en-US", "english": "en-US", "en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES", "es": "es-ES", "spanish": "es-ES",
@@ -358,10 +444,9 @@ class MacVisionDetector:
"it": "it-IT", "italian": "it-IT", "it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE", "de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR", "ko": "ko-KR", "korean": "ko-KR",
"zh": "ko-KR", "chinese": "zh-Hans" # Simplified Chinese "zh": "zh-Hans", "chinese": "zh-Hans"
} }
# 3. Resolve the language code
apple_lang = lang_map.get(lang_key, "en-US") apple_lang = lang_map.get(lang_key, "en-US")
self.langs = [apple_lang] self.langs = [apple_lang]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
@@ -683,7 +768,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
centers = [quad_center(r[0]) for r in ocr] centers = [quad_center(r[0]) for r in ocr]
hs = [max(1.0, b[3] - b[1]) for b in boxes] hs = [max(1.0, b[3] - b[1]) for b in boxes]
med_h = float(np.median(hs)) if hs else 12.0 med_h = float(np.median(hs)) if hs else 12.0
dist_thresh = max(20.0, med_h * 2.2) dist_thresh = max(20.0, med_h * 1.8)
adaptive_gap_y = max(gap_px, med_h * 2.5) adaptive_gap_y = max(gap_px, med_h * 2.5)
@@ -709,7 +794,11 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
cx2, cy2 = centers[j] cx2, cy2 = centers[j]
is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5) is_vertically_aligned = abs(cx1 - cx2) < (med_h * 1.5)
if is_vertically_aligned and gap_y <= (med_h * 4.0): if gap_x == 0 and gap_y <= (med_h * 3.5):
unite(i, j)
continue
if is_vertically_aligned and gap_y <= (med_h * 3.5):
unite(i, j) unite(i, j)
continue continue
@@ -718,7 +807,7 @@ def group_tokens(ocr, image_shape, gap_px=18, bbox_padding=3):
continue continue
d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 d = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 3.0: if d <= dist_thresh and abs(cy1 - cy2) <= med_h * 1.5:
unite(i, j) unite(i, j)
groups = {} groups = {}
@@ -929,6 +1018,10 @@ def translate_manga_text(
print("⚠️ No text after filtering.") print("⚠️ No text after filtering.")
return return
filtered, splits_made = split_wide_ocr_items(image, filtered)
if splits_made > 0:
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens( bubbles, bubble_boxes, bubble_quads, bubble_indices = group_tokens(
filtered, image.shape, gap_px=resolved_gap, bbox_padding=3 filtered, image.shape, gap_px=resolved_gap, bbox_padding=3
) )
@@ -941,7 +1034,6 @@ def translate_manga_text(
box = bubble_boxes[bid] box = bubble_boxes[bid]
bubble_split = None bubble_split = None
# 1. Panel border split
split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid]) split_result = split_panel_box(image, box, bubble_quads=bubble_quads[bid])
if split_result: if split_result:
box_left, box_right, split_x = split_result box_left, box_right, split_x = split_result
@@ -957,7 +1049,7 @@ def translate_manga_text(
if left_idxs and right_idxs: if left_idxs and right_idxs:
bubble_split = (left_idxs, right_idxs) bubble_split = (left_idxs, right_idxs)
splits_performed.append(f"BOX#{bid} (panel border at x={split_x})") splits_performed.append(f"BOX#{bid} (panel border at x={split_x})")
elif len(bubble_quads[bid]) >= 10: elif len(bubble_quads[bid]) >= 4:
col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True) col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid, use_aggressive_thresholds=True)
if col_split: if col_split:
left_idxs, right_idxs = col_split left_idxs, right_idxs = col_split
@@ -965,7 +1057,6 @@ def translate_manga_text(
bubble_split = (left_idxs, right_idxs) bubble_split = (left_idxs, right_idxs)
splits_performed.append(f"BOX#{bid} ({len(left_idxs)} quads | {len(right_idxs)} quads)") splits_performed.append(f"BOX#{bid} ({len(left_idxs)} quads | {len(right_idxs)} quads)")
# 2. Check for vertical columns (left/right split)
if bubble_split is None: if bubble_split is None:
col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid) col_split = split_bubble_if_multiple_columns(bubble_indices[bid], filtered, bid=bid)
if col_split: if col_split:
@@ -974,7 +1065,6 @@ def translate_manga_text(
bubble_split = (left_idxs, right_idxs) bubble_split = (left_idxs, right_idxs)
splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(left_idxs)} | {len(right_idxs)} quads)") splits_performed.append(f"BOX#{bid} (Vertical Column Split: {len(left_idxs)} | {len(right_idxs)} quads)")
# 3. Check for horizontal rows (top/bottom split)
if bubble_split is None: if bubble_split is None:
row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid) row_split = split_bubble_if_multiple_rows(bubble_indices[bid], filtered, bid=bid)
if row_split: if row_split:
@@ -1115,8 +1205,8 @@ def translate_manga_text(
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path="003.jpg", image_path="004.png",
source_lang="es", source_lang="english",
target_lang="ca", target_lang="ca",
confidence_threshold=0.05, confidence_threshold=0.05,
min_text_length=1, min_text_length=1,
@@ -1125,6 +1215,6 @@ if __name__ == "__main__":
quality_threshold=0.62, quality_threshold=0.62,
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="rtl", # Changed to RTL for Japanese Manga reading_mode="rtl",
debug=True debug=True
) )