Added all

This commit is contained in:
Guillem Hernandez Sola
2026-04-21 23:27:56 +02:00
parent 494631c967
commit 512bb32f66

View File

@@ -32,12 +32,12 @@ GLOSSARY = {
SOUND_EFFECT_PATTERNS = [ SOUND_EFFECT_PATTERNS = [
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$",
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
] ]
TITLE_PATTERNS = [ TITLE_PATTERNS = [
r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$", r"^(chapter|episode|vol\.?|volume)\s*\d+$",
r"^(spy|family|spy.family)$",
r"^by\s+.+$", r"^by\s+.+$",
] ]
@@ -82,6 +82,22 @@ def postprocess_translation_general(text: str) -> str:
return t return t
def fix_common_ocr_errors(text: str) -> str:
"""Fix common OCR mistakes in manga text"""
result = text
# Apply context-aware fixes
# Fix "O" to "0" only if surrounded by numbers
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
# Fix common character confusions
result = result.replace('|', 'I')
result = result.replace('`', "'")
return result
def is_sound_effect(text: str) -> bool: def is_sound_effect(text: str) -> bool:
cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower()) cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
@@ -194,7 +210,80 @@ def ocr_candidate_score(text: str) -> float:
# ============================================================ # ============================================================
# SPEECH BUBBLE DETECTION (NEW) # ENHANCED IMAGE PREPROCESSING
# ============================================================
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
"""Enhanced preprocessing for better OCR results"""
# Upscale first
h, w = image_bgr.shape[:2]
new_w = int(w * upscale_factor)
new_h = int(h * upscale_factor)
upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
# Convert to grayscale
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
# Denoise
denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
# Increase contrast with CLAHE
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
# Sharpen
kernel_sharpen = np.array([[-1,-1,-1],
[-1, 9,-1],
[-1,-1,-1]])
sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
# Adaptive thresholding for clean text
binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
# Morphological operations to clean up
kernel = np.ones((2, 2), np.uint8)
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# Convert back to BGR for Vision API
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
def detect_small_text_regions(image_bgr, existing_quads):
"""Detect small text regions that might have been missed"""
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
# Create mask of existing detections
mask = np.zeros(gray.shape, dtype=np.uint8)
for quad in existing_quads:
pts = np.array(quad, dtype=np.int32)
cv2.fillPoly(mask, [pts], 255)
# Invert mask to find undetected regions
mask_inv = cv2.bitwise_not(mask)
# Find text-like regions
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
# Find contours in undetected regions
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Filter for text-like contours
text_regions = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
# Filter by size and aspect ratio
if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
text_regions.append((x, y, x+w, y+h))
return text_regions
# ============================================================
# SPEECH BUBBLE DETECTION
# ============================================================ # ============================================================
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
"""Detect speech bubble contours for box splitting""" """Detect speech bubble contours for box splitting"""
@@ -302,7 +391,7 @@ def check_vertical_alignment_split(indices: List[int],
# ============================================================ # ============================================================
# BOX FIXING FUNCTIONS (NEW) # BOX FIXING FUNCTIONS
# ============================================================ # ============================================================
def apply_page_specific_fixes(bubbles: Dict[int, List[str]], def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
bubble_boxes: Dict[int, Tuple], bubble_boxes: Dict[int, Tuple],
@@ -311,12 +400,7 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
ocr: List[Tuple], ocr: List[Tuple],
image_bgr: np.ndarray, image_bgr: np.ndarray,
page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]: page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]:
""" """Apply page-specific fixes to bubble detection issues"""
Apply page-specific fixes to bubble detection issues
Args:
page_identifier: Base filename (e.g., "15", "16", "19")
"""
# Detect speech bubbles for splitting logic # Detect speech bubbles for splitting logic
bubble_contours = detect_speech_bubbles(image_bgr) bubble_contours = detect_speech_bubbles(image_bgr)
@@ -470,6 +554,302 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
return bubbles, bubble_boxes, bubble_quads, bubble_indices return bubbles, bubble_boxes, bubble_quads, bubble_indices
# ============================================================
# ENHANCED OCR ENGINE
# ============================================================
class ImprovedMacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
apple_lang = lang_map.get(lang_key, "en-US")
self.langs = [apple_lang]
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
def preprocess_variants(self, image_bgr):
"""Generate multiple preprocessing variants"""
variants = []
# Variant 1: Enhanced standard
variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5)))
# Variant 2: High contrast
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
_, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR)))
# Variant 3: Bilateral filter (preserves edges)
bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75)
upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
variants.append(("bilateral", upscaled_bil))
# Variant 4: Inverted (for white text on black)
inverted = cv2.bitwise_not(image_bgr)
upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
variants.append(("inverted", upscaled_inv))
# Variant 5: Original upscaled
upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
variants.append(("original", upscaled_orig))
return variants
def run_vision_ocr(self, image_bgr):
"""Run Vision OCR on a single image"""
if image_bgr is None or image_bgr.size == 0:
return []
ih, iw = image_bgr.shape[:2]
success, buffer = cv2.imencode('.png', image_bgr)
if not success:
return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error:
return
for observation in request.results():
candidate = observation.topCandidates_(1)[0]
text = candidate.string()
confidence = candidate.confidence()
bbox = observation.boundingBox()
x = bbox.origin.x * iw
y_bottom_left = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bottom_left - h
quad = [
[int(x), int(y)],
[int(x + w), int(y)],
[int(x + w), int(y + h)],
[int(x), int(y + h)]
]
results.append((quad, text, confidence))
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
request.setUsesLanguageCorrection_(False) # Disable for manga
request.setRecognitionLanguages_(self.langs)
request.setAutomaticallyDetectsLanguage_(True)
handler.performRequests_error_([request], None)
return results
def merge_multi_pass_results(self, all_results, original_shape):
"""Merge results from multiple preprocessing passes"""
if not all_results:
return []
# Scale factor to normalize coordinates back to original
scale_factor = 2.5
# Normalize all quads to original image coordinates
normalized_results = []
for variant_name, results in all_results:
for quad, text, conf in results:
# Scale quad back to original size
scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad]
normalized_results.append((scaled_quad, text, conf, variant_name))
# Group similar detections (same location, similar text)
def quads_overlap(q1, q2, threshold=0.5):
b1 = quad_bbox(q1)
b2 = quad_bbox(q2)
# Calculate IoU
x1 = max(b1[0], b2[0])
y1 = max(b1[1], b2[1])
x2 = min(b1[2], b2[2])
y2 = min(b1[3], b2[3])
if x2 < x1 or y2 < y1:
return False
intersection = (x2 - x1) * (y2 - y1)
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
union = area1 + area2 - intersection
iou = intersection / max(union, 1)
return iou > threshold
# Cluster overlapping detections
clusters = []
used = set()
for i, (quad1, text1, conf1, var1) in enumerate(normalized_results):
if i in used:
continue
cluster = [(quad1, text1, conf1, var1)]
used.add(i)
for j, (quad2, text2, conf2, var2) in enumerate(normalized_results):
if j in used or i == j:
continue
if quads_overlap(quad1, quad2, threshold=0.5):
cluster.append((quad2, text2, conf2, var2))
used.add(j)
clusters.append(cluster)
# Vote on best result per cluster
final_results = []
for cluster in clusters:
# Sort by confidence
cluster.sort(key=lambda x: x[2], reverse=True)
# Take highest confidence result
best_quad, best_text, best_conf, best_var = cluster[0]
# If multiple variants agree on text, boost confidence
text_votes = {}
for _, text, conf, _ in cluster:
normalized = normalize_text(text)
if normalized:
text_votes[normalized] = text_votes.get(normalized, 0) + conf
if text_votes:
best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0]
if best_voted_text != normalize_text(best_text):
# Use voted text if it has more support
best_text = best_voted_text
# Apply OCR error fixes
best_text = fix_common_ocr_errors(best_text)
final_results.append((best_quad, best_text, best_conf))
return final_results
def read(self, image_path_or_array):
"""Enhanced multi-pass OCR"""
if isinstance(image_path_or_array, str):
img = cv2.imread(image_path_or_array)
else:
img = image_path_or_array
if img is None or img.size == 0:
return []
original_shape = img.shape
# Generate preprocessing variants
variants = self.preprocess_variants(img)
# Run OCR on each variant
all_results = []
for variant_name, variant_img in variants:
results = self.run_vision_ocr(variant_img)
if results:
all_results.append((variant_name, results))
# Merge and vote on results
final_results = self.merge_multi_pass_results(all_results, original_shape)
return final_results
# ============================================================
# ORIGINAL OCR ENGINE (Fallback)
# ============================================================
class MacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
apple_lang = lang_map.get(lang_key, "en-US")
self.langs = [apple_lang]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
def read(self, image_path_or_array):
if isinstance(image_path_or_array, str):
img = cv2.imread(image_path_or_array)
else:
img = image_path_or_array
if img is None or img.size == 0:
return []
ih, iw = img.shape[:2]
success, buffer = cv2.imencode('.png', img)
if not success:
return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error:
print(f"Vision API Error: {error}")
return
for observation in request.results():
candidate = observation.topCandidates_(1)[0]
text = candidate.string()
confidence = candidate.confidence()
bbox = observation.boundingBox()
x = bbox.origin.x * iw
y_bottom_left = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bottom_left - h
quad = [
[int(x), int(y)],
[int(x + w), int(y)],
[int(x + w), int(y + h)],
[int(x), int(y + h)]
]
results.append((quad, text, confidence))
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
request.setUsesLanguageCorrection_(True)
request.setRecognitionLanguages_(self.langs)
handler.performRequests_error_([request], None)
return results
# ============================================================ # ============================================================
# SPLITTERS + QUAD NORMALIZATION # SPLITTERS + QUAD NORMALIZATION
# ============================================================ # ============================================================
@@ -960,84 +1340,6 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubb
return out_b, out_bb, out_bq, out_bi return out_b, out_bb, out_bq, out_bi
# ============================================================
# OCR ENGINES (Apple Native Vision)
# ============================================================
class MacVisionDetector:
def __init__(self, source_lang="en"):
lang_key = source_lang.lower().strip()
lang_map = {
"en": "en-US", "english": "en-US",
"es": "es-ES", "spanish": "es-ES",
"ca": "ca-ES", "catalan": "ca-ES",
"fr": "fr-FR", "french": "fr-FR",
"ja": "ja-JP", "japanese": "ja-JP",
"it": "it-IT", "italian": "it-IT",
"de": "de-DE", "german": "de-DE",
"ko": "ko-KR", "korean": "ko-KR",
"zh": "zh-Hans", "chinese": "zh-Hans"
}
apple_lang = lang_map.get(lang_key, "en-US")
self.langs = [apple_lang]
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
def read(self, image_path_or_array):
if isinstance(image_path_or_array, str):
img = cv2.imread(image_path_or_array)
else:
img = image_path_or_array
if img is None or img.size == 0:
return []
ih, iw = img.shape[:2]
success, buffer = cv2.imencode('.png', img)
if not success:
return []
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
results = []
def completion_handler(request, error):
if error:
print(f"Vision API Error: {error}")
return
for observation in request.results():
candidate = observation.topCandidates_(1)[0]
text = candidate.string()
confidence = candidate.confidence()
bbox = observation.boundingBox()
x = bbox.origin.x * iw
y_bottom_left = bbox.origin.y * ih
w = bbox.size.width * iw
h = bbox.size.height * ih
y = ih - y_bottom_left - h
quad = [
[int(x), int(y)],
[int(x + w), int(y)],
[int(x + w), int(y + h)],
[int(x), int(y + h)]
]
results.append((quad, text, confidence))
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
request.setUsesLanguageCorrection_(True)
request.setRecognitionLanguages_(self.langs)
handler.performRequests_error_([request], None)
return results
# ============================================================ # ============================================================
# PREPROCESS # PREPROCESS
# ============================================================ # ============================================================
@@ -1127,7 +1429,7 @@ def rebuild_text_from_vision_result(res):
return normalize_text(" ".join(lines)) return normalize_text(" ".join(lines))
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24): def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
ih, iw = image_bgr.shape[:2] ih, iw = image_bgr.shape[:2]
x1, y1, x2, y2 = bbox_xyxy x1, y1, x2, y2 = bbox_xyxy
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
@@ -1148,7 +1450,13 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDe
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
for a in angles: for a in angles:
rot = rotate_image_keep_bounds(proc3, a) rot = rotate_image_keep_bounds(proc3, a)
res = vision_detector.read(rot)
# Use run_vision_ocr if available (enhanced detector)
if hasattr(vision_detector, 'run_vision_ocr'):
res = vision_detector.run_vision_ocr(rot)
else:
res = vision_detector.read(rot)
txt = rebuild_text_from_vision_result(res) txt = rebuild_text_from_vision_result(res)
sc = ocr_candidate_score(txt) sc = ocr_candidate_score(txt)
if sc > best_v_sc: if sc > best_v_sc:
@@ -1469,13 +1777,13 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
# ============================================================ # ============================================================
# PIPELINE # MAIN PIPELINE
# ============================================================ # ============================================================
def translate_manga_text( def translate_manga_text(
image_path="001-page.png", image_path="001-page.png",
source_lang="en", source_lang="en",
target_lang="ca", target_lang="ca",
confidence_threshold=0.05, confidence_threshold=0.03,
min_text_length=1, min_text_length=1,
gap_px="auto", gap_px="auto",
filter_sound_effects=True, filter_sound_effects=True,
@@ -1483,7 +1791,8 @@ def translate_manga_text(
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="ltr", reading_mode="ltr",
debug=True debug=True,
use_enhanced_ocr=True
): ):
image = cv2.imread(image_path) image = cv2.imread(image_path)
if image is None: if image is None:
@@ -1493,11 +1802,48 @@ def translate_manga_text(
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
print("Loading OCR engines...") print("Loading OCR engines...")
detector = MacVisionDetector(source_lang=source_lang)
# Use enhanced detector
if use_enhanced_ocr:
detector = ImprovedMacVisionDetector(source_lang=source_lang)
print("🚀 Using Enhanced Multi-Pass OCR")
else:
detector = MacVisionDetector(source_lang=source_lang)
print("Running detection OCR (Apple Vision)...") print("Running detection OCR (Apple Vision)...")
raw = detector.read(image_path) raw = detector.read(image_path)
print(f"Raw detections: {len(raw)}") print(f"Raw detections: {len(raw)}")
# Secondary pass for missed regions
if use_enhanced_ocr:
existing_quads = [r[0] for r in raw]
missed_regions = detect_small_text_regions(image, existing_quads)
if missed_regions:
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
# Re-run OCR on missed regions with higher upscaling
for region in missed_regions:
x1, y1, x2, y2 = region
# Add padding
pad = 10
x1 = max(0, x1 - pad)
y1 = max(0, y1 - pad)
x2 = min(image.shape[1], x2 + pad)
y2 = min(image.shape[0], y2 + pad)
crop = image[y1:y2, x1:x2]
if crop.size > 0:
# Aggressive upscaling for small text
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
region_results = detector.run_vision_ocr(upscaled)
# Scale back and offset coordinates
for quad, text, conf in region_results:
scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad]
raw.append((scaled_quad, text, conf))
print(f"📝 Total detections after missed region scan: {len(raw)}")
filtered = [] filtered = []
skipped = 0 skipped = 0
@@ -1533,7 +1879,7 @@ def translate_manga_text(
if splits_made > 0: if splits_made > 0:
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.") print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
# 2) split giant bridge quads (fixes page16 BOX19-like glue) # 2) split giant bridge quads
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
if bridge_splits > 0: if bridge_splits > 0:
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
@@ -1545,14 +1891,12 @@ def translate_manga_text(
filtered, image.shape, gap_px=resolved_gap, bbox_padding=1 filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
) )
# merge accidental sibling fragments (fixes page15 BOX11+BOX16 style) # merge accidental sibling fragments
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
) )
# ============================================================ # Apply page-specific fixes
# APPLY PAGE-SPECIFIC FIXES (NEW)
# ============================================================
page_identifier = os.path.basename(image_path) page_identifier = os.path.basename(image_path)
bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes( bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes(
bubbles, bubble_boxes, bubble_quads, bubble_indices, bubbles, bubble_boxes, bubble_quads, bubble_indices,
@@ -1760,16 +2104,17 @@ def translate_manga_text(
if __name__ == "__main__": if __name__ == "__main__":
translate_manga_text( translate_manga_text(
image_path="15.png", image_path="16.jpg",
source_lang="english", source_lang="english",
target_lang="ca", target_lang="ca",
confidence_threshold=0.05, confidence_threshold=0.03, # Lower threshold for better detection
min_text_length=1, min_text_length=1,
gap_px="auto", gap_px="auto",
filter_sound_effects=True, filter_sound_effects=True,
quality_threshold=0.62, quality_threshold=0.62,
export_to_file="output.txt", export_to_file="output.txt",
export_bubbles_to="bubbles.json", export_bubbles_to="bubbles.json",
reading_mode="rtl", reading_mode="ltr", #rtl or
debug=True debug=True,
use_enhanced_ocr=True # Enable enhanced multi-pass OCR
) )