Added all

2026-04-21 23:27:56 +02:00
parent 494631c967
commit 512bb32f66
1 changed files with 448 additions and 103 deletions
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -32,12 +32,12 @@ GLOSSARY = {
 SOUND_EFFECT_PATTERNS = [
    r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
    r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
+    r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$",
    r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
 ]

 TITLE_PATTERNS = [
-    r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
-    r"^(spy|family|spy.family)$",
+    r"^(chapter|episode|vol\.?|volume)\s*\d+$",
    r"^by\s+.+$",
 ]

@@ -82,6 +82,22 @@ def postprocess_translation_general(text: str) -> str:
    return t


+def fix_common_ocr_errors(text: str) -> str:
+    """Fix common OCR mistakes in manga text"""
+    result = text
+    
+    # Apply context-aware fixes
+    # Fix "O" to "0" only if surrounded by numbers
+    result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
+    result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
+    
+    # Fix common character confusions
+    result = result.replace('|', 'I')
+    result = result.replace('`', "'")
+    
+    return result
+
+
 def is_sound_effect(text: str) -> bool:
    cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
    return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
@@ -194,7 +210,80 @@ def ocr_candidate_score(text: str) -> float:


 # ============================================================
-# SPEECH BUBBLE DETECTION (NEW)
+# ENHANCED IMAGE PREPROCESSING
+# ============================================================
+def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
+    """Enhanced preprocessing for better OCR results"""
+    
+    # Upscale first
+    h, w = image_bgr.shape[:2]
+    new_w = int(w * upscale_factor)
+    new_h = int(h * upscale_factor)
+    upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+    
+    # Convert to grayscale
+    gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
+    
+    # Denoise
+    denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
+    
+    # Increase contrast with CLAHE
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(denoised)
+    
+    # Sharpen
+    kernel_sharpen = np.array([[-1,-1,-1],
+                               [-1, 9,-1],
+                               [-1,-1,-1]])
+    sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
+    
+    # Adaptive thresholding for clean text
+    binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                   cv2.THRESH_BINARY, 11, 2)
+    
+    # Morphological operations to clean up
+    kernel = np.ones((2, 2), np.uint8)
+    cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
+    
+    # Convert back to BGR for Vision API
+    return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
+
+
+def detect_small_text_regions(image_bgr, existing_quads):
+    """Detect small text regions that might have been missed"""
+    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+    
+    # Create mask of existing detections
+    mask = np.zeros(gray.shape, dtype=np.uint8)
+    for quad in existing_quads:
+        pts = np.array(quad, dtype=np.int32)
+        cv2.fillPoly(mask, [pts], 255)
+    
+    # Invert mask to find undetected regions
+    mask_inv = cv2.bitwise_not(mask)
+    
+    # Find text-like regions
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
+    
+    # Find contours in undetected regions
+    contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    
+    # Filter for text-like contours
+    text_regions = []
+    for contour in contours:
+        x, y, w, h = cv2.boundingRect(contour)
+        area = w * h
+        
+        # Filter by size and aspect ratio
+        if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
+            text_regions.append((x, y, x+w, y+h))
+    
+    return text_regions
+
+
+# ============================================================
+# SPEECH BUBBLE DETECTION
 # ============================================================
 def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
    """Detect speech bubble contours for box splitting"""
@@ -302,7 +391,7 @@ def check_vertical_alignment_split(indices: List[int],


 # ============================================================
-# BOX FIXING FUNCTIONS (NEW)
+# BOX FIXING FUNCTIONS
 # ============================================================
 def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
                              bubble_boxes: Dict[int, Tuple],
@@ -311,12 +400,7 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
                              ocr: List[Tuple],
                              image_bgr: np.ndarray,
                              page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]:
-    """
-    Apply page-specific fixes to bubble detection issues
-    
-    Args:
-        page_identifier: Base filename (e.g., "15", "16", "19")
-    """
+    """Apply page-specific fixes to bubble detection issues"""
    
    # Detect speech bubbles for splitting logic
    bubble_contours = detect_speech_bubbles(image_bgr)
@@ -470,6 +554,302 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
    return bubbles, bubble_boxes, bubble_quads, bubble_indices


+# ============================================================
+# ENHANCED OCR ENGINE
+# ============================================================
+class ImprovedMacVisionDetector:
+    def __init__(self, source_lang="en"):
+        lang_key = source_lang.lower().strip()
+        
+        lang_map = {
+            "en": "en-US", "english": "en-US",
+            "es": "es-ES", "spanish": "es-ES",
+            "ca": "ca-ES", "catalan": "ca-ES",
+            "fr": "fr-FR", "french": "fr-FR",
+            "ja": "ja-JP", "japanese": "ja-JP",
+            "it": "it-IT", "italian": "it-IT",
+            "de": "de-DE", "german": "de-DE",
+            "ko": "ko-KR", "korean": "ko-KR",
+            "zh": "zh-Hans", "chinese": "zh-Hans"
+        }
+        
+        apple_lang = lang_map.get(lang_key, "en-US")
+        self.langs = [apple_lang]
+        print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
+    
+    def preprocess_variants(self, image_bgr):
+        """Generate multiple preprocessing variants"""
+        variants = []
+        
+        # Variant 1: Enhanced standard
+        variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5)))
+        
+        # Variant 2: High contrast
+        gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+        _, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
+        variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR)))
+        
+        # Variant 3: Bilateral filter (preserves edges)
+        bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75)
+        upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
+        variants.append(("bilateral", upscaled_bil))
+        
+        # Variant 4: Inverted (for white text on black)
+        inverted = cv2.bitwise_not(image_bgr)
+        upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
+        variants.append(("inverted", upscaled_inv))
+        
+        # Variant 5: Original upscaled
+        upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
+        variants.append(("original", upscaled_orig))
+        
+        return variants
+    
+    def run_vision_ocr(self, image_bgr):
+        """Run Vision OCR on a single image"""
+        if image_bgr is None or image_bgr.size == 0:
+            return []
+        
+        ih, iw = image_bgr.shape[:2]
+        
+        success, buffer = cv2.imencode('.png', image_bgr)
+        if not success:
+            return []
+        
+        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
+        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
+        results = []
+        
+        def completion_handler(request, error):
+            if error:
+                return
+            
+            for observation in request.results():
+                candidate = observation.topCandidates_(1)[0]
+                text = candidate.string()
+                confidence = candidate.confidence()
+                
+                bbox = observation.boundingBox()
+                x = bbox.origin.x * iw
+                y_bottom_left = bbox.origin.y * ih
+                w = bbox.size.width * iw
+                h = bbox.size.height * ih
+                
+                y = ih - y_bottom_left - h
+                
+                quad = [
+                    [int(x), int(y)],
+                    [int(x + w), int(y)],
+                    [int(x + w), int(y + h)],
+                    [int(x), int(y + h)]
+                ]
+                
+                results.append((quad, text, confidence))
+        
+        request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
+        request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
+        request.setUsesLanguageCorrection_(False)  # Disable for manga
+        request.setRecognitionLanguages_(self.langs)
+        request.setAutomaticallyDetectsLanguage_(True)
+        
+        handler.performRequests_error_([request], None)
+        return results
+    
+    def merge_multi_pass_results(self, all_results, original_shape):
+        """Merge results from multiple preprocessing passes"""
+        if not all_results:
+            return []
+        
+        # Scale factor to normalize coordinates back to original
+        scale_factor = 2.5
+        
+        # Normalize all quads to original image coordinates
+        normalized_results = []
+        for variant_name, results in all_results:
+            for quad, text, conf in results:
+                # Scale quad back to original size
+                scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad]
+                normalized_results.append((scaled_quad, text, conf, variant_name))
+        
+        # Group similar detections (same location, similar text)
+        def quads_overlap(q1, q2, threshold=0.5):
+            b1 = quad_bbox(q1)
+            b2 = quad_bbox(q2)
+            
+            # Calculate IoU
+            x1 = max(b1[0], b2[0])
+            y1 = max(b1[1], b2[1])
+            x2 = min(b1[2], b2[2])
+            y2 = min(b1[3], b2[3])
+            
+            if x2 < x1 or y2 < y1:
+                return False
+            
+            intersection = (x2 - x1) * (y2 - y1)
+            area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
+            area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
+            union = area1 + area2 - intersection
+            
+            iou = intersection / max(union, 1)
+            return iou > threshold
+        
+        # Cluster overlapping detections
+        clusters = []
+        used = set()
+        
+        for i, (quad1, text1, conf1, var1) in enumerate(normalized_results):
+            if i in used:
+                continue
+            
+            cluster = [(quad1, text1, conf1, var1)]
+            used.add(i)
+            
+            for j, (quad2, text2, conf2, var2) in enumerate(normalized_results):
+                if j in used or i == j:
+                    continue
+                
+                if quads_overlap(quad1, quad2, threshold=0.5):
+                    cluster.append((quad2, text2, conf2, var2))
+                    used.add(j)
+            
+            clusters.append(cluster)
+        
+        # Vote on best result per cluster
+        final_results = []
+        for cluster in clusters:
+            # Sort by confidence
+            cluster.sort(key=lambda x: x[2], reverse=True)
+            
+            # Take highest confidence result
+            best_quad, best_text, best_conf, best_var = cluster[0]
+            
+            # If multiple variants agree on text, boost confidence
+            text_votes = {}
+            for _, text, conf, _ in cluster:
+                normalized = normalize_text(text)
+                if normalized:
+                    text_votes[normalized] = text_votes.get(normalized, 0) + conf
+            
+            if text_votes:
+                best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0]
+                if best_voted_text != normalize_text(best_text):
+                    # Use voted text if it has more support
+                    best_text = best_voted_text
+            
+            # Apply OCR error fixes
+            best_text = fix_common_ocr_errors(best_text)
+            
+            final_results.append((best_quad, best_text, best_conf))
+        
+        return final_results
+    
+    def read(self, image_path_or_array):
+        """Enhanced multi-pass OCR"""
+        if isinstance(image_path_or_array, str):
+            img = cv2.imread(image_path_or_array)
+        else:
+            img = image_path_or_array
+        
+        if img is None or img.size == 0:
+            return []
+        
+        original_shape = img.shape
+        
+        # Generate preprocessing variants
+        variants = self.preprocess_variants(img)
+        
+        # Run OCR on each variant
+        all_results = []
+        for variant_name, variant_img in variants:
+            results = self.run_vision_ocr(variant_img)
+            if results:
+                all_results.append((variant_name, results))
+        
+        # Merge and vote on results
+        final_results = self.merge_multi_pass_results(all_results, original_shape)
+        
+        return final_results
+
+
+# ============================================================
+# ORIGINAL OCR ENGINE (Fallback)
+# ============================================================
+class MacVisionDetector:
+    def __init__(self, source_lang="en"):
+        lang_key = source_lang.lower().strip()
+
+        lang_map = {
+            "en": "en-US", "english": "en-US",
+            "es": "es-ES", "spanish": "es-ES",
+            "ca": "ca-ES", "catalan": "ca-ES",
+            "fr": "fr-FR", "french": "fr-FR",
+            "ja": "ja-JP", "japanese": "ja-JP",
+            "it": "it-IT", "italian": "it-IT",
+            "de": "de-DE", "german": "de-DE",
+            "ko": "ko-KR", "korean": "ko-KR",
+            "zh": "zh-Hans", "chinese": "zh-Hans"
+        }
+
+        apple_lang = lang_map.get(lang_key, "en-US")
+        self.langs = [apple_lang]
+        print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
+
+    def read(self, image_path_or_array):
+        if isinstance(image_path_or_array, str):
+            img = cv2.imread(image_path_or_array)
+        else:
+            img = image_path_or_array
+
+        if img is None or img.size == 0:
+            return []
+
+        ih, iw = img.shape[:2]
+
+        success, buffer = cv2.imencode('.png', img)
+        if not success:
+            return []
+
+        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
+        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
+        results = []
+
+        def completion_handler(request, error):
+            if error:
+                print(f"Vision API Error: {error}")
+                return
+
+            for observation in request.results():
+                candidate = observation.topCandidates_(1)[0]
+                text = candidate.string()
+                confidence = candidate.confidence()
+
+                bbox = observation.boundingBox()
+                x = bbox.origin.x * iw
+                y_bottom_left = bbox.origin.y * ih
+                w = bbox.size.width * iw
+                h = bbox.size.height * ih
+
+                y = ih - y_bottom_left - h
+
+                quad = [
+                    [int(x), int(y)],
+                    [int(x + w), int(y)],
+                    [int(x + w), int(y + h)],
+                    [int(x), int(y + h)]
+                ]
+
+                results.append((quad, text, confidence))
+
+        request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
+        request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
+        request.setUsesLanguageCorrection_(True)
+        request.setRecognitionLanguages_(self.langs)
+
+        handler.performRequests_error_([request], None)
+        return results
+
+
 # ============================================================
 # SPLITTERS + QUAD NORMALIZATION
 # ============================================================
@@ -960,84 +1340,6 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubb
    return out_b, out_bb, out_bq, out_bi


-# ============================================================
-# OCR ENGINES (Apple Native Vision)
-# ============================================================
-class MacVisionDetector:
-    def __init__(self, source_lang="en"):
-        lang_key = source_lang.lower().strip()
-
-        lang_map = {
-            "en": "en-US", "english": "en-US",
-            "es": "es-ES", "spanish": "es-ES",
-            "ca": "ca-ES", "catalan": "ca-ES",
-            "fr": "fr-FR", "french": "fr-FR",
-            "ja": "ja-JP", "japanese": "ja-JP",
-            "it": "it-IT", "italian": "it-IT",
-            "de": "de-DE", "german": "de-DE",
-            "ko": "ko-KR", "korean": "ko-KR",
-            "zh": "zh-Hans", "chinese": "zh-Hans"
-        }
-
-        apple_lang = lang_map.get(lang_key, "en-US")
-        self.langs = [apple_lang]
-        print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
-
-    def read(self, image_path_or_array):
-        if isinstance(image_path_or_array, str):
-            img = cv2.imread(image_path_or_array)
-        else:
-            img = image_path_or_array
-
-        if img is None or img.size == 0:
-            return []
-
-        ih, iw = img.shape[:2]
-
-        success, buffer = cv2.imencode('.png', img)
-        if not success:
-            return []
-
-        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
-        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
-        results = []
-
-        def completion_handler(request, error):
-            if error:
-                print(f"Vision API Error: {error}")
-                return
-
-            for observation in request.results():
-                candidate = observation.topCandidates_(1)[0]
-                text = candidate.string()
-                confidence = candidate.confidence()
-
-                bbox = observation.boundingBox()
-                x = bbox.origin.x * iw
-                y_bottom_left = bbox.origin.y * ih
-                w = bbox.size.width * iw
-                h = bbox.size.height * ih
-
-                y = ih - y_bottom_left - h
-
-                quad = [
-                    [int(x), int(y)],
-                    [int(x + w), int(y)],
-                    [int(x + w), int(y + h)],
-                    [int(x), int(y + h)]
-                ]
-
-                results.append((quad, text, confidence))
-
-        request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
-        request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
-        request.setUsesLanguageCorrection_(True)
-        request.setRecognitionLanguages_(self.langs)
-
-        handler.performRequests_error_([request], None)
-        return results
-
-
 # ============================================================
 # PREPROCESS
 # ============================================================
@@ -1127,7 +1429,7 @@ def rebuild_text_from_vision_result(res):
    return normalize_text(" ".join(lines))


-def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24):
+def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
    ih, iw = image_bgr.shape[:2]
    x1, y1, x2, y2 = bbox_xyxy
    x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
@@ -1148,7 +1450,13 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDe
        proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
        for a in angles:
            rot = rotate_image_keep_bounds(proc3, a)
+            
+            # Use run_vision_ocr if available (enhanced detector)
+            if hasattr(vision_detector, 'run_vision_ocr'):
+                res = vision_detector.run_vision_ocr(rot)
+            else:
                res = vision_detector.read(rot)
+            
            txt = rebuild_text_from_vision_result(res)
            sc = ocr_candidate_score(txt)
            if sc > best_v_sc:
@@ -1469,13 +1777,13 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m


 # ============================================================
-# PIPELINE
+# MAIN PIPELINE
 # ============================================================
 def translate_manga_text(
    image_path="001-page.png",
    source_lang="en",
    target_lang="ca",
-    confidence_threshold=0.05,
+    confidence_threshold=0.03,
    min_text_length=1,
    gap_px="auto",
    filter_sound_effects=True,
@@ -1483,7 +1791,8 @@ def translate_manga_text(
    export_to_file="output.txt",
    export_bubbles_to="bubbles.json",
    reading_mode="ltr",
-    debug=True
+    debug=True,
+    use_enhanced_ocr=True
 ):
    image = cv2.imread(image_path)
    if image is None:
@@ -1493,12 +1802,49 @@ def translate_manga_text(
    resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)

    print("Loading OCR engines...")
+    
+    # Use enhanced detector
+    if use_enhanced_ocr:
+        detector = ImprovedMacVisionDetector(source_lang=source_lang)
+        print("🚀 Using Enhanced Multi-Pass OCR")
+    else:
        detector = MacVisionDetector(source_lang=source_lang)

    print("Running detection OCR (Apple Vision)...")
    raw = detector.read(image_path)
    print(f"Raw detections: {len(raw)}")
    
+    # Secondary pass for missed regions
+    if use_enhanced_ocr:
+        existing_quads = [r[0] for r in raw]
+        missed_regions = detect_small_text_regions(image, existing_quads)
+        
+        if missed_regions:
+            print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
+            
+            # Re-run OCR on missed regions with higher upscaling
+            for region in missed_regions:
+                x1, y1, x2, y2 = region
+                # Add padding
+                pad = 10
+                x1 = max(0, x1 - pad)
+                y1 = max(0, y1 - pad)
+                x2 = min(image.shape[1], x2 + pad)
+                y2 = min(image.shape[0], y2 + pad)
+                
+                crop = image[y1:y2, x1:x2]
+                if crop.size > 0:
+                    # Aggressive upscaling for small text
+                    upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
+                    region_results = detector.run_vision_ocr(upscaled)
+                    
+                    # Scale back and offset coordinates
+                    for quad, text, conf in region_results:
+                        scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad]
+                        raw.append((scaled_quad, text, conf))
+            
+            print(f"📝 Total detections after missed region scan: {len(raw)}")
+
    filtered = []
    skipped = 0
    ih, iw = image.shape[:2]
@@ -1533,7 +1879,7 @@ def translate_manga_text(
    if splits_made > 0:
        print(f"✂️  Split {splits_made} wide OCR lines across column gaps.")

-    # 2) split giant bridge quads (fixes page16 BOX19-like glue)
+    # 2) split giant bridge quads
    filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
    if bridge_splits > 0:
        print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
@@ -1545,14 +1891,12 @@ def translate_manga_text(
        filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
    )

-    # merge accidental sibling fragments (fixes page15 BOX11+BOX16 style)
+    # merge accidental sibling fragments
    bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
        bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
    )

-    # ============================================================
-    # APPLY PAGE-SPECIFIC FIXES (NEW)
-    # ============================================================
+    # Apply page-specific fixes
    page_identifier = os.path.basename(image_path)
    bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes(
        bubbles, bubble_boxes, bubble_quads, bubble_indices, 
@@ -1760,16 +2104,17 @@ def translate_manga_text(

 if __name__ == "__main__":
    translate_manga_text(
-        image_path="15.png",
+        image_path="16.jpg",
        source_lang="english",
        target_lang="ca",
-        confidence_threshold=0.05,
+        confidence_threshold=0.03,  # Lower threshold for better detection
        min_text_length=1,
        gap_px="auto",
        filter_sound_effects=True,
        quality_threshold=0.62,
        export_to_file="output.txt",
        export_bubbles_to="bubbles.json",
-        reading_mode="rtl",
-        debug=True
+        reading_mode="ltr", #rtl or 
+        debug=True,
+        use_enhanced_ocr=True  # Enable enhanced multi-pass OCR
    )