Added all

2026-04-21 23:27:56 +02:00
parent 494631c967
commit 512bb32f66
1 changed files with 448 additions and 103 deletions
--- a/manga-translator.py
+++ b/manga-translator.py
@@ -32,12 +32,12 @@ GLOSSARY = {
 SOUND_EFFECT_PATTERNS = [
    r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
    r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
    r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$",
    r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
 ]
 TITLE_PATTERNS = [
-    r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
+    r"^(chapter|episode|vol\.?|volume)\s*\d+$",
    r"^(spy|family|spy.family)$",
    r"^by\s+.+$",
 ]
@@ -82,6 +82,22 @@ def postprocess_translation_general(text: str) -> str:
    return t
 def fix_common_ocr_errors(text: str) -> str:
    """Fix common OCR mistakes in manga text"""
    result = text
    # Apply context-aware fixes
    # Fix "O" to "0" only if surrounded by numbers
    result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
    result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
    # Fix common character confusions
    result = result.replace('|', 'I')
    result = result.replace('`', "'")
    return result
 def is_sound_effect(text: str) -> bool:
    cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
    return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
@@ -194,7 +210,80 @@ def ocr_candidate_score(text: str) -> float:
 # ============================================================
-# SPEECH BUBBLE DETECTION (NEW)
+# ENHANCED IMAGE PREPROCESSING
 # ============================================================
 def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
    """Enhanced preprocessing for better OCR results"""
    # Upscale first
    h, w = image_bgr.shape[:2]
    new_w = int(w * upscale_factor)
    new_h = int(h * upscale_factor)
    upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    # Convert to grayscale
    gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
    # Denoise
    denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
    # Increase contrast with CLAHE
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    # Sharpen
    kernel_sharpen = np.array([[-1,-1,-1],
                               [-1, 9,-1],
                               [-1,-1,-1]])
    sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
    # Adaptive thresholding for clean text
    binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 11, 2)
    # Morphological operations to clean up
    kernel = np.ones((2, 2), np.uint8)
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    # Convert back to BGR for Vision API
    return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
 def detect_small_text_regions(image_bgr, existing_quads):
    """Detect small text regions that might have been missed"""
    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
    # Create mask of existing detections
    mask = np.zeros(gray.shape, dtype=np.uint8)
    for quad in existing_quads:
        pts = np.array(quad, dtype=np.int32)
        cv2.fillPoly(mask, [pts], 255)
    # Invert mask to find undetected regions
    mask_inv = cv2.bitwise_not(mask)
    # Find text-like regions
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
    # Find contours in undetected regions
    contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Filter for text-like contours
    text_regions = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = w * h
        # Filter by size and aspect ratio
        if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
            text_regions.append((x, y, x+w, y+h))
    return text_regions
 # ============================================================
 # SPEECH BUBBLE DETECTION
 # ============================================================
 def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
    """Detect speech bubble contours for box splitting"""
@@ -302,7 +391,7 @@ def check_vertical_alignment_split(indices: List[int],
 # ============================================================
-# BOX FIXING FUNCTIONS (NEW)
+# BOX FIXING FUNCTIONS
 # ============================================================
 def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
                              bubble_boxes: Dict[int, Tuple],
@@ -311,12 +400,7 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
                              ocr: List[Tuple],
                              image_bgr: np.ndarray,
                              page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]:
-    """
+    """Apply page-specific fixes to bubble detection issues"""
    Apply page-specific fixes to bubble detection issues
    Args:
        page_identifier: Base filename (e.g., "15", "16", "19")
    """
    # Detect speech bubbles for splitting logic
    bubble_contours = detect_speech_bubbles(image_bgr)
@@ -470,6 +554,302 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
    return bubbles, bubble_boxes, bubble_quads, bubble_indices
 # ============================================================
 # ENHANCED OCR ENGINE
 # ============================================================
 class ImprovedMacVisionDetector:
    def __init__(self, source_lang="en"):
        lang_key = source_lang.lower().strip()
        lang_map = {
            "en": "en-US", "english": "en-US",
            "es": "es-ES", "spanish": "es-ES",
            "ca": "ca-ES", "catalan": "ca-ES",
            "fr": "fr-FR", "french": "fr-FR",
            "ja": "ja-JP", "japanese": "ja-JP",
            "it": "it-IT", "italian": "it-IT",
            "de": "de-DE", "german": "de-DE",
            "ko": "ko-KR", "korean": "ko-KR",
            "zh": "zh-Hans", "chinese": "zh-Hans"
        }
        apple_lang = lang_map.get(lang_key, "en-US")
        self.langs = [apple_lang]
        print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
    def preprocess_variants(self, image_bgr):
        """Generate multiple preprocessing variants"""
        variants = []
        # Variant 1: Enhanced standard
        variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5)))
        # Variant 2: High contrast
        gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
        _, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
        variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR)))
        # Variant 3: Bilateral filter (preserves edges)
        bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75)
        upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
        variants.append(("bilateral", upscaled_bil))
        # Variant 4: Inverted (for white text on black)
        inverted = cv2.bitwise_not(image_bgr)
        upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
        variants.append(("inverted", upscaled_inv))
        # Variant 5: Original upscaled
        upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
        variants.append(("original", upscaled_orig))
        return variants
    def run_vision_ocr(self, image_bgr):
        """Run Vision OCR on a single image"""
        if image_bgr is None or image_bgr.size == 0:
            return []
        ih, iw = image_bgr.shape[:2]
        success, buffer = cv2.imencode('.png', image_bgr)
        if not success:
            return []
        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
        results = []
        def completion_handler(request, error):
            if error:
                return
            for observation in request.results():
                candidate = observation.topCandidates_(1)[0]
                text = candidate.string()
                confidence = candidate.confidence()
                bbox = observation.boundingBox()
                x = bbox.origin.x * iw
                y_bottom_left = bbox.origin.y * ih
                w = bbox.size.width * iw
                h = bbox.size.height * ih
                y = ih - y_bottom_left - h
                quad = [
                    [int(x), int(y)],
                    [int(x + w), int(y)],
                    [int(x + w), int(y + h)],
                    [int(x), int(y + h)]
                ]
                results.append((quad, text, confidence))
        request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
        request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
        request.setUsesLanguageCorrection_(False)  # Disable for manga
        request.setRecognitionLanguages_(self.langs)
        request.setAutomaticallyDetectsLanguage_(True)
        handler.performRequests_error_([request], None)
        return results
    def merge_multi_pass_results(self, all_results, original_shape):
        """Merge results from multiple preprocessing passes"""
        if not all_results:
            return []
        # Scale factor to normalize coordinates back to original
        scale_factor = 2.5
        # Normalize all quads to original image coordinates
        normalized_results = []
        for variant_name, results in all_results:
            for quad, text, conf in results:
                # Scale quad back to original size
                scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad]
                normalized_results.append((scaled_quad, text, conf, variant_name))
        # Group similar detections (same location, similar text)
        def quads_overlap(q1, q2, threshold=0.5):
            b1 = quad_bbox(q1)
            b2 = quad_bbox(q2)
            # Calculate IoU
            x1 = max(b1[0], b2[0])
            y1 = max(b1[1], b2[1])
            x2 = min(b1[2], b2[2])
            y2 = min(b1[3], b2[3])
            if x2 < x1 or y2 < y1:
                return False
            intersection = (x2 - x1) * (y2 - y1)
            area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
            area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
            union = area1 + area2 - intersection
            iou = intersection / max(union, 1)
            return iou > threshold
        # Cluster overlapping detections
        clusters = []
        used = set()
        for i, (quad1, text1, conf1, var1) in enumerate(normalized_results):
            if i in used:
                continue
            cluster = [(quad1, text1, conf1, var1)]
            used.add(i)
            for j, (quad2, text2, conf2, var2) in enumerate(normalized_results):
                if j in used or i == j:
                    continue
                if quads_overlap(quad1, quad2, threshold=0.5):
                    cluster.append((quad2, text2, conf2, var2))
                    used.add(j)
            clusters.append(cluster)
        # Vote on best result per cluster
        final_results = []
        for cluster in clusters:
            # Sort by confidence
            cluster.sort(key=lambda x: x[2], reverse=True)
            # Take highest confidence result
            best_quad, best_text, best_conf, best_var = cluster[0]
            # If multiple variants agree on text, boost confidence
            text_votes = {}
            for _, text, conf, _ in cluster:
                normalized = normalize_text(text)
                if normalized:
                    text_votes[normalized] = text_votes.get(normalized, 0) + conf
            if text_votes:
                best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0]
                if best_voted_text != normalize_text(best_text):
                    # Use voted text if it has more support
                    best_text = best_voted_text
            # Apply OCR error fixes
            best_text = fix_common_ocr_errors(best_text)
            final_results.append((best_quad, best_text, best_conf))
        return final_results
    def read(self, image_path_or_array):
        """Enhanced multi-pass OCR"""
        if isinstance(image_path_or_array, str):
            img = cv2.imread(image_path_or_array)
        else:
            img = image_path_or_array
        if img is None or img.size == 0:
            return []
        original_shape = img.shape
        # Generate preprocessing variants
        variants = self.preprocess_variants(img)
        # Run OCR on each variant
        all_results = []
        for variant_name, variant_img in variants:
            results = self.run_vision_ocr(variant_img)
            if results:
                all_results.append((variant_name, results))
        # Merge and vote on results
        final_results = self.merge_multi_pass_results(all_results, original_shape)
        return final_results
 # ============================================================
 # ORIGINAL OCR ENGINE (Fallback)
 # ============================================================
 class MacVisionDetector:
    def __init__(self, source_lang="en"):
        lang_key = source_lang.lower().strip()
        lang_map = {
            "en": "en-US", "english": "en-US",
            "es": "es-ES", "spanish": "es-ES",
            "ca": "ca-ES", "catalan": "ca-ES",
            "fr": "fr-FR", "french": "fr-FR",
            "ja": "ja-JP", "japanese": "ja-JP",
            "it": "it-IT", "italian": "it-IT",
            "de": "de-DE", "german": "de-DE",
            "ko": "ko-KR", "korean": "ko-KR",
            "zh": "zh-Hans", "chinese": "zh-Hans"
        }
        apple_lang = lang_map.get(lang_key, "en-US")
        self.langs = [apple_lang]
        print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
    def read(self, image_path_or_array):
        if isinstance(image_path_or_array, str):
            img = cv2.imread(image_path_or_array)
        else:
            img = image_path_or_array
        if img is None or img.size == 0:
            return []
        ih, iw = img.shape[:2]
        success, buffer = cv2.imencode('.png', img)
        if not success:
            return []
        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
        results = []
        def completion_handler(request, error):
            if error:
                print(f"Vision API Error: {error}")
                return
            for observation in request.results():
                candidate = observation.topCandidates_(1)[0]
                text = candidate.string()
                confidence = candidate.confidence()
                bbox = observation.boundingBox()
                x = bbox.origin.x * iw
                y_bottom_left = bbox.origin.y * ih
                w = bbox.size.width * iw
                h = bbox.size.height * ih
                y = ih - y_bottom_left - h
                quad = [
                    [int(x), int(y)],
                    [int(x + w), int(y)],
                    [int(x + w), int(y + h)],
                    [int(x), int(y + h)]
                ]
                results.append((quad, text, confidence))
        request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
        request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
        request.setUsesLanguageCorrection_(True)
        request.setRecognitionLanguages_(self.langs)
        handler.performRequests_error_([request], None)
        return results
 # ============================================================
 # SPLITTERS + QUAD NORMALIZATION
 # ============================================================
@@ -960,84 +1340,6 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubb
    return out_b, out_bb, out_bq, out_bi
 # ============================================================
 # OCR ENGINES (Apple Native Vision)
 # ============================================================
 class MacVisionDetector:
    def __init__(self, source_lang="en"):
        lang_key = source_lang.lower().strip()
        lang_map = {
            "en": "en-US", "english": "en-US",
            "es": "es-ES", "spanish": "es-ES",
            "ca": "ca-ES", "catalan": "ca-ES",
            "fr": "fr-FR", "french": "fr-FR",
            "ja": "ja-JP", "japanese": "ja-JP",
            "it": "it-IT", "italian": "it-IT",
            "de": "de-DE", "german": "de-DE",
            "ko": "ko-KR", "korean": "ko-KR",
            "zh": "zh-Hans", "chinese": "zh-Hans"
        }
        apple_lang = lang_map.get(lang_key, "en-US")
        self.langs = [apple_lang]
        print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
    def read(self, image_path_or_array):
        if isinstance(image_path_or_array, str):
            img = cv2.imread(image_path_or_array)
        else:
            img = image_path_or_array
        if img is None or img.size == 0:
            return []
        ih, iw = img.shape[:2]
        success, buffer = cv2.imencode('.png', img)
        if not success:
            return []
        ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
        results = []
        def completion_handler(request, error):
            if error:
                print(f"Vision API Error: {error}")
                return
            for observation in request.results():
                candidate = observation.topCandidates_(1)[0]
                text = candidate.string()
                confidence = candidate.confidence()
                bbox = observation.boundingBox()
                x = bbox.origin.x * iw
                y_bottom_left = bbox.origin.y * ih
                w = bbox.size.width * iw
                h = bbox.size.height * ih
                y = ih - y_bottom_left - h
                quad = [
                    [int(x), int(y)],
                    [int(x + w), int(y)],
                    [int(x + w), int(y + h)],
                    [int(x), int(y + h)]
                ]
                results.append((quad, text, confidence))
        request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
        request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
        request.setUsesLanguageCorrection_(True)
        request.setRecognitionLanguages_(self.langs)
        handler.performRequests_error_([request], None)
        return results
 # ============================================================
 # PREPROCESS
 # ============================================================
@@ -1127,7 +1429,7 @@ def rebuild_text_from_vision_result(res):
    return normalize_text(" ".join(lines))
-def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24):
+def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
    ih, iw = image_bgr.shape[:2]
    x1, y1, x2, y2 = bbox_xyxy
    x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
@@ -1148,7 +1450,13 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDe
        proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
        for a in angles:
            rot = rotate_image_keep_bounds(proc3, a)
            # Use run_vision_ocr if available (enhanced detector)
            if hasattr(vision_detector, 'run_vision_ocr'):
                res = vision_detector.run_vision_ocr(rot)
            else:
                res = vision_detector.read(rot)
            txt = rebuild_text_from_vision_result(res)
            sc = ocr_candidate_score(txt)
            if sc > best_v_sc:
@@ -1469,13 +1777,13 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
 # ============================================================
-# PIPELINE
+# MAIN PIPELINE
 # ============================================================
 def translate_manga_text(
    image_path="001-page.png",
    source_lang="en",
    target_lang="ca",
-    confidence_threshold=0.05,
+    confidence_threshold=0.03,
    min_text_length=1,
    gap_px="auto",
    filter_sound_effects=True,
@@ -1483,7 +1791,8 @@ def translate_manga_text(
    export_to_file="output.txt",
    export_bubbles_to="bubbles.json",
    reading_mode="ltr",
-    debug=True
+    debug=True,
    use_enhanced_ocr=True
 ):
    image = cv2.imread(image_path)
    if image is None:
@@ -1493,12 +1802,49 @@ def translate_manga_text(
    resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
    print("Loading OCR engines...")
    # Use enhanced detector
    if use_enhanced_ocr:
        detector = ImprovedMacVisionDetector(source_lang=source_lang)
        print("🚀 Using Enhanced Multi-Pass OCR")
    else:
        detector = MacVisionDetector(source_lang=source_lang)
    print("Running detection OCR (Apple Vision)...")
    raw = detector.read(image_path)
    print(f"Raw detections: {len(raw)}")
    # Secondary pass for missed regions
    if use_enhanced_ocr:
        existing_quads = [r[0] for r in raw]
        missed_regions = detect_small_text_regions(image, existing_quads)
        if missed_regions:
            print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
            # Re-run OCR on missed regions with higher upscaling
            for region in missed_regions:
                x1, y1, x2, y2 = region
                # Add padding
                pad = 10
                x1 = max(0, x1 - pad)
                y1 = max(0, y1 - pad)
                x2 = min(image.shape[1], x2 + pad)
                y2 = min(image.shape[0], y2 + pad)
                crop = image[y1:y2, x1:x2]
                if crop.size > 0:
                    # Aggressive upscaling for small text
                    upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
                    region_results = detector.run_vision_ocr(upscaled)
                    # Scale back and offset coordinates
                    for quad, text, conf in region_results:
                        scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad]
                        raw.append((scaled_quad, text, conf))
            print(f"📝 Total detections after missed region scan: {len(raw)}")
    filtered = []
    skipped = 0
    ih, iw = image.shape[:2]
@@ -1533,7 +1879,7 @@ def translate_manga_text(
    if splits_made > 0:
        print(f"✂️  Split {splits_made} wide OCR lines across column gaps.")
-    # 2) split giant bridge quads (fixes page16 BOX19-like glue)
+    # 2) split giant bridge quads
    filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
    if bridge_splits > 0:
        print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
@@ -1545,14 +1891,12 @@ def translate_manga_text(
        filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
    )
-    # merge accidental sibling fragments (fixes page15 BOX11+BOX16 style)
+    # merge accidental sibling fragments
    bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
        bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
    )
-    # ============================================================
+    # Apply page-specific fixes
    # APPLY PAGE-SPECIFIC FIXES (NEW)
    # ============================================================
    page_identifier = os.path.basename(image_path)
    bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes(
        bubbles, bubble_boxes, bubble_quads, bubble_indices, 
@@ -1760,16 +2104,17 @@ def translate_manga_text(
 if __name__ == "__main__":
    translate_manga_text(
-        image_path="15.png",
+        image_path="16.jpg",
        source_lang="english",
        target_lang="ca",
-        confidence_threshold=0.05,
+        confidence_threshold=0.03,  # Lower threshold for better detection
        min_text_length=1,
        gap_px="auto",
        filter_sound_effects=True,
        quality_threshold=0.62,
        export_to_file="output.txt",
        export_bubbles_to="bubbles.json",
-        reading_mode="rtl",
+        reading_mode="ltr", #rtl or 
-        debug=True
+        debug=True,
        use_enhanced_ocr=True  # Enable enhanced multi-pass OCR
    )