diff --git a/manga-translator.py b/manga-translator.py index 19c754f..5610251 100644 --- a/manga-translator.py +++ b/manga-translator.py @@ -32,12 +32,12 @@ GLOSSARY = { SOUND_EFFECT_PATTERNS = [ r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$", r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$", + r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$", r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$" ] TITLE_PATTERNS = [ - r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$", - r"^(spy|family|spy.family)$", + r"^(chapter|episode|vol\.?|volume)\s*\d+$", r"^by\s+.+$", ] @@ -82,6 +82,22 @@ def postprocess_translation_general(text: str) -> str: return t +def fix_common_ocr_errors(text: str) -> str: + """Fix common OCR mistakes in manga text""" + result = text + + # Apply context-aware fixes + # Fix "O" to "0" only if surrounded by numbers + result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result) + result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result) + + # Fix common character confusions + result = result.replace('|', 'I') + result = result.replace('`', "'") + + return result + + def is_sound_effect(text: str) -> bool: cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower()) return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS) @@ -194,7 +210,80 @@ def ocr_candidate_score(text: str) -> float: # ============================================================ -# SPEECH BUBBLE DETECTION (NEW) +# ENHANCED IMAGE PREPROCESSING +# ============================================================ +def enhance_image_for_ocr(image_bgr, upscale_factor=2.5): + """Enhanced preprocessing for better OCR results""" + + # Upscale first + h, w = image_bgr.shape[:2] + new_w = int(w * upscale_factor) + new_h = int(h * upscale_factor) + upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + + # Convert to grayscale + gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY) + + # Denoise + denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21) + + # Increase contrast with CLAHE + clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) + enhanced = clahe.apply(denoised) + + # Sharpen + kernel_sharpen = np.array([[-1,-1,-1], + [-1, 9,-1], + [-1,-1,-1]]) + sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen) + + # Adaptive thresholding for clean text + binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 11, 2) + + # Morphological operations to clean up + kernel = np.ones((2, 2), np.uint8) + cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) + + # Convert back to BGR for Vision API + return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR) + + +def detect_small_text_regions(image_bgr, existing_quads): + """Detect small text regions that might have been missed""" + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + + # Create mask of existing detections + mask = np.zeros(gray.shape, dtype=np.uint8) + for quad in existing_quads: + pts = np.array(quad, dtype=np.int32) + cv2.fillPoly(mask, [pts], 255) + + # Invert mask to find undetected regions + mask_inv = cv2.bitwise_not(mask) + + # Find text-like regions + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv) + + # Find contours in undetected regions + contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Filter for text-like contours + text_regions = [] + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + area = w * h + + # Filter by size and aspect ratio + if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10: + text_regions.append((x, y, x+w, y+h)) + + return text_regions + + +# ============================================================ +# SPEECH BUBBLE DETECTION # ============================================================ def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]: """Detect speech bubble contours for box splitting""" @@ -302,7 +391,7 @@ def check_vertical_alignment_split(indices: List[int], # ============================================================ -# BOX FIXING FUNCTIONS (NEW) +# BOX FIXING FUNCTIONS # ============================================================ def apply_page_specific_fixes(bubbles: Dict[int, List[str]], bubble_boxes: Dict[int, Tuple], @@ -311,12 +400,7 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]], ocr: List[Tuple], image_bgr: np.ndarray, page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]: - """ - Apply page-specific fixes to bubble detection issues - - Args: - page_identifier: Base filename (e.g., "15", "16", "19") - """ + """Apply page-specific fixes to bubble detection issues""" # Detect speech bubbles for splitting logic bubble_contours = detect_speech_bubbles(image_bgr) @@ -470,6 +554,302 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]], return bubbles, bubble_boxes, bubble_quads, bubble_indices +# ============================================================ +# ENHANCED OCR ENGINE +# ============================================================ +class ImprovedMacVisionDetector: + def __init__(self, source_lang="en"): + lang_key = source_lang.lower().strip() + + lang_map = { + "en": "en-US", "english": "en-US", + "es": "es-ES", "spanish": "es-ES", + "ca": "ca-ES", "catalan": "ca-ES", + "fr": "fr-FR", "french": "fr-FR", + "ja": "ja-JP", "japanese": "ja-JP", + "it": "it-IT", "italian": "it-IT", + "de": "de-DE", "german": "de-DE", + "ko": "ko-KR", "korean": "ko-KR", + "zh": "zh-Hans", "chinese": "zh-Hans" + } + + apple_lang = lang_map.get(lang_key, "en-US") + self.langs = [apple_lang] + print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})") + + def preprocess_variants(self, image_bgr): + """Generate multiple preprocessing variants""" + variants = [] + + # Variant 1: Enhanced standard + variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5))) + + # Variant 2: High contrast + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + _, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) + variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR))) + + # Variant 3: Bilateral filter (preserves edges) + bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75) + upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) + variants.append(("bilateral", upscaled_bil)) + + # Variant 4: Inverted (for white text on black) + inverted = cv2.bitwise_not(image_bgr) + upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) + variants.append(("inverted", upscaled_inv)) + + # Variant 5: Original upscaled + upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC) + variants.append(("original", upscaled_orig)) + + return variants + + def run_vision_ocr(self, image_bgr): + """Run Vision OCR on a single image""" + if image_bgr is None or image_bgr.size == 0: + return [] + + ih, iw = image_bgr.shape[:2] + + success, buffer = cv2.imencode('.png', image_bgr) + if not success: + return [] + + ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) + handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) + results = [] + + def completion_handler(request, error): + if error: + return + + for observation in request.results(): + candidate = observation.topCandidates_(1)[0] + text = candidate.string() + confidence = candidate.confidence() + + bbox = observation.boundingBox() + x = bbox.origin.x * iw + y_bottom_left = bbox.origin.y * ih + w = bbox.size.width * iw + h = bbox.size.height * ih + + y = ih - y_bottom_left - h + + quad = [ + [int(x), int(y)], + [int(x + w), int(y)], + [int(x + w), int(y + h)], + [int(x), int(y + h)] + ] + + results.append((quad, text, confidence)) + + request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) + request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) + request.setUsesLanguageCorrection_(False) # Disable for manga + request.setRecognitionLanguages_(self.langs) + request.setAutomaticallyDetectsLanguage_(True) + + handler.performRequests_error_([request], None) + return results + + def merge_multi_pass_results(self, all_results, original_shape): + """Merge results from multiple preprocessing passes""" + if not all_results: + return [] + + # Scale factor to normalize coordinates back to original + scale_factor = 2.5 + + # Normalize all quads to original image coordinates + normalized_results = [] + for variant_name, results in all_results: + for quad, text, conf in results: + # Scale quad back to original size + scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad] + normalized_results.append((scaled_quad, text, conf, variant_name)) + + # Group similar detections (same location, similar text) + def quads_overlap(q1, q2, threshold=0.5): + b1 = quad_bbox(q1) + b2 = quad_bbox(q2) + + # Calculate IoU + x1 = max(b1[0], b2[0]) + y1 = max(b1[1], b2[1]) + x2 = min(b1[2], b2[2]) + y2 = min(b1[3], b2[3]) + + if x2 < x1 or y2 < y1: + return False + + intersection = (x2 - x1) * (y2 - y1) + area1 = (b1[2] - b1[0]) * (b1[3] - b1[1]) + area2 = (b2[2] - b2[0]) * (b2[3] - b2[1]) + union = area1 + area2 - intersection + + iou = intersection / max(union, 1) + return iou > threshold + + # Cluster overlapping detections + clusters = [] + used = set() + + for i, (quad1, text1, conf1, var1) in enumerate(normalized_results): + if i in used: + continue + + cluster = [(quad1, text1, conf1, var1)] + used.add(i) + + for j, (quad2, text2, conf2, var2) in enumerate(normalized_results): + if j in used or i == j: + continue + + if quads_overlap(quad1, quad2, threshold=0.5): + cluster.append((quad2, text2, conf2, var2)) + used.add(j) + + clusters.append(cluster) + + # Vote on best result per cluster + final_results = [] + for cluster in clusters: + # Sort by confidence + cluster.sort(key=lambda x: x[2], reverse=True) + + # Take highest confidence result + best_quad, best_text, best_conf, best_var = cluster[0] + + # If multiple variants agree on text, boost confidence + text_votes = {} + for _, text, conf, _ in cluster: + normalized = normalize_text(text) + if normalized: + text_votes[normalized] = text_votes.get(normalized, 0) + conf + + if text_votes: + best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0] + if best_voted_text != normalize_text(best_text): + # Use voted text if it has more support + best_text = best_voted_text + + # Apply OCR error fixes + best_text = fix_common_ocr_errors(best_text) + + final_results.append((best_quad, best_text, best_conf)) + + return final_results + + def read(self, image_path_or_array): + """Enhanced multi-pass OCR""" + if isinstance(image_path_or_array, str): + img = cv2.imread(image_path_or_array) + else: + img = image_path_or_array + + if img is None or img.size == 0: + return [] + + original_shape = img.shape + + # Generate preprocessing variants + variants = self.preprocess_variants(img) + + # Run OCR on each variant + all_results = [] + for variant_name, variant_img in variants: + results = self.run_vision_ocr(variant_img) + if results: + all_results.append((variant_name, results)) + + # Merge and vote on results + final_results = self.merge_multi_pass_results(all_results, original_shape) + + return final_results + + +# ============================================================ +# ORIGINAL OCR ENGINE (Fallback) +# ============================================================ +class MacVisionDetector: + def __init__(self, source_lang="en"): + lang_key = source_lang.lower().strip() + + lang_map = { + "en": "en-US", "english": "en-US", + "es": "es-ES", "spanish": "es-ES", + "ca": "ca-ES", "catalan": "ca-ES", + "fr": "fr-FR", "french": "fr-FR", + "ja": "ja-JP", "japanese": "ja-JP", + "it": "it-IT", "italian": "it-IT", + "de": "de-DE", "german": "de-DE", + "ko": "ko-KR", "korean": "ko-KR", + "zh": "zh-Hans", "chinese": "zh-Hans" + } + + apple_lang = lang_map.get(lang_key, "en-US") + self.langs = [apple_lang] + print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") + + def read(self, image_path_or_array): + if isinstance(image_path_or_array, str): + img = cv2.imread(image_path_or_array) + else: + img = image_path_or_array + + if img is None or img.size == 0: + return [] + + ih, iw = img.shape[:2] + + success, buffer = cv2.imencode('.png', img) + if not success: + return [] + + ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) + handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) + results = [] + + def completion_handler(request, error): + if error: + print(f"Vision API Error: {error}") + return + + for observation in request.results(): + candidate = observation.topCandidates_(1)[0] + text = candidate.string() + confidence = candidate.confidence() + + bbox = observation.boundingBox() + x = bbox.origin.x * iw + y_bottom_left = bbox.origin.y * ih + w = bbox.size.width * iw + h = bbox.size.height * ih + + y = ih - y_bottom_left - h + + quad = [ + [int(x), int(y)], + [int(x + w), int(y)], + [int(x + w), int(y + h)], + [int(x), int(y + h)] + ] + + results.append((quad, text, confidence)) + + request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) + request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) + request.setUsesLanguageCorrection_(True) + request.setRecognitionLanguages_(self.langs) + + handler.performRequests_error_([request], None) + return results + + # ============================================================ # SPLITTERS + QUAD NORMALIZATION # ============================================================ @@ -960,84 +1340,6 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubb return out_b, out_bb, out_bq, out_bi -# ============================================================ -# OCR ENGINES (Apple Native Vision) -# ============================================================ -class MacVisionDetector: - def __init__(self, source_lang="en"): - lang_key = source_lang.lower().strip() - - lang_map = { - "en": "en-US", "english": "en-US", - "es": "es-ES", "spanish": "es-ES", - "ca": "ca-ES", "catalan": "ca-ES", - "fr": "fr-FR", "french": "fr-FR", - "ja": "ja-JP", "japanese": "ja-JP", - "it": "it-IT", "italian": "it-IT", - "de": "de-DE", "german": "de-DE", - "ko": "ko-KR", "korean": "ko-KR", - "zh": "zh-Hans", "chinese": "zh-Hans" - } - - apple_lang = lang_map.get(lang_key, "en-US") - self.langs = [apple_lang] - print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})") - - def read(self, image_path_or_array): - if isinstance(image_path_or_array, str): - img = cv2.imread(image_path_or_array) - else: - img = image_path_or_array - - if img is None or img.size == 0: - return [] - - ih, iw = img.shape[:2] - - success, buffer = cv2.imencode('.png', img) - if not success: - return [] - - ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes())) - handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None) - results = [] - - def completion_handler(request, error): - if error: - print(f"Vision API Error: {error}") - return - - for observation in request.results(): - candidate = observation.topCandidates_(1)[0] - text = candidate.string() - confidence = candidate.confidence() - - bbox = observation.boundingBox() - x = bbox.origin.x * iw - y_bottom_left = bbox.origin.y * ih - w = bbox.size.width * iw - h = bbox.size.height * ih - - y = ih - y_bottom_left - h - - quad = [ - [int(x), int(y)], - [int(x + w), int(y)], - [int(x + w), int(y + h)], - [int(x), int(y + h)] - ] - - results.append((quad, text, confidence)) - - request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler) - request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) - request.setUsesLanguageCorrection_(True) - request.setRecognitionLanguages_(self.langs) - - handler.performRequests_error_([request], None) - return results - - # ============================================================ # PREPROCESS # ============================================================ @@ -1127,7 +1429,7 @@ def rebuild_text_from_vision_result(res): return normalize_text(" ".join(lines)) -def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24): +def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24): ih, iw = image_bgr.shape[:2] x1, y1, x2, y2 = bbox_xyxy x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad)) @@ -1148,7 +1450,13 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDe proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc for a in angles: rot = rotate_image_keep_bounds(proc3, a) - res = vision_detector.read(rot) + + # Use run_vision_ocr if available (enhanced detector) + if hasattr(vision_detector, 'run_vision_ocr'): + res = vision_detector.run_vision_ocr(rot) + else: + res = vision_detector.read(rot) + txt = rebuild_text_from_vision_result(res) sc = ocr_candidate_score(txt) if sc > best_v_sc: @@ -1469,13 +1777,13 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m # ============================================================ -# PIPELINE +# MAIN PIPELINE # ============================================================ def translate_manga_text( image_path="001-page.png", source_lang="en", target_lang="ca", - confidence_threshold=0.05, + confidence_threshold=0.03, min_text_length=1, gap_px="auto", filter_sound_effects=True, @@ -1483,7 +1791,8 @@ def translate_manga_text( export_to_file="output.txt", export_bubbles_to="bubbles.json", reading_mode="ltr", - debug=True + debug=True, + use_enhanced_ocr=True ): image = cv2.imread(image_path) if image is None: @@ -1493,11 +1802,48 @@ def translate_manga_text( resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px) print("Loading OCR engines...") - detector = MacVisionDetector(source_lang=source_lang) + + # Use enhanced detector + if use_enhanced_ocr: + detector = ImprovedMacVisionDetector(source_lang=source_lang) + print("🚀 Using Enhanced Multi-Pass OCR") + else: + detector = MacVisionDetector(source_lang=source_lang) print("Running detection OCR (Apple Vision)...") raw = detector.read(image_path) print(f"Raw detections: {len(raw)}") + + # Secondary pass for missed regions + if use_enhanced_ocr: + existing_quads = [r[0] for r in raw] + missed_regions = detect_small_text_regions(image, existing_quads) + + if missed_regions: + print(f"🔍 Found {len(missed_regions)} potentially missed text regions") + + # Re-run OCR on missed regions with higher upscaling + for region in missed_regions: + x1, y1, x2, y2 = region + # Add padding + pad = 10 + x1 = max(0, x1 - pad) + y1 = max(0, y1 - pad) + x2 = min(image.shape[1], x2 + pad) + y2 = min(image.shape[0], y2 + pad) + + crop = image[y1:y2, x1:x2] + if crop.size > 0: + # Aggressive upscaling for small text + upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC) + region_results = detector.run_vision_ocr(upscaled) + + # Scale back and offset coordinates + for quad, text, conf in region_results: + scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad] + raw.append((scaled_quad, text, conf)) + + print(f"📝 Total detections after missed region scan: {len(raw)}") filtered = [] skipped = 0 @@ -1533,7 +1879,7 @@ def translate_manga_text( if splits_made > 0: print(f"✂️ Split {splits_made} wide OCR lines across column gaps.") - # 2) split giant bridge quads (fixes page16 BOX19-like glue) + # 2) split giant bridge quads filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered) if bridge_splits > 0: print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).") @@ -1545,14 +1891,12 @@ def translate_manga_text( filtered, image.shape, gap_px=resolved_gap, bbox_padding=1 ) - # merge accidental sibling fragments (fixes page15 BOX11+BOX16 style) + # merge accidental sibling fragments bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height( bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered ) - # ============================================================ - # APPLY PAGE-SPECIFIC FIXES (NEW) - # ============================================================ + # Apply page-specific fixes page_identifier = os.path.basename(image_path) bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes( bubbles, bubble_boxes, bubble_quads, bubble_indices, @@ -1760,16 +2104,17 @@ def translate_manga_text( if __name__ == "__main__": translate_manga_text( - image_path="15.png", + image_path="16.jpg", source_lang="english", target_lang="ca", - confidence_threshold=0.05, + confidence_threshold=0.03, # Lower threshold for better detection min_text_length=1, gap_px="auto", filter_sound_effects=True, quality_threshold=0.62, export_to_file="output.txt", export_bubbles_to="bubbles.json", - reading_mode="rtl", - debug=True + reading_mode="ltr", #rtl or + debug=True, + use_enhanced_ocr=True # Enable enhanced multi-pass OCR ) \ No newline at end of file