Added all
This commit is contained in:
@@ -32,12 +32,12 @@ GLOSSARY = {
|
|||||||
SOUND_EFFECT_PATTERNS = [
|
SOUND_EFFECT_PATTERNS = [
|
||||||
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
|
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
|
||||||
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
|
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
|
||||||
|
r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$",
|
||||||
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
|
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
|
||||||
]
|
]
|
||||||
|
|
||||||
TITLE_PATTERNS = [
|
TITLE_PATTERNS = [
|
||||||
r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
|
r"^(chapter|episode|vol\.?|volume)\s*\d+$",
|
||||||
r"^(spy|family|spy.family)$",
|
|
||||||
r"^by\s+.+$",
|
r"^by\s+.+$",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -82,6 +82,22 @@ def postprocess_translation_general(text: str) -> str:
|
|||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def fix_common_ocr_errors(text: str) -> str:
|
||||||
|
"""Fix common OCR mistakes in manga text"""
|
||||||
|
result = text
|
||||||
|
|
||||||
|
# Apply context-aware fixes
|
||||||
|
# Fix "O" to "0" only if surrounded by numbers
|
||||||
|
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
|
||||||
|
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
|
||||||
|
|
||||||
|
# Fix common character confusions
|
||||||
|
result = result.replace('|', 'I')
|
||||||
|
result = result.replace('`', "'")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def is_sound_effect(text: str) -> bool:
|
def is_sound_effect(text: str) -> bool:
|
||||||
cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
|
cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
|
||||||
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
|
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
|
||||||
@@ -194,7 +210,80 @@ def ocr_candidate_score(text: str) -> float:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# SPEECH BUBBLE DETECTION (NEW)
|
# ENHANCED IMAGE PREPROCESSING
|
||||||
|
# ============================================================
|
||||||
|
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
|
||||||
|
"""Enhanced preprocessing for better OCR results"""
|
||||||
|
|
||||||
|
# Upscale first
|
||||||
|
h, w = image_bgr.shape[:2]
|
||||||
|
new_w = int(w * upscale_factor)
|
||||||
|
new_h = int(h * upscale_factor)
|
||||||
|
upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||||
|
|
||||||
|
# Convert to grayscale
|
||||||
|
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
|
# Denoise
|
||||||
|
denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
|
||||||
|
|
||||||
|
# Increase contrast with CLAHE
|
||||||
|
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
||||||
|
enhanced = clahe.apply(denoised)
|
||||||
|
|
||||||
|
# Sharpen
|
||||||
|
kernel_sharpen = np.array([[-1,-1,-1],
|
||||||
|
[-1, 9,-1],
|
||||||
|
[-1,-1,-1]])
|
||||||
|
sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
|
||||||
|
|
||||||
|
# Adaptive thresholding for clean text
|
||||||
|
binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY, 11, 2)
|
||||||
|
|
||||||
|
# Morphological operations to clean up
|
||||||
|
kernel = np.ones((2, 2), np.uint8)
|
||||||
|
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
|
||||||
|
|
||||||
|
# Convert back to BGR for Vision API
|
||||||
|
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_small_text_regions(image_bgr, existing_quads):
|
||||||
|
"""Detect small text regions that might have been missed"""
|
||||||
|
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
|
# Create mask of existing detections
|
||||||
|
mask = np.zeros(gray.shape, dtype=np.uint8)
|
||||||
|
for quad in existing_quads:
|
||||||
|
pts = np.array(quad, dtype=np.int32)
|
||||||
|
cv2.fillPoly(mask, [pts], 255)
|
||||||
|
|
||||||
|
# Invert mask to find undetected regions
|
||||||
|
mask_inv = cv2.bitwise_not(mask)
|
||||||
|
|
||||||
|
# Find text-like regions
|
||||||
|
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||||
|
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
|
||||||
|
|
||||||
|
# Find contours in undetected regions
|
||||||
|
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
|
# Filter for text-like contours
|
||||||
|
text_regions = []
|
||||||
|
for contour in contours:
|
||||||
|
x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
area = w * h
|
||||||
|
|
||||||
|
# Filter by size and aspect ratio
|
||||||
|
if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
|
||||||
|
text_regions.append((x, y, x+w, y+h))
|
||||||
|
|
||||||
|
return text_regions
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# SPEECH BUBBLE DETECTION
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
|
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
|
||||||
"""Detect speech bubble contours for box splitting"""
|
"""Detect speech bubble contours for box splitting"""
|
||||||
@@ -302,7 +391,7 @@ def check_vertical_alignment_split(indices: List[int],
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# BOX FIXING FUNCTIONS (NEW)
|
# BOX FIXING FUNCTIONS
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
||||||
bubble_boxes: Dict[int, Tuple],
|
bubble_boxes: Dict[int, Tuple],
|
||||||
@@ -311,12 +400,7 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
|||||||
ocr: List[Tuple],
|
ocr: List[Tuple],
|
||||||
image_bgr: np.ndarray,
|
image_bgr: np.ndarray,
|
||||||
page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]:
|
page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]:
|
||||||
"""
|
"""Apply page-specific fixes to bubble detection issues"""
|
||||||
Apply page-specific fixes to bubble detection issues
|
|
||||||
|
|
||||||
Args:
|
|
||||||
page_identifier: Base filename (e.g., "15", "16", "19")
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Detect speech bubbles for splitting logic
|
# Detect speech bubbles for splitting logic
|
||||||
bubble_contours = detect_speech_bubbles(image_bgr)
|
bubble_contours = detect_speech_bubbles(image_bgr)
|
||||||
@@ -470,6 +554,302 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
|||||||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# ENHANCED OCR ENGINE
|
||||||
|
# ============================================================
|
||||||
|
class ImprovedMacVisionDetector:
|
||||||
|
def __init__(self, source_lang="en"):
|
||||||
|
lang_key = source_lang.lower().strip()
|
||||||
|
|
||||||
|
lang_map = {
|
||||||
|
"en": "en-US", "english": "en-US",
|
||||||
|
"es": "es-ES", "spanish": "es-ES",
|
||||||
|
"ca": "ca-ES", "catalan": "ca-ES",
|
||||||
|
"fr": "fr-FR", "french": "fr-FR",
|
||||||
|
"ja": "ja-JP", "japanese": "ja-JP",
|
||||||
|
"it": "it-IT", "italian": "it-IT",
|
||||||
|
"de": "de-DE", "german": "de-DE",
|
||||||
|
"ko": "ko-KR", "korean": "ko-KR",
|
||||||
|
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||||||
|
}
|
||||||
|
|
||||||
|
apple_lang = lang_map.get(lang_key, "en-US")
|
||||||
|
self.langs = [apple_lang]
|
||||||
|
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
|
||||||
|
|
||||||
|
def preprocess_variants(self, image_bgr):
|
||||||
|
"""Generate multiple preprocessing variants"""
|
||||||
|
variants = []
|
||||||
|
|
||||||
|
# Variant 1: Enhanced standard
|
||||||
|
variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5)))
|
||||||
|
|
||||||
|
# Variant 2: High contrast
|
||||||
|
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||||||
|
_, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
|
upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||||
|
variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR)))
|
||||||
|
|
||||||
|
# Variant 3: Bilateral filter (preserves edges)
|
||||||
|
bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75)
|
||||||
|
upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||||
|
variants.append(("bilateral", upscaled_bil))
|
||||||
|
|
||||||
|
# Variant 4: Inverted (for white text on black)
|
||||||
|
inverted = cv2.bitwise_not(image_bgr)
|
||||||
|
upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||||
|
variants.append(("inverted", upscaled_inv))
|
||||||
|
|
||||||
|
# Variant 5: Original upscaled
|
||||||
|
upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||||
|
variants.append(("original", upscaled_orig))
|
||||||
|
|
||||||
|
return variants
|
||||||
|
|
||||||
|
def run_vision_ocr(self, image_bgr):
|
||||||
|
"""Run Vision OCR on a single image"""
|
||||||
|
if image_bgr is None or image_bgr.size == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ih, iw = image_bgr.shape[:2]
|
||||||
|
|
||||||
|
success, buffer = cv2.imencode('.png', image_bgr)
|
||||||
|
if not success:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||||||
|
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
def completion_handler(request, error):
|
||||||
|
if error:
|
||||||
|
return
|
||||||
|
|
||||||
|
for observation in request.results():
|
||||||
|
candidate = observation.topCandidates_(1)[0]
|
||||||
|
text = candidate.string()
|
||||||
|
confidence = candidate.confidence()
|
||||||
|
|
||||||
|
bbox = observation.boundingBox()
|
||||||
|
x = bbox.origin.x * iw
|
||||||
|
y_bottom_left = bbox.origin.y * ih
|
||||||
|
w = bbox.size.width * iw
|
||||||
|
h = bbox.size.height * ih
|
||||||
|
|
||||||
|
y = ih - y_bottom_left - h
|
||||||
|
|
||||||
|
quad = [
|
||||||
|
[int(x), int(y)],
|
||||||
|
[int(x + w), int(y)],
|
||||||
|
[int(x + w), int(y + h)],
|
||||||
|
[int(x), int(y + h)]
|
||||||
|
]
|
||||||
|
|
||||||
|
results.append((quad, text, confidence))
|
||||||
|
|
||||||
|
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||||||
|
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||||||
|
request.setUsesLanguageCorrection_(False) # Disable for manga
|
||||||
|
request.setRecognitionLanguages_(self.langs)
|
||||||
|
request.setAutomaticallyDetectsLanguage_(True)
|
||||||
|
|
||||||
|
handler.performRequests_error_([request], None)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def merge_multi_pass_results(self, all_results, original_shape):
|
||||||
|
"""Merge results from multiple preprocessing passes"""
|
||||||
|
if not all_results:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Scale factor to normalize coordinates back to original
|
||||||
|
scale_factor = 2.5
|
||||||
|
|
||||||
|
# Normalize all quads to original image coordinates
|
||||||
|
normalized_results = []
|
||||||
|
for variant_name, results in all_results:
|
||||||
|
for quad, text, conf in results:
|
||||||
|
# Scale quad back to original size
|
||||||
|
scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad]
|
||||||
|
normalized_results.append((scaled_quad, text, conf, variant_name))
|
||||||
|
|
||||||
|
# Group similar detections (same location, similar text)
|
||||||
|
def quads_overlap(q1, q2, threshold=0.5):
|
||||||
|
b1 = quad_bbox(q1)
|
||||||
|
b2 = quad_bbox(q2)
|
||||||
|
|
||||||
|
# Calculate IoU
|
||||||
|
x1 = max(b1[0], b2[0])
|
||||||
|
y1 = max(b1[1], b2[1])
|
||||||
|
x2 = min(b1[2], b2[2])
|
||||||
|
y2 = min(b1[3], b2[3])
|
||||||
|
|
||||||
|
if x2 < x1 or y2 < y1:
|
||||||
|
return False
|
||||||
|
|
||||||
|
intersection = (x2 - x1) * (y2 - y1)
|
||||||
|
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
|
||||||
|
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
|
||||||
|
union = area1 + area2 - intersection
|
||||||
|
|
||||||
|
iou = intersection / max(union, 1)
|
||||||
|
return iou > threshold
|
||||||
|
|
||||||
|
# Cluster overlapping detections
|
||||||
|
clusters = []
|
||||||
|
used = set()
|
||||||
|
|
||||||
|
for i, (quad1, text1, conf1, var1) in enumerate(normalized_results):
|
||||||
|
if i in used:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cluster = [(quad1, text1, conf1, var1)]
|
||||||
|
used.add(i)
|
||||||
|
|
||||||
|
for j, (quad2, text2, conf2, var2) in enumerate(normalized_results):
|
||||||
|
if j in used or i == j:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if quads_overlap(quad1, quad2, threshold=0.5):
|
||||||
|
cluster.append((quad2, text2, conf2, var2))
|
||||||
|
used.add(j)
|
||||||
|
|
||||||
|
clusters.append(cluster)
|
||||||
|
|
||||||
|
# Vote on best result per cluster
|
||||||
|
final_results = []
|
||||||
|
for cluster in clusters:
|
||||||
|
# Sort by confidence
|
||||||
|
cluster.sort(key=lambda x: x[2], reverse=True)
|
||||||
|
|
||||||
|
# Take highest confidence result
|
||||||
|
best_quad, best_text, best_conf, best_var = cluster[0]
|
||||||
|
|
||||||
|
# If multiple variants agree on text, boost confidence
|
||||||
|
text_votes = {}
|
||||||
|
for _, text, conf, _ in cluster:
|
||||||
|
normalized = normalize_text(text)
|
||||||
|
if normalized:
|
||||||
|
text_votes[normalized] = text_votes.get(normalized, 0) + conf
|
||||||
|
|
||||||
|
if text_votes:
|
||||||
|
best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0]
|
||||||
|
if best_voted_text != normalize_text(best_text):
|
||||||
|
# Use voted text if it has more support
|
||||||
|
best_text = best_voted_text
|
||||||
|
|
||||||
|
# Apply OCR error fixes
|
||||||
|
best_text = fix_common_ocr_errors(best_text)
|
||||||
|
|
||||||
|
final_results.append((best_quad, best_text, best_conf))
|
||||||
|
|
||||||
|
return final_results
|
||||||
|
|
||||||
|
def read(self, image_path_or_array):
|
||||||
|
"""Enhanced multi-pass OCR"""
|
||||||
|
if isinstance(image_path_or_array, str):
|
||||||
|
img = cv2.imread(image_path_or_array)
|
||||||
|
else:
|
||||||
|
img = image_path_or_array
|
||||||
|
|
||||||
|
if img is None or img.size == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
original_shape = img.shape
|
||||||
|
|
||||||
|
# Generate preprocessing variants
|
||||||
|
variants = self.preprocess_variants(img)
|
||||||
|
|
||||||
|
# Run OCR on each variant
|
||||||
|
all_results = []
|
||||||
|
for variant_name, variant_img in variants:
|
||||||
|
results = self.run_vision_ocr(variant_img)
|
||||||
|
if results:
|
||||||
|
all_results.append((variant_name, results))
|
||||||
|
|
||||||
|
# Merge and vote on results
|
||||||
|
final_results = self.merge_multi_pass_results(all_results, original_shape)
|
||||||
|
|
||||||
|
return final_results
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# ORIGINAL OCR ENGINE (Fallback)
|
||||||
|
# ============================================================
|
||||||
|
class MacVisionDetector:
|
||||||
|
def __init__(self, source_lang="en"):
|
||||||
|
lang_key = source_lang.lower().strip()
|
||||||
|
|
||||||
|
lang_map = {
|
||||||
|
"en": "en-US", "english": "en-US",
|
||||||
|
"es": "es-ES", "spanish": "es-ES",
|
||||||
|
"ca": "ca-ES", "catalan": "ca-ES",
|
||||||
|
"fr": "fr-FR", "french": "fr-FR",
|
||||||
|
"ja": "ja-JP", "japanese": "ja-JP",
|
||||||
|
"it": "it-IT", "italian": "it-IT",
|
||||||
|
"de": "de-DE", "german": "de-DE",
|
||||||
|
"ko": "ko-KR", "korean": "ko-KR",
|
||||||
|
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||||||
|
}
|
||||||
|
|
||||||
|
apple_lang = lang_map.get(lang_key, "en-US")
|
||||||
|
self.langs = [apple_lang]
|
||||||
|
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
|
||||||
|
|
||||||
|
def read(self, image_path_or_array):
|
||||||
|
if isinstance(image_path_or_array, str):
|
||||||
|
img = cv2.imread(image_path_or_array)
|
||||||
|
else:
|
||||||
|
img = image_path_or_array
|
||||||
|
|
||||||
|
if img is None or img.size == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ih, iw = img.shape[:2]
|
||||||
|
|
||||||
|
success, buffer = cv2.imencode('.png', img)
|
||||||
|
if not success:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||||||
|
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
def completion_handler(request, error):
|
||||||
|
if error:
|
||||||
|
print(f"Vision API Error: {error}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for observation in request.results():
|
||||||
|
candidate = observation.topCandidates_(1)[0]
|
||||||
|
text = candidate.string()
|
||||||
|
confidence = candidate.confidence()
|
||||||
|
|
||||||
|
bbox = observation.boundingBox()
|
||||||
|
x = bbox.origin.x * iw
|
||||||
|
y_bottom_left = bbox.origin.y * ih
|
||||||
|
w = bbox.size.width * iw
|
||||||
|
h = bbox.size.height * ih
|
||||||
|
|
||||||
|
y = ih - y_bottom_left - h
|
||||||
|
|
||||||
|
quad = [
|
||||||
|
[int(x), int(y)],
|
||||||
|
[int(x + w), int(y)],
|
||||||
|
[int(x + w), int(y + h)],
|
||||||
|
[int(x), int(y + h)]
|
||||||
|
]
|
||||||
|
|
||||||
|
results.append((quad, text, confidence))
|
||||||
|
|
||||||
|
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||||||
|
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||||||
|
request.setUsesLanguageCorrection_(True)
|
||||||
|
request.setRecognitionLanguages_(self.langs)
|
||||||
|
|
||||||
|
handler.performRequests_error_([request], None)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# SPLITTERS + QUAD NORMALIZATION
|
# SPLITTERS + QUAD NORMALIZATION
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -960,84 +1340,6 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubb
|
|||||||
return out_b, out_bb, out_bq, out_bi
|
return out_b, out_bb, out_bq, out_bi
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# OCR ENGINES (Apple Native Vision)
|
|
||||||
# ============================================================
|
|
||||||
class MacVisionDetector:
|
|
||||||
def __init__(self, source_lang="en"):
|
|
||||||
lang_key = source_lang.lower().strip()
|
|
||||||
|
|
||||||
lang_map = {
|
|
||||||
"en": "en-US", "english": "en-US",
|
|
||||||
"es": "es-ES", "spanish": "es-ES",
|
|
||||||
"ca": "ca-ES", "catalan": "ca-ES",
|
|
||||||
"fr": "fr-FR", "french": "fr-FR",
|
|
||||||
"ja": "ja-JP", "japanese": "ja-JP",
|
|
||||||
"it": "it-IT", "italian": "it-IT",
|
|
||||||
"de": "de-DE", "german": "de-DE",
|
|
||||||
"ko": "ko-KR", "korean": "ko-KR",
|
|
||||||
"zh": "zh-Hans", "chinese": "zh-Hans"
|
|
||||||
}
|
|
||||||
|
|
||||||
apple_lang = lang_map.get(lang_key, "en-US")
|
|
||||||
self.langs = [apple_lang]
|
|
||||||
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
|
|
||||||
|
|
||||||
def read(self, image_path_or_array):
|
|
||||||
if isinstance(image_path_or_array, str):
|
|
||||||
img = cv2.imread(image_path_or_array)
|
|
||||||
else:
|
|
||||||
img = image_path_or_array
|
|
||||||
|
|
||||||
if img is None or img.size == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
ih, iw = img.shape[:2]
|
|
||||||
|
|
||||||
success, buffer = cv2.imencode('.png', img)
|
|
||||||
if not success:
|
|
||||||
return []
|
|
||||||
|
|
||||||
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
|
||||||
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
|
||||||
results = []
|
|
||||||
|
|
||||||
def completion_handler(request, error):
|
|
||||||
if error:
|
|
||||||
print(f"Vision API Error: {error}")
|
|
||||||
return
|
|
||||||
|
|
||||||
for observation in request.results():
|
|
||||||
candidate = observation.topCandidates_(1)[0]
|
|
||||||
text = candidate.string()
|
|
||||||
confidence = candidate.confidence()
|
|
||||||
|
|
||||||
bbox = observation.boundingBox()
|
|
||||||
x = bbox.origin.x * iw
|
|
||||||
y_bottom_left = bbox.origin.y * ih
|
|
||||||
w = bbox.size.width * iw
|
|
||||||
h = bbox.size.height * ih
|
|
||||||
|
|
||||||
y = ih - y_bottom_left - h
|
|
||||||
|
|
||||||
quad = [
|
|
||||||
[int(x), int(y)],
|
|
||||||
[int(x + w), int(y)],
|
|
||||||
[int(x + w), int(y + h)],
|
|
||||||
[int(x), int(y + h)]
|
|
||||||
]
|
|
||||||
|
|
||||||
results.append((quad, text, confidence))
|
|
||||||
|
|
||||||
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
|
||||||
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
|
||||||
request.setUsesLanguageCorrection_(True)
|
|
||||||
request.setRecognitionLanguages_(self.langs)
|
|
||||||
|
|
||||||
handler.performRequests_error_([request], None)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# PREPROCESS
|
# PREPROCESS
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -1127,7 +1429,7 @@ def rebuild_text_from_vision_result(res):
|
|||||||
return normalize_text(" ".join(lines))
|
return normalize_text(" ".join(lines))
|
||||||
|
|
||||||
|
|
||||||
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24):
|
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
|
||||||
ih, iw = image_bgr.shape[:2]
|
ih, iw = image_bgr.shape[:2]
|
||||||
x1, y1, x2, y2 = bbox_xyxy
|
x1, y1, x2, y2 = bbox_xyxy
|
||||||
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
|
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
|
||||||
@@ -1148,7 +1450,13 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDe
|
|||||||
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
|
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
|
||||||
for a in angles:
|
for a in angles:
|
||||||
rot = rotate_image_keep_bounds(proc3, a)
|
rot = rotate_image_keep_bounds(proc3, a)
|
||||||
|
|
||||||
|
# Use run_vision_ocr if available (enhanced detector)
|
||||||
|
if hasattr(vision_detector, 'run_vision_ocr'):
|
||||||
|
res = vision_detector.run_vision_ocr(rot)
|
||||||
|
else:
|
||||||
res = vision_detector.read(rot)
|
res = vision_detector.read(rot)
|
||||||
|
|
||||||
txt = rebuild_text_from_vision_result(res)
|
txt = rebuild_text_from_vision_result(res)
|
||||||
sc = ocr_candidate_score(txt)
|
sc = ocr_candidate_score(txt)
|
||||||
if sc > best_v_sc:
|
if sc > best_v_sc:
|
||||||
@@ -1469,13 +1777,13 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# PIPELINE
|
# MAIN PIPELINE
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def translate_manga_text(
|
def translate_manga_text(
|
||||||
image_path="001-page.png",
|
image_path="001-page.png",
|
||||||
source_lang="en",
|
source_lang="en",
|
||||||
target_lang="ca",
|
target_lang="ca",
|
||||||
confidence_threshold=0.05,
|
confidence_threshold=0.03,
|
||||||
min_text_length=1,
|
min_text_length=1,
|
||||||
gap_px="auto",
|
gap_px="auto",
|
||||||
filter_sound_effects=True,
|
filter_sound_effects=True,
|
||||||
@@ -1483,7 +1791,8 @@ def translate_manga_text(
|
|||||||
export_to_file="output.txt",
|
export_to_file="output.txt",
|
||||||
export_bubbles_to="bubbles.json",
|
export_bubbles_to="bubbles.json",
|
||||||
reading_mode="ltr",
|
reading_mode="ltr",
|
||||||
debug=True
|
debug=True,
|
||||||
|
use_enhanced_ocr=True
|
||||||
):
|
):
|
||||||
image = cv2.imread(image_path)
|
image = cv2.imread(image_path)
|
||||||
if image is None:
|
if image is None:
|
||||||
@@ -1493,12 +1802,49 @@ def translate_manga_text(
|
|||||||
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
||||||
|
|
||||||
print("Loading OCR engines...")
|
print("Loading OCR engines...")
|
||||||
|
|
||||||
|
# Use enhanced detector
|
||||||
|
if use_enhanced_ocr:
|
||||||
|
detector = ImprovedMacVisionDetector(source_lang=source_lang)
|
||||||
|
print("🚀 Using Enhanced Multi-Pass OCR")
|
||||||
|
else:
|
||||||
detector = MacVisionDetector(source_lang=source_lang)
|
detector = MacVisionDetector(source_lang=source_lang)
|
||||||
|
|
||||||
print("Running detection OCR (Apple Vision)...")
|
print("Running detection OCR (Apple Vision)...")
|
||||||
raw = detector.read(image_path)
|
raw = detector.read(image_path)
|
||||||
print(f"Raw detections: {len(raw)}")
|
print(f"Raw detections: {len(raw)}")
|
||||||
|
|
||||||
|
# Secondary pass for missed regions
|
||||||
|
if use_enhanced_ocr:
|
||||||
|
existing_quads = [r[0] for r in raw]
|
||||||
|
missed_regions = detect_small_text_regions(image, existing_quads)
|
||||||
|
|
||||||
|
if missed_regions:
|
||||||
|
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
|
||||||
|
|
||||||
|
# Re-run OCR on missed regions with higher upscaling
|
||||||
|
for region in missed_regions:
|
||||||
|
x1, y1, x2, y2 = region
|
||||||
|
# Add padding
|
||||||
|
pad = 10
|
||||||
|
x1 = max(0, x1 - pad)
|
||||||
|
y1 = max(0, y1 - pad)
|
||||||
|
x2 = min(image.shape[1], x2 + pad)
|
||||||
|
y2 = min(image.shape[0], y2 + pad)
|
||||||
|
|
||||||
|
crop = image[y1:y2, x1:x2]
|
||||||
|
if crop.size > 0:
|
||||||
|
# Aggressive upscaling for small text
|
||||||
|
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
|
||||||
|
region_results = detector.run_vision_ocr(upscaled)
|
||||||
|
|
||||||
|
# Scale back and offset coordinates
|
||||||
|
for quad, text, conf in region_results:
|
||||||
|
scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad]
|
||||||
|
raw.append((scaled_quad, text, conf))
|
||||||
|
|
||||||
|
print(f"📝 Total detections after missed region scan: {len(raw)}")
|
||||||
|
|
||||||
filtered = []
|
filtered = []
|
||||||
skipped = 0
|
skipped = 0
|
||||||
ih, iw = image.shape[:2]
|
ih, iw = image.shape[:2]
|
||||||
@@ -1533,7 +1879,7 @@ def translate_manga_text(
|
|||||||
if splits_made > 0:
|
if splits_made > 0:
|
||||||
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
|
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
|
||||||
|
|
||||||
# 2) split giant bridge quads (fixes page16 BOX19-like glue)
|
# 2) split giant bridge quads
|
||||||
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
|
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
|
||||||
if bridge_splits > 0:
|
if bridge_splits > 0:
|
||||||
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
|
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
|
||||||
@@ -1545,14 +1891,12 @@ def translate_manga_text(
|
|||||||
filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
|
filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
|
||||||
)
|
)
|
||||||
|
|
||||||
# merge accidental sibling fragments (fixes page15 BOX11+BOX16 style)
|
# merge accidental sibling fragments
|
||||||
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
|
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
|
||||||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
|
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
|
||||||
)
|
)
|
||||||
|
|
||||||
# ============================================================
|
# Apply page-specific fixes
|
||||||
# APPLY PAGE-SPECIFIC FIXES (NEW)
|
|
||||||
# ============================================================
|
|
||||||
page_identifier = os.path.basename(image_path)
|
page_identifier = os.path.basename(image_path)
|
||||||
bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes(
|
bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes(
|
||||||
bubbles, bubble_boxes, bubble_quads, bubble_indices,
|
bubbles, bubble_boxes, bubble_quads, bubble_indices,
|
||||||
@@ -1760,16 +2104,17 @@ def translate_manga_text(
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
translate_manga_text(
|
translate_manga_text(
|
||||||
image_path="15.png",
|
image_path="16.jpg",
|
||||||
source_lang="english",
|
source_lang="english",
|
||||||
target_lang="ca",
|
target_lang="ca",
|
||||||
confidence_threshold=0.05,
|
confidence_threshold=0.03, # Lower threshold for better detection
|
||||||
min_text_length=1,
|
min_text_length=1,
|
||||||
gap_px="auto",
|
gap_px="auto",
|
||||||
filter_sound_effects=True,
|
filter_sound_effects=True,
|
||||||
quality_threshold=0.62,
|
quality_threshold=0.62,
|
||||||
export_to_file="output.txt",
|
export_to_file="output.txt",
|
||||||
export_bubbles_to="bubbles.json",
|
export_bubbles_to="bubbles.json",
|
||||||
reading_mode="rtl",
|
reading_mode="ltr", #rtl or
|
||||||
debug=True
|
debug=True,
|
||||||
|
use_enhanced_ocr=True # Enable enhanced multi-pass OCR
|
||||||
)
|
)
|
||||||
Reference in New Issue
Block a user