Added all
This commit is contained in:
@@ -32,12 +32,12 @@ GLOSSARY = {
|
||||
SOUND_EFFECT_PATTERNS = [
|
||||
r"^b+i+p+$", r"^sha+$", r"^ha+$", r"^ah+$",
|
||||
r"^ugh+$", r"^bam+$", r"^pow+$", r"^boom+$", r"^bang+$",
|
||||
r"^Grr+$", r"^grrp+$", r"^fshoo+$", r"^fwuip+$",
|
||||
r"^crash+$", r"^thud+$", r"^zip+$", r"^swoosh+$", r"^chirp+$"
|
||||
]
|
||||
|
||||
TITLE_PATTERNS = [
|
||||
r"^(mission|chapter|episode|vol\.?|volume)\s*\d+$",
|
||||
r"^(spy|family|spy.family)$",
|
||||
r"^(chapter|episode|vol\.?|volume)\s*\d+$",
|
||||
r"^by\s+.+$",
|
||||
]
|
||||
|
||||
@@ -82,6 +82,22 @@ def postprocess_translation_general(text: str) -> str:
|
||||
return t
|
||||
|
||||
|
||||
def fix_common_ocr_errors(text: str) -> str:
|
||||
"""Fix common OCR mistakes in manga text"""
|
||||
result = text
|
||||
|
||||
# Apply context-aware fixes
|
||||
# Fix "O" to "0" only if surrounded by numbers
|
||||
result = re.sub(r'(\d)O(\d)', r'\g<1>0\g<2>', result)
|
||||
result = re.sub(r'(\d)O([^a-zA-Z])', r'\g<1>0\g<2>', result)
|
||||
|
||||
# Fix common character confusions
|
||||
result = result.replace('|', 'I')
|
||||
result = result.replace('`', "'")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def is_sound_effect(text: str) -> bool:
|
||||
cleaned = re.sub(r"[^a-z]", "", (text or "").strip().lower())
|
||||
return any(re.fullmatch(p, cleaned, re.IGNORECASE) for p in SOUND_EFFECT_PATTERNS)
|
||||
@@ -194,7 +210,80 @@ def ocr_candidate_score(text: str) -> float:
|
||||
|
||||
|
||||
# ============================================================
|
||||
# SPEECH BUBBLE DETECTION (NEW)
|
||||
# ENHANCED IMAGE PREPROCESSING
|
||||
# ============================================================
|
||||
def enhance_image_for_ocr(image_bgr, upscale_factor=2.5):
|
||||
"""Enhanced preprocessing for better OCR results"""
|
||||
|
||||
# Upscale first
|
||||
h, w = image_bgr.shape[:2]
|
||||
new_w = int(w * upscale_factor)
|
||||
new_h = int(h * upscale_factor)
|
||||
upscaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Convert to grayscale
|
||||
gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Denoise
|
||||
denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
|
||||
|
||||
# Increase contrast with CLAHE
|
||||
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(denoised)
|
||||
|
||||
# Sharpen
|
||||
kernel_sharpen = np.array([[-1,-1,-1],
|
||||
[-1, 9,-1],
|
||||
[-1,-1,-1]])
|
||||
sharpened = cv2.filter2D(enhanced, -1, kernel_sharpen)
|
||||
|
||||
# Adaptive thresholding for clean text
|
||||
binary = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, 11, 2)
|
||||
|
||||
# Morphological operations to clean up
|
||||
kernel = np.ones((2, 2), np.uint8)
|
||||
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
|
||||
|
||||
# Convert back to BGR for Vision API
|
||||
return cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
|
||||
def detect_small_text_regions(image_bgr, existing_quads):
|
||||
"""Detect small text regions that might have been missed"""
|
||||
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Create mask of existing detections
|
||||
mask = np.zeros(gray.shape, dtype=np.uint8)
|
||||
for quad in existing_quads:
|
||||
pts = np.array(quad, dtype=np.int32)
|
||||
cv2.fillPoly(mask, [pts], 255)
|
||||
|
||||
# Invert mask to find undetected regions
|
||||
mask_inv = cv2.bitwise_not(mask)
|
||||
|
||||
# Find text-like regions
|
||||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
binary_masked = cv2.bitwise_and(binary, binary, mask=mask_inv)
|
||||
|
||||
# Find contours in undetected regions
|
||||
contours, _ = cv2.findContours(binary_masked, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
# Filter for text-like contours
|
||||
text_regions = []
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
area = w * h
|
||||
|
||||
# Filter by size and aspect ratio
|
||||
if 50 < area < 5000 and 0.1 < h/max(w, 1) < 10:
|
||||
text_regions.append((x, y, x+w, y+h))
|
||||
|
||||
return text_regions
|
||||
|
||||
|
||||
# ============================================================
|
||||
# SPEECH BUBBLE DETECTION
|
||||
# ============================================================
|
||||
def detect_speech_bubbles(image_bgr: np.ndarray) -> List[np.ndarray]:
|
||||
"""Detect speech bubble contours for box splitting"""
|
||||
@@ -302,7 +391,7 @@ def check_vertical_alignment_split(indices: List[int],
|
||||
|
||||
|
||||
# ============================================================
|
||||
# BOX FIXING FUNCTIONS (NEW)
|
||||
# BOX FIXING FUNCTIONS
|
||||
# ============================================================
|
||||
def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
||||
bubble_boxes: Dict[int, Tuple],
|
||||
@@ -311,12 +400,7 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
||||
ocr: List[Tuple],
|
||||
image_bgr: np.ndarray,
|
||||
page_identifier: str) -> Tuple[Dict, Dict, Dict, Dict]:
|
||||
"""
|
||||
Apply page-specific fixes to bubble detection issues
|
||||
|
||||
Args:
|
||||
page_identifier: Base filename (e.g., "15", "16", "19")
|
||||
"""
|
||||
"""Apply page-specific fixes to bubble detection issues"""
|
||||
|
||||
# Detect speech bubbles for splitting logic
|
||||
bubble_contours = detect_speech_bubbles(image_bgr)
|
||||
@@ -470,6 +554,302 @@ def apply_page_specific_fixes(bubbles: Dict[int, List[str]],
|
||||
return bubbles, bubble_boxes, bubble_quads, bubble_indices
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ENHANCED OCR ENGINE
|
||||
# ============================================================
|
||||
class ImprovedMacVisionDetector:
|
||||
def __init__(self, source_lang="en"):
|
||||
lang_key = source_lang.lower().strip()
|
||||
|
||||
lang_map = {
|
||||
"en": "en-US", "english": "en-US",
|
||||
"es": "es-ES", "spanish": "es-ES",
|
||||
"ca": "ca-ES", "catalan": "ca-ES",
|
||||
"fr": "fr-FR", "french": "fr-FR",
|
||||
"ja": "ja-JP", "japanese": "ja-JP",
|
||||
"it": "it-IT", "italian": "it-IT",
|
||||
"de": "de-DE", "german": "de-DE",
|
||||
"ko": "ko-KR", "korean": "ko-KR",
|
||||
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||||
}
|
||||
|
||||
apple_lang = lang_map.get(lang_key, "en-US")
|
||||
self.langs = [apple_lang]
|
||||
print(f"⚡ Using Enhanced Apple Vision OCR (Language: {self.langs[0]})")
|
||||
|
||||
def preprocess_variants(self, image_bgr):
|
||||
"""Generate multiple preprocessing variants"""
|
||||
variants = []
|
||||
|
||||
# Variant 1: Enhanced standard
|
||||
variants.append(("enhanced", enhance_image_for_ocr(image_bgr, upscale_factor=2.5)))
|
||||
|
||||
# Variant 2: High contrast
|
||||
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||||
_, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||
upscaled_hc = cv2.resize(high_contrast, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||
variants.append(("high_contrast", cv2.cvtColor(upscaled_hc, cv2.COLOR_GRAY2BGR)))
|
||||
|
||||
# Variant 3: Bilateral filter (preserves edges)
|
||||
bilateral = cv2.bilateralFilter(image_bgr, 9, 75, 75)
|
||||
upscaled_bil = cv2.resize(bilateral, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||
variants.append(("bilateral", upscaled_bil))
|
||||
|
||||
# Variant 4: Inverted (for white text on black)
|
||||
inverted = cv2.bitwise_not(image_bgr)
|
||||
upscaled_inv = cv2.resize(inverted, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||
variants.append(("inverted", upscaled_inv))
|
||||
|
||||
# Variant 5: Original upscaled
|
||||
upscaled_orig = cv2.resize(image_bgr, None, fx=2.5, fy=2.5, interpolation=cv2.INTER_CUBIC)
|
||||
variants.append(("original", upscaled_orig))
|
||||
|
||||
return variants
|
||||
|
||||
def run_vision_ocr(self, image_bgr):
|
||||
"""Run Vision OCR on a single image"""
|
||||
if image_bgr is None or image_bgr.size == 0:
|
||||
return []
|
||||
|
||||
ih, iw = image_bgr.shape[:2]
|
||||
|
||||
success, buffer = cv2.imencode('.png', image_bgr)
|
||||
if not success:
|
||||
return []
|
||||
|
||||
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||||
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||||
results = []
|
||||
|
||||
def completion_handler(request, error):
|
||||
if error:
|
||||
return
|
||||
|
||||
for observation in request.results():
|
||||
candidate = observation.topCandidates_(1)[0]
|
||||
text = candidate.string()
|
||||
confidence = candidate.confidence()
|
||||
|
||||
bbox = observation.boundingBox()
|
||||
x = bbox.origin.x * iw
|
||||
y_bottom_left = bbox.origin.y * ih
|
||||
w = bbox.size.width * iw
|
||||
h = bbox.size.height * ih
|
||||
|
||||
y = ih - y_bottom_left - h
|
||||
|
||||
quad = [
|
||||
[int(x), int(y)],
|
||||
[int(x + w), int(y)],
|
||||
[int(x + w), int(y + h)],
|
||||
[int(x), int(y + h)]
|
||||
]
|
||||
|
||||
results.append((quad, text, confidence))
|
||||
|
||||
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||||
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||||
request.setUsesLanguageCorrection_(False) # Disable for manga
|
||||
request.setRecognitionLanguages_(self.langs)
|
||||
request.setAutomaticallyDetectsLanguage_(True)
|
||||
|
||||
handler.performRequests_error_([request], None)
|
||||
return results
|
||||
|
||||
def merge_multi_pass_results(self, all_results, original_shape):
|
||||
"""Merge results from multiple preprocessing passes"""
|
||||
if not all_results:
|
||||
return []
|
||||
|
||||
# Scale factor to normalize coordinates back to original
|
||||
scale_factor = 2.5
|
||||
|
||||
# Normalize all quads to original image coordinates
|
||||
normalized_results = []
|
||||
for variant_name, results in all_results:
|
||||
for quad, text, conf in results:
|
||||
# Scale quad back to original size
|
||||
scaled_quad = [[int(p[0] / scale_factor), int(p[1] / scale_factor)] for p in quad]
|
||||
normalized_results.append((scaled_quad, text, conf, variant_name))
|
||||
|
||||
# Group similar detections (same location, similar text)
|
||||
def quads_overlap(q1, q2, threshold=0.5):
|
||||
b1 = quad_bbox(q1)
|
||||
b2 = quad_bbox(q2)
|
||||
|
||||
# Calculate IoU
|
||||
x1 = max(b1[0], b2[0])
|
||||
y1 = max(b1[1], b2[1])
|
||||
x2 = min(b1[2], b2[2])
|
||||
y2 = min(b1[3], b2[3])
|
||||
|
||||
if x2 < x1 or y2 < y1:
|
||||
return False
|
||||
|
||||
intersection = (x2 - x1) * (y2 - y1)
|
||||
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
|
||||
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
|
||||
union = area1 + area2 - intersection
|
||||
|
||||
iou = intersection / max(union, 1)
|
||||
return iou > threshold
|
||||
|
||||
# Cluster overlapping detections
|
||||
clusters = []
|
||||
used = set()
|
||||
|
||||
for i, (quad1, text1, conf1, var1) in enumerate(normalized_results):
|
||||
if i in used:
|
||||
continue
|
||||
|
||||
cluster = [(quad1, text1, conf1, var1)]
|
||||
used.add(i)
|
||||
|
||||
for j, (quad2, text2, conf2, var2) in enumerate(normalized_results):
|
||||
if j in used or i == j:
|
||||
continue
|
||||
|
||||
if quads_overlap(quad1, quad2, threshold=0.5):
|
||||
cluster.append((quad2, text2, conf2, var2))
|
||||
used.add(j)
|
||||
|
||||
clusters.append(cluster)
|
||||
|
||||
# Vote on best result per cluster
|
||||
final_results = []
|
||||
for cluster in clusters:
|
||||
# Sort by confidence
|
||||
cluster.sort(key=lambda x: x[2], reverse=True)
|
||||
|
||||
# Take highest confidence result
|
||||
best_quad, best_text, best_conf, best_var = cluster[0]
|
||||
|
||||
# If multiple variants agree on text, boost confidence
|
||||
text_votes = {}
|
||||
for _, text, conf, _ in cluster:
|
||||
normalized = normalize_text(text)
|
||||
if normalized:
|
||||
text_votes[normalized] = text_votes.get(normalized, 0) + conf
|
||||
|
||||
if text_votes:
|
||||
best_voted_text = max(text_votes.items(), key=lambda x: x[1])[0]
|
||||
if best_voted_text != normalize_text(best_text):
|
||||
# Use voted text if it has more support
|
||||
best_text = best_voted_text
|
||||
|
||||
# Apply OCR error fixes
|
||||
best_text = fix_common_ocr_errors(best_text)
|
||||
|
||||
final_results.append((best_quad, best_text, best_conf))
|
||||
|
||||
return final_results
|
||||
|
||||
def read(self, image_path_or_array):
|
||||
"""Enhanced multi-pass OCR"""
|
||||
if isinstance(image_path_or_array, str):
|
||||
img = cv2.imread(image_path_or_array)
|
||||
else:
|
||||
img = image_path_or_array
|
||||
|
||||
if img is None or img.size == 0:
|
||||
return []
|
||||
|
||||
original_shape = img.shape
|
||||
|
||||
# Generate preprocessing variants
|
||||
variants = self.preprocess_variants(img)
|
||||
|
||||
# Run OCR on each variant
|
||||
all_results = []
|
||||
for variant_name, variant_img in variants:
|
||||
results = self.run_vision_ocr(variant_img)
|
||||
if results:
|
||||
all_results.append((variant_name, results))
|
||||
|
||||
# Merge and vote on results
|
||||
final_results = self.merge_multi_pass_results(all_results, original_shape)
|
||||
|
||||
return final_results
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ORIGINAL OCR ENGINE (Fallback)
|
||||
# ============================================================
|
||||
class MacVisionDetector:
|
||||
def __init__(self, source_lang="en"):
|
||||
lang_key = source_lang.lower().strip()
|
||||
|
||||
lang_map = {
|
||||
"en": "en-US", "english": "en-US",
|
||||
"es": "es-ES", "spanish": "es-ES",
|
||||
"ca": "ca-ES", "catalan": "ca-ES",
|
||||
"fr": "fr-FR", "french": "fr-FR",
|
||||
"ja": "ja-JP", "japanese": "ja-JP",
|
||||
"it": "it-IT", "italian": "it-IT",
|
||||
"de": "de-DE", "german": "de-DE",
|
||||
"ko": "ko-KR", "korean": "ko-KR",
|
||||
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||||
}
|
||||
|
||||
apple_lang = lang_map.get(lang_key, "en-US")
|
||||
self.langs = [apple_lang]
|
||||
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
|
||||
|
||||
def read(self, image_path_or_array):
|
||||
if isinstance(image_path_or_array, str):
|
||||
img = cv2.imread(image_path_or_array)
|
||||
else:
|
||||
img = image_path_or_array
|
||||
|
||||
if img is None or img.size == 0:
|
||||
return []
|
||||
|
||||
ih, iw = img.shape[:2]
|
||||
|
||||
success, buffer = cv2.imencode('.png', img)
|
||||
if not success:
|
||||
return []
|
||||
|
||||
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||||
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||||
results = []
|
||||
|
||||
def completion_handler(request, error):
|
||||
if error:
|
||||
print(f"Vision API Error: {error}")
|
||||
return
|
||||
|
||||
for observation in request.results():
|
||||
candidate = observation.topCandidates_(1)[0]
|
||||
text = candidate.string()
|
||||
confidence = candidate.confidence()
|
||||
|
||||
bbox = observation.boundingBox()
|
||||
x = bbox.origin.x * iw
|
||||
y_bottom_left = bbox.origin.y * ih
|
||||
w = bbox.size.width * iw
|
||||
h = bbox.size.height * ih
|
||||
|
||||
y = ih - y_bottom_left - h
|
||||
|
||||
quad = [
|
||||
[int(x), int(y)],
|
||||
[int(x + w), int(y)],
|
||||
[int(x + w), int(y + h)],
|
||||
[int(x), int(y + h)]
|
||||
]
|
||||
|
||||
results.append((quad, text, confidence))
|
||||
|
||||
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||||
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||||
request.setUsesLanguageCorrection_(True)
|
||||
request.setRecognitionLanguages_(self.langs)
|
||||
|
||||
handler.performRequests_error_([request], None)
|
||||
return results
|
||||
|
||||
|
||||
# ============================================================
|
||||
# SPLITTERS + QUAD NORMALIZATION
|
||||
# ============================================================
|
||||
@@ -960,84 +1340,6 @@ def merge_close_bubbles_by_line_height(bubbles, bubble_boxes, bubble_quads, bubb
|
||||
return out_b, out_bb, out_bq, out_bi
|
||||
|
||||
|
||||
# ============================================================
|
||||
# OCR ENGINES (Apple Native Vision)
|
||||
# ============================================================
|
||||
class MacVisionDetector:
|
||||
def __init__(self, source_lang="en"):
|
||||
lang_key = source_lang.lower().strip()
|
||||
|
||||
lang_map = {
|
||||
"en": "en-US", "english": "en-US",
|
||||
"es": "es-ES", "spanish": "es-ES",
|
||||
"ca": "ca-ES", "catalan": "ca-ES",
|
||||
"fr": "fr-FR", "french": "fr-FR",
|
||||
"ja": "ja-JP", "japanese": "ja-JP",
|
||||
"it": "it-IT", "italian": "it-IT",
|
||||
"de": "de-DE", "german": "de-DE",
|
||||
"ko": "ko-KR", "korean": "ko-KR",
|
||||
"zh": "zh-Hans", "chinese": "zh-Hans"
|
||||
}
|
||||
|
||||
apple_lang = lang_map.get(lang_key, "en-US")
|
||||
self.langs = [apple_lang]
|
||||
print(f"⚡ Using Apple Vision OCR (Language: {self.langs[0]})")
|
||||
|
||||
def read(self, image_path_or_array):
|
||||
if isinstance(image_path_or_array, str):
|
||||
img = cv2.imread(image_path_or_array)
|
||||
else:
|
||||
img = image_path_or_array
|
||||
|
||||
if img is None or img.size == 0:
|
||||
return []
|
||||
|
||||
ih, iw = img.shape[:2]
|
||||
|
||||
success, buffer = cv2.imencode('.png', img)
|
||||
if not success:
|
||||
return []
|
||||
|
||||
ns_data = NSData.dataWithBytes_length_(buffer.tobytes(), len(buffer.tobytes()))
|
||||
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(ns_data, None)
|
||||
results = []
|
||||
|
||||
def completion_handler(request, error):
|
||||
if error:
|
||||
print(f"Vision API Error: {error}")
|
||||
return
|
||||
|
||||
for observation in request.results():
|
||||
candidate = observation.topCandidates_(1)[0]
|
||||
text = candidate.string()
|
||||
confidence = candidate.confidence()
|
||||
|
||||
bbox = observation.boundingBox()
|
||||
x = bbox.origin.x * iw
|
||||
y_bottom_left = bbox.origin.y * ih
|
||||
w = bbox.size.width * iw
|
||||
h = bbox.size.height * ih
|
||||
|
||||
y = ih - y_bottom_left - h
|
||||
|
||||
quad = [
|
||||
[int(x), int(y)],
|
||||
[int(x + w), int(y)],
|
||||
[int(x + w), int(y + h)],
|
||||
[int(x), int(y + h)]
|
||||
]
|
||||
|
||||
results.append((quad, text, confidence))
|
||||
|
||||
request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(completion_handler)
|
||||
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
||||
request.setUsesLanguageCorrection_(True)
|
||||
request.setRecognitionLanguages_(self.langs)
|
||||
|
||||
handler.performRequests_error_([request], None)
|
||||
return results
|
||||
|
||||
|
||||
# ============================================================
|
||||
# PREPROCESS
|
||||
# ============================================================
|
||||
@@ -1127,7 +1429,7 @@ def rebuild_text_from_vision_result(res):
|
||||
return normalize_text(" ".join(lines))
|
||||
|
||||
|
||||
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDetector, upscale=3.0, pad=24):
|
||||
def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector, upscale=3.0, pad=24):
|
||||
ih, iw = image_bgr.shape[:2]
|
||||
x1, y1, x2, y2 = bbox_xyxy
|
||||
x1 = max(0, int(x1 - pad)); y1 = max(0, int(y1 - pad))
|
||||
@@ -1148,7 +1450,13 @@ def reread_bubble_with_vision(image_bgr, bbox_xyxy, vision_detector: MacVisionDe
|
||||
proc3 = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR) if len(proc.shape) == 2 else proc
|
||||
for a in angles:
|
||||
rot = rotate_image_keep_bounds(proc3, a)
|
||||
|
||||
# Use run_vision_ocr if available (enhanced detector)
|
||||
if hasattr(vision_detector, 'run_vision_ocr'):
|
||||
res = vision_detector.run_vision_ocr(rot)
|
||||
else:
|
||||
res = vision_detector.read(rot)
|
||||
|
||||
txt = rebuild_text_from_vision_result(res)
|
||||
sc = ocr_candidate_score(txt)
|
||||
if sc > best_v_sc:
|
||||
@@ -1469,13 +1777,13 @@ def export_bubbles(filepath, bbox_dict, quads_dict, indices_dict, ocr, reading_m
|
||||
|
||||
|
||||
# ============================================================
|
||||
# PIPELINE
|
||||
# MAIN PIPELINE
|
||||
# ============================================================
|
||||
def translate_manga_text(
|
||||
image_path="001-page.png",
|
||||
source_lang="en",
|
||||
target_lang="ca",
|
||||
confidence_threshold=0.05,
|
||||
confidence_threshold=0.03,
|
||||
min_text_length=1,
|
||||
gap_px="auto",
|
||||
filter_sound_effects=True,
|
||||
@@ -1483,7 +1791,8 @@ def translate_manga_text(
|
||||
export_to_file="output.txt",
|
||||
export_bubbles_to="bubbles.json",
|
||||
reading_mode="ltr",
|
||||
debug=True
|
||||
debug=True,
|
||||
use_enhanced_ocr=True
|
||||
):
|
||||
image = cv2.imread(image_path)
|
||||
if image is None:
|
||||
@@ -1493,12 +1802,49 @@ def translate_manga_text(
|
||||
resolved_gap = auto_gap(image_path) if gap_px == "auto" else float(gap_px)
|
||||
|
||||
print("Loading OCR engines...")
|
||||
|
||||
# Use enhanced detector
|
||||
if use_enhanced_ocr:
|
||||
detector = ImprovedMacVisionDetector(source_lang=source_lang)
|
||||
print("🚀 Using Enhanced Multi-Pass OCR")
|
||||
else:
|
||||
detector = MacVisionDetector(source_lang=source_lang)
|
||||
|
||||
print("Running detection OCR (Apple Vision)...")
|
||||
raw = detector.read(image_path)
|
||||
print(f"Raw detections: {len(raw)}")
|
||||
|
||||
# Secondary pass for missed regions
|
||||
if use_enhanced_ocr:
|
||||
existing_quads = [r[0] for r in raw]
|
||||
missed_regions = detect_small_text_regions(image, existing_quads)
|
||||
|
||||
if missed_regions:
|
||||
print(f"🔍 Found {len(missed_regions)} potentially missed text regions")
|
||||
|
||||
# Re-run OCR on missed regions with higher upscaling
|
||||
for region in missed_regions:
|
||||
x1, y1, x2, y2 = region
|
||||
# Add padding
|
||||
pad = 10
|
||||
x1 = max(0, x1 - pad)
|
||||
y1 = max(0, y1 - pad)
|
||||
x2 = min(image.shape[1], x2 + pad)
|
||||
y2 = min(image.shape[0], y2 + pad)
|
||||
|
||||
crop = image[y1:y2, x1:x2]
|
||||
if crop.size > 0:
|
||||
# Aggressive upscaling for small text
|
||||
upscaled = cv2.resize(crop, None, fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
|
||||
region_results = detector.run_vision_ocr(upscaled)
|
||||
|
||||
# Scale back and offset coordinates
|
||||
for quad, text, conf in region_results:
|
||||
scaled_quad = [[int(p[0]/4.0 + x1), int(p[1]/4.0 + y1)] for p in quad]
|
||||
raw.append((scaled_quad, text, conf))
|
||||
|
||||
print(f"📝 Total detections after missed region scan: {len(raw)}")
|
||||
|
||||
filtered = []
|
||||
skipped = 0
|
||||
ih, iw = image.shape[:2]
|
||||
@@ -1533,7 +1879,7 @@ def translate_manga_text(
|
||||
if splits_made > 0:
|
||||
print(f"✂️ Split {splits_made} wide OCR lines across column gaps.")
|
||||
|
||||
# 2) split giant bridge quads (fixes page16 BOX19-like glue)
|
||||
# 2) split giant bridge quads
|
||||
filtered, bridge_splits = split_abnormal_bridge_quads(image, filtered)
|
||||
if bridge_splits > 0:
|
||||
print(f"🧩 Split {bridge_splits} abnormal bridge OCR quad(s).")
|
||||
@@ -1545,14 +1891,12 @@ def translate_manga_text(
|
||||
filtered, image.shape, gap_px=resolved_gap, bbox_padding=1
|
||||
)
|
||||
|
||||
# merge accidental sibling fragments (fixes page15 BOX11+BOX16 style)
|
||||
# merge accidental sibling fragments
|
||||
bubbles, bubble_boxes, bubble_quads, bubble_indices = merge_close_bubbles_by_line_height(
|
||||
bubbles, bubble_boxes, bubble_quads, bubble_indices, filtered
|
||||
)
|
||||
|
||||
# ============================================================
|
||||
# APPLY PAGE-SPECIFIC FIXES (NEW)
|
||||
# ============================================================
|
||||
# Apply page-specific fixes
|
||||
page_identifier = os.path.basename(image_path)
|
||||
bubbles, bubble_boxes, bubble_quads, bubble_indices = apply_page_specific_fixes(
|
||||
bubbles, bubble_boxes, bubble_quads, bubble_indices,
|
||||
@@ -1760,16 +2104,17 @@ def translate_manga_text(
|
||||
|
||||
if __name__ == "__main__":
|
||||
translate_manga_text(
|
||||
image_path="15.png",
|
||||
image_path="16.jpg",
|
||||
source_lang="english",
|
||||
target_lang="ca",
|
||||
confidence_threshold=0.05,
|
||||
confidence_threshold=0.03, # Lower threshold for better detection
|
||||
min_text_length=1,
|
||||
gap_px="auto",
|
||||
filter_sound_effects=True,
|
||||
quality_threshold=0.62,
|
||||
export_to_file="output.txt",
|
||||
export_bubbles_to="bubbles.json",
|
||||
reading_mode="rtl",
|
||||
debug=True
|
||||
reading_mode="ltr", #rtl or
|
||||
debug=True,
|
||||
use_enhanced_ocr=True # Enable enhanced multi-pass OCR
|
||||
)
|
||||
Reference in New Issue
Block a user