Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -494,6 +494,58 @@ def init_ocr_model():
|
|
494 |
if ocr_model is None:
|
495 |
ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
|
496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
def ocr_frame_worker(args):
|
498 |
frame_idx, frame_time, frame = args
|
499 |
|
@@ -509,7 +561,8 @@ def ocr_frame_worker(args):
|
|
509 |
frame = frame.astype(np.uint8)
|
510 |
|
511 |
try:
|
512 |
-
|
|
|
513 |
texts = [line[1][0] for line in result[0]] if result[0] else []
|
514 |
combined_text = " ".join(texts).strip()
|
515 |
return {"time": frame_time, "text": combined_text}
|
|
|
494 |
if ocr_model is None:
|
495 |
ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
|
496 |
|
497 |
+
def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
|
498 |
+
"""
|
499 |
+
Automatically identifies the best subtitle region in a video frame using OCR confidence.
|
500 |
+
|
501 |
+
Parameters:
|
502 |
+
- frame: full video frame (BGR np.ndarray)
|
503 |
+
- ocr_model: a loaded PaddleOCR model
|
504 |
+
- region_height_ratio: portion of image height to scan (from bottom up)
|
505 |
+
- num_strips: how many horizontal strips to evaluate
|
506 |
+
- min_conf: minimum average confidence to consider a region valid
|
507 |
+
|
508 |
+
Returns:
|
509 |
+
- crop_region: the cropped image region with highest OCR confidence
|
510 |
+
- region_box: (y_start, y_end) of the region in the original frame
|
511 |
+
"""
|
512 |
+
height, width, _ = frame.shape
|
513 |
+
region_height = int(height * region_height_ratio)
|
514 |
+
base_y_start = height - region_height
|
515 |
+
strip_height = region_height // num_strips
|
516 |
+
|
517 |
+
best_score = -1
|
518 |
+
best_crop = None
|
519 |
+
best_bounds = (0, height)
|
520 |
+
|
521 |
+
for i in range(num_strips):
|
522 |
+
y_start = base_y_start + i * strip_height
|
523 |
+
y_end = y_start + strip_height
|
524 |
+
strip = frame[y_start:y_end, :]
|
525 |
+
|
526 |
+
try:
|
527 |
+
result = ocr_model.ocr(strip, cls=True)
|
528 |
+
if not result or not result[0]:
|
529 |
+
continue
|
530 |
+
|
531 |
+
total_score = sum(line[1][1] for line in result[0])
|
532 |
+
avg_score = total_score / len(result[0])
|
533 |
+
|
534 |
+
if avg_score > best_score:
|
535 |
+
best_score = avg_score
|
536 |
+
best_crop = strip
|
537 |
+
best_bounds = (y_start, y_end)
|
538 |
+
|
539 |
+
except Exception as e:
|
540 |
+
continue # Fail silently on OCR issues
|
541 |
+
|
542 |
+
if best_score >= min_conf and best_crop is not None:
|
543 |
+
return best_crop, best_bounds
|
544 |
+
else:
|
545 |
+
# Fallback to center-bottom strip
|
546 |
+
fallback_y = height - int(height * 0.2)
|
547 |
+
return frame[fallback_y:, :], (fallback_y, height)
|
548 |
+
|
549 |
def ocr_frame_worker(args):
|
550 |
frame_idx, frame_time, frame = args
|
551 |
|
|
|
561 |
frame = frame.astype(np.uint8)
|
562 |
|
563 |
try:
|
564 |
+
subtitle_crop, _ = find_best_subtitle_region(frame, ocr_model)
|
565 |
+
result = ocr_model.ocr(subtitle_crop, cls=True)
|
566 |
texts = [line[1][0] for line in result[0]] if result[0] else []
|
567 |
combined_text = " ".join(texts).strip()
|
568 |
return {"time": frame_time, "text": combined_text}
|