Spaces:
Runtime error
Runtime error
import gradio as gr | |
import mediapipe as mp | |
import cv2 | |
import numpy as np | |
import threading | |
import time | |
from collections import deque | |
from transformers import pipeline | |
import torch | |
class OptimizedSignDetector: | |
def __init__(self): | |
# Initialize MediaPipe with optimized settings | |
self.mp_hands = mp.solutions.hands | |
self.hands = self.mp_hands.Hands( | |
static_image_mode=False, | |
max_num_hands=2, | |
min_detection_confidence=0.7, | |
min_tracking_confidence=0.5, | |
model_complexity=0 # Faster model | |
) | |
self.mp_drawing = mp.solutions.drawing_utils | |
# Sign detection arrays | |
self.detected_signs = deque(maxlen=100) # Limit array size | |
self.current_word = "" | |
self.word_buffer = [] | |
# Frame processing optimization | |
self.frame_skip = 2 # Process every 2nd frame | |
self.frame_count = 0 | |
self.last_detection_time = 0 | |
self.detection_cooldown = 0.5 # 500ms between detections | |
# Available sign languages | |
self.sign_languages = { | |
"ASL": "American Sign Language", | |
"BSL": "British Sign Language", | |
"ISL": "Indian Sign Language", | |
"CSL": "Chinese Sign Language", | |
"FSL": "French Sign Language", | |
"GSL": "German Sign Language", | |
"JSL": "Japanese Sign Language" | |
} | |
# Popular translation languages | |
self.translation_languages = { | |
"English": "en", | |
"Spanish": "es", | |
"French": "fr", | |
"German": "de", | |
"Italian": "it", | |
"Portuguese": "pt", | |
"Russian": "ru", | |
"Japanese": "ja", | |
"Korean": "ko", | |
"Chinese": "zh", | |
"Arabic": "ar", | |
"Hindi": "hi", | |
"Yoruba": "yo", | |
"Igbo": "ig", | |
"Hausa": "ha" | |
} | |
# Initialize translator | |
self.translator = pipeline( | |
"translation", | |
model="facebook/nllb-200-distilled-600M", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
# Sign recognition patterns (simplified for demo) | |
self.sign_patterns = self.load_sign_patterns() | |
def load_sign_patterns(self): | |
"""Load sign language patterns for different languages""" | |
return { | |
"ASL": { | |
"hello": [0.1, 0.2, 0.3], # Simplified landmark pattern | |
"thank_you": [0.4, 0.5, 0.6], | |
"goodbye": [0.7, 0.8, 0.9] | |
}, | |
"BSL": { | |
"hello": [0.11, 0.21, 0.31], | |
"thank_you": [0.41, 0.51, 0.61], | |
"goodbye": [0.71, 0.81, 0.91] | |
} | |
} | |
def optimize_frame(self, frame): | |
"""Optimize frame for faster processing""" | |
# Resize frame for faster processing | |
height, width = frame.shape[:2] | |
if width > 640: | |
scale = 640 / width | |
new_width = 640 | |
new_height = int(height * scale) | |
frame = cv2.resize(frame, (new_width, new_height)) | |
# Convert to RGB | |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
return frame_rgb | |
def extract_hand_features(self, landmarks): | |
"""Extract optimized features from hand landmarks""" | |
if not landmarks: | |
return None | |
# Convert landmarks to numpy array for faster processing | |
points = np.array([[lm.x, lm.y, lm.z] for lm in landmarks.landmark]) | |
# Calculate relative positions (more robust) | |
wrist = points[0] | |
relative_points = points - wrist | |
# Normalize to make scale-invariant | |
max_dist = np.max(np.linalg.norm(relative_points, axis=1)) | |
if max_dist > 0: | |
relative_points = relative_points / max_dist | |
# Extract key features (fingertips, joints) | |
key_points = relative_points[[4, 8, 12, 16, 20]] # Fingertips | |
return key_points.flatten() | |
def detect_sign(self, frame, sign_language="ASL"): | |
"""Detect sign from frame with optimizations""" | |
current_time = time.time() | |
# Skip frames for speed | |
self.frame_count += 1 | |
if self.frame_count % self.frame_skip != 0: | |
return None | |
# Cooldown between detections | |
if current_time - self.last_detection_time < self.detection_cooldown: | |
return None | |
# Optimize frame | |
frame_rgb = self.optimize_frame(frame) | |
# Process with MediaPipe | |
results = self.hands.process(frame_rgb) | |
if results.multi_hand_landmarks: | |
for hand_landmarks in results.multi_hand_landmarks: | |
# Extract features | |
features = self.extract_hand_features(hand_landmarks) | |
if features is not None: | |
# Simple pattern matching (replace with ML model) | |
detected_sign = self.match_sign_pattern(features, sign_language) | |
if detected_sign: | |
self.last_detection_time = current_time | |
return detected_sign | |
return None | |
def match_sign_pattern(self, features, sign_language): | |
"""Match features to sign patterns""" | |
patterns = self.sign_patterns.get(sign_language, {}) | |
# Simple distance-based matching (replace with proper ML) | |
min_distance = float('inf') | |
best_match = None | |
for sign, pattern in patterns.items(): | |
if len(features) >= len(pattern): | |
distance = np.mean((features[:len(pattern)] - np.array(pattern))**2) | |
if distance < min_distance and distance < 0.1: # Threshold | |
min_distance = distance | |
best_match = sign | |
return best_match | |
def process_video_stream(self, frame, sign_language, target_language): | |
"""Process video stream and return results""" | |
# Detect sign | |
detected_sign = self.detect_sign(frame, sign_language) | |
if detected_sign: | |
# Add to detection array | |
self.detected_signs.append({ | |
"sign": detected_sign, | |
"timestamp": time.time(), | |
"language": sign_language | |
}) | |
# Update current word | |
self.current_word = detected_sign | |
# Build sentence from recent signs | |
recent_signs = list(self.detected_signs)[-10:] # Last 10 signs | |
sentence = " ".join([s["sign"] for s in recent_signs]) | |
# Translate if needed | |
translated_text = self.translate_text(sentence, target_language) | |
return { | |
"current_sign": detected_sign, | |
"sentence": sentence, | |
"translation": translated_text, | |
"sign_array": [s["sign"] for s in self.detected_signs] | |
} | |
return { | |
"current_sign": self.current_word, | |
"sentence": " ".join([s["sign"] for s in self.detected_signs][-10:]), | |
"translation": "", | |
"sign_array": [s["sign"] for s in self.detected_signs] | |
} | |
def translate_text(self, text, target_language): | |
"""Translate text to target language""" | |
if not text or target_language == "English": | |
return text | |
try: | |
target_code = self.translation_languages.get(target_language, "en") | |
result = self.translator(text, src_lang="en", tgt_lang=target_code) | |
return result[0]['translation_text'] | |
except: | |
return text | |
def clear_detections(self): | |
"""Clear detection array""" | |
self.detected_signs.clear() | |
self.current_word = "" | |
return "Detections cleared!" | |
# Initialize detector | |
detector = OptimizedSignDetector() | |
def process_video(video_frame, sign_language, target_language): | |
"""Process video frame and return results""" | |
if video_frame is None: | |
return "", "", "", [] | |
result = detector.process_video_stream(video_frame, sign_language, target_language) | |
return ( | |
result["current_sign"], | |
result["sentence"], | |
result["translation"], | |
result["sign_array"] | |
) | |
def clear_all(): | |
"""Clear all detections""" | |
return detector.clear_detections() | |
# Create Gradio interface | |
with gr.Blocks(title="Advanced Sign Language Interpreter") as demo: | |
gr.Markdown("# π€ Advanced Sign Language Interpreter") | |
gr.Markdown("Real-time sign language detection with translation to multiple languages") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Video input | |
video_input = gr.Image( | |
source="webcam", | |
streaming=True, | |
label="Camera Feed" | |
) | |
with gr.Row(): | |
sign_language = gr.Dropdown( | |
choices=list(detector.sign_languages.keys()), | |
value="ASL", | |
label="Sign Language" | |
) | |
target_language = gr.Dropdown( | |
choices=list(detector.translation_languages.keys()), | |
value="English", | |
label="Target Language" | |
) | |
with gr.Column(scale=1): | |
# Outputs | |
current_sign = gr.Textbox( | |
label="Current Sign", | |
interactive=False | |
) | |
sentence = gr.Textbox( | |
label="Detected Sentence", | |
lines=3, | |
interactive=False | |
) | |
translation = gr.Textbox( | |
label="Translation", | |
lines=3, | |
interactive=False | |
) | |
sign_array = gr.JSON( | |
label="Sign History Array", | |
visible=True | |
) | |
clear_btn = gr.Button("Clear All", variant="secondary") | |
# Connect video processing | |
video_input.stream( | |
fn=process_video, | |
inputs=[video_input, sign_language, target_language], | |
outputs=[current_sign, sentence, translation, sign_array], | |
time_limit=60 | |
) | |
# Clear button | |
clear_btn.click( | |
fn=clear_all, | |
outputs=[current_sign, sentence, translation, sign_array] | |
) | |
# Instructions | |
gr.Markdown(""" | |
## How to Use: | |
1. **Select Sign Language**: Choose from ASL, BSL, ISL, etc. | |
2. **Select Target Language**: Choose translation language | |
3. **Start Signing**: Signs will be detected and stored in array | |
4. **View Results**: See current sign, sentence, and translation | |
5. **Clear History**: Use clear button to reset | |
## Optimizations: | |
- β‘ Frame skipping for speed | |
- π― Improved hand detection | |
- πΎ Array-based sign storage | |
- π Multiple sign languages | |
- π Real-time translation | |
""") | |
if __name__ == "__main__": | |
demo.launch() |