Smriti77 commited on
Commit
18cb951
·
verified ·
1 Parent(s): b1b201f

Created app.y

Browse files
Files changed (1) hide show
  1. app.py +514 -0
app.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import subprocess
5
+ import faiss
6
+ import cv2
7
+ import re
8
+ import gradio as gr
9
+ from sentence_transformers import SentenceTransformer
10
+ from openai import OpenAI
11
+ import logging
12
+ from PIL import Image
13
+ import base64
14
+ import io
15
+
16
+
17
+ deepseek_api_key = os.environ.get("DEEPSEEK_API_KEY", "YOUR_API_KEY")
18
+ client = OpenAI(
19
+ base_url="https://openrouter.ai/api/v1",
20
+ api_key=deepseek_api_key,
21
+ )
22
+
23
+
24
+ DATASET_PATH = "data"
25
+ JSON_PATH = f"{DATASET_PATH}/sign_language_data.json"
26
+
27
+
28
+ if os.path.exists(JSON_PATH):
29
+ with open(JSON_PATH, "r") as f:
30
+ dataset = json.load(f)
31
+
32
+ for item in dataset:
33
+
34
+ category = item["category"].lower().replace(" ", "_")
35
+
36
+
37
+ video_filename = os.path.basename(item["video_clip_path"])
38
+ item["video_clip_path"] = f"{DATASET_PATH}/clips/{category}/{video_filename}"
39
+
40
+
41
+ frame_filename = os.path.basename(item["frame_path"])
42
+ item["frame_path"] = f"{DATASET_PATH}/all_signs/{frame_filename}"
43
+
44
+ else:
45
+
46
+ dataset = []
47
+ print(f"Warning: {JSON_PATH} does not exist. Using empty dataset.")
48
+
49
+ # Configure logging
50
+ logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
51
+
52
+ # Load embedding model
53
+ print("Loading sentence transformer model...")
54
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
55
+
56
+ # Create FAISS index
57
+ dimension = 384
58
+ index = faiss.IndexFlatL2(dimension)
59
+ text_to_video = {}
60
+ idx_to_text = []
61
+
62
+ # Add data to index
63
+ for item in dataset:
64
+ phrases = [item["text"]] + item.get("semantic_meaning", [])
65
+
66
+ for phrase in phrases:
67
+ embedding = embed_model.encode(phrase).astype(np.float32)
68
+ index.add(np.array([embedding]))
69
+ text_to_video[phrase] = item["video_clip_path"]
70
+ idx_to_text.append(phrase)
71
+
72
+ print(f"Indexed {len(idx_to_text)} phrases")
73
+
74
+ def list_available_phrases():
75
+ print("Available phrases in dataset:")
76
+ for idx, phrase in enumerate(text_to_video.keys()):
77
+ print(f"{idx+1}. '{phrase}'")
78
+ print(f"Total: {len(text_to_video)} phrases")
79
+
80
+
81
+ def preprocess_text(text):
82
+ # Remove emojis and special characters
83
+ emoji_pattern = re.compile("["
84
+ u"\U0001F600-\U0001F64F"
85
+ u"\U0001F300-\U0001F5FF"
86
+ u"\U0001F680-\U0001F6FF"
87
+ u"\U0001F700-\U0001F77F"
88
+ u"\U0001F780-\U0001F7FF"
89
+ u"\U0001F800-\U0001F8FF"
90
+ u"\U0001F900-\U0001F9FF"
91
+ u"\U0001FA00-\U0001FA6F"
92
+ u"\U0001FA70-\U0001FAFF"
93
+ u"\U00002702-\U000027B0"
94
+ u"\U000024C2-\U0001F251"
95
+ "]+", flags=re.UNICODE)
96
+
97
+ text = emoji_pattern.sub(r'', text)
98
+ text = re.sub(r'[^\w\s\?\/]', '', text)
99
+ text = re.sub(r'\s+', ' ', text).strip()
100
+
101
+ return text
102
+
103
+
104
+ def refine_sentence_with_deepseek(text):
105
+ # Clean the input
106
+ text = preprocess_text(text)
107
+
108
+ prompt = f"""
109
+ Convert the following sentence into a sign-language-friendly version:
110
+ - Remove unnecessary words like articles (a, an, the).
111
+ - Keep essential words like pronouns (I, you, we, they).
112
+ - Maintain question words (what, where, when, why, how).
113
+ - Ensure verbs and key actions are included.
114
+ - Reorder words to match sign language grammar.
115
+ - IMPORTANT: Format your response with "SIGN_LANGUAGE_VERSION: [your simplified phrase]" at the beginning.
116
+ - Sign language often places topic first, then comment (e.g., "READY YOU?" instead of "YOU READY?").
117
+
118
+ Sentence: "{text}"
119
+ """
120
+
121
+ try:
122
+ completion = client.chat.completions.create(
123
+ model="deepseek/deepseek-r1:free",
124
+ messages=[{"role": "user", "content": prompt}],
125
+ temperature=0.3
126
+ )
127
+
128
+ full_response = completion.choices[0].message.content.strip()
129
+
130
+ patterns = [
131
+ r"SIGN_LANGUAGE_VERSION:\s*(.+?)(?:\n|$)",
132
+ r"\*\*Signs?\*\*:?\s*(.+?)(?:\n|$)",
133
+ r"\*\*Sign-language-friendly version:\*\*\s*(.+?)(?:\n|$)",
134
+ r"(?:^|\n)([A-Z\s\?\!]+)(?:\n|$)"
135
+ ]
136
+
137
+ for pattern in patterns:
138
+ match = re.search(pattern, full_response, re.MULTILINE)
139
+ if match:
140
+ refined_text = match.group(1).strip()
141
+ return refined_text
142
+
143
+ first_line = full_response.split('\n')[0].strip()
144
+ return first_line
145
+
146
+ except Exception as e:
147
+ print(f"Error with DeepSeek API: {str(e)}")
148
+ # Fallback to basic word filtering
149
+ words = text.split()
150
+ filtered_words = [w for w in words if w.lower() not in ['a', 'an', 'the', 'is', 'are', 'am']]
151
+ return ' '.join(filtered_words)
152
+
153
+
154
+ def retrieve_video(text, debug=False, similarity_threshold=0.7):
155
+ # Check for empty input
156
+ if not text or text.isspace():
157
+ return None
158
+
159
+ text = preprocess_text(text)
160
+
161
+ if debug:
162
+ print(f"Creating embedding for '{text}'")
163
+
164
+ # Handle special case for "I"
165
+ if text.lower() == "i":
166
+ if "I/me" in text_to_video:
167
+ if debug:
168
+ print(f" Direct mapping found: '{text}' → 'I/me'")
169
+ return text_to_video["I/me"]
170
+
171
+ if index.ntotal == 0:
172
+ if debug:
173
+ print("No items in the index")
174
+ return None
175
+
176
+ query_embedding = embed_model.encode(text).astype(np.float32)
177
+ distances, closest_idx = index.search(np.array([query_embedding]), min(3, index.ntotal)) # Get top matches
178
+
179
+ closest_texts = [idx_to_text[idx] for idx in closest_idx[0]]
180
+ similarity_scores = distances[0]
181
+
182
+ if debug:
183
+ print(f"Top matches for '{text}':")
184
+ for i, (phrase, score) in enumerate(zip(closest_texts, similarity_scores)):
185
+ print(f" {i+1}. '{phrase}' (score: {score:.4f})")
186
+
187
+ if len(similarity_scores) > 0 and similarity_scores[0] < similarity_threshold:
188
+ closest_text = closest_texts[0]
189
+ query_word_count = len(text.split())
190
+ match_word_count = len(closest_text.split())
191
+
192
+ if query_word_count > 1 and match_word_count == 1:
193
+ if debug:
194
+ print(f"Rejecting single-word match '{closest_text}' for multi-word query '{text}'")
195
+ return None
196
+
197
+ if debug:
198
+ print(f" Found match: '{closest_text}' with score {similarity_scores[0]:.4f}")
199
+ return text_to_video.get(closest_text, None)
200
+ else:
201
+ if debug:
202
+ print(f"No match found with similarity below threshold {similarity_threshold}")
203
+ return None
204
+
205
+
206
+ def merge_videos(video_list, output_path="temp/output.mp4"):
207
+ # Ensure temp directory exists
208
+ os.makedirs("temp", exist_ok=True)
209
+
210
+ if not video_list:
211
+ return None
212
+
213
+ if len(video_list) == 1:
214
+ os.system(f"cp '{video_list[0]}' '{output_path}'")
215
+ return output_path
216
+
217
+ for path in video_list:
218
+ if not os.path.exists(path):
219
+ print(f"Warning: Video path does not exist: {path}")
220
+ return None
221
+
222
+ with open("temp/video_list.txt", "w") as f:
223
+ for path in video_list:
224
+ f.write(f"file '{path}'\n")
225
+
226
+ command = f"ffmpeg -f concat -safe 0 -i temp/video_list.txt -c copy {output_path} -y"
227
+ process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
228
+
229
+ if process.returncode != 0:
230
+ print(f"FFmpeg error: {process.stderr.decode()}")
231
+ return None
232
+
233
+ return output_path
234
+
235
+
236
+ def save_video(video_path, output_path="temp/display_output.mp4"):
237
+
238
+ os.makedirs("temp", exist_ok=True)
239
+
240
+ if not video_path or not os.path.exists(video_path):
241
+ return None
242
+
243
+ if video_path != output_path:
244
+ os.system(f"cp '{video_path}' '{output_path}'")
245
+ return output_path
246
+
247
+
248
+ def text_to_sign_pipeline(user_input, debug=False):
249
+
250
+ user_input = preprocess_text(user_input)
251
+
252
+ if debug:
253
+ print(f"Processing input: '{user_input}'")
254
+
255
+ has_multiple_words = len(user_input.split()) > 1
256
+
257
+ if not has_multiple_words:
258
+ direct_video = retrieve_video(user_input, debug=debug)
259
+ if direct_video:
260
+ if debug:
261
+ print(f"Single word match found for '{user_input}'")
262
+ return save_video(direct_video)
263
+
264
+ sign_friendly_sentence = refine_sentence_with_deepseek(user_input)
265
+ if debug:
266
+ print(f"DeepSeek refined input to: '{sign_friendly_sentence}'")
267
+
268
+ full_sentence_video = retrieve_video(sign_friendly_sentence, debug=debug)
269
+ if full_sentence_video:
270
+ if debug:
271
+ print(f"Found full sentence match for '{sign_friendly_sentence}'")
272
+ return save_video(full_sentence_video)
273
+
274
+ words = sign_friendly_sentence.split()
275
+ video_paths = []
276
+
277
+ if debug:
278
+ print(f"No full sentence match. Trying word-by-word approach for: {words}")
279
+
280
+ for word in words:
281
+ clean_word = preprocess_text(word).replace('?', '')
282
+ if not clean_word or clean_word.isspace():
283
+ continue
284
+
285
+ word_video = retrieve_video(clean_word, debug=debug)
286
+ if word_video:
287
+ print(f" Found video for word: '{clean_word}'")
288
+ video_paths.append(word_video)
289
+ else:
290
+ print(f" No video found for word: '{clean_word}'")
291
+
292
+ if not video_paths:
293
+ print(" No videos found for any words in the sentence")
294
+ return None
295
+
296
+ if debug:
297
+ print(f"Found videos for {len(video_paths)} words, merging...")
298
+
299
+ merged_video = merge_videos(video_paths)
300
+ return save_video(merged_video)
301
+
302
+
303
+ def encode_image_to_base64(image_path):
304
+ with open(image_path, "rb") as image_file:
305
+ return base64.b64encode(image_file.read()).decode('utf-8')
306
+
307
+
308
+ def preprocess_image(image_path):
309
+ img = cv2.imread(image_path)
310
+ if img is None:
311
+ return None
312
+
313
+ height, width = img.shape[:2]
314
+
315
+
316
+ right_side = img[:, width//2:width]
317
+
318
+
319
+ os.makedirs("temp", exist_ok=True)
320
+ cropped_path = "temp/cropped_image.jpg"
321
+ cv2.imwrite(cropped_path, right_side)
322
+
323
+ return cropped_path
324
+
325
+
326
+ def detect_text_in_image(image_path, debug=False):
327
+ base64_image = encode_image_to_base64(image_path)
328
+
329
+ prompt = """
330
+ Is there any prominent text label or sign language text in this image?
331
+ Answer with ONLY "YES" or "NO".
332
+ """
333
+
334
+ try:
335
+ completion = client.chat.completions.create(
336
+ model="qwen/qwen-vl-plus:free",
337
+ messages=[
338
+ {
339
+ "role": "user",
340
+ "content": [
341
+ {"type": "text", "text": prompt},
342
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
343
+ ]
344
+ }
345
+ ],
346
+ temperature=0.3
347
+ )
348
+
349
+ response = completion.choices[0].message.content.strip().upper()
350
+
351
+ if debug:
352
+ print(f"Text detection response: {response}")
353
+
354
+ return "YES" in response
355
+
356
+ except Exception as e:
357
+ if debug:
358
+ print(f"Error in text detection: {str(e)}")
359
+ return False
360
+
361
+
362
+ def image_to_text_with_qwen(image_path, debug=False):
363
+ base64_image = encode_image_to_base64(image_path)
364
+
365
+
366
+ has_text = detect_text_in_image(image_path, debug)
367
+
368
+ if has_text:
369
+
370
+ cropped_image_path = preprocess_image(image_path)
371
+ if cropped_image_path:
372
+ cropped_base64 = encode_image_to_base64(cropped_image_path)
373
+
374
+ prompt = """
375
+ Extract ONLY the main text label from this image. I'm looking for a single word or short phrase
376
+ that appears as the main text (like "AFTERNOON"). Ignore any numbers, categories, or other text.
377
+
378
+ Provide ONLY the extracted text without any other explanation or context.
379
+ """
380
+
381
+ try:
382
+ completion = client.chat.completions.create(
383
+ model="qwen/qwen-vl-plus:free",
384
+ messages=[
385
+ {
386
+ "role": "user",
387
+ "content": [
388
+ {"type": "text", "text": prompt},
389
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{cropped_base64}"}}
390
+ ]
391
+ }
392
+ ],
393
+ temperature=0.3
394
+ )
395
+
396
+ response = completion.choices[0].message.content.strip()
397
+
398
+ if debug:
399
+ print(f"Qwen VL text extraction response: {response}")
400
+
401
+
402
+ cleaned_text = re.sub(r"^(the|main|text|label|is|:|\.|\s)+", "", response, flags=re.IGNORECASE)
403
+ cleaned_text = re.sub(r'["\'\(\)]', '', cleaned_text)
404
+ cleaned_text = cleaned_text.strip().upper()
405
+
406
+ if cleaned_text:
407
+ return cleaned_text, "text"
408
+
409
+ except Exception as e:
410
+ if debug:
411
+ print(f"Error using Qwen VL for text extraction: {str(e)}")
412
+
413
+
414
+ prompt = """
415
+ Describe this image in a SINGLE WORD only.
416
+ Focus on the main subject (like "MAN", "WOMAN", "HOUSE", "HAPPY", "SAD", etc.).
417
+ Provide ONLY this single word without any punctuation or explanation.
418
+ """
419
+
420
+ try:
421
+ completion = client.chat.completions.create(
422
+ model="qwen/qwen-vl-plus:free",
423
+ messages=[
424
+ {
425
+ "role": "user",
426
+ "content": [
427
+ {"type": "text", "text": prompt},
428
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
429
+ ]
430
+ }
431
+ ],
432
+ temperature=0.3
433
+ )
434
+
435
+ response = completion.choices[0].message.content.strip()
436
+
437
+ if debug:
438
+ print(f"Qwen VL caption response: {response}")
439
+
440
+
441
+ cleaned_caption = re.sub(r'[^\w\s]', '', response)
442
+ cleaned_caption = cleaned_caption.strip().split()[0]
443
+ cleaned_caption = cleaned_caption.upper()
444
+
445
+ return cleaned_caption, "caption"
446
+
447
+ except Exception as e:
448
+ if debug:
449
+ print(f"Error using Qwen VL for captioning: {str(e)}")
450
+ return "ERROR", "error"
451
+
452
+
453
+ def process_text(input_text):
454
+ if not input_text or input_text.isspace():
455
+ return "Please enter some text to convert."
456
+
457
+ final_video = text_to_sign_pipeline(input_text, debug=True)
458
+ if final_video:
459
+ return final_video
460
+ else:
461
+ return "Sorry, no matching sign language video found."
462
+
463
+
464
+ def process_image(input_image):
465
+
466
+ os.makedirs("temp", exist_ok=True)
467
+
468
+
469
+ image_path = "temp/uploaded_image.jpg"
470
+ input_image.save(image_path)
471
+
472
+
473
+ extracted_text, source_type = image_to_text_with_qwen(image_path, debug=True)
474
+
475
+ if extracted_text == "ERROR":
476
+ return "Error processing image", None
477
+
478
+
479
+ sign_video = text_to_sign_pipeline(extracted_text, debug=True)
480
+
481
+
482
+ if source_type == "text":
483
+ result_text = f"Extracted text: {extracted_text}"
484
+ else:
485
+ result_text = f"Generated caption: {extracted_text}"
486
+
487
+ return result_text, sign_video if sign_video else "No matching sign language video found"
488
+
489
+
490
+
491
+ with gr.Blocks() as app:
492
+ gr.Markdown("# Sign Language Conversion")
493
+
494
+ with gr.Tabs():
495
+ with gr.Tab("Text to Sign"):
496
+ text_input = gr.Textbox(label="Enter text to convert to sign language")
497
+ text_button = gr.Button("Convert Text to Sign")
498
+ text_output = gr.Video(label="Sign Language Output")
499
+ text_button.click(process_text, inputs=text_input, outputs=text_output)
500
+
501
+ with gr.Tab("Image to Text/Caption and Sign"):
502
+ image_input = gr.Image(type="pil", label="Upload image")
503
+ image_button = gr.Button("Process Image and Convert to Sign")
504
+ extracted_text_output = gr.Textbox(label="Extracted Text/Caption")
505
+ image_output = gr.Video(label="Sign Language Output")
506
+
507
+ image_button.click(
508
+ process_image,
509
+ inputs=image_input,
510
+ outputs=[extracted_text_output, image_output]
511
+ )
512
+
513
+
514
+ app.launch()