Spaces:
Build error
Build error
File size: 3,872 Bytes
5fc45d8 fc8c0c3 0151363 fc8c0c3 5fc45d8 fc8c0c3 0151363 fd4edb3 0151363 fc8c0c3 5fc45d8 fc8c0c3 0151363 fc8c0c3 5fc45d8 0151363 5fc45d8 877e9a0 42f5e4b 877e9a0 5fc45d8 0151363 5fc45d8 0151363 5fc45d8 fc8c0c3 0151363 fc8c0c3 0151363 fc8c0c3 0151363 fc8c0c3 0151363 fc8c0c3 5fc45d8 fc8c0c3 0151363 5fc45d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os
import torch
import time
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from google.generativeai import GenerativeModel, configure
import gradio as gr
# Initialize with prints
print("⚡ Initializing models...")
start_load = time.time()
# 1. Load Gemini
GEMINI_KEY = os.environ.get('GEMINI_API_KEY')
configure(api_key=GEMINI_KEY)
gemini = GenerativeModel('gemini-2.0-flash')
print(f" ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})")
# 2. Load Indic-TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
print(f" ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n")
def hinglish_to_devnagri(text):
try:
print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'")
start = time.time()
response = gemini.generate_content(
f"""
Analyze this mixed-language text containing English and a regional language:
"{text}"
Perform:
1. Detect the regional language (e.g. Hindi, Tamil, Bengali)
2. Convert to pure regional language script (Devanagari/Tamil/Bangla)
3. Preserve complex words (technical/medical terms) in their original form
4. Maintain natural flow and meaning
5. Remove the code if you find them in backticks ```.
Rules:
- Keep proper nouns unchanged
- Retain English words if no common regional equivalent exists
- Use colloquial spellings (e.g. "कॉलेज" not "विद्यालय" for "college")
Output ONLY the converted text in the detected script.
"""
)
print(f" ✓ Translation done in {time.time() - start:.2f}s")
return response.text
except Exception as e:
print(f"❌ Gemini error: {str(e)}")
raise gr.Error(f"Gemini error: {str(e)}")
def generate_speech(text):
print("\n" + "="*50)
print("🎤 Starting Hinglish-to-Speech pipeline")
# 1. Text Conversion
hindi_text = hinglish_to_devnagri(text)
print(f" Hindi text: {hindi_text[:50]}...")
# 2. Audio Generation
print("\n🔊 Generating audio...")
start_audio = time.time()
desc = "एक महिला वक्ता स्पष्ट हिंदी में बोल रही हैं"
desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device)
audio = tts_model.generate(
input_ids=desc_inputs.input_ids,
attention_mask=desc_inputs.attention_mask,
prompt_input_ids=text_inputs.input_ids,
prompt_attention_mask=text_inputs.attention_mask
)
# 3. Save Output
sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)
print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s")
print("="*50 + "\n")
return "output.wav", hindi_text
# Gradio UI
with gr.Blocks() as app:
gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)")
with gr.Row():
inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...")
btn = gr.Button("Generate")
with gr.Row():
audio_out = gr.Audio(label="Speech Output")
text_out = gr.Textbox(label="Devnagri Translation")
btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out])
print("\n🚀 App ready! Waiting for input...")
app.launch() |