File size: 3,872 Bytes
5fc45d8
fc8c0c3
0151363
fc8c0c3
 
 
5fc45d8
fc8c0c3
 
0151363
 
 
 
 
 
 
fd4edb3
0151363
fc8c0c3
5fc45d8
 
fc8c0c3
 
 
0151363
fc8c0c3
 
5fc45d8
0151363
 
 
5fc45d8
877e9a0
 
 
 
 
 
 
 
 
42f5e4b
877e9a0
 
 
 
 
 
 
 
5fc45d8
0151363
 
5fc45d8
 
0151363
5fc45d8
fc8c0c3
 
0151363
 
 
 
fc8c0c3
0151363
 
 
 
 
fc8c0c3
 
 
 
 
 
 
 
 
 
 
 
0151363
fc8c0c3
0151363
 
 
fc8c0c3
 
 
5fc45d8
 
 
 
 
 
 
 
 
 
fc8c0c3
0151363
5fc45d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import torch
import time
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from google.generativeai import GenerativeModel, configure
import gradio as gr

# Initialize with prints
print("⚡ Initializing models...")
start_load = time.time()

# 1. Load Gemini
GEMINI_KEY = os.environ.get('GEMINI_API_KEY')
configure(api_key=GEMINI_KEY)
gemini = GenerativeModel('gemini-2.0-flash')
print(f"   ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})")

# 2. Load Indic-TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
print(f"   ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n")

def hinglish_to_devnagri(text):
    try:
        print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'")
        start = time.time()
        
        response = gemini.generate_content(
            f"""
            Analyze this mixed-language text containing English and a regional language:
            "{text}"

            Perform:
            1. Detect the regional language (e.g. Hindi, Tamil, Bengali)
            2. Convert to pure regional language script (Devanagari/Tamil/Bangla)
            3. Preserve complex words (technical/medical terms) in their original form
            4. Maintain natural flow and meaning
            5. Remove the code if you find them in backticks ```.

            Rules:
            - Keep proper nouns unchanged
            - Retain English words if no common regional equivalent exists
            - Use colloquial spellings (e.g. "कॉलेज" not "विद्यालय" for "college")

            Output ONLY the converted text in the detected script.
            """
        )
        
        print(f"   ✓ Translation done in {time.time() - start:.2f}s")
        return response.text
    except Exception as e:
        print(f"❌ Gemini error: {str(e)}")
        raise gr.Error(f"Gemini error: {str(e)}")

def generate_speech(text):
    print("\n" + "="*50)
    print("🎤 Starting Hinglish-to-Speech pipeline")
    
    # 1. Text Conversion
    hindi_text = hinglish_to_devnagri(text)
    print(f"   Hindi text: {hindi_text[:50]}...")
    
    # 2. Audio Generation
    print("\n🔊 Generating audio...")
    start_audio = time.time()
    
    desc = "एक महिला वक्ता स्पष्ट हिंदी में बोल रही हैं"
    desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
    text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device)
    
    audio = tts_model.generate(
        input_ids=desc_inputs.input_ids,
        attention_mask=desc_inputs.attention_mask,
        prompt_input_ids=text_inputs.input_ids,
        prompt_attention_mask=text_inputs.attention_mask
    )
    
    # 3. Save Output
    sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)
    print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s")
    print("="*50 + "\n")
    
    return "output.wav", hindi_text

# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)")
    with gr.Row():
        inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...")
        btn = gr.Button("Generate")
    with gr.Row():
        audio_out = gr.Audio(label="Speech Output")
        text_out = gr.Textbox(label="Devnagri Translation")
    
    btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out])

print("\n🚀 App ready! Waiting for input...")
app.launch()