""" FastRTC + Gemma-3 minimal voice chat app Requirements: pip install fastrtc transformers torch torchaudio """ import asyncio from typing import AsyncGenerator from fastrtc import ( ReplyOnPause, Stream, get_stt_model, get_tts_model, wait_for_item, ) import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline # ------------------------------------------------------------------ # 1. Load Gemma-3 (4b-it) via transformers # ------------------------------------------------------------------ MODEL_ID = "google/gemma-3-4b-it" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) # ------------------------------------------------------------------ # 2. Build a simple chat pipeline # ------------------------------------------------------------------ chat_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, do_sample=True, temperature=0.7, ) # ------------------------------------------------------------------ # 3. Voice pipeline helpers # ------------------------------------------------------------------ stt = get_stt_model("tiny") tts = get_tts_model("coqui/XTTS-v2", lang="en") # ------------------------------------------------------------------ # 4. Response generator # ------------------------------------------------------------------ def response_generator(prompt: str) -> str: """Feed the user prompt to Gemma-3 and return the assistant text.""" messages = [{"role": "user", "content": prompt}] prompt_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = chat_pipeline(prompt_text)[0]["generated_text"] # strip the prompt from the output return output[len(prompt_text) :].strip() # ------------------------------------------------------------------ # 5. FastRTC streaming handler # ------------------------------------------------------------------ async def chat_handler( audio: AsyncGenerator, ) -> AsyncGenerator[bytes, None]: """Receive user voice, transcribe, answer via Gemma-3, stream back TTS audio.""" async for user_text in stt.transcribe(audio): if not user_text.strip(): continue # Generate response reply_text = response_generator(user_text) # Stream TTS audio back to the user async for chunk in tts.synthesize(reply_text): yield chunk # ------------------------------------------------------------------ # 6. Launch the app # ------------------------------------------------------------------ if __name__ == "__main__": stream = Stream( handler=ReplyOnPause(chat_handler), modality="audio", mode="send-receive", ) stream.ui.launch(server_name="0.0.0.0", server_port=7860)