my-fast-rtc-app / app.py
matthartman's picture
Upload app.py with huggingface_hub
d83f09b verified
"""
FastRTC + Gemma-3 minimal voice chat app
Requirements:
pip install fastrtc transformers torch torchaudio
"""
import asyncio
from typing import AsyncGenerator
from fastrtc import (
ReplyOnPause,
Stream,
get_stt_model,
get_tts_model,
wait_for_item,
)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# ------------------------------------------------------------------
# 1. Load Gemma-3 (4b-it) via transformers
# ------------------------------------------------------------------
MODEL_ID = "google/gemma-3-4b-it"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
# ------------------------------------------------------------------
# 2. Build a simple chat pipeline
# ------------------------------------------------------------------
chat_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
)
# ------------------------------------------------------------------
# 3. Voice pipeline helpers
# ------------------------------------------------------------------
stt = get_stt_model("tiny")
tts = get_tts_model("coqui/XTTS-v2", lang="en")
# ------------------------------------------------------------------
# 4. Response generator
# ------------------------------------------------------------------
def response_generator(prompt: str) -> str:
"""Feed the user prompt to Gemma-3 and return the assistant text."""
messages = [{"role": "user", "content": prompt}]
prompt_text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
output = chat_pipeline(prompt_text)[0]["generated_text"]
# strip the prompt from the output
return output[len(prompt_text) :].strip()
# ------------------------------------------------------------------
# 5. FastRTC streaming handler
# ------------------------------------------------------------------
async def chat_handler(
audio: AsyncGenerator,
) -> AsyncGenerator[bytes, None]:
"""Receive user voice, transcribe, answer via Gemma-3, stream back TTS audio."""
async for user_text in stt.transcribe(audio):
if not user_text.strip():
continue
# Generate response
reply_text = response_generator(user_text)
# Stream TTS audio back to the user
async for chunk in tts.synthesize(reply_text):
yield chunk
# ------------------------------------------------------------------
# 6. Launch the app
# ------------------------------------------------------------------
if __name__ == "__main__":
stream = Stream(
handler=ReplyOnPause(chat_handler),
modality="audio",
mode="send-receive",
)
stream.ui.launch(server_name="0.0.0.0", server_port=7860)