# app.py import os import asyncio import base64 import io import traceback import cv2 import pyaudio import PIL.Image import mss import gradio as gr from google import genai from google.genai import types # Audio and video capture config FORMAT = pyaudio.paInt16 CHANNELS = 1 SEND_SAMPLE_RATE = 16000 RECEIVE_SAMPLE_RATE = 24000 CHUNK_SIZE = 1024 MODEL = "models/gemini-2.0-flash-live-001" # Initialize Google GenAI client client = genai.Client( http_options={"api_version": "v1beta"}, api_key=os.environ.get("GEMINI_API_KEY"), ) # Live connect configuration CONFIG = types.LiveConnectConfig( response_modalities=["audio"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck") ) ), system_instruction=types.Content( parts=[types.Part.from_text(text="You are Puck..." )] ), ) # AudioLoop class adapted for Gradio class AudioLoop: def __init__(self, mode="camera"): self.mode = mode self.audio_in_queue = None self.out_queue = None self.session = None async def _get_frame(self, cap): ret, frame = cap.read() if not ret: return None frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) img = PIL.Image.fromarray(frame_rgb) img.thumbnail((640, 480)) buf = io.BytesIO() img.save(buf, format="JPEG") return buf.getvalue() async def _video_stream(self): cap = await asyncio.to_thread(cv2.VideoCapture, 0) try: while True: frame = await self._get_frame(cap) if frame is None: break await self.out_queue.put({"mime_type": "image/jpeg", "data": base64.b64encode(frame).decode()}) await asyncio.sleep(0.1) finally: cap.release() async def _audio_stream(self): mic_info = pya.get_default_input_device_info() stream = await asyncio.to_thread( pyaudio.PyAudio().open, format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, input_device_index=mic_info['index'], frames_per_buffer=CHUNK_SIZE, ) while True: data = await asyncio.to_thread(stream.read, CHUNK_SIZE, False) await self.out_queue.put({"data": data, "mime_type": "audio/pcm"}) async def send_realtime(self): while True: msg = await self.out_queue.get() await self.session.send(input=msg) async def receive_audio(self): while True: turn = self.session.receive() async for response in turn: if data := response.data: yield (None, data) if text := response.text: yield (text, None) async def run(self): async with client.aio.live.connect(model=MODEL, config=CONFIG) as session: self.session = session self.audio_in_queue = asyncio.Queue() self.out_queue = asyncio.Queue(maxsize=5) tasks = [] tasks.append(asyncio.create_task(self._audio_stream())) if self.mode == "camera": tasks.append(asyncio.create_task(self._video_stream())) tasks.append(asyncio.create_task(self.send_realtime())) async for text, audio in self.receive_audio(): yield text, audio for t in tasks: t.cancel() # Gradio interface async def chat(mode="camera"): """Starts a live chat session and yields (text, audio) tuples as they arrive.""" loop = AudioLoop(mode=mode) async for t, a in loop.run(): yield t, a with gr.Blocks() as demo: gr.Markdown("# Gemini Live API Web Chat\nUse your microphone and camera directly from the browser.") mode = gr.Radio(choices=["camera", "screen", "none"], value="camera", label="Video Source") chatbot = gr.Chatbot() with gr.Row(): start = gr.Button("Start") stop = gr.Button("Stop") start.click(lambda m: chat(m), inputs=[mode], outputs=[chatbot], _js="(fn, inputs) => {fn(inputs).then(data => console.log(data));}") demo.launch(server_name="0.0.0.0", share=True) # requirements.txt # # google-genai # opencv-python # pyaudio # pillow # mss # gradio