Spaces:
Running
on
Zero
Running
on
Zero
# -*- coding: utf-8 -*- | |
"""Lookingsoft Radiology Assistant | |
Automatically generated by Colab. | |
This file is adapted from: | |
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb | |
# Lookingsoft Radiology Assistant | |
## Developed by Lookingsoft Team | |
This demo showcases an AI-powered radiology assistant that leverages MedGemma for medical image interpretation and Gemini’s native text-to-speech (TTS) for natural voice output. The assistant transforms complex radiology reports into easy-to-understand language and delivers it through a user-friendly voice-driven experience—highlighting key areas in radiology images and making insights more accessible. | |
### 🔐 Securing API Keys | |
We use secure tokens to authenticate with Hugging Face and Google’s Gemini APIs, ensuring safe and authorized access. | |
""" | |
import spaces | |
from google import genai | |
from google.genai import types | |
import os | |
gemini_api_key = os.getenv('GEMINI_API_KEY') | |
hf_token = os.environ.get("HF_TOKEN") | |
client = genai.Client(api_key=gemini_api_key) | |
"""### 🧠 Loading MedGemma for Radiology Insights | |
Here, we load the MedGemma model—an image-text model optimized for medical contexts. We apply 4-bit quantization to enhance performance and reduce memory usage on GPUs. | |
""" | |
from transformers import pipeline, BitsAndBytesConfig | |
import torch | |
model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True)) | |
pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs, token=hf_token) | |
pipe.model.generation_config.do_sample = False | |
"""### 🩻 Radiology Image Interpretation Logic | |
This function uses MedGemma to generate a plain-language report based on a given prompt and medical image. It formats the input and passes it to the model for inference. | |
""" | |
from PIL import Image | |
def infer(prompt: str, image: Image.Image, system: str = None) -> str: | |
image_filename = "image.png" | |
image.save(image_filename) | |
messages = [] | |
if system: | |
messages.append({ | |
"role": "system", | |
"content": [{"type": "text", "text": system}] | |
}) | |
messages.append({ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": prompt}, | |
{"type": "image", "image": image} | |
] | |
}) | |
output = pipe(text=messages, max_new_tokens=2048) | |
response = output[0]["generated_text"][-1]["content"] | |
return response | |
"""### 🔊 Prepare for Gemini's Native TTS | |
This helper function converts Gemini’s audio output into a `.wav` file—enabling the assistant to speak its reports in a natural-sounding voice. | |
""" | |
import wave | |
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2): | |
with wave.open(filename, "wb") as wf: | |
wf.setnchannels(channels) | |
wf.setsampwidth(sample_width) | |
wf.setframerate(rate) | |
wf.writeframes(pcm) | |
"""### 🤖 Integrating Image Analysis and Voice Output | |
This function combines the MedGemma analysis with Gemini’s TTS to produce both text and audio responses. | |
""" | |
import gradio as gr | |
import requests | |
def _do_predictions(text, image_file, image_url, source_type): | |
if source_type == "url": | |
image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw) | |
else: | |
image = image_file | |
report = infer(text, image) | |
response = client.models.generate_content( | |
model="gemini-2.5-flash-preview-tts", | |
contents=report, | |
config=types.GenerateContentConfig( | |
response_modalities=["AUDIO"], | |
speech_config=types.SpeechConfig( | |
voice_config=types.VoiceConfig( | |
prebuilt_voice_config=types.PrebuiltVoiceConfig( | |
voice_name='Kore', | |
) | |
) | |
), | |
) | |
) | |
data = response.candidates[0].content.parts[0].inline_data.data | |
file_name='out.wav' | |
wave_file(file_name, data) | |
return report, file_name | |
"""### 🖼️ Interactive Web UI with Gradio | |
A user-friendly interface built with Gradio. Users can upload an image or provide a URL, enter a prompt, and receive both a text report and an audio explanation—powered by **MedGemma + Gemini TTS**. | |
""" | |
def toggle_image_src(choice): | |
if choice == "url": | |
return gr.update(visible=False), gr.update(visible=True) | |
else: | |
return gr.update(visible=True), gr.update(visible=False) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# Lookingsoft Radiology Assistant | |
## Developed by the Lookingsoft Team | |
This assistant demonstrates the integration of MedGemma for medical image interpretation with Gemini’s native text-to-speech (TTS). It simplifies complex radiology reports into clear, spoken language, making insights more accessible and understandable for both professionals and patients. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
text = gr.Text(label="Instructions", lines=2, interactive=True) | |
with gr.Column(): | |
radio = gr.Radio(["file", "url"], value="file", label="Input Image Source") | |
image_file = gr.Image(label="File", type="pil", visible=True) | |
image_url = gr.Textbox(label="URL", visible=False) | |
with gr.Row(): | |
submit = gr.Button("Generate") | |
with gr.Column(): | |
output = gr.Textbox(label="Generated Report") | |
audio_output = gr.Audio(label="Generated Report (wav)") | |
submit.click(_do_predictions, inputs=[text, image_file, image_url, radio], | |
outputs=[output, audio_output]) | |
radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False) | |
gr.Examples( | |
fn=_do_predictions, | |
examples=[ | |
["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"], | |
["Describe this CT", None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"], | |
], | |
inputs=[text, image_file, image_url, radio], | |
outputs=[output, audio_output] | |
) | |
gr.Markdown(""" | |
### Disclaimer | |
This demonstration is for educational purposes only. It is not intended to diagnose or treat any disease or condition and should not be considered medical advice. | |
""") | |
demo.queue(max_size=8 * 4).launch(share=True) |