Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,721 Bytes
c42faed 4392ec0 c42faed 4392ec0 363a697 c42faed 363a697 c42faed 48aca08 c42faed 310c0b7 c42faed 48aca08 c42faed 4392ec0 310c0b7 c42faed 310c0b7 c42faed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# -*- coding: utf-8 -*-
"""Lookingsoft Radiology Assistant
Automatically generated by Colab.
This file is adapted from:
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb
# Lookingsoft Radiology Assistant
## Developed by Lookingsoft Team
This demo showcases an AI-powered radiology assistant that leverages MedGemma for medical image interpretation and Gemini’s native text-to-speech (TTS) for natural voice output. The assistant transforms complex radiology reports into easy-to-understand language and delivers it through a user-friendly voice-driven experience—highlighting key areas in radiology images and making insights more accessible.
### 🔐 Securing API Keys
We use secure tokens to authenticate with Hugging Face and Google’s Gemini APIs, ensuring safe and authorized access.
"""
import spaces
from google import genai
from google.genai import types
import os
gemini_api_key = os.getenv('GEMINI_API_KEY')
hf_token = os.environ.get("HF_TOKEN")
client = genai.Client(api_key=gemini_api_key)
"""### 🧠 Loading MedGemma for Radiology Insights
Here, we load the MedGemma model—an image-text model optimized for medical contexts. We apply 4-bit quantization to enhance performance and reduce memory usage on GPUs.
"""
from transformers import pipeline, BitsAndBytesConfig
import torch
model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs, token=hf_token)
pipe.model.generation_config.do_sample = False
"""### 🩻 Radiology Image Interpretation Logic
This function uses MedGemma to generate a plain-language report based on a given prompt and medical image. It formats the input and passes it to the model for inference.
"""
from PIL import Image
@spaces.GPU
def infer(prompt: str, image: Image.Image, system: str = None) -> str:
image_filename = "image.png"
image.save(image_filename)
messages = []
if system:
messages.append({
"role": "system",
"content": [{"type": "text", "text": system}]
})
messages.append({
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image", "image": image}
]
})
output = pipe(text=messages, max_new_tokens=2048)
response = output[0]["generated_text"][-1]["content"]
return response
"""### 🔊 Prepare for Gemini's Native TTS
This helper function converts Gemini’s audio output into a `.wav` file—enabling the assistant to speak its reports in a natural-sounding voice.
"""
import wave
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
with wave.open(filename, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm)
"""### 🤖 Integrating Image Analysis and Voice Output
This function combines the MedGemma analysis with Gemini’s TTS to produce both text and audio responses.
"""
import gradio as gr
import requests
def _do_predictions(text, image_file, image_url, source_type):
if source_type == "url":
image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw)
else:
image = image_file
report = infer(text, image)
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=report,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Kore',
)
)
),
)
)
data = response.candidates[0].content.parts[0].inline_data.data
file_name='out.wav'
wave_file(file_name, data)
return report, file_name
"""### 🖼️ Interactive Web UI with Gradio
A user-friendly interface built with Gradio. Users can upload an image or provide a URL, enter a prompt, and receive both a text report and an audio explanation—powered by **MedGemma + Gemini TTS**.
"""
def toggle_image_src(choice):
if choice == "url":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=True), gr.update(visible=False)
with gr.Blocks() as demo:
gr.Markdown(
"""
# Lookingsoft Radiology Assistant
## Developed by the Lookingsoft Team
This assistant demonstrates the integration of MedGemma for medical image interpretation with Gemini’s native text-to-speech (TTS). It simplifies complex radiology reports into clear, spoken language, making insights more accessible and understandable for both professionals and patients.
"""
)
with gr.Row():
with gr.Column():
with gr.Row():
text = gr.Text(label="Instructions", lines=2, interactive=True)
with gr.Column():
radio = gr.Radio(["file", "url"], value="file", label="Input Image Source")
image_file = gr.Image(label="File", type="pil", visible=True)
image_url = gr.Textbox(label="URL", visible=False)
with gr.Row():
submit = gr.Button("Generate")
with gr.Column():
output = gr.Textbox(label="Generated Report")
audio_output = gr.Audio(label="Generated Report (wav)")
submit.click(_do_predictions, inputs=[text, image_file, image_url, radio],
outputs=[output, audio_output])
radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False)
gr.Examples(
fn=_do_predictions,
examples=[
["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"],
["Describe this CT", None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"],
],
inputs=[text, image_file, image_url, radio],
outputs=[output, audio_output]
)
gr.Markdown("""
### Disclaimer
This demonstration is for educational purposes only. It is not intended to diagnose or treat any disease or condition and should not be considered medical advice.
""")
demo.queue(max_size=8 * 4).launch(share=True) |