File size: 6,721 Bytes
c42faed
4392ec0
c42faed
 
 
 
 
 
4392ec0
363a697
c42faed
363a697
c42faed
 
 
 
 
 
 
 
 
 
 
 
48aca08
c42faed
 
 
 
310c0b7
c42faed
 
 
 
 
 
48aca08
c42faed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4392ec0
310c0b7
c42faed
310c0b7
c42faed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
"""Lookingsoft Radiology Assistant

Automatically generated by Colab.

This file is adapted from:
    https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb

# Lookingsoft Radiology Assistant
## Developed by Lookingsoft Team

This demo showcases an AI-powered radiology assistant that leverages MedGemma for medical image interpretation and Gemini’s native text-to-speech (TTS) for natural voice output. The assistant transforms complex radiology reports into easy-to-understand language and delivers it through a user-friendly voice-driven experience—highlighting key areas in radiology images and making insights more accessible.

### 🔐 Securing API Keys

We use secure tokens to authenticate with Hugging Face and Google’s Gemini APIs, ensuring safe and authorized access.
"""

import spaces
from google import genai
from google.genai import types
import os

gemini_api_key = os.getenv('GEMINI_API_KEY')
hf_token = os.environ.get("HF_TOKEN")
client = genai.Client(api_key=gemini_api_key)

"""### 🧠 Loading MedGemma for Radiology Insights

Here, we load the MedGemma model—an image-text model optimized for medical contexts. We apply 4-bit quantization to enhance performance and reduce memory usage on GPUs.
"""

from transformers import pipeline, BitsAndBytesConfig
import torch

model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs, token=hf_token)
pipe.model.generation_config.do_sample = False

"""### 🩻 Radiology Image Interpretation Logic

This function uses MedGemma to generate a plain-language report based on a given prompt and medical image. It formats the input and passes it to the model for inference.
"""

from PIL import Image

@spaces.GPU
def infer(prompt: str, image: Image.Image, system: str = None) -> str:
    image_filename = "image.png"
    image.save(image_filename)

    messages = []
    if system:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system}]
        })
    messages.append({
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image", "image": image}
        ]
    })

    output = pipe(text=messages, max_new_tokens=2048)
    response = output[0]["generated_text"][-1]["content"]

    return response

"""### 🔊 Prepare for Gemini's Native TTS

This helper function converts Gemini’s audio output into a `.wav` file—enabling the assistant to speak its reports in a natural-sounding voice.
"""

import wave

def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
   with wave.open(filename, "wb") as wf:
      wf.setnchannels(channels)
      wf.setsampwidth(sample_width)
      wf.setframerate(rate)
      wf.writeframes(pcm)

"""### 🤖 Integrating Image Analysis and Voice Output

This function combines the MedGemma analysis with Gemini’s TTS to produce both text and audio responses.
"""

import gradio as gr
import requests

def _do_predictions(text, image_file, image_url, source_type):
    if source_type == "url":
        image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw)
    else:
        image = image_file
    report = infer(text, image)

    response = client.models.generate_content(
       model="gemini-2.5-flash-preview-tts",
       contents=report,
       config=types.GenerateContentConfig(
          response_modalities=["AUDIO"],
          speech_config=types.SpeechConfig(
             voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                   voice_name='Kore',
                )
             )
          ),
       )
    )

    data = response.candidates[0].content.parts[0].inline_data.data
    file_name='out.wav'
    wave_file(file_name, data)

    return report, file_name

"""### 🖼️ Interactive Web UI with Gradio

A user-friendly interface built with Gradio. Users can upload an image or provide a URL, enter a prompt, and receive both a text report and an audio explanation—powered by **MedGemma + Gemini TTS**.
"""

def toggle_image_src(choice):
    if choice == "url":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=True), gr.update(visible=False)

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Lookingsoft Radiology Assistant
        ## Developed by the Lookingsoft Team

        This assistant demonstrates the integration of MedGemma for medical image interpretation with Gemini’s native text-to-speech (TTS). It simplifies complex radiology reports into clear, spoken language, making insights more accessible and understandable for both professionals and patients.
        """
    )
    with gr.Row():
        with gr.Column():
            with gr.Row():
                text = gr.Text(label="Instructions", lines=2, interactive=True)
                with gr.Column():
                    radio = gr.Radio(["file", "url"], value="file", label="Input Image Source")
                    image_file = gr.Image(label="File", type="pil", visible=True)
                    image_url = gr.Textbox(label="URL", visible=False)
            with gr.Row():
                submit = gr.Button("Generate")
        with gr.Column():
            output = gr.Textbox(label="Generated Report")
            audio_output = gr.Audio(label="Generated Report (wav)")
    submit.click(_do_predictions, inputs=[text, image_file, image_url, radio],
                    outputs=[output, audio_output])
    radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False)
    gr.Examples(
        fn=_do_predictions,
        examples=[
                ["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"],
                ["Describe this CT",  None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"],
            ],
        inputs=[text, image_file, image_url, radio],
        outputs=[output, audio_output]
    )
    gr.Markdown("""
    ### Disclaimer
    This demonstration is for educational purposes only. It is not intended to diagnose or treat any disease or condition and should not be considered medical advice.
    """)

    demo.queue(max_size=8 * 4).launch(share=True)