austinroy commited on
Commit
c42faed
·
0 Parent(s):

Initial commit

Browse files
Files changed (4) hide show
  1. .gitattributes +36 -0
  2. README.md +0 -0
  3. app.py +170 -0
  4. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ screenshot.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
File without changes
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """LookingSoft Radiology Assistant
3
+
4
+ Automatically generated by Colab.
5
+
6
+ This file is adapted from:
7
+ https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb
8
+
9
+ # LookingSoft Radiology Assistant: MedGemma + Gemini TTS
10
+ ## Developed by LookingSoft Team
11
+
12
+ This demo showcases an AI-powered radiology assistant that leverages **MedGemma** for medical image interpretation and **Gemini’s native text-to-speech (TTS)** for natural voice output. The assistant transforms complex radiology reports into easy-to-understand language and delivers it through a user-friendly voice-driven experience—highlighting key areas in radiology images and making insights more accessible.
13
+
14
+ ### 🔐 Securing API Keys
15
+
16
+ We use secure tokens to authenticate with Hugging Face and Google’s Gemini APIs, ensuring safe and authorized access.
17
+ """
18
+
19
+ import spaces
20
+ from google import genai
21
+ from google.genai import types
22
+ import os
23
+
24
+ gemini_api_key = os.getenv('GEMINI_API_KEY')
25
+ client = genai.Client(api_key=gemini_api_key)
26
+
27
+ """### 🧠 Loading MedGemma for Radiology Insights
28
+
29
+ Here, we load the **MedGemma** model—an image-text model optimized for medical contexts. We apply 4-bit quantization to enhance performance and reduce memory usage on GPUs.
30
+ """
31
+
32
+ from transformers import pipeline, BitsAndBytesConfig
33
+ import torch
34
+
35
+ model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
36
+ pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs)
37
+ pipe.model.generation_config.do_sample = False
38
+
39
+ """### 🩻 Radiology Image Interpretation Logic
40
+
41
+ This function uses MedGemma to generate a plain-language report based on a given prompt and medical image. It formats the input and passes it to the model for inference.
42
+ """
43
+
44
+ from PIL import Image
45
+
46
+ @spaces.GPU
47
+ def infer(prompt: str, image: Image.Image, system: str = None) -> str:
48
+ image_filename = "image.png"
49
+ image.save(image_filename)
50
+
51
+ messages = []
52
+ if system:
53
+ messages.append({
54
+ "role": "system",
55
+ "content": [{"type": "text", "text": system}]
56
+ })
57
+ messages.append({
58
+ "role": "user",
59
+ "content": [
60
+ {"type": "text", "text": prompt},
61
+ {"type": "image", "image": image}
62
+ ]
63
+ })
64
+
65
+ output = pipe(text=messages, max_new_tokens=2048)
66
+ response = output[0]["generated_text"][-1]["content"]
67
+
68
+ return response
69
+
70
+ """### 🔊 Prepare for Gemini's Native TTS
71
+
72
+ This helper function converts Gemini’s audio output into a `.wav` file—enabling the assistant to speak its reports in a natural-sounding voice.
73
+ """
74
+
75
+ import wave
76
+
77
+ def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
78
+ with wave.open(filename, "wb") as wf:
79
+ wf.setnchannels(channels)
80
+ wf.setsampwidth(sample_width)
81
+ wf.setframerate(rate)
82
+ wf.writeframes(pcm)
83
+
84
+ """### 🤖 Integrating Image Analysis and Voice Output
85
+
86
+ This function combines the MedGemma analysis with Gemini’s TTS to produce both text and audio responses.
87
+ """
88
+
89
+ import gradio as gr
90
+ import requests
91
+
92
+ def _do_predictions(text, image_file, image_url, source_type):
93
+ if source_type == "url":
94
+ image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw)
95
+ else:
96
+ image = image_file
97
+ report = infer(text, image)
98
+
99
+ response = client.models.generate_content(
100
+ model="gemini-2.5-flash-preview-tts",
101
+ contents=report,
102
+ config=types.GenerateContentConfig(
103
+ response_modalities=["AUDIO"],
104
+ speech_config=types.SpeechConfig(
105
+ voice_config=types.VoiceConfig(
106
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
107
+ voice_name='Kore',
108
+ )
109
+ )
110
+ ),
111
+ )
112
+ )
113
+
114
+ data = response.candidates[0].content.parts[0].inline_data.data
115
+ file_name='out.wav'
116
+ wave_file(file_name, data)
117
+
118
+ return report, file_name
119
+
120
+ """### 🖼️ Interactive Web UI with Gradio
121
+
122
+ A user-friendly interface built with Gradio. Users can upload an image or provide a URL, enter a prompt, and receive both a text report and an audio explanation—powered by **MedGemma + Gemini TTS**.
123
+ """
124
+
125
+ def toggle_image_src(choice):
126
+ if choice == "url":
127
+ return gr.update(visible=False), gr.update(visible=True)
128
+ else:
129
+ return gr.update(visible=True), gr.update(visible=False)
130
+
131
+ with gr.Blocks() as demo:
132
+ gr.Markdown(
133
+ """
134
+ # LookingSoft Radiology Assistant: MedGemma + Gemini TTS
135
+ ## Developed by the LookingSoft Team
136
+
137
+ This assistant demonstrates the integration of **MedGemma** for medical image interpretation with **Gemini’s native text-to-speech (TTS)**. It simplifies complex radiology reports into clear, spoken language, making insights more accessible and understandable for both professionals and patients.
138
+ """
139
+ )
140
+ with gr.Row():
141
+ with gr.Column():
142
+ with gr.Row():
143
+ text = gr.Text(label="Instructions", lines=2, interactive=True)
144
+ with gr.Column():
145
+ radio = gr.Radio(["file", "url"], value="file", label="Input Image Source")
146
+ image_file = gr.Image(label="File", type="pil", visible=True)
147
+ image_url = gr.Textbox(label="URL", visible=False)
148
+ with gr.Row():
149
+ submit = gr.Button("Generate")
150
+ with gr.Column():
151
+ output = gr.Textbox(label="Generated Report")
152
+ audio_output = gr.Audio(label="Generated Report (wav)")
153
+ submit.click(_do_predictions, inputs=[text, image_file, image_url, radio],
154
+ outputs=[output, audio_output])
155
+ radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False)
156
+ gr.Examples(
157
+ fn=_do_predictions,
158
+ examples=[
159
+ ["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"],
160
+ ["Describe this CT", None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"],
161
+ ],
162
+ inputs=[text, image_file, image_url, radio],
163
+ outputs=[output, audio_output]
164
+ )
165
+ gr.Markdown("""
166
+ ### Disclaimer
167
+ This demonstration is for educational purposes only. It is not intended to diagnose or treat any disease or condition and should not be considered medical advice.
168
+ """)
169
+
170
+ demo.queue(max_size=8 * 4).launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ accelerate
2
+ bitsandbytes
3
+ transformers
4
+ gradio
5
+ google-genai