File size: 9,164 Bytes
4206d5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2f2403
4206d5e
e2f2403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4206d5e
e2f2403
 
 
4206d5e
e2f2403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4206d5e
 
e2f2403
4206d5e
 
e2f2403
 
 
 
 
 
4206d5e
 
 
 
 
 
 
 
e2f2403
4206d5e
 
 
 
 
e2f2403
 
 
 
 
 
 
 
4206d5e
 
 
 
 
 
 
 
 
 
 
 
e2f2403
 
 
 
 
 
 
4206d5e
 
 
 
 
 
e2f2403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4206d5e
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Install required libraries
#!pip install gradio torch transformers datasets soundfile librosa numpy

import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import numpy as np
import soundfile as sf
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
logger.info(f"Using device: {device}")

# Model configuration
stt_model_id = "openai/whisper-tiny"
summarizer_model_id = "sshleifer/distilbart-cnn-6-6"

# Load models
logger.info("Loading STT model...")
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    stt_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
stt_model.to(device)
processor = AutoProcessor.from_pretrained(stt_model_id)
stt_pipeline = pipeline(
    "automatic-speech-recognition",
    model=stt_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

logger.info("Loading Summarization pipeline...")
summarizer = pipeline(
    "summarization",
    model=summarizer_model_id,
    device=device
)

# Meeting data storage
meetings_data = {"google_meet_session": {"transcript": "", "summary": "", "last_summary_time": 0}}

# Helper functions
def format_summary_as_bullets(summary_text):
    if not summary_text:
        return "No summary generated yet."
    sentences = summary_text.replace(". ", ".\n- ").split('\n')
    bullet_summary = "- " + "\n".join(sentences).strip()
    return "\n".join([line for line in bullet_summary.split('\n') if line.strip() not in ['-', '']])

def process_audio(audio, meeting_id="google_meet_session"):
    """Process audio chunks for transcription and summarization"""
    try:
        if audio is None:
            return "No audio input.", "No summary generated."
        
        # Handle streaming audio format correctly
        # In Gradio streaming mode, audio is a tuple of (sample_rate, audio_data)
        if isinstance(audio, tuple) and len(audio) == 2:
            sample_rate, audio_data = audio
        else:
            # For non-streaming or file inputs
            try:
                audio_data, sample_rate = sf.read(audio)
            except Exception as e:
                logger.error(f"Error reading audio file: {e}")
                return "Error processing audio input.", meetings_data[meeting_id].get("summary", "No summary available.")
        
        # Ensure audio data is in the correct format
        # Handle mono vs stereo audio
        if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
            # Convert stereo to mono by averaging channels
            audio_data = np.mean(audio_data, axis=1)
        
        # Ensure correct dtype for processing
        audio_chunk = audio_data.astype(np.float32)
        
        # Add extra logging for debugging
        logger.info(f"Audio chunk shape: {audio_chunk.shape}, Sample rate: {sample_rate}")
        logger.info(f"Audio min/max values: {np.min(audio_chunk)}/{np.max(audio_chunk)}")
        
        # Check if audio contains actual sound (not just silence)
        if np.max(np.abs(audio_chunk)) < 0.01:  # Threshold for silence
            logger.info("Audio chunk contains mostly silence, skipping transcription")
            return meetings_data[meeting_id].get("transcript", ""), meetings_data[meeting_id].get("summary", "No summary generated yet.")
        
        # Transcribe
        try:
            result = stt_pipeline({"sampling_rate": sample_rate, "raw": audio_chunk})
            new_text = result["text"].strip() if result["text"] else ""
            logger.info(f"Transcription: '{new_text}'")
        except Exception as e:
            logger.error(f"Error during transcription: {e}")
            return meetings_data[meeting_id].get("transcript", ""), meetings_data[meeting_id].get("summary", "Transcription failed.")
        
        # Update transcript
        meeting = meetings_data[meeting_id]
        if meeting["transcript"] and new_text:
            meeting["transcript"] += " " + new_text
        else:
            meeting["transcript"] = new_text
        
        # Summarize periodically (every 30 seconds or when enough content)
        current_time = time.time()
        if (len(meeting["transcript"]) > 50 and 
            (current_time - meeting["last_summary_time"] > 30)):
            try:
                summary_result = summarizer(meeting["transcript"], max_length=150, min_length=30, do_sample=False)
                if summary_result and isinstance(summary_result, list):
                    raw_summary = summary_result[0]['summary_text']
                    meeting["summary"] = format_summary_as_bullets(raw_summary)
                    meeting["last_summary_time"] = current_time
                else:
                    logger.warning("Summary generation returned unexpected format")
            except Exception as e:
                logger.error(f"Error during summarization: {e}")
                # Don't update summary on error, keep the old one
        
        return meeting["transcript"], meeting["summary"]
    
    except Exception as general_error:
        # Catch-all for any unexpected errors
        logger.error(f"Unexpected error in process_audio: {general_error}")
        return "Error processing audio. Please check logs.", meetings_data[meeting_id].get("summary", "No summary available.")

def open_google_meet(meet_link):
    """Generate HTML for opening Google Meet link"""
    if not meet_link or "meet.google.com" not in meet_link:
        return "Please enter a valid Google Meet link (e.g., https://meet.google.com/xyz-abcd-123)."
    return f'<a href="{meet_link}" target="_blank">Click here to join your Google Meet</a><br><br><b>Instructions:</b> After clicking the link, open it in a new tab, join the meeting, and use the microphone below to capture audio for real-time notes.<br><br><b>Note:</b> This app captures audio from your microphone, not directly from Google Meet. Position your microphone close to your speakers for best results.'

def clear_session(meeting_id="google_meet_session"):
    """Reset the meeting data"""
    meetings_data[meeting_id] = {"transcript": "", "summary": "", "last_summary_time": 0}
    return "Session cleared. Ready for a new meeting.", ""

# Gradio interface
def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Real-Time Google Meet Notes Generator")
        gr.Markdown("Enter your Google Meet link, join the meeting, and get real-time notes using your microphone.")
        
        with gr.Row():
            meet_link = gr.Textbox(label="Google Meet Link", placeholder="https://meet.google.com/xyz-abcd-123")
            join_button = gr.Button("Join Meeting")
        
        join_output = gr.HTML(label="Meeting Link Status")
        
        with gr.Row():
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="numpy", 
                label="Live Microphone Input (Start speaking in your meeting)", 
                streaming=True,
                autoplay=True
            )
            clear_button = gr.Button("Clear Session")
        
        with gr.Row():
            transcript_output = gr.Textbox(label="Real-Time Transcription", lines=10)
            summary_output = gr.Textbox(label="Bullet Point Notes (Updates every ~30s)", lines=10)
        
        # Handle Google Meet button click
        join_button.click(
            fn=open_google_meet,
            inputs=[meet_link],
            outputs=[join_output]
        )
        
        # Clear session data
        clear_button.click(
            fn=clear_session,
            inputs=[],
            outputs=[transcript_output, summary_output]
        )
        
        # Process audio in real-time
        audio_input.stream(
            fn=process_audio,
            inputs=[audio_input],
            outputs=[transcript_output, summary_output]
        )
        
        # Add some helpful instructions
        gr.Markdown("""
        ## Instructions:
        1. Enter your Google Meet link and click "Join Meeting"
        2. In the new tab, join your meeting
        3. Allow microphone access for this app (important!)
        4. Position your microphone to clearly capture the meeting audio
        5. The app will transcribe what it hears and generate notes automatically
        
        ## Troubleshooting:
        - If no transcription appears, make sure your microphone is capturing the meeting audio
        - Try positioning your device's microphone closer to your speakers
        - If needed, click "Clear Session" to reset the transcript and summary
        - For best results, use headphones for the meeting and keep the microphone close to your speakers
        """)
    
    return demo

# Launch
demo = create_gradio_interface()
demo.launch(share=True)