File size: 15,043 Bytes
f9d2354
0d6a1a8
 
 
 
c029cee
 
4804b0a
ac98278
04b9905
f9d2354
0d6a1a8
 
 
 
 
 
 
 
 
c029cee
761eda7
c029cee
 
ceda09c
 
b920ed0
 
0d6a1a8
 
 
 
 
 
 
22a6668
0d6a1a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceda09c
0d6a1a8
 
 
 
0ea37a5
 
 
0d6a1a8
0ea37a5
ceda09c
9712737
0d6a1a8
ceda09c
 
 
 
 
06b55be
9712737
 
04b9905
 
9712737
04b9905
 
adc5b1d
06b55be
 
 
 
 
 
256d7e9
04b9905
adc5b1d
04b9905
 
5130d14
04b9905
 
 
 
 
9712737
04b9905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0085043
04b9905
4315eee
04b9905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b51683
04b9905
 
 
 
 
 
 
 
 
 
 
 
0cdf87e
04b9905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cdf87e
 
04b9905
 
 
0085043
06b55be
04b9905
 
 
 
 
b920ed0
 
04b9905
 
 
adc5b1d
04b9905
 
adc5b1d
 
b920ed0
 
 
 
 
 
adc5b1d
06b55be
 
 
 
 
adc5b1d
04b9905
b920ed0
 
010d5fd
b920ed0
 
 
adc5b1d
b920ed0
04b9905
 
06b55be
 
b920ed0
06b55be
 
adc5b1d
06b55be
c337d0c
adc5b1d
06b55be
a10fd4c
d7226ab
a10fd4c
 
 
 
 
 
 
d7226ab
 
a10fd4c
 
 
d7226ab
 
a10fd4c
d7226ab
06b55be
04b9905
 
b920ed0
 
 
 
 
 
 
 
 
 
 
d7226ab
 
04b9905
 
d7226ab
 
04b9905
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import streamlit as st
from asr import transcribe_file
from summarization import summarize_transcript
from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
from utils import model_names, available_gguf_llms
import base64
import time
from datetime import datetime  # βœ… Correct import
import html as html_lib  # Required for text escaping in transcripts
import json # βœ… Added for passing data to JS

# Session state init
if "transcript" not in st.session_state:
    st.session_state.transcript = ""
if "summary" not in st.session_state:
    st.session_state.summary = ""
if "status" not in st.session_state:
    st.session_state.status = "Ready"
if "audio_path" not in st.session_state:
    st.session_state.audio_path = None
if "utterances" not in st.session_state:
    st.session_state.utterances = []
if "audio_base64" not in st.session_state:
    st.session_state.audio_base64 = None
if "prev_audio_path" not in st.session_state:
    st.session_state.prev_audio_path = None
if "transcribing" not in st.session_state:
    st.session_state.transcribing = False

st.set_page_config(page_title="πŸŽ™οΈ Moonshine ASR + LLM", layout="wide")
st.title("πŸŽ™οΈ Speech Summarization with Moonshine ASR & LLM")

with st.sidebar:
    st.header("βš™οΈ Settings")
    vad_threshold = st.slider("VAD Threshold", 0.1, 0.9, 0.5)
    model_name = st.selectbox("Moonshine Model", model_names.keys())
    llm_model = st.selectbox("LLM for Summarization", list(available_gguf_llms.keys()))
    prompt_input = st.text_area("Custom Prompt", value="Summarize the transcript below.")

tab1, tab2, tab3 = st.tabs(["πŸ“» Podcast", "🎡 Audio Input", "πŸ“„ Results"])

with tab1:
    st.subheader("Search Podcast")
    query = st.text_input("Enter podcast name")
    if st.button("Search Series"):
        series_list = search_podcast_series(query)
        st.session_state.series_list = series_list

    if "series_list" in st.session_state:
        series_titles = [f"{s['title']} by {s['artist']}" for s in st.session_state.series_list]
        selected_title = st.selectbox("Select Series", series_titles)
        series = next((s for s in st.session_state.series_list if f"{s['title']} by {s['artist']}" == selected_title), None)
        if series:
            st.image(series["thumbnail"], width=150)
            st.text_area("Series Info", value=f"Title: {series['title']}\nArtist: {series['artist']}\nEpisodes: {series['episode_count']}", disabled=True)
            if st.button("Load Episodes"):
                episodes = fetch_episodes(series["feed_url"])
                st.session_state.episodes = episodes
            if "episodes" in st.session_state:
                episode_titles = [e["title"] for e in st.session_state.episodes]
                selected_episode = st.selectbox("Select Episode", episode_titles)
                episode = next((e for e in st.session_state.episodes if e["title"] == selected_episode), None)
                if episode:
                    st.text_area("Episode Info", value=f"Title: {episode['title']}\nPublished: {episode['published']}\nDuration: {episode['duration']}", disabled=True)
                    if st.button("Download Episode"):
                        audio_path, status = download_podcast_audio(episode["audio_url"], episode["title"], st.session_state.status)
                        st.session_state.audio_path = audio_path
                        st.session_state.status = status

with tab2:
    st.subheader("Upload or Fetch Audio")
    youtube_url = st.text_input("YouTube URL")
    if st.button("Fetch from YouTube"):
        audio_path, status = fetch_audio(youtube_url, st.session_state.status)
        st.session_state.audio_path = audio_path
        st.session_state.audio_base64 = None  # βœ… Clear base64
        st.session_state.status = status

    uploaded_file = st.file_uploader("Upload Audio", type=["mp3", "wav"])
    if uploaded_file:
        # FIX: Write to /tmp directory instead of current directory
        temp_audio_path = "/tmp/temp_audio.mp3"
        with open(temp_audio_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        st.session_state.audio_path = temp_audio_path
        st.session_state.audio_base64 = None  # βœ… Clear base64

with tab3:

    if st.session_state.audio_path and st.session_state.get("prev_audio_path") != st.session_state.audio_path:
        st.session_state.audio_base64 = None
        st.session_state.prev_audio_path = st.session_state.audio_path

    st.subheader("🎀 Transcription & Summary")
    st.markdown("---")
    
    status_placeholder = st.empty()
    summary_container = st.container()

    # ===== Audio Player and Transcript Logic =====
    # If we have an audio path, prepare the base64 encoding
    if st.session_state.audio_path and not st.session_state.audio_base64:
        try:
            with open(st.session_state.audio_path, "rb") as f:
                audio_bytes = f.read()
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
        except Exception as e:
            st.error(f"Audio loading error: {str(e)}")

    def create_interactive_player(audio_base64, utterances):
        """
        Generates a single, self-contained HTML component for the audio player
        and the interactive transcript.
        
        Why this works:
        - All HTML (player, transcript) and JavaScript logic live in the SAME context.
        - No more complex, failing postMessage communication between different iframes.
        - Highlighting is handled instantly in the browser, not by slow Python reruns.
        - Clicking to seek is also instant, as the JS has direct access to the player.
        """
        
        # Pass utterances data to JavaScript safely
        utterances_json = json.dumps(utterances)

        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
        <meta charset="UTF-8">
        <title>Interactive Player</title>
        <style>
            body {{ font-family: sans-serif; }}
            .utterance {{
                padding: 10px; margin: 5px 0; border-radius: 8px;
                cursor: pointer; transition: all 0.2s ease-in-out;
                border: 1px solid #e0e0e0; line-height: 1.6;
            }}
            .utterance:hover {{
                background-color: #f5f5f5;
                transform: translateX(4px);
            }}
            .current-utterance {{
                background-color: #fff3e0 !important;
                border-left: 5px solid #ff9800;
                font-weight: 600;
            }}
            #transcript-container {{
                max-height: 500px;
                overflow-y: auto;
                padding-right: 10px;
            }}
            audio {{
                width: 100%;
                margin-bottom: 20px;
            }}
        </style>
        </head>
        <body>
            <audio id="audioPlayer" controls>
                <source src="data:audio/mp3;base64,{audio_base64}" type="audio/mp3">
                Your browser does not support the audio element.
            </audio>

            <div id="transcript-container"></div>

            <script>
                const player = document.getElementById('audioPlayer');
                const transcriptContainer = document.getElementById('transcript-container');
                const utterances = {utterances_json};
                let currentHighlight = null;

                // 1. Function to build the transcript from data
                function buildTranscript() {{
                    utterances.forEach((utt, index) => {{
                        if (utt.length !== 3) return; // Skip malformed utterances
                        const [start, end, text] = utt;
                        const utteranceDiv = document.createElement('div');
                        utteranceDiv.className = 'utterance';
                        utteranceDiv.dataset.start = start;
                        utteranceDiv.dataset.end = end;
                        utteranceDiv.dataset.index = index;
                        
                        const startTime = new Date(start * 1000).toISOString().substr(14, 5);
                        
                        utteranceDiv.innerHTML = `<b>[${{startTime}}]</b> ${{text}}`;
                        
                        // βœ… FIX: CLICK TO SEEK
                        // Add click event listener to seek the audio player
                        utteranceDiv.addEventListener('click', () => {{
                            console.log(`Clicked utterance. Seeking to: ${{start}}`);
                            player.currentTime = start;
                            player.play();
                        }});
                        
                        transcriptContainer.appendChild(utteranceDiv);
                    }});
                }}

                // 2. Function to handle highlighting based on audio time
                // βœ… FIX: HIGHLIGHTING AS AUDIO PLAYS
                function updateHighlight() {{
                    const currentTime = player.currentTime;
                    let activeUtterance = null;
                    
                    for (const utt of utterances) {{
                        const [start, end, text] = utt;
                        if (currentTime >= start && currentTime < end) {{
                            activeUtterance = utt;
                            break;
                        }}
                    }}

                    const allUtteranceDivs = document.querySelectorAll('.utterance');
                    
                    // Find the div corresponding to the active utterance
                    let activeDiv = null;
                    if (activeUtterance) {{
                        activeDiv = transcriptContainer.querySelector(`[data-start="${{activeUtterance[0]}}"]`);
                    }}

                    if (activeDiv !== currentHighlight) {{
                        // Remove highlight from the previous element
                        if (currentHighlight) {{
                            currentHighlight.classList.remove('current-utterance');
                        }}
                        
                        // Add highlight to the new element
                        if (activeDiv) {{
                            activeDiv.classList.add('current-utterance');
                            
                            // Auto-scroll into view
                            activeDiv.scrollIntoView({{ behavior: 'smooth', block: 'center' }});
                        }}
                        currentHighlight = activeDiv;
                    }}
                }}

                // 3. Attach listeners
                buildTranscript();
                player.addEventListener('timeupdate', updateHighlight);

            </script>
        </body>
        </html>
        """
        return html_content

    # Placeholder for transcript display (either streaming text or interactive player)
    transcript_display = st.empty()

    # ===== Transcription Process =====
    if st.button("πŸŽ™οΈ Transcribe Audio", key="transcribe_button_tab3"):
        if st.session_state.audio_path:
            status_placeholder.info("πŸ”Š Transcribing audio... Please wait.")
            # Reset previous results
            st.session_state.utterances = []
            st.session_state.transcript = ""
            st.session_state.transcribing = True
            
            # Set up live streaming display
            with transcript_display.container():
                st.markdown("### πŸ“ Live Transcript (Streaming)")
                live_placeholder = st.empty()
            
            try:
                transcription_gen = transcribe_file(
                    st.session_state.audio_path, 
                    vad_threshold, 
                    model_names[model_name]
                )
                for _, all_utts in transcription_gen:
                    st.session_state.utterances = list(all_utts) if all_utts else []
                    st.session_state.transcript = "\n".join(
                        f"{text}" 
                        for start, end, text in st.session_state.utterances
                    )
                    live_placeholder.markdown(st.session_state.transcript)
                
                st.session_state.transcribing = False
                status_placeholder.success("βœ… Transcription completed! The interactive player is now active.")
                st.rerun() 
            except Exception as e:
                status_placeholder.error(f"Transcription error: {str(e)}")
                st.session_state.transcribing = False
        else:
            status_placeholder.warning("⚠️ No audio file available")
    
    # ===== Summarization Process =====
    if st.button("πŸ“ Generate Summary", key="summarize_button_tab3"):
        if st.session_state.transcript:
            status_placeholder.info("🧠 Generating summary...")
            st.session_state.summary = ""
            summary_container.empty() # Clear old summary

            live_summary_area = st.empty()
            with live_summary_area.container():
                st.markdown("### πŸ“ Live Summary (In Progress)")
                progress_placeholder = st.empty()

            summary_gen = summarize_transcript(st.session_state.transcript, llm_model, prompt_input)
            
            # Accumulate the summary in session_state
            for accumulated_summary in summary_gen:
                st.session_state.summary = accumulated_summary
                progress_placeholder.markdown(accumulated_summary)
            
            # Clear the "Live Summary" placeholder
            live_summary_area.empty()
            
        else:
            status_placeholder.warning("⚠️ No transcript available")
            
    # Display the interactive player if transcription is complete
    if st.session_state.get("audio_base64") and st.session_state.get("utterances") and not st.session_state.transcribing:
        component_html = create_interactive_player(st.session_state.audio_base64, st.session_state.utterances)
        # Calculate a dynamic height for the component
        estimated_height = min(600, max(200, len(st.session_state.utterances) * 50 + 100))
        with transcript_display.container():
            st.components.v1.html(component_html, height=estimated_height, scrolling=True)
    elif not st.session_state.utterances and not st.session_state.transcribing:
        with transcript_display.container():
            st.info("No transcript available. Click 'Transcribe Audio' to generate one.")

    # βœ… THIS BLOCK NOW HANDLES ALL DISPLAYING
    # Display the final summary if it exists in the session state
    if st.session_state.summary:
        with summary_container:
            # Title changed for consistency
            st.markdown("### πŸ“ Final Summary")
            st.markdown(st.session_state.summary)