import gradio as gr import os import tempfile import speech_recognition as sr import nltk from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize from moviepy.editor import VideoFileClip from pytesseract import image_to_string from PIL import Image import cv2 from transformers import pipeline import concurrent.futures # Downloads nltk.download('punkt') nltk.download('averaged_perceptron_tagger') # Use faster summarization model summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") # Gradio interface def extract_audio(video_path): video = VideoFileClip(video_path) audio_path = "extracted_audio.wav" video.audio.write_audiofile(audio_path, verbose=False, logger=None) return audio_path def transcribe_audio(audio_path): recognizer = sr.Recognizer() with sr.AudioFile(audio_path) as source: audio = recognizer.record(source, duration=30) # limit to 30s return recognizer.recognize_google(audio) def extract_keywords(text): tokens = word_tokenize(text) pos_tags = nltk.pos_tag(tokens) stemmer = PorterStemmer() return list(set(f"{stemmer.stem(w.lower())} ({t})" for w, t in pos_tags if t.startswith("NN") or t.startswith("VB"))) def summarize_text(text, ratio="short"): max_len, min_len = (100, 30) if ratio == "short" else (150, 50) if ratio == "medium" else (250, 80) if len(text.split()) < min_len: return "Transcript is too short to summarize." chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] summary = "" for chunk in chunks: sum_out = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False) summary += sum_out[0]['summary_text'] + " " return summary.strip() def extract_slide_text(video_path): cap = cv2.VideoCapture(video_path) frame_count = 0 ocr_texts = set() while cap.isOpened() and frame_count < 20: ret, frame = cap.read() if not ret: break if frame_count % 30 == 0: image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) text = image_to_string(image) if text.strip(): ocr_texts.add(text.strip()) frame_count += 1 cap.release() return "\n\n".join(ocr_texts) def process_uploaded_file(uploaded_file): with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as temp_file: temp_file.write(uploaded_file.read()) file_path = temp_file.name audio_path = file_path slide_text = "" try: if file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv")): audio_path = extract_audio(file_path) with concurrent.futures.ThreadPoolExecutor() as executor: st.info("šŸš€ Running OCR and transcription in parallel...") ocr_future = executor.submit(extract_slide_text, file_path) if file_path.endswith((".mp4", ".mov", ".avi", ".mkv")) else None trans_future = executor.submit(transcribe_audio, audio_path) transcript = trans_future.result() slide_text = ocr_future.result() if ocr_future else "" return transcript, slide_text except Exception as e: return f"🚫 Error: {e}", "" finally: os.remove(file_path) if audio_path != file_path and os.path.exists(audio_path): os.remove(audio_path) # Gradio Interface for input and output def generate_notes(uploaded_file): transcript, slide_text = process_uploaded_file(uploaded_file) if slide_text: slide_text_display = f"šŸ–¼ļø Slide/Whiteboard Extracted Text: \n{slide_text}" else: slide_text_display = "No slide/whiteboard text extracted." if len(transcript.split()) < 30: transcript_display = "Transcript too short for a meaningful summary." else: summary_mode = "short" # Default summary mode summary = summarize_text(transcript, ratio=summary_mode) transcript_display = f"šŸ“œ Full Transcription: \n{transcript}\n\nšŸ“‹ Lecture Summary: \n{summary}" return slide_text_display, transcript_display # Gradio Interface inputs = gr.File(label="Upload Audio/Video File (Any Format)", type="file") outputs = [gr.Textbox(label="Slide Text"), gr.Textbox(label="Lecture Transcript and Summary")] gr.Interface(fn=generate_notes, inputs=inputs, outputs=outputs, live=True).launch()