File size: 4,409 Bytes
61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 61324e8 7a1c4b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import os
import tempfile
import speech_recognition as sr
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from moviepy.editor import VideoFileClip
from pytesseract import image_to_string
from PIL import Image
import cv2
from transformers import pipeline
import concurrent.futures
# Downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Use faster summarization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
# Gradio interface
def extract_audio(video_path):
video = VideoFileClip(video_path)
audio_path = "extracted_audio.wav"
video.audio.write_audiofile(audio_path, verbose=False, logger=None)
return audio_path
def transcribe_audio(audio_path):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_path) as source:
audio = recognizer.record(source, duration=30) # limit to 30s
return recognizer.recognize_google(audio)
def extract_keywords(text):
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
stemmer = PorterStemmer()
return list(set(f"{stemmer.stem(w.lower())} ({t})" for w, t in pos_tags if t.startswith("NN") or t.startswith("VB")))
def summarize_text(text, ratio="short"):
max_len, min_len = (100, 30) if ratio == "short" else (150, 50) if ratio == "medium" else (250, 80)
if len(text.split()) < min_len:
return "Transcript is too short to summarize."
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
summary = ""
for chunk in chunks:
sum_out = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
summary += sum_out[0]['summary_text'] + " "
return summary.strip()
def extract_slide_text(video_path):
cap = cv2.VideoCapture(video_path)
frame_count = 0
ocr_texts = set()
while cap.isOpened() and frame_count < 20:
ret, frame = cap.read()
if not ret:
break
if frame_count % 30 == 0:
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
text = image_to_string(image)
if text.strip():
ocr_texts.add(text.strip())
frame_count += 1
cap.release()
return "\n\n".join(ocr_texts)
def process_uploaded_file(uploaded_file):
with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as temp_file:
temp_file.write(uploaded_file.read())
file_path = temp_file.name
audio_path = file_path
slide_text = ""
try:
if file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv")):
audio_path = extract_audio(file_path)
with concurrent.futures.ThreadPoolExecutor() as executor:
st.info("π Running OCR and transcription in parallel...")
ocr_future = executor.submit(extract_slide_text, file_path) if file_path.endswith((".mp4", ".mov", ".avi", ".mkv")) else None
trans_future = executor.submit(transcribe_audio, audio_path)
transcript = trans_future.result()
slide_text = ocr_future.result() if ocr_future else ""
return transcript, slide_text
except Exception as e:
return f"π« Error: {e}", ""
finally:
os.remove(file_path)
if audio_path != file_path and os.path.exists(audio_path):
os.remove(audio_path)
# Gradio Interface for input and output
def generate_notes(uploaded_file):
transcript, slide_text = process_uploaded_file(uploaded_file)
if slide_text:
slide_text_display = f"πΌοΈ Slide/Whiteboard Extracted Text: \n{slide_text}"
else:
slide_text_display = "No slide/whiteboard text extracted."
if len(transcript.split()) < 30:
transcript_display = "Transcript too short for a meaningful summary."
else:
summary_mode = "short" # Default summary mode
summary = summarize_text(transcript, ratio=summary_mode)
transcript_display = f"π Full Transcription: \n{transcript}\n\nπ Lecture Summary: \n{summary}"
return slide_text_display, transcript_display
# Gradio Interface
inputs = gr.File(label="Upload Audio/Video File (Any Format)", type="file")
outputs = [gr.Textbox(label="Slide Text"), gr.Textbox(label="Lecture Transcript and Summary")]
gr.Interface(fn=generate_notes, inputs=inputs, outputs=outputs, live=True).launch()
|