File size: 4,409 Bytes
61324e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a1c4b9
61324e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a1c4b9
61324e8
 
 
 
 
 
 
 
 
 
 
 
7a1c4b9
61324e8
 
 
 
 
 
7a1c4b9
61324e8
7a1c4b9
 
61324e8
7a1c4b9
61324e8
 
 
 
7a1c4b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61324e8
 
 
7a1c4b9
61324e8
7a1c4b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import os
import tempfile
import speech_recognition as sr
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from moviepy.editor import VideoFileClip
from pytesseract import image_to_string
from PIL import Image
import cv2
from transformers import pipeline
import concurrent.futures

# Downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Use faster summarization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Gradio interface
def extract_audio(video_path):
    video = VideoFileClip(video_path)
    audio_path = "extracted_audio.wav"
    video.audio.write_audiofile(audio_path, verbose=False, logger=None)
    return audio_path

def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source, duration=30)  # limit to 30s
    return recognizer.recognize_google(audio)

def extract_keywords(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    stemmer = PorterStemmer()
    return list(set(f"{stemmer.stem(w.lower())} ({t})" for w, t in pos_tags if t.startswith("NN") or t.startswith("VB")))

def summarize_text(text, ratio="short"):
    max_len, min_len = (100, 30) if ratio == "short" else (150, 50) if ratio == "medium" else (250, 80)
    if len(text.split()) < min_len:
        return "Transcript is too short to summarize."
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
    summary = ""
    for chunk in chunks:
        sum_out = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
        summary += sum_out[0]['summary_text'] + " "
    return summary.strip()

def extract_slide_text(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    ocr_texts = set()
    while cap.isOpened() and frame_count < 20:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % 30 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            text = image_to_string(image)
            if text.strip():
                ocr_texts.add(text.strip())
        frame_count += 1
    cap.release()
    return "\n\n".join(ocr_texts)

def process_uploaded_file(uploaded_file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as temp_file:
        temp_file.write(uploaded_file.read())
        file_path = temp_file.name

    audio_path = file_path
    slide_text = ""

    try:
        if file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv")):
            audio_path = extract_audio(file_path)

        with concurrent.futures.ThreadPoolExecutor() as executor:
            st.info("πŸš€ Running OCR and transcription in parallel...")
            ocr_future = executor.submit(extract_slide_text, file_path) if file_path.endswith((".mp4", ".mov", ".avi", ".mkv")) else None
            trans_future = executor.submit(transcribe_audio, audio_path)

            transcript = trans_future.result()
            slide_text = ocr_future.result() if ocr_future else ""

        return transcript, slide_text

    except Exception as e:
        return f"🚫 Error: {e}", ""

    finally:
        os.remove(file_path)
        if audio_path != file_path and os.path.exists(audio_path):
            os.remove(audio_path)

# Gradio Interface for input and output
def generate_notes(uploaded_file):
    transcript, slide_text = process_uploaded_file(uploaded_file)

    if slide_text:
        slide_text_display = f"πŸ–ΌοΈ Slide/Whiteboard Extracted Text: \n{slide_text}"
    else:
        slide_text_display = "No slide/whiteboard text extracted."

    if len(transcript.split()) < 30:
        transcript_display = "Transcript too short for a meaningful summary."
    else:
        summary_mode = "short"  # Default summary mode
        summary = summarize_text(transcript, ratio=summary_mode)
        transcript_display = f"πŸ“œ Full Transcription: \n{transcript}\n\nπŸ“‹ Lecture Summary: \n{summary}"

    return slide_text_display, transcript_display


# Gradio Interface
inputs = gr.File(label="Upload Audio/Video File (Any Format)", type="file")
outputs = [gr.Textbox(label="Slide Text"), gr.Textbox(label="Lecture Transcript and Summary")]

gr.Interface(fn=generate_notes, inputs=inputs, outputs=outputs, live=True).launch()