Spaces:

karthikmn
/

audio.py

Build error

File size: 4,409 Bytes

import gradio as gr
import os
import tempfile
import speech_recognition as sr
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from moviepy.editor import VideoFileClip
from pytesseract import image_to_string
from PIL import Image
import cv2
from transformers import pipeline
import concurrent.futures

# Downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Use faster summarization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Gradio interface
def extract_audio(video_path):
    video = VideoFileClip(video_path)
    audio_path = "extracted_audio.wav"
    video.audio.write_audiofile(audio_path, verbose=False, logger=None)
    return audio_path

def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source, duration=30)  # limit to 30s
    return recognizer.recognize_google(audio)

def extract_keywords(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    stemmer = PorterStemmer()
    return list(set(f"{stemmer.stem(w.lower())} ({t})" for w, t in pos_tags if t.startswith("NN") or t.startswith("VB")))

def summarize_text(text, ratio="short"):
    max_len, min_len = (100, 30) if ratio == "short" else (150, 50) if ratio == "medium" else (250, 80)
    if len(text.split()) < min_len:
        return "Transcript is too short to summarize."
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
    summary = ""
    for chunk in chunks:
        sum_out = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
        summary += sum_out[0]['summary_text'] + " "
    return summary.strip()

def extract_slide_text(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    ocr_texts = set()
    while cap.isOpened() and frame_count < 20:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % 30 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            text = image_to_string(image)
            if text.strip():
                ocr_texts.add(text.strip())
        frame_count += 1
    cap.release()
    return "\n\n".join(ocr_texts)

def process_uploaded_file(uploaded_file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as temp_file:
        temp_file.write(uploaded_file.read())
        file_path = temp_file.name

    audio_path = file_path
    slide_text = ""

    try:
        if file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv")):
            audio_path = extract_audio(file_path)

        with concurrent.futures.ThreadPoolExecutor() as executor:
            st.info("🚀 Running OCR and transcription in parallel...")
            ocr_future = executor.submit(extract_slide_text, file_path) if file_path.endswith((".mp4", ".mov", ".avi", ".mkv")) else None
            trans_future = executor.submit(transcribe_audio, audio_path)

            transcript = trans_future.result()
            slide_text = ocr_future.result() if ocr_future else ""

        return transcript, slide_text

    except Exception as e:
        return f"🚫 Error: {e}", ""

    finally:
        os.remove(file_path)
        if audio_path != file_path and os.path.exists(audio_path):
            os.remove(audio_path)

# Gradio Interface for input and output
def generate_notes(uploaded_file):
    transcript, slide_text = process_uploaded_file(uploaded_file)

    if slide_text:
        slide_text_display = f"🖼️ Slide/Whiteboard Extracted Text: \n{slide_text}"
    else:
        slide_text_display = "No slide/whiteboard text extracted."

    if len(transcript.split()) < 30:
        transcript_display = "Transcript too short for a meaningful summary."
    else:
        summary_mode = "short"  # Default summary mode
        summary = summarize_text(transcript, ratio=summary_mode)
        transcript_display = f"📜 Full Transcription: \n{transcript}\n\n📋 Lecture Summary: \n{summary}"

    return slide_text_display, transcript_display


# Gradio Interface
inputs = gr.File(label="Upload Audio/Video File (Any Format)", type="file")
outputs = [gr.Textbox(label="Slide Text"), gr.Textbox(label="Lecture Transcript and Summary")]

gr.Interface(fn=generate_notes, inputs=inputs, outputs=outputs, live=True).launch()