Spaces:
Sleeping
Sleeping
import os | |
import faiss | |
import numpy as np | |
import re | |
import nltk | |
import google.generativeai as genai | |
from sentence_transformers import SentenceTransformer | |
from nltk.tokenize import sent_tokenize | |
import gradio as gr | |
# Download necessary NLTK data | |
nltk.download("punkt") | |
nltk.download('all') | |
# Load transcript file | |
TRANSCRIPT_FILE = "transcript.txt" # Upload this file manually or use an existing one | |
# Read and clean transcript | |
def clean_text(text): | |
text = re.sub(r"\s+", " ", text) | |
text = text.replace("\n", " ") | |
return text.strip() | |
with open(TRANSCRIPT_FILE, "r", encoding="utf-8") as f: | |
transcript_text = f.read() | |
cleaned_text = clean_text(transcript_text) | |
# Tokenize into sentences | |
sentences = sent_tokenize(cleaned_text) | |
# Split into chunks | |
chunk_size = 500 | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
if len(current_chunk) + len(sentence) < chunk_size: | |
current_chunk += " " + sentence | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
# Load embedding model | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# Encode chunks | |
query_embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks]) | |
chunk_map = {i: chunks[i] for i in range(len(chunks))} | |
# Save to FAISS index | |
dimension = query_embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(query_embeddings) | |
# Configure Google Generative AI | |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) # Use environment variable for security | |
model = genai.GenerativeModel("gemini-1.5-pro-latest") | |
# Function to search transcript | |
def search_transcript(query, top_k=3): | |
query_embedding = embedding_model.encode(query).astype("float32").reshape(1, -1) | |
distances, indices = index.search(query_embedding, top_k) | |
return " ".join([chunk_map[i] for i in indices[0]]) | |
# Function to generate AI response | |
def generate_response(query): | |
relevant_text = search_transcript(query) | |
prompt = f""" | |
You are an AI tutor. Answer the following question based on the given lecture transcript: | |
Lecture Context: {relevant_text} | |
Question: {query} | |
""" | |
response = model.generate_content(prompt) | |
return response.text | |
# Gradio Interface | |
def chatbot(query): | |
return generate_response(query) if query.lower() != "exit" else "Goodbye!" | |
iface = gr.Interface( | |
fn=chatbot, | |
inputs=gr.Textbox(placeholder="Ask anything about the lecture..."), | |
outputs="text", | |
title="Dhamm AI Chatbot", | |
description="Ask questions about any topic and get AI-generated answers!" | |
) | |
iface.launch() | |