File size: 2,646 Bytes
1d16be5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import faiss
import numpy as np
import re
import nltk
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import gradio as gr

# Download necessary NLTK data
nltk.download("punkt")
nltk.download('all')

# Load transcript file
TRANSCRIPT_FILE = "transcript.txt"  # Upload this file manually or use an existing one

# Read and clean transcript
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = text.replace("\n", " ")
    return text.strip()

with open(TRANSCRIPT_FILE, "r", encoding="utf-8") as f:
    transcript_text = f.read()

cleaned_text = clean_text(transcript_text)

# Tokenize into sentences
sentences = sent_tokenize(cleaned_text)

# Split into chunks
chunk_size = 500
chunks = []
current_chunk = ""

for sentence in sentences:
    if len(current_chunk) + len(sentence) < chunk_size:
        current_chunk += " " + sentence
    else:
        chunks.append(current_chunk.strip())
        current_chunk = sentence

if current_chunk:
    chunks.append(current_chunk.strip())

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode chunks
query_embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks])
chunk_map = {i: chunks[i] for i in range(len(chunks))}

# Save to FAISS index
dimension = query_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(query_embeddings)

# Configure Google Generative AI
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))  # Use environment variable for security
model = genai.GenerativeModel("gemini-1.5-pro-latest")

# Function to search transcript
def search_transcript(query, top_k=3):
    query_embedding = embedding_model.encode(query).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    return " ".join([chunk_map[i] for i in indices[0]])

# Function to generate AI response
def generate_response(query):
    relevant_text = search_transcript(query)
    prompt = f"""
    You are an AI tutor. Answer the following question based on the given lecture transcript:

    Lecture Context: {relevant_text}

    Question: {query}
    """
    response = model.generate_content(prompt)
    return response.text

# Gradio Interface
def chatbot(query):
    return generate_response(query) if query.lower() != "exit" else "Goodbye!"

iface = gr.Interface(
    fn=chatbot,
    inputs=gr.Textbox(placeholder="Ask anything about the lecture..."),
    outputs="text",
    title="Dhamm AI Chatbot",
    description="Ask questions about any topic and get AI-generated answers!"
)

iface.launch()