File size: 2,329 Bytes
a047c4f
 
 
abeaa28
93f1ff0
a047c4f
abeaa28
a047c4f
9b4e4aa
abeaa28
5cba5a1
31386f7
1445742
abeaa28
e7e375f
1445742
abeaa28
 
1445742
5cba5a1
abeaa28
9b4e4aa
0fc1e2c
abeaa28
 
 
 
 
0fc1e2c
abeaa28
0fc1e2c
93f1ff0
abeaa28
 
 
 
 
a047c4f
 
abeaa28
a047c4f
 
 
abeaa28
 
 
1445742
5cba5a1
1e5db4e
1445742
abeaa28
93f1ff0
 
0fc1e2c
abeaa28
 
 
5cba5a1
 
a047c4f
5cba5a1
0576674
 
abeaa28
0576674
abeaa28
 
0576674
1445742
93f1ff0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import torch
import pandas as pd
import chromadb
import gradio as gr
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline

# Device setup
device = -1  # Use CPU
print("Device set to: CPU")

# Load CSV data
df = pd.read_csv("iec_college_data.csv").dropna(subset=["content"]).reset_index(drop=True)

# Load embedding model on CPU
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

# ChromaDB setup
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection_name = "iec_data"

# Get or create collection
if collection_name not in [col.name for col in chroma_client.list_collections()]:
    collection = chroma_client.create_collection(name=collection_name)
else:
    collection = chroma_client.get_collection(name=collection_name)

# Only index if collection is empty
if collection.count() == 0:
    print("Indexing documents...")
    texts = df["content"].tolist()
    embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=True)

    for idx, (embedding, row) in enumerate(zip(embeddings, df.itertuples())):
        metadata = {"title": row.title, "url": row.url}
        collection.add(
            embeddings=[embedding],
            documents=[row.content],
            metadatas=[metadata],
            ids=[str(idx)]
        )
        if idx % 50 == 0:
            print(f"Indexed {idx}/{len(df)}")
    print("Indexing complete.")

# Use lightweight extractive QA model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1)

# QA function
def answer_question(user_question):
    question_embedding = embed_model.encode(user_question)
    results = collection.query(query_embeddings=[question_embedding], n_results=3)
    context = "\n".join(results["documents"][0])
    if len(context.split()) > 400:
        context = " ".join(context.split()[:400])
    result = qa_pipeline(question=user_question, context=context)
    return result["answer"]

# Gradio UI
iface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Ask about IEC College..."),
    outputs=gr.Textbox(label="Answer"),
    title="IEC College Assistant",
    description="Ask questions about IEC College based on structured data."
)

iface.launch(share=True)