Spaces:
Sleeping
Sleeping
import os | |
import torch | |
import pandas as pd | |
import chromadb | |
import gradio as gr | |
from sentence_transformers import SentenceTransformer | |
from chromadb.config import Settings | |
from transformers import pipeline | |
# Device setup | |
device = -1 # Use CPU | |
print("Device set to: CPU") | |
# Load CSV data | |
df = pd.read_csv("iec_college_data.csv").dropna(subset=["content"]).reset_index(drop=True) | |
# Load embedding model on CPU | |
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") | |
# ChromaDB setup | |
chroma_client = chromadb.PersistentClient(path="./chroma_db") | |
collection_name = "iec_data" | |
# Get or create collection | |
if collection_name not in [col.name for col in chroma_client.list_collections()]: | |
collection = chroma_client.create_collection(name=collection_name) | |
else: | |
collection = chroma_client.get_collection(name=collection_name) | |
# Only index if collection is empty | |
if collection.count() == 0: | |
print("Indexing documents...") | |
texts = df["content"].tolist() | |
embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=True) | |
for idx, (embedding, row) in enumerate(zip(embeddings, df.itertuples())): | |
metadata = {"title": row.title, "url": row.url} | |
collection.add( | |
embeddings=[embedding], | |
documents=[row.content], | |
metadatas=[metadata], | |
ids=[str(idx)] | |
) | |
if idx % 50 == 0: | |
print(f"Indexed {idx}/{len(df)}") | |
print("Indexing complete.") | |
# Use lightweight extractive QA model | |
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1) | |
# QA function | |
def answer_question(user_question): | |
question_embedding = embed_model.encode(user_question) | |
results = collection.query(query_embeddings=[question_embedding], n_results=3) | |
context = "\n".join(results["documents"][0]) | |
if len(context.split()) > 400: | |
context = " ".join(context.split()[:400]) | |
result = qa_pipeline(question=user_question, context=context) | |
return result["answer"] | |
# Gradio UI | |
iface = gr.Interface( | |
fn=answer_question, | |
inputs=gr.Textbox(lines=2, placeholder="Ask about IEC College..."), | |
outputs=gr.Textbox(label="Answer"), | |
title="IEC College Assistant", | |
description="Ask questions about IEC College based on structured data." | |
) | |
iface.launch(share=True) | |