Spaces:
Sleeping
Sleeping
File size: 2,329 Bytes
a047c4f abeaa28 93f1ff0 a047c4f abeaa28 a047c4f 9b4e4aa abeaa28 5cba5a1 31386f7 1445742 abeaa28 e7e375f 1445742 abeaa28 1445742 5cba5a1 abeaa28 9b4e4aa 0fc1e2c abeaa28 0fc1e2c abeaa28 0fc1e2c 93f1ff0 abeaa28 a047c4f abeaa28 a047c4f abeaa28 1445742 5cba5a1 1e5db4e 1445742 abeaa28 93f1ff0 0fc1e2c abeaa28 5cba5a1 a047c4f 5cba5a1 0576674 abeaa28 0576674 abeaa28 0576674 1445742 93f1ff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import torch
import pandas as pd
import chromadb
import gradio as gr
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline
# Device setup
device = -1 # Use CPU
print("Device set to: CPU")
# Load CSV data
df = pd.read_csv("iec_college_data.csv").dropna(subset=["content"]).reset_index(drop=True)
# Load embedding model on CPU
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
# ChromaDB setup
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection_name = "iec_data"
# Get or create collection
if collection_name not in [col.name for col in chroma_client.list_collections()]:
collection = chroma_client.create_collection(name=collection_name)
else:
collection = chroma_client.get_collection(name=collection_name)
# Only index if collection is empty
if collection.count() == 0:
print("Indexing documents...")
texts = df["content"].tolist()
embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=True)
for idx, (embedding, row) in enumerate(zip(embeddings, df.itertuples())):
metadata = {"title": row.title, "url": row.url}
collection.add(
embeddings=[embedding],
documents=[row.content],
metadatas=[metadata],
ids=[str(idx)]
)
if idx % 50 == 0:
print(f"Indexed {idx}/{len(df)}")
print("Indexing complete.")
# Use lightweight extractive QA model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1)
# QA function
def answer_question(user_question):
question_embedding = embed_model.encode(user_question)
results = collection.query(query_embeddings=[question_embedding], n_results=3)
context = "\n".join(results["documents"][0])
if len(context.split()) > 400:
context = " ".join(context.split()[:400])
result = qa_pipeline(question=user_question, context=context)
return result["answer"]
# Gradio UI
iface = gr.Interface(
fn=answer_question,
inputs=gr.Textbox(lines=2, placeholder="Ask about IEC College..."),
outputs=gr.Textbox(label="Answer"),
title="IEC College Assistant",
description="Ask questions about IEC College based on structured data."
)
iface.launch(share=True)
|