import os import torch import pandas as pd import chromadb import gradio as gr from sentence_transformers import SentenceTransformer from chromadb.config import Settings from transformers import pipeline # Device setup device = -1 # Use CPU print("Device set to: CPU") # Load CSV data df = pd.read_csv("iec_college_data.csv").dropna(subset=["content"]).reset_index(drop=True) # Load embedding model on CPU embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") # ChromaDB setup chroma_client = chromadb.PersistentClient(path="./chroma_db") collection_name = "iec_data" # Get or create collection if collection_name not in [col.name for col in chroma_client.list_collections()]: collection = chroma_client.create_collection(name=collection_name) else: collection = chroma_client.get_collection(name=collection_name) # Only index if collection is empty if collection.count() == 0: print("Indexing documents...") texts = df["content"].tolist() embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=True) for idx, (embedding, row) in enumerate(zip(embeddings, df.itertuples())): metadata = {"title": row.title, "url": row.url} collection.add( embeddings=[embedding], documents=[row.content], metadatas=[metadata], ids=[str(idx)] ) if idx % 50 == 0: print(f"Indexed {idx}/{len(df)}") print("Indexing complete.") # Use lightweight extractive QA model qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1) # QA function def answer_question(user_question): question_embedding = embed_model.encode(user_question) results = collection.query(query_embeddings=[question_embedding], n_results=3) context = "\n".join(results["documents"][0]) if len(context.split()) > 400: context = " ".join(context.split()[:400]) result = qa_pipeline(question=user_question, context=context) return result["answer"] # Gradio UI iface = gr.Interface( fn=answer_question, inputs=gr.Textbox(lines=2, placeholder="Ask about IEC College..."), outputs=gr.Textbox(label="Answer"), title="IEC College Assistant", description="Ask questions about IEC College based on structured data." ) iface.launch(share=True)