IEC-LLM / app.py
prakhardoneria's picture
Update app.py
1e5db4e verified
import os
import torch
import pandas as pd
import chromadb
import gradio as gr
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline
# Device setup
device = -1 # Use CPU
print("Device set to: CPU")
# Load CSV data
df = pd.read_csv("iec_college_data.csv").dropna(subset=["content"]).reset_index(drop=True)
# Load embedding model on CPU
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
# ChromaDB setup
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection_name = "iec_data"
# Get or create collection
if collection_name not in [col.name for col in chroma_client.list_collections()]:
collection = chroma_client.create_collection(name=collection_name)
else:
collection = chroma_client.get_collection(name=collection_name)
# Only index if collection is empty
if collection.count() == 0:
print("Indexing documents...")
texts = df["content"].tolist()
embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=True)
for idx, (embedding, row) in enumerate(zip(embeddings, df.itertuples())):
metadata = {"title": row.title, "url": row.url}
collection.add(
embeddings=[embedding],
documents=[row.content],
metadatas=[metadata],
ids=[str(idx)]
)
if idx % 50 == 0:
print(f"Indexed {idx}/{len(df)}")
print("Indexing complete.")
# Use lightweight extractive QA model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1)
# QA function
def answer_question(user_question):
question_embedding = embed_model.encode(user_question)
results = collection.query(query_embeddings=[question_embedding], n_results=3)
context = "\n".join(results["documents"][0])
if len(context.split()) > 400:
context = " ".join(context.split()[:400])
result = qa_pipeline(question=user_question, context=context)
return result["answer"]
# Gradio UI
iface = gr.Interface(
fn=answer_question,
inputs=gr.Textbox(lines=2, placeholder="Ask about IEC College..."),
outputs=gr.Textbox(label="Answer"),
title="IEC College Assistant",
description="Ask questions about IEC College based on structured data."
)
iface.launch(share=True)