gholap310's picture
Upload app.py
5198010 verified
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import gradio as gr
# Load your natural-language corpus
with open("electricity_corpus.json", "r") as f:
corpus = json.load(f)
# Build TF-IDF index
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
# Load the QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
# Function to retrieve top matching rows
def get_top_contexts(question, top_k=3):
question_vec = vectorizer.transform([question])
similarities = cosine_similarity(question_vec, tfidf_matrix).flatten()
top_indices = similarities.argsort()[-top_k:][::-1]
return [corpus[i] for i in top_indices]
# Main logic to get answer
def answer_question(question, top_k=3):
if not question.strip():
return "Please enter a valid question."
contexts = get_top_contexts(question, top_k)
combined_context = " ".join(contexts)[:4096] # truncate to model max input
result = qa_pipeline(question=question, context=combined_context)
return result["answer"]
# Gradio interface
iface = gr.Interface(
fn=answer_question,
inputs=gr.Textbox(label="Ask your question about electricity usage..."),
outputs=gr.Textbox(label="Answer"),
title="πŸ”Œ Electricity Data Q&A",
description="Ask questions like 'What was the price for residential in Texas in Jan 2001?' or 'Which state had highest revenue in Jan 2001?'",
)
# Run the app
if __name__ == "__main__":
iface.launch()