import json from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import pipeline import gradio as gr # Load your natural-language corpus with open("electricity_corpus.json", "r") as f: corpus = json.load(f) # Build TF-IDF index vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(corpus) # Load the QA model qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") # Function to retrieve top matching rows def get_top_contexts(question, top_k=3): question_vec = vectorizer.transform([question]) similarities = cosine_similarity(question_vec, tfidf_matrix).flatten() top_indices = similarities.argsort()[-top_k:][::-1] return [corpus[i] for i in top_indices] # Main logic to get answer def answer_question(question, top_k=3): if not question.strip(): return "Please enter a valid question." contexts = get_top_contexts(question, top_k) combined_context = " ".join(contexts)[:4096] # truncate to model max input result = qa_pipeline(question=question, context=combined_context) return result["answer"] # Gradio interface iface = gr.Interface( fn=answer_question, inputs=gr.Textbox(label="Ask your question about electricity usage..."), outputs=gr.Textbox(label="Answer"), title="🔌 Electricity Data Q&A", description="Ask questions like 'What was the price for residential in Texas in Jan 2001?' or 'Which state had highest revenue in Jan 2001?'", ) # Run the app if __name__ == "__main__": iface.launch()