Enron / app.py
gigisan81's picture
Update app.py
c198e10 verified
# https://huggingface.co/spaces/gigisan81/Enron
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import torch
import gradio as gr
# Load the dataset
dataset_path = '/content/drive/MyDrive/emails_folder/emails.csv'
df = pd.read_csv(dataset_path)
# Load the pre-trained model and tokenizer
model_name = "paraphrase-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Tokenize the emails
email_tokens = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt")
# Function to find the most relevant email based on the question
def find_most_relevant_email(question):
# Tokenize the question
question_tokens = tokenizer(question, padding=True, truncation=True, return_tensors="pt")
# Encode the question and emails
with torch.no_grad():
question_embedding = model(**question_tokens).pooler_output
email_embeddings = model(**email_tokens).pooler_output
# Calculate cosine similarity between question and email embeddings
similarities = 1 - cosine(question_embedding, email_embeddings, axis=1)
# Get the index of the most similar email
most_similar_index = similarities.argmax().item()
# Return the text of the most similar email
return df['text'][most_similar_index]
# Create Gradio interface
iface = gr.Interface(
fn=find_most_relevant_email,
inputs="text",
outputs="text",
title="Email Question Answering System",
description="Enter a question and get the most relevant email from the dataset."
)
# Launch the interface
iface.launch()