File size: 1,666 Bytes
5413556 990e8dd 71226c9 332b958 990e8dd c198e10 990e8dd 332b958 71226c9 990e8dd 332b958 990e8dd 71226c9 332b958 71226c9 332b958 990e8dd 332b958 990e8dd 71226c9 990e8dd 71226c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# https://huggingface.co/spaces/gigisan81/Enron
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import torch
import gradio as gr
# Load the dataset
dataset_path = '/content/drive/MyDrive/emails_folder/emails.csv'
df = pd.read_csv(dataset_path)
# Load the pre-trained model and tokenizer
model_name = "paraphrase-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Tokenize the emails
email_tokens = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt")
# Function to find the most relevant email based on the question
def find_most_relevant_email(question):
# Tokenize the question
question_tokens = tokenizer(question, padding=True, truncation=True, return_tensors="pt")
# Encode the question and emails
with torch.no_grad():
question_embedding = model(**question_tokens).pooler_output
email_embeddings = model(**email_tokens).pooler_output
# Calculate cosine similarity between question and email embeddings
similarities = 1 - cosine(question_embedding, email_embeddings, axis=1)
# Get the index of the most similar email
most_similar_index = similarities.argmax().item()
# Return the text of the most similar email
return df['text'][most_similar_index]
# Create Gradio interface
iface = gr.Interface(
fn=find_most_relevant_email,
inputs="text",
outputs="text",
title="Email Question Answering System",
description="Enter a question and get the most relevant email from the dataset."
)
# Launch the interface
iface.launch()
|