# https://huggingface.co/spaces/gigisan81/Enron

import pandas as pd
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import torch
import gradio as gr

# Load the dataset
dataset_path = '/content/drive/MyDrive/emails_folder/emails.csv'
df = pd.read_csv(dataset_path)

# Load the pre-trained model and tokenizer
model_name = "paraphrase-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize the emails
email_tokens = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Function to find the most relevant email based on the question
def find_most_relevant_email(question):
    # Tokenize the question
    question_tokens = tokenizer(question, padding=True, truncation=True, return_tensors="pt")
    # Encode the question and emails
    with torch.no_grad():
        question_embedding = model(**question_tokens).pooler_output
        email_embeddings = model(**email_tokens).pooler_output
    # Calculate cosine similarity between question and email embeddings
    similarities = 1 - cosine(question_embedding, email_embeddings, axis=1)
    # Get the index of the most similar email
    most_similar_index = similarities.argmax().item()
    # Return the text of the most similar email
    return df['text'][most_similar_index]

# Create Gradio interface
iface = gr.Interface(
    fn=find_most_relevant_email,
    inputs="text",
    outputs="text",
    title="Email Question Answering System",
    description="Enter a question and get the most relevant email from the dataset."
)

# Launch the interface
iface.launch()