# https://huggingface.co/spaces/gigisan81/Enron import pandas as pd from transformers import AutoTokenizer, AutoModel from scipy.spatial.distance import cosine import torch import gradio as gr # Load the dataset dataset_path = '/content/drive/MyDrive/emails_folder/emails.csv' df = pd.read_csv(dataset_path) # Load the pre-trained model and tokenizer model_name = "paraphrase-MiniLM-L6-v2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Tokenize the emails email_tokens = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt") # Function to find the most relevant email based on the question def find_most_relevant_email(question): # Tokenize the question question_tokens = tokenizer(question, padding=True, truncation=True, return_tensors="pt") # Encode the question and emails with torch.no_grad(): question_embedding = model(**question_tokens).pooler_output email_embeddings = model(**email_tokens).pooler_output # Calculate cosine similarity between question and email embeddings similarities = 1 - cosine(question_embedding, email_embeddings, axis=1) # Get the index of the most similar email most_similar_index = similarities.argmax().item() # Return the text of the most similar email return df['text'][most_similar_index] # Create Gradio interface iface = gr.Interface( fn=find_most_relevant_email, inputs="text", outputs="text", title="Email Question Answering System", description="Enter a question and get the most relevant email from the dataset." ) # Launch the interface iface.launch()