|
|
|
|
|
import pandas as pd |
|
from transformers import AutoTokenizer, AutoModel |
|
from scipy.spatial.distance import cosine |
|
import torch |
|
import gradio as gr |
|
|
|
|
|
dataset_path = '/content/drive/MyDrive/emails_folder/emails.csv' |
|
df = pd.read_csv(dataset_path) |
|
|
|
|
|
model_name = "paraphrase-MiniLM-L6-v2" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
email_tokens = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt") |
|
|
|
|
|
def find_most_relevant_email(question): |
|
|
|
question_tokens = tokenizer(question, padding=True, truncation=True, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
question_embedding = model(**question_tokens).pooler_output |
|
email_embeddings = model(**email_tokens).pooler_output |
|
|
|
similarities = 1 - cosine(question_embedding, email_embeddings, axis=1) |
|
|
|
most_similar_index = similarities.argmax().item() |
|
|
|
return df['text'][most_similar_index] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=find_most_relevant_email, |
|
inputs="text", |
|
outputs="text", |
|
title="Email Question Answering System", |
|
description="Enter a question and get the most relevant email from the dataset." |
|
) |
|
|
|
|
|
iface.launch() |
|
|