Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline, AutoTokenizer | |
# Load model and tokenizer | |
model_name = "ealvaradob/bert-finetuned-phishing" | |
classifier = pipeline("text-classification", model=model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
MAX_TOKENS = 512 | |
def count_tokens(text): | |
return len(tokenizer.encode(text, truncation=False)) | |
def chunk_text(text, max_tokens=MAX_TOKENS): | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
word_length = len(tokenizer.encode(word, add_special_tokens=False)) | |
if current_length + word_length > max_tokens: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [word] | |
current_length = word_length | |
else: | |
current_chunk.append(word) | |
current_length += word_length | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def process_chunks(chunks): | |
phishing_count = 0 | |
legitimate_count = 0 | |
total_score = 0 | |
for chunk in chunks: | |
result = classifier(chunk)[0] | |
label = result['label'].lower() | |
score = result['score'] | |
total_score += score | |
if label == "phishing": | |
phishing_count += 1 | |
else: | |
legitimate_count += 1 | |
final_label = "Phishing" if phishing_count > legitimate_count else "Legitimate" | |
average_confidence = total_score / len(chunks) | |
return f"Prediction: {final_label}\nAverage Confidence: {average_confidence:.2%}" | |
def detect_phishing(input_text): | |
token_count = count_tokens(input_text) | |
if token_count <= MAX_TOKENS: | |
result = classifier(input_text)[0] | |
label = "Phishing" if result['label'].lower() == "phishing" else "Legitimate" | |
return f"Prediction: {label}\nConfidence: {result['score']:.2%}" | |
else: | |
chunks = chunk_text(input_text) | |
return process_chunks(chunks) | |
# Gradio interface | |
demo = gr.Interface( | |
fn=detect_phishing, | |
inputs=gr.Textbox(lines=8, placeholder="Paste email content here..."), | |
outputs="text", | |
title="Phishing Email Detector", | |
description="Uses a fine-tuned BERT model to classify whether the email is phishing or legitimate. Handles long emails by chunking." | |
) | |
demo.launch() | |