Spaces:

SubhaL
/

bert-phishing-detector

Running

File size: 2,315 Bytes

import gradio as gr
from transformers import pipeline, AutoTokenizer

# Load model and tokenizer
model_name = "ealvaradob/bert-finetuned-phishing"
classifier = pipeline("text-classification", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_TOKENS = 512

def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))

def chunk_text(text, max_tokens=MAX_TOKENS):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def process_chunks(chunks):
    phishing_count = 0
    legitimate_count = 0
    total_score = 0

    for chunk in chunks:
        result = classifier(chunk)[0]
        label = result['label'].lower()
        score = result['score']
        total_score += score

        if label == "phishing":
            phishing_count += 1
        else:
            legitimate_count += 1

    final_label = "Phishing" if phishing_count > legitimate_count else "Legitimate"
    average_confidence = total_score / len(chunks)

    return f"Prediction: {final_label}\nAverage Confidence: {average_confidence:.2%}"

def detect_phishing(input_text):
    token_count = count_tokens(input_text)

    if token_count <= MAX_TOKENS:
        result = classifier(input_text)[0]
        label = "Phishing" if result['label'].lower() == "phishing" else "Legitimate"
        return f"Prediction: {label}\nConfidence: {result['score']:.2%}"
    else:
        chunks = chunk_text(input_text)
        return process_chunks(chunks)

# Gradio interface
demo = gr.Interface(
    fn=detect_phishing,
    inputs=gr.Textbox(lines=8, placeholder="Paste email content here..."),
    outputs="text",
    title="Phishing Email Detector",
    description="Uses a fine-tuned BERT model to classify whether the email is phishing or legitimate. Handles long emails by chunking."
)

demo.launch()