Spaces:
Sleeping
Sleeping
import os, re | |
import gradio as gr | |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier" | |
URL_LABEL_MAP = {"LABEL_0":"benign","LABEL_1":"defacement","LABEL_2":"malware","LABEL_3":"phishing"} | |
URL_RE = re.compile(r"""(?xi)\b(?:https?://|www\.)[a-z0-9\-._~%]+(?:/[^\s<>"']*)?""") | |
_pipe = None # created on first analyze() | |
def _extract_urls(t: str): | |
return sorted(set(m.group(0) for m in URL_RE.finditer(t or ""))) | |
def _pretty(raw, id2label): | |
if id2label: | |
if raw in id2label: return id2label[raw] | |
k = raw.replace("LABEL_","") | |
if k in id2label: return id2label[k] | |
return URL_LABEL_MAP.get(raw, raw) | |
def _markdown_table(rows): | |
lines = ["| URL | Prediction | Confidence (%) |", "|---|---|---|"] | |
for u, lbl, conf in rows: | |
lines.append(f"| `{u}` | **{lbl}** | {conf:.2f} |") | |
return "\n".join(lines) | |
def analyze(text: str) -> str: | |
text = (text or "").strip() | |
if not text: | |
return "Paste an email body or a URL." | |
urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text) | |
if not urls: | |
return "No URLs detected in the text." | |
global _pipe | |
if _pipe is None: | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
tok = AutoTokenizer.from_pretrained(URL_MODEL_ID) | |
mdl = AutoModelForSequenceClassification.from_pretrained(URL_MODEL_ID) | |
_pipe = pipeline("text-classification", model=mdl, tokenizer=tok, device=-1, top_k=None) | |
id2label = getattr(_pipe.model.config, "id2label", None) | |
rows, unsafe = [], False | |
for u in urls: | |
scores = sorted(_pipe(u)[0], key=lambda s: s["score"], reverse=True) | |
top = scores[0] | |
lbl = _pretty(top["label"], id2label) | |
conf = 100 * float(top["score"]) | |
rows.append([u, lbl, conf]) | |
if lbl.lower() in {"phishing","malware","defacement"}: | |
unsafe = True | |
verdict = "🔴 **UNSAFE (links flagged)**" if unsafe else "🟢 **SAFE (all links benign)**" | |
return verdict + "\n\n" + _markdown_table(rows) | |
demo = gr.Interface( | |
fn=analyze, | |
inputs=gr.Textbox(lines=6, label="Email or URL", placeholder="Paste a URL or a full email…"), | |
outputs=gr.Markdown(label="Results"), | |
title="🛡️ PhishingMail (Link Analysis)", | |
description="Extracts links from your text and classifies each with a compact malicious-URL model.", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |