File size: 2,082 Bytes
87d2fc1
15a78c6
87d2fc1
15a78c6
 
b7ba252
15a78c6
 
 
87d2fc1
 
15a78c6
 
 
 
87d2fc1
 
15a78c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87d2fc1
15a78c6
 
87d2fc1
 
15a78c6
b7ba252
15a78c6
 
 
 
b7ba252
15a78c6
 
 
 
 
 
 
b7ba252
87d2fc1
15a78c6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
from transformers import pipeline

MODEL_ID = "HagalazAI/RedSecureBERT"
THRESHOLD = 0.515

# 1) build a pipeline that applies softmax to the 2 logits
#    (the default pipeline for text-classification already does softmax
#     if the model config says 2 labels).
clf = pipeline(
    "text-classification",
    model=MODEL_ID,
    tokenizer=MODEL_ID,
    top_k=None,  # we want the list of all labels, not just the top label
    function_to_apply="softmax",
)


def predict_offensive(text):
    """
    Returns:
      * Probability that text is "offensive" (the label with index=1)
      * Boolean is_red
    """
    # The pipeline returns a list of dicts, each with {"label", "score"},
    # sorted by descending score, e.g.:
    #
    #   [ {"label": "LABEL_1", "score": 0.997...},
    #     {"label": "LABEL_0", "score": 0.003...} ]
    #
    # We want the entry with "label": "LABEL_1".
    preds = clf(text)[0]  # 0 -> first example in a batch, 2-class
    # If your pipeline is batched, it's typically [ [dict1, dict2], [dict1, dict2], ... ]
    # but for a single string, it's one item: [dict1, dict2].

    # preds is something like:
    # [ {"label":"LABEL_1","score":0.99},
    #   {"label":"LABEL_0","score":0.01} ]
    #
    # So let's find the dictionary for label==LABEL_1:
    label_1_entry = next(x for x in preds if x["label"] == "LABEL_1")
    prob_offensive = float(label_1_entry["score"])
    is_red = (prob_offensive >= THRESHOLD)

    return {
        "P(offensive)": f"{prob_offensive:.3f}",
        "is_red": is_red
    }


demo = gr.Interface(
    fn=predict_offensive,
    inputs=gr.Textbox(
        lines=2,
        placeholder="Try an exploit-like prompt: e.g. 'Bypass an antivirus...'"),
    outputs="json",
    title="RedSecureBERT Demo",
    description=(
        f"This Space uses **{MODEL_ID}**.\n\n"
        f"**Threshold** for 'is_red' = {THRESHOLD}\n\n"
        "The model is a 2-class classifier: LABEL_0=Not offensive, LABEL_1=Offensive.\n"
    ),
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch()