File size: 3,007 Bytes
83874aa
 
 
 
 
 
 
 
a3bf388
 
 
83874aa
 
 
 
 
 
e1177f7
 
83874aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# app.py
import io
import pandas as pd
import torch
from transformers import pipeline
import streamlit as st
from utils import apply_model

import os
os.environ["HF_HOME"] = "/home/user/huggingface"

device = 0 if torch.cuda.is_available() else -1
ner_pipeline = pipeline("token-classification", model="CyberPeace-Institute/SecureBERT-NER", device=device)

st.set_page_config(page_title="NER on Text Files", page_icon="🧠", layout="centered")

st.title("🧠 Named Entity Recognition (NER)")
st.write("XSRF:", st.get_option("server.enableXsrfProtection"))

st.write("Upload a single **.txt** file and extract entities by class.")

# --- Sidebar: model selection / help ---
with st.sidebar:
    st.header("Settings")
    model_name = "CyberPeace-Institute/SecureBERT-NER"
 

uploaded = st.file_uploader(
    "Upload a .txt file",
    type=["txt"],
    accept_multiple_files=False,   # 🔒 Only one file at a time
    help="Plain text only."
)

if uploaded is not None:
    # Read text safely
    raw_bytes = uploaded.read()
    try:
        text = raw_bytes.decode("utf-8")
    except UnicodeDecodeError:
        # Fallback if not UTF-8
        text = raw_bytes.decode("latin-1", errors="ignore")

    st.subheader("Preview")
    st.text_area("File contents", text, height=220)

    if st.button("Process with NER", type="primary"):
        with st.spinner("Loading model and extracting entities…"):
            preds = apply_model([[text]], ner_pipeline)[0][0]

        # Group unique entities by class label
        by_label = {}
        for p in preds:
            label = p.get("entity_group") or p.get("entity") or "UNKNOWN"
            # Normalize the entity text
            ent_text = (p.get("word") or p.get("entity") or "").strip()
            if not ent_text:
                start, end = p.get("start"), p.get("end")
                ent_text = text[start:end] if (start is not None and end is not None) else ""
                ent_text = ent_text.strip()
            if not ent_text:
                continue
            by_label.setdefault(label, [])
            # keep unique but preserve order
            if ent_text not in by_label[label]:
                by_label[label].append(ent_text)

        # Build results table
        if by_label:
            df = pd.DataFrame(
                [{"Class": label, "Entities": ", ".join(ents)} for label, ents in by_label.items()]
            ).sort_values("Class").reset_index(drop=True)

            st.subheader("Results")
            st.dataframe(df, use_container_width=True)

            # Offer CSV download
            csv_buf = io.StringIO()
            df.to_csv(csv_buf, index=False)
            st.download_button(
                label="Download results as CSV",
                data=csv_buf.getvalue(),
                file_name="ner_results.csv",
                mime="text/csv",
            )
        else:
            st.info("No entities found by the selected model.")

else:
    st.info("Upload a single .txt file to begin.")