|
import streamlit as st |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import pandas as pd |
|
|
|
|
|
MODEL_ID = "dejanseo/substance" |
|
|
|
@st.cache_resource |
|
def load_model(): |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) |
|
model.eval() |
|
return tokenizer, model |
|
|
|
def classify(text, tokenizer, model): |
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) |
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
probs = torch.softmax(logits, dim=1).squeeze() |
|
pred = torch.argmax(probs).item() |
|
confidence = probs[pred].item() |
|
return pred, confidence |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
st.title("Content Substance Classifier") |
|
st.markdown("This tool estimates the likelihood that content is thin or lacking in substance, using our [deep learning model](https://dejan.ai/blog/content-substance-classification/).") |
|
|
|
tokenizer, model = load_model() |
|
|
|
pasted_text = st.text_area("Enter full text for granular page analysis:", height=100) |
|
run = st.button("π Run Analysis", use_container_width=True) |
|
|
|
if not pasted_text.strip(): |
|
st.info("Enter some text above, then click **Run Analysis**.") |
|
|
|
|
|
if run and pasted_text.strip(): |
|
st.markdown("### Results: Text Classification") |
|
lines = [line.strip() for line in pasted_text.strip().split("\n") if line.strip()] |
|
results = [] |
|
for line in lines: |
|
label, conf = classify(line, tokenizer, model) |
|
results.append({ |
|
"Text": line, |
|
"Contains Thin Content": "Yes" if label == 0 else "No", |
|
"Confidence": round(conf, 4) |
|
}) |
|
df = pd.DataFrame(results) |
|
st.data_editor( |
|
df, |
|
column_config={ |
|
"Confidence": st.column_config.ProgressColumn( |
|
label="Confidence", |
|
min_value=0.0, |
|
max_value=1.0, |
|
format="%.4f" |
|
) |
|
}, |
|
hide_index=True, |
|
use_container_width=True |
|
) |
|
|