fluff / app.py
dejanseo's picture
Create app.py
28dd4be verified
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
# Constants
MODEL_ID = "dejanseo/substance"
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
model.eval()
return tokenizer, model
def classify(text, tokenizer, model):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.softmax(logits, dim=1).squeeze()
pred = torch.argmax(probs).item()
confidence = probs[pred].item()
return pred, confidence
# --- UI ---
st.set_page_config(layout="wide")
st.title("Content Substance Classifier")
st.markdown("This tool estimates the likelihood that content is thin or lacking in substance, using our [deep learning model](https://dejan.ai/blog/content-substance-classification/).")
tokenizer, model = load_model()
pasted_text = st.text_area("Enter full text for granular page analysis:", height=100)
run = st.button("πŸš€ Run Analysis", use_container_width=True)
if not pasted_text.strip():
st.info("Enter some text above, then click **Run Analysis**.")
# --- Classify Text ---
if run and pasted_text.strip():
st.markdown("### Results: Text Classification")
lines = [line.strip() for line in pasted_text.strip().split("\n") if line.strip()]
results = []
for line in lines:
label, conf = classify(line, tokenizer, model)
results.append({
"Text": line,
"Contains Thin Content": "Yes" if label == 0 else "No",
"Confidence": round(conf, 4)
})
df = pd.DataFrame(results)
st.data_editor(
df,
column_config={
"Confidence": st.column_config.ProgressColumn(
label="Confidence",
min_value=0.0,
max_value=1.0,
format="%.4f"
)
},
hide_index=True,
use_container_width=True
)