import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import pandas as pd # Constants MODEL_ID = "dejanseo/substance" @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) model.eval() return tokenizer, model def classify(text, tokenizer, model): inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): logits = model(**inputs).logits probs = torch.softmax(logits, dim=1).squeeze() pred = torch.argmax(probs).item() confidence = probs[pred].item() return pred, confidence # --- UI --- st.set_page_config(layout="wide") st.title("Content Substance Classifier") st.markdown("This tool estimates the likelihood that content is thin or lacking in substance, using our [deep learning model](https://dejan.ai/blog/content-substance-classification/).") tokenizer, model = load_model() pasted_text = st.text_area("Enter full text for granular page analysis:", height=100) run = st.button("🚀 Run Analysis", use_container_width=True) if not pasted_text.strip(): st.info("Enter some text above, then click **Run Analysis**.") # --- Classify Text --- if run and pasted_text.strip(): st.markdown("### Results: Text Classification") lines = [line.strip() for line in pasted_text.strip().split("\n") if line.strip()] results = [] for line in lines: label, conf = classify(line, tokenizer, model) results.append({ "Text": line, "Contains Thin Content": "Yes" if label == 0 else "No", "Confidence": round(conf, 4) }) df = pd.DataFrame(results) st.data_editor( df, column_config={ "Confidence": st.column_config.ProgressColumn( label="Confidence", min_value=0.0, max_value=1.0, format="%.4f" ) }, hide_index=True, use_container_width=True )