Spaces:

dejanseo
/

ai-detection-small

Running

App Files Files Community

dejanseo commited on Apr 17

Commit

68d1553

verified ·

1 Parent(s): 4f473c9

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -150

app.py CHANGED Viewed

@@ -3,201 +3,126 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
-import logging # Optional: Add logging for better debugging
-# Set up logging (optional but helpful)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Set the page configuration
 st.set_page_config(
     page_title="AI Article Detection by DEJAN",
     page_icon="🧠",
     layout="wide"
 )
-# Logo as provided
-st.logo(
-    image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
-    link="https://dejan.ai/",
-    # size="large" # 'size' is not a valid argument for st.logo as of Streamlit 1.34 - remove or adjust if needed
 )
-# Font styling
 st.markdown("""
 <link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
 <style>
-    html, body, [class*="css"] {
-        font-family: 'Roboto', sans-serif;
-    }
 </style>
 """, unsafe_allow_html=True)
-@st.cache_resource # Cache the model and tokenizer to avoid reloading on every interaction
 def load_model_and_tokenizer(model_name):
-    """Loads the model and tokenizer."""
-    logger.info(f"Loading tokenizer: {model_name}")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Determine device
-    device_type = "cuda" if torch.cuda.is_available() else "cpu"
-    # Use bfloat16 if available on CUDA for potential speedup/memory saving, else float32
-    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
-    logger.info(f"Using device: {device_type} with dtype: {dtype}")
-    logger.info(f"Loading model: {model_name}")
-    # Load model onto CPU first, then move to target device
-    model = AutoModelForSequenceClassification.from_pretrained(
-        model_name,
-        torch_dtype=dtype # Use the determined dtype
-        # Removed device_map="auto"
-    )
-    logger.info("Moving model to target device...")
-    model.to(torch.device(device_type)) # Move the entire model to the target device
-    model.eval() # Set model to evaluation mode
-    logger.info("Model loaded successfully.")
-    return tokenizer, model, torch.device(device_type)
-# Load model and tokenizer using the cached function
 MODEL_NAME = "dejanseo/ai-detection-small"
 try:
     tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
 except Exception as e:
     st.error(f"Error loading model: {e}")
-    logger.error(f"Failed to load model or tokenizer: {e}", exc_info=True)
-    st.stop() # Stop execution if model loading fails
-# Static settings
 LABELS = ["AI Content", "Human Content"]
-COLORS = ["#ffe5e5", "#e6ffe6"]  # light red, light green
-# Regex-based sentence splitter (improved slightly for robustness)
 def sent_tokenize(text):
-    # Split by '.', '!', '?' followed by space(s) or end of string
-    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
-    # Filter out empty strings that might result from splitting
     return [s for s in sentences if s]
-def split_into_chunks(text, tokenizer, max_length=512):
-    sentences = sent_tokenize(text)
-    if not sentences:
-        return [] # Handle empty input after tokenization
-    chunks, current_chunk_sentences, current_len = [], [], 0
-    max_tokens = max_length - 2 # Account for [CLS] and [SEP] tokens
-    for sent in sentences:
-        # Use tokenizer.encode to get accurate token count (more reliable than tokenize)
-        token_ids = tokenizer.encode(sent, add_special_tokens=False)
-        token_len = len(token_ids)
-        if token_len > max_tokens:
-            # Sentence is too long even by itself, handle appropriately
-            # Option 1: Truncate the sentence (simplest)
-            logger.warning(f"Sentence truncated as it exceeds max_length: '{sent[:100]}...'")
-            truncated_sent = tokenizer.decode(token_ids[:max_tokens])
-            # If there was a previous chunk, add it first
-            if current_chunk_sentences:
-                 chunks.append(" ".join(current_chunk_sentences))
-            chunks.append(truncated_sent) # Add the single truncated sentence as its own chunk
-            current_chunk_sentences, current_len = [], 0 # Reset chunk
-            continue # Move to the next sentence
-        if current_len + token_len <= max_tokens:
-            current_chunk_sentences.append(sent)
-            current_len += token_len
-        else:
-            # Current chunk is full, finalize it
-            if current_chunk_sentences:
-                chunks.append(" ".join(current_chunk_sentences))
-            # Start a new chunk with the current sentence
-            current_chunk_sentences = [sent]
-            current_len = token_len
-    # Add the last remaining chunk
-    if current_chunk_sentences:
-        chunks.append(" ".join(current_chunk_sentences))
-    return chunks
-# --- UI ---
 st.title("AI Article Detection")
-text = st.text_area("Enter text to classify", height=150, placeholder="Paste your text here...")
 if st.button("Classify", type="primary"):
-    if not text or not text.strip():
         st.warning("Please enter some text.")
     else:
-        with st.spinner("Analyzing... Please wait."):
             try:
-                # Split text using the tokenizer reference
-                chunks = split_into_chunks(text, tokenizer, max_length=model.config.max_position_embeddings)
-                logger.info(f"Split text into {len(chunks)} chunks.")
-                if not chunks:
-                     st.warning("Could not process the input text (perhaps it's too short or contains only delimiters?).")
-                     st.stop()
-                # Tokenize chunks and move tensors to the correct device
                 inputs = tokenizer(
-                    chunks,
                     return_tensors="pt",
-                    padding=True,         # Pad sequences to the longest in the batch
-                    truncation=True,      # Truncate sequences longer than max_length
-                    max_length=model.config.max_position_embeddings # Use model's max length
-                ).to(device) # Move inputs to the same device as the model
-                # Perform inference
                 with torch.no_grad():
                     outputs = model(**inputs)
                     logits = outputs.logits
-                    # Ensure probabilities are calculated on CPU if needed for aggregation later
-                    probs = F.softmax(logits, dim=-1).cpu() # Move probs to CPU
-                    preds = torch.argmax(probs, dim=-1) # Argmax on CPU probabilities
-                # Process results
-                chunk_results = []
-                for i, chunk in enumerate(chunks):
-                    pred_index = preds[i].item() # Get prediction index for this chunk
-                    chunk_results.append({
-                        "text": chunk,
-                        "label": LABELS[pred_index],
-                        "color": COLORS[pred_index],
-                        "conf": probs[i, pred_index].item() * 100, # Get confidence for the predicted class
-                    })
-                # Calculate overall prediction based on average probability across chunks
-                if probs.numel() > 0: # Check if probs tensor is not empty
-                    avg_probs = torch.mean(probs, dim=0) # Average probabilities across the batch dimension
-                    final_class_index = torch.argmax(avg_probs).item()
-                    final_label = LABELS[final_class_index]
-                    final_conf = avg_probs[final_class_index].item() * 100
-                    # Display final prediction
-                    st.subheader("📊 Final Prediction")
-                    st.markdown(
-                        f"<div style='background-color:{COLORS[final_class_index]}; padding:1rem; border-radius:0.5rem; border: 1px solid #ccc;'>"
-                        f"Based on the analysis, the text is most likely: <b>{final_label}</b> (Confidence: {final_conf:.1f}%)</div>",
-                        unsafe_allow_html=True
-                    )
-                else:
-                    st.warning("Could not generate predictions for the provided text.")
-                # Display per-chunk predictions in an expander
-                with st.expander("See per-chunk predictions and confidence"):
-                    if chunk_results:
-                        for result in chunk_results:
-                            st.markdown(
-                                f"<div title='Confidence: {result['conf']:.1f}%' "
-                                f"style='background-color:{result['color']}; padding:0.75rem; margin-bottom:0.5rem; border-radius:0.5rem; border: 1px solid #ddd;'>"
-                                f"<i>({result['label']} - {result['conf']:.1f}%)</i><br>{result['text']}</div>",
-                                unsafe_allow_html=True
-                            )
                     else:
-                         st.write("No chunk predictions were generated.")
             except Exception as e:
-                st.error(f"An error occurred during analysis: {e}")
-                logger.error(f"Analysis failed: {e}", exc_info=True)

 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
+import logging
+# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Streamlit page config
 st.set_page_config(
     page_title="AI Article Detection by DEJAN",
     page_icon="🧠",
     layout="wide"
 )
+# Logo
+st.markdown(
+    """
+    <a href="https://dejan.ai/" target="_blank">
+      <img src="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png" alt="DEJAN logo">
+    </a>
+    """,
+    unsafe_allow_html=True
 )
+# Custom font
 st.markdown("""
 <link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
 <style>
+  html, body, [class*="css"] {
+    font-family: 'Roboto', sans-serif;
+  }
 </style>
 """, unsafe_allow_html=True)
+@st.cache_resource
 def load_model_and_tokenizer(model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
+    model.to(device)
+    model.eval()
+    return tokenizer, model, device
 MODEL_NAME = "dejanseo/ai-detection-small"
 try:
     tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
 except Exception as e:
     st.error(f"Error loading model: {e}")
+    logger.error("Failed to load model or tokenizer", exc_info=True)
+    st.stop()
+# Labels
 LABELS = ["AI Content", "Human Content"]
+# Sentence splitter
 def sent_tokenize(text):
+    sentences = re.split(r'(?<=[\.!?])\s+', text.strip())
     return [s for s in sentences if s]
+# UI
 st.title("AI Article Detection")
+text = st.text_area("Enter text to classify", height=200)
 if st.button("Classify", type="primary"):
+    if not text.strip():
         st.warning("Please enter some text.")
     else:
+        with st.spinner("Analyzing..."):
             try:
+                sentences = sent_tokenize(text)
+                if not sentences:
+                    st.warning("No sentences detected.")
+                    st.stop()
+                # Tokenize each sentence
                 inputs = tokenizer(
+                    sentences,
                     return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=model.config.max_position_embeddings
+                ).to(device)
+                # Inference
                 with torch.no_grad():
                     outputs = model(**inputs)
                     logits = outputs.logits
+                    probs = F.softmax(logits, dim=-1).cpu()  # shape [n_sentences, 2]
+                    preds = torch.argmax(probs, dim=-1).cpu()
+                # Build inline styled text
+                styled_chunks = []
+                for i, sent in enumerate(sentences):
+                    pred = preds[i].item()
+                    # select color channel
+                    if pred == 0:
+                        r, g = 255, 0   # red for AI
                     else:
+                        r, g = 0, 255   # green for Human
+                    confidence = probs[i, pred].item()  # between 0 and 1
+                    alpha = confidence  # drive opacity directly
+                    # wrap sentence in span
+                    span = (
+                        f"<span "
+                        f"style='background-color: rgba({r},{g},0,{alpha:.2f}); "
+                        f"padding:2px; margin:0 2px; border-radius:3px;'>"
+                        f"{sent}"
+                        f"</span>"
+                    )
+                    styled_chunks.append(span)
+                # join all sentences inline
+                full_text_html = "".join(styled_chunks)
+                st.markdown(full_text_html, unsafe_allow_html=True)
+                # Overall AI likelihood
+                avg_probs = torch.mean(probs, dim=0)
+                ai_likelihood = avg_probs[0].item() * 100  # class 0 is AI
+                st.subheader(f"🤖 AI Likelihood: {ai_likelihood:.1f}%")
             except Exception as e:
+                st.error(f"Analysis error: {e}")
+                logger.error("Classification failed", exc_info=True)