import streamlit as st import joblib import pandas as pd import re from unidecode import unidecode import emoji import string import contractions from nltk.stem import PorterStemmer import numpy as np from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # Custom CSS Styling st.markdown(""" """, unsafe_allow_html=True) # Header st.markdown("""

📌 StackOverflow Tag Predictor

Enter a programming question to see predicted tags

""", unsafe_allow_html=True) # Initialize components stemmer = PorterStemmer() stop_words = set(ENGLISH_STOP_WORDS) chat_words = { "brb": "be right back", "btw": "by the way", "lol": "laugh out loud", "afaik": "as far as i know", "imo": "in my opinion", "tbh": "to be honest", "idk": "i don't know", "asap": "as soon as possible", "np": "no problem", "thx": "thanks", "pls": "please", "fyi": "for your information" } def preprocess_text(text): if not isinstance(text, str) or not text.strip(): return "" try: text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'https?://\S+|www\.\S+', '', text) text = emoji.demojize(text, delimiters=(" ", " ")) text = unidecode(text) text = contractions.fix(text) text = text.lower() words = text.split() text = " ".join([chat_words.get(word.lower(), word) for word in words]) text = text.translate(str.maketrans('', '', string.punctuation)) tokens = re.findall(r'\b\w+\b', text) tokens = [word for word in tokens if word not in stop_words] tokens = [stemmer.stem(word) for word in tokens] return " ".join(tokens) except Exception as e: st.error(f"Preprocessing error: {e}") return "" @st.cache_resource def load_models(): try: model = joblib.load("tag_model.joblib") mlb = joblib.load("tag_binarizer.joblib") return model, mlb except Exception as e: st.error(f"Error loading model: {e}") return None, None model, mlb = load_models() # Input st.markdown('

', unsafe_allow_html=True) user_input = st.text_area("✍️ Paste your programming question below:", height=200, placeholder="e.g., How to reverse a list in Python?") st.markdown('

', unsafe_allow_html=True) # Prediction if st.button("🚀 Predict Tags"): if not user_input.strip(): st.warning("Please enter your question to get predictions.") elif model is None or mlb is None: st.error("Model loading failed.") else: with st.spinner("Processing..."): processed = preprocess_text(user_input) if processed: try: input_df = pd.DataFrame({'processed_excerpt': [processed]}) if hasattr(model, "predict_proba"): probs = model.predict_proba(input_df)[0] top_idx = np.argsort(probs)[-5:][::-1] tags = [mlb.classes_[i] for i in top_idx] confs = [int(probs[i] * 100) for i in top_idx] elif hasattr(model, "decision_function"): scores = model.decision_function(input_df)[0] top_idx = np.argsort(scores)[-5:][::-1] tags = [mlb.classes_[i] for i in top_idx] confs = [None] * 5 else: preds = model.predict(input_df) tags = mlb.inverse_transform(preds)[0] confs = [None] * len(tags) # Output st.markdown('

🏷️ Predicted Tags:

', unsafe_allow_html=True) for tag, conf in zip(tags, confs): confidence = f" ({conf}%)" if conf is not None else "" st.markdown(f'{tag}{confidence}', unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) except Exception as e: st.error(f"Prediction error: {e}") # Footer st.markdown(""" """, unsafe_allow_html=True)