import streamlit as st import joblib import re import PyPDF2 import pandas as pd import os import uuid from datetime import datetime import tempfile from io import BytesIO # Load model and vectorizer classifier_model = joblib.load('resume_classifier') resume_vectorizer = joblib.load('resume_vectorizer') def transfer_tmp_logs(): tmp_log_path = "/tmp/corrections_log.csv" main_log_path = "corrections_log.csv" if not os.path.exists(tmp_log_path): return # No new logs to transfer tmp_df = pd.read_csv(tmp_log_path) if os.path.exists(main_log_path): main_df = pd.read_csv(main_log_path) # Merge without duplicates based on serial_id combined_df = pd.concat([main_df, tmp_df]).drop_duplicates(subset=["serial_id"], keep="last") else: combined_df = tmp_df combined_df.to_csv(main_log_path, index=False) # Optionally, clean up the tmp file after transfer os.remove(tmp_log_path) def read_uploaded_file(uploaded_file): ext = os.path.splitext(uploaded_file.name)[1].lower() try: if ext == ".pdf": reader = PyPDF2.PdfReader(uploaded_file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text.strip() elif ext == ".txt": return uploaded_file.read().decode("utf-8").strip() else: return "Unsupported file type." except Exception as e: return f"Error reading file: {str(e)}" def clean_resume(text): return re.sub(r'[^a-zA-Z]', ' ', text).lower() def log_or_update(serial_id, timestamp, resume_text, model_prediction, corrected_prediction): log_file = "/tmp/corrections_log.csv" resume_text_short = resume_text[:500] # Truncate for privacy/log size new_row = { "serial_id": serial_id, "timestamp": timestamp, "resume_text": resume_text_short, "model_prediction": model_prediction, "corrected_prediction": corrected_prediction } if os.path.exists(log_file): df = pd.read_csv(log_file) if serial_id in df["serial_id"].values: df.loc[df["serial_id"] == serial_id, "corrected_prediction"] = corrected_prediction else: df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) else: df = pd.DataFrame([new_row]) df.to_csv(log_file, index=False) # Streamlit UI st.title("📄 Resume Role Classifier") uploaded_file = st.file_uploader( "Upload your resume (PDF, TXT format)", type=["pdf", "txt", "doc", "docx"] ) if uploaded_file: # Reset the file read pointer in case it was read earlier uploaded_file.seek(0) # Track upload session if ( "uploaded_file_name" not in st.session_state or st.session_state.uploaded_file_name != uploaded_file.name ): st.session_state.uploaded_file_name = uploaded_file.name st.session_state.serial_id = str(uuid.uuid4()) st.session_state.corrected_prediction = None extracted_text = read_uploaded_file(uploaded_file) if "Error" in extracted_text or not extracted_text.strip(): st.warning("⚠️ Could not extract text from the uploaded file.") else: cleaned_text = clean_resume(extracted_text) new_input = resume_vectorizer.transform([cleaned_text]) prediction = classifier_model.predict(new_input)[0] st.write(f"**Predicted Role:** `{prediction}`") feedback = st.radio("Is this prediction correct?", ("Yes", "No"), key="feedback_radio") corrected_prediction = prediction if feedback == "No": corrected_prediction = st.text_input( "Please provide the correct role:", value=st.session_state.get("corrected_prediction", ""), key="correction_input" ) st.session_state.corrected_prediction = corrected_prediction else: st.session_state.corrected_prediction = prediction if (feedback == "Yes") or (feedback == "No" and corrected_prediction): now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log_or_update( serial_id=st.session_state.serial_id, timestamp=now, resume_text=extracted_text, model_prediction=prediction, corrected_prediction=corrected_prediction ) st.success(f"✅ Final role recorded: `{corrected_prediction}`") else: st.info("📤 Please upload a supported file (PDF, TXT, DOC, DOCX).")