import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from io import BytesIO import base64 import os import re import warnings warnings.filterwarnings("ignore") # ML/NLP imports try: from sentence_transformers import SentenceTransformer, CrossEncoder from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import torch import faiss from rank_bm25 import BM25Okapi import nltk from nltk.tokenize import word_tokenize import pdfplumber import PyPDF2 from docx import Document from datasets import load_dataset ML_IMPORTS_AVAILABLE = True except ImportError as e: st.error(f"Missing required ML libraries: {e}") ML_IMPORTS_AVAILABLE = False # Download NLTK data try: nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) except: pass # Page configuration st.set_page_config( page_title="๐Ÿค– AI Resume Screener", page_icon="๐Ÿค–", layout="wide", initial_sidebar_state="expanded" ) # Initialize session state if 'models_loaded' not in st.session_state: st.session_state.models_loaded = False if 'embedding_model' not in st.session_state: st.session_state.embedding_model = None if 'cross_encoder' not in st.session_state: st.session_state.cross_encoder = None if 'llm_tokenizer' not in st.session_state: st.session_state.llm_tokenizer = None if 'llm_model' not in st.session_state: st.session_state.llm_model = None if 'model_errors' not in st.session_state: st.session_state.model_errors = {} if 'resume_texts' not in st.session_state: st.session_state.resume_texts = [] if 'resume_filenames' not in st.session_state: st.session_state.resume_filenames = [] if 'results' not in st.session_state: st.session_state.results = None def load_models(): """Load all ML models at startup""" if st.session_state.models_loaded: return st.info("๐Ÿ”„ Loading AI models... This may take a few minutes on first run.") # Load embedding model try: print("Loading embedding model: BAAI/bge-large-en-v1.5") st.text("Loading embedding model...") try: st.session_state.embedding_model = SentenceTransformer( 'BAAI/bge-large-en-v1.5', device_map="auto" ) except Exception as e: print(f"Device map failed, falling back to default: {e}") st.session_state.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5') print("โœ… Embedding model loaded successfully") except Exception as e: print(f"โŒ Error loading embedding model: {e}") st.session_state.model_errors['embedding'] = str(e) # Load cross-encoder try: print("Loading cross-encoder: cross-encoder/ms-marco-MiniLM-L6-v2") st.text("Loading cross-encoder...") st.session_state.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2') print("โœ… Cross-encoder loaded successfully") except Exception as e: print(f"โŒ Error loading cross-encoder: {e}") st.session_state.model_errors['cross_encoder'] = str(e) # Load LLM for intent analysis try: print("Loading LLM: Qwen/Qwen2-1.5B") # Using smaller model for better compatibility st.text("Loading LLM for intent analysis...") # Quantization config bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) st.session_state.llm_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B") st.session_state.llm_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-1.5B", quantization_config=bnb_config, device_map="auto", trust_remote_code=True ) print("โœ… LLM loaded successfully") except Exception as e: print(f"โŒ Error loading LLM: {e}") st.session_state.model_errors['llm'] = str(e) st.session_state.models_loaded = True st.success("โœ… All models loaded successfully!") class ResumeScreener: def __init__(self): self.embedding_model = st.session_state.embedding_model self.cross_encoder = st.session_state.cross_encoder self.llm_tokenizer = st.session_state.llm_tokenizer self.llm_model = st.session_state.llm_model # Predefined skills list self.skills_list = [ 'python', 'java', 'javascript', 'react', 'angular', 'vue', 'node.js', 'sql', 'mongodb', 'postgresql', 'mysql', 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'git', 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy', 'html', 'css', 'bootstrap', 'tailwind', 'api', 'rest', 'graphql', 'microservices', 'agile', 'scrum', 'devops', 'ci/cd', 'jenkins', 'linux', 'bash', 'shell scripting', 'data analysis', 'statistics', 'excel', 'powerbi', 'tableau', 'spark', 'hadoop', 'kafka', 'redis', 'elasticsearch', 'nginx', 'apache', 'django', 'flask', 'spring', 'express', 'fastapi', 'laravel', 'php', 'c++', 'c#', 'go', 'rust', 'scala', 'r', 'matlab', 'sas', 'spss' ] def extract_text_from_file(self, file): """Extract text from uploaded files""" try: if file.type == "application/pdf": # Try pdfplumber first try: with pdfplumber.open(file) as pdf: text = "" for page in pdf.pages: text += page.extract_text() or "" return text except: # Fallback to PyPDF2 file.seek(0) reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": doc = Document(file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text elif file.type == "text/plain": return str(file.read(), "utf-8") elif file.type == "text/csv": df = pd.read_csv(file) return df.to_string() else: return "Unsupported file type" except Exception as e: st.warning(f"Error extracting text from {file.name}: {str(e)}") return "" def get_embedding(self, text): """Get embedding for text""" if not self.embedding_model: return None if not text or len(text.strip()) == 0: return np.zeros(1024) # Default embedding size for BGE # Truncate if too long if len(text) > 8000: text = text[:8000] try: embedding = self.embedding_model.encode(text, normalize_embeddings=True) return embedding except Exception as e: st.warning(f"Error getting embedding: {e}") return np.zeros(1024) def calculate_bm25_scores(self, resume_texts, job_description): """Calculate BM25 scores""" try: # Tokenize documents tokenized_resumes = [word_tokenize(text.lower()) for text in resume_texts] tokenized_job = word_tokenize(job_description.lower()) # Create BM25 object bm25 = BM25Okapi(tokenized_resumes) # Get scores scores = bm25.get_scores(tokenized_job) return scores except Exception as e: st.warning(f"Error calculating BM25 scores: {e}") return np.zeros(len(resume_texts)) def faiss_recall(self, resume_texts, job_description, top_k=50): """FAISS-based recall for top candidates""" try: if not self.embedding_model: return list(range(min(top_k, len(resume_texts)))) # Get embeddings resume_embeddings = np.array([self.get_embedding(text) for text in resume_texts]) job_embedding = self.get_embedding(job_description).reshape(1, -1) # Build FAISS index dimension = resume_embeddings.shape[1] index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity index.add(resume_embeddings.astype('float32')) # Search scores, indices = index.search(job_embedding.astype('float32'), min(top_k, len(resume_texts))) return indices[0].tolist() except Exception as e: st.warning(f"Error in FAISS recall: {e}") return list(range(min(top_k, len(resume_texts)))) def cross_encoder_rerank(self, resume_texts, job_description, candidate_indices, top_k=20): """Re-rank candidates using cross-encoder""" try: if not self.cross_encoder: return candidate_indices[:top_k] # Prepare pairs for cross-encoder pairs = [(job_description, resume_texts[i]) for i in candidate_indices] # Get scores scores = self.cross_encoder.predict(pairs) # Sort by scores and return top_k scored_indices = list(zip(candidate_indices, scores)) scored_indices.sort(key=lambda x: x[1], reverse=True) return [idx for idx, _ in scored_indices[:top_k]] except Exception as e: st.warning(f"Error in cross-encoder reranking: {e}") return candidate_indices[:top_k] def analyze_intent(self, resume_text, job_description): """Analyze candidate intent using LLM""" try: if not self.llm_model or not self.llm_tokenizer: return "Maybe", 0.5 prompt = f"""Analyze if this candidate is genuinely interested in this job based on their resume. Job Description: {job_description[:500]}... Resume: {resume_text[:1000]}... Based on the alignment between the candidate's experience and the job requirements, classify their intent as: - Yes: Strong alignment and genuine interest - Maybe: Some alignment but unclear intent - No: Poor alignment or likely not interested Intent:""" inputs = self.llm_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) with torch.no_grad(): outputs = self.llm_model.generate( **inputs, max_new_tokens=10, temperature=0.1, do_sample=True, pad_token_id=self.llm_tokenizer.eos_token_id ) response = self.llm_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) # Parse response if "yes" in response.lower(): return "Yes", 0.9 elif "no" in response.lower(): return "No", 0.1 else: return "Maybe", 0.5 except Exception as e: st.warning(f"Error in intent analysis: {e}") return "Maybe", 0.5 def extract_skills(self, text, job_description): """Extract matching skills from resume""" text_lower = text.lower() job_lower = job_description.lower() # Find skills from predefined list found_skills = [] for skill in self.skills_list: if skill in text_lower: found_skills.append(skill) # Extract job-specific keywords (simple approach) job_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', job_lower)) text_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', text_lower)) job_specific = list(job_words.intersection(text_words))[:10] # Top 10 return { 'technical_skills': found_skills, 'job_specific_keywords': job_specific, 'total_skills': len(found_skills) + len(job_specific) } def add_bm25_scores(self, results_df, resume_texts, job_description): """Add BM25 scores to results""" bm25_scores = self.calculate_bm25_scores(resume_texts, job_description) results_df['bm25_score'] = bm25_scores return results_df def add_intent_scores(self, results_df, resume_texts, job_description): """Add intent analysis scores""" intent_labels = [] intent_scores = [] progress_bar = st.progress(0) for i, text in enumerate(resume_texts): label, score = self.analyze_intent(text, job_description) intent_labels.append(label) intent_scores.append(score) progress_bar.progress((i + 1) / len(resume_texts)) results_df['intent_label'] = intent_labels results_df['intent_score'] = intent_scores return results_df def calculate_final_scores(self, results_df): """Calculate final weighted scores""" # Normalize scores to 0-1 range if 'cross_encoder_score' in results_df.columns: ce_scores = (results_df['cross_encoder_score'] - results_df['cross_encoder_score'].min()) / \ (results_df['cross_encoder_score'].max() - results_df['cross_encoder_score'].min() + 1e-8) else: ce_scores = np.zeros(len(results_df)) if 'bm25_score' in results_df.columns: bm25_scores = (results_df['bm25_score'] - results_df['bm25_score'].min()) / \ (results_df['bm25_score'].max() - results_df['bm25_score'].min() + 1e-8) else: bm25_scores = np.zeros(len(results_df)) intent_scores = results_df.get('intent_score', np.ones(len(results_df)) * 0.5) # Weighted combination final_scores = 0.5 * ce_scores + 0.3 * bm25_scores + 0.2 * intent_scores results_df['final_score'] = final_scores return results_df.sort_values('final_score', ascending=False) def advanced_pipeline_ranking(self, resume_texts, resume_filenames, job_description): """Run the complete advanced pipeline""" st.info("๐Ÿš€ Starting advanced pipeline ranking...") # Stage 1: FAISS Recall st.text("Stage 1: FAISS-based recall (top 50 candidates)") top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50) # Stage 2: Cross-encoder reranking st.text("Stage 2: Cross-encoder reranking (top 20 candidates)") top_20_indices = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20) # Create results dataframe results_df = pd.DataFrame({ 'rank': range(1, len(top_20_indices) + 1), 'filename': [resume_filenames[i] for i in top_20_indices], 'resume_index': top_20_indices }) # Stage 3: Add cross-encoder scores st.text("Stage 3: Adding detailed cross-encoder scores") if self.cross_encoder: pairs = [(job_description, resume_texts[i]) for i in top_20_indices] ce_scores = self.cross_encoder.predict(pairs) results_df['cross_encoder_score'] = ce_scores # Stage 4: Add BM25 scores st.text("Stage 4: Adding BM25 scores") top_20_texts = [resume_texts[i] for i in top_20_indices] results_df = self.add_bm25_scores(results_df, top_20_texts, job_description) # Stage 5: Add intent analysis st.text("Stage 5: Analyzing candidate intent") results_df = self.add_intent_scores(results_df, top_20_texts, job_description) # Calculate final scores st.text("Calculating final weighted scores...") results_df = self.calculate_final_scores(results_df) # Add skills analysis st.text("Extracting skills and keywords...") skills_data = [] for i in top_20_indices: skills = self.extract_skills(resume_texts[i], job_description) skills_data.append({ 'top_skills': ', '.join(skills['technical_skills'][:5]), 'job_keywords': ', '.join(skills['job_specific_keywords'][:5]), 'total_skills_count': skills['total_skills'] }) skills_df = pd.DataFrame(skills_data) results_df = pd.concat([results_df, skills_df], axis=1) st.success("โœ… Pipeline completed successfully!") return results_df # Load models on startup if ML_IMPORTS_AVAILABLE and not st.session_state.models_loaded: load_models() # Initialize screener if ML_IMPORTS_AVAILABLE and st.session_state.models_loaded: screener = ResumeScreener() # Sidebar with st.sidebar: st.title("๐Ÿค– AI Resume Screener") st.markdown("---") st.subheader("๐Ÿ“‹ Pipeline Stages") st.markdown(""" 1. **FAISS Recall**: Semantic similarity search (top 50) 2. **Cross-Encoder**: Deep reranking (top 20) 3. **BM25 Scoring**: Keyword-based relevance 4. **Intent Analysis**: AI-powered candidate intent 5. **Final Ranking**: Weighted score combination """) st.subheader("๐Ÿง  AI Models") if st.session_state.models_loaded: st.success("โœ… Embedding: BGE-Large-EN") st.success("โœ… Cross-Encoder: MS-Marco-MiniLM") st.success("โœ… LLM: Qwen2-1.5B") else: st.warning("โณ Models loading...") if st.session_state.model_errors: st.error("โŒ Model Errors:") for model, error in st.session_state.model_errors.items(): st.text(f"{model}: {error[:100]}...") st.subheader("๐Ÿ“Š Scoring Formula") st.markdown(""" **Final Score = 0.5 ร— Cross-Encoder + 0.3 ร— BM25 + 0.2 ร— Intent** - Cross-Encoder: Deep semantic matching - BM25: Keyword relevance - Intent: Candidate interest level """) # Main content st.title("๐Ÿค– AI Resume Screener") st.markdown("Automatically rank candidate resumes against job descriptions using advanced AI") # Step 1: Job Description Input st.header("๐Ÿ“ Step 1: Job Description") job_description = st.text_area( "Enter the job description:", height=200, placeholder="Paste the complete job description here..." ) # Step 2: Resume Upload st.header("๐Ÿ“„ Step 2: Load Resumes") upload_option = st.radio( "Choose how to load resumes:", ["Upload Files", "Upload CSV", "Load from Hugging Face Dataset"] ) if upload_option == "Upload Files": uploaded_files = st.file_uploader( "Upload resume files", type=['pdf', 'docx', 'txt'], accept_multiple_files=True ) if uploaded_files and st.button("Process Uploaded Files"): with st.spinner("Processing files..."): texts = [] filenames = [] for file in uploaded_files: if ML_IMPORTS_AVAILABLE and st.session_state.models_loaded: text = screener.extract_text_from_file(file) if text: texts.append(text) filenames.append(file.name) else: st.error("Models not loaded. Cannot process files.") break st.session_state.resume_texts = texts st.session_state.resume_filenames = filenames st.success(f"โœ… Processed {len(texts)} resumes") elif upload_option == "Upload CSV": csv_file = st.file_uploader("Upload CSV with resume texts", type=['csv']) if csv_file: df = pd.read_csv(csv_file) st.write("CSV Preview:", df.head()) text_column = st.selectbox("Select text column:", df.columns) name_column = st.selectbox("Select name/ID column:", df.columns) if st.button("Load from CSV"): st.session_state.resume_texts = df[text_column].fillna("").tolist() st.session_state.resume_filenames = df[name_column].fillna("Unknown").tolist() st.success(f"โœ… Loaded {len(st.session_state.resume_texts)} resumes from CSV") elif upload_option == "Load from Hugging Face Dataset": dataset_name = st.text_input("Dataset name:", "resume-dataset/resume-screening") if st.button("Load Dataset"): try: with st.spinner("Loading dataset..."): dataset = load_dataset(dataset_name, split="train") # Try to identify text and name columns columns = dataset.column_names text_col = st.selectbox("Select text column:", columns) name_col = st.selectbox("Select name/ID column:", columns) if text_col and name_col: st.session_state.resume_texts = dataset[text_col][:100] # Limit to 100 st.session_state.resume_filenames = [f"Resume_{i}" for i in range(len(st.session_state.resume_texts))] st.success(f"โœ… Loaded {len(st.session_state.resume_texts)} resumes from dataset") except Exception as e: st.error(f"Error loading dataset: {e}") # Display current resume count if st.session_state.resume_texts: st.info(f"๐Ÿ“Š Currently loaded: {len(st.session_state.resume_texts)} resumes") # Step 3: Run Pipeline st.header("๐Ÿš€ Step 3: Run Advanced Pipeline") can_run = ( ML_IMPORTS_AVAILABLE and st.session_state.models_loaded and job_description.strip() and st.session_state.resume_texts ) if st.button("๐ŸŽฏ Run Advanced Ranking Pipeline", disabled=not can_run): if not can_run: if not ML_IMPORTS_AVAILABLE: st.error("โŒ ML libraries not available") elif not st.session_state.models_loaded: st.error("โŒ Models not loaded") elif not job_description.strip(): st.error("โŒ Please enter a job description") elif not st.session_state.resume_texts: st.error("โŒ Please load some resumes") else: with st.spinner("Running advanced pipeline..."): results = screener.advanced_pipeline_ranking( st.session_state.resume_texts, st.session_state.resume_filenames, job_description ) st.session_state.results = results # Display Results if st.session_state.results is not None: st.header("๐Ÿ“Š Results") # Create tabs for different views tab1, tab2, tab3 = st.tabs(["๐Ÿ“‹ Summary", "๐Ÿ” Detailed Analysis", "๐Ÿ“ˆ Visualizations"]) with tab1: st.subheader("Top Ranked Candidates") # Style the dataframe display_df = st.session_state.results[['rank', 'filename', 'final_score', 'cross_encoder_score', 'bm25_score', 'intent_score', 'intent_label', 'top_skills']].copy() display_df['final_score'] = display_df['final_score'].round(3) display_df['cross_encoder_score'] = display_df['cross_encoder_score'].round(3) display_df['bm25_score'] = display_df['bm25_score'].round(3) display_df['intent_score'] = display_df['intent_score'].round(3) st.dataframe(display_df, use_container_width=True) # Download link csv = display_df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'๐Ÿ“ฅ Download Results as CSV' st.markdown(href, unsafe_allow_html=True) with tab2: st.subheader("Detailed Candidate Analysis") for idx, row in st.session_state.results.iterrows(): with st.expander(f"#{row['rank']} - {row['filename']} (Score: {row['final_score']:.3f})"): col1, col2 = st.columns(2) with col1: st.metric("Final Score", f"{row['final_score']:.3f}") st.metric("Cross-Encoder", f"{row['cross_encoder_score']:.3f}") st.metric("BM25 Score", f"{row['bm25_score']:.3f}") with col2: st.metric("Intent Score", f"{row['intent_score']:.3f}") st.metric("Intent Label", row['intent_label']) st.metric("Skills Count", row['total_skills_count']) st.write("**Top Skills:**", row['top_skills']) st.write("**Job Keywords:**", row['job_keywords']) # Show resume excerpt resume_text = st.session_state.resume_texts[row['resume_index']] st.text_area("Resume Excerpt:", resume_text[:500] + "...", height=100, key=f"excerpt_{idx}") with tab3: st.subheader("Score Visualizations") # Score distribution fig1 = px.bar( st.session_state.results.head(10), x='filename', y='final_score', title="Top 10 Candidates - Final Scores", color='final_score', color_continuous_scale='viridis' ) fig1.update_xaxis(tickangle=45) st.plotly_chart(fig1, use_container_width=True) # Score breakdown score_cols = ['cross_encoder_score', 'bm25_score', 'intent_score'] fig2 = go.Figure() for i, col in enumerate(score_cols): fig2.add_trace(go.Bar( name=col.replace('_', ' ').title(), x=st.session_state.results['filename'].head(10), y=st.session_state.results[col].head(10) )) fig2.update_layout( title="Score Breakdown - Top 10 Candidates", barmode='group', xaxis_tickangle=45 ) st.plotly_chart(fig2, use_container_width=True) # Intent distribution intent_counts = st.session_state.results['intent_label'].value_counts() fig3 = px.pie( values=intent_counts.values, names=intent_counts.index, title="Candidate Intent Distribution" ) st.plotly_chart(fig3, use_container_width=True) # Average metrics col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Avg Final Score", f"{st.session_state.results['final_score'].mean():.3f}") with col2: st.metric("Avg Cross-Encoder", f"{st.session_state.results['cross_encoder_score'].mean():.3f}") with col3: st.metric("Avg BM25", f"{st.session_state.results['bm25_score'].mean():.3f}") with col4: st.metric("Avg Intent", f"{st.session_state.results['intent_score'].mean():.3f}") # Cleanup Controls st.header("๐Ÿงน Cleanup") col1, col2 = st.columns(2) with col1: if st.button("Clear Resumes Only"): st.session_state.resume_texts = [] st.session_state.resume_filenames = [] st.session_state.results = None st.success("โœ… Resumes cleared") with col2: if st.button("Reset Entire App"): # Clear all session state for key in list(st.session_state.keys()): del st.session_state[key] # Free GPU memory if torch.cuda.is_available(): torch.cuda.empty_cache() st.success("โœ… App reset complete") st.experimental_rerun() # Footer st.markdown("---") st.markdown( """
๐Ÿค– Powered by BGE-Large-EN, MS-Marco-MiniLM, Qwen2-1.5B | Built with Streamlit
""", unsafe_allow_html=True )