resumescreener_v2 / src /streamlit_app.py
root
ss
26e8660
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import BytesIO
import base64
import os
import re
import warnings
warnings.filterwarnings("ignore")
# ML/NLP imports
try:
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import faiss
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
import pdfplumber
import PyPDF2
from docx import Document
from datasets import load_dataset
ML_IMPORTS_AVAILABLE = True
except ImportError as e:
st.error(f"Missing required ML libraries: {e}")
ML_IMPORTS_AVAILABLE = False
# Download NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
except:
pass
# Page configuration
st.set_page_config(
page_title="πŸ€– AI Resume Screener",
page_icon="πŸ€–",
layout="wide",
initial_sidebar_state="expanded"
)
# Initialize session state
if 'models_loaded' not in st.session_state:
st.session_state.models_loaded = False
if 'embedding_model' not in st.session_state:
st.session_state.embedding_model = None
if 'cross_encoder' not in st.session_state:
st.session_state.cross_encoder = None
if 'llm_tokenizer' not in st.session_state:
st.session_state.llm_tokenizer = None
if 'llm_model' not in st.session_state:
st.session_state.llm_model = None
if 'model_errors' not in st.session_state:
st.session_state.model_errors = {}
if 'resume_texts' not in st.session_state:
st.session_state.resume_texts = []
if 'resume_filenames' not in st.session_state:
st.session_state.resume_filenames = []
if 'results' not in st.session_state:
st.session_state.results = None
def load_models():
"""Load all ML models at startup"""
if st.session_state.models_loaded:
return
st.info("πŸ”„ Loading AI models... This may take a few minutes on first run.")
# Load embedding model
try:
print("Loading embedding model: BAAI/bge-large-en-v1.5")
st.text("Loading embedding model...")
try:
st.session_state.embedding_model = SentenceTransformer(
'BAAI/bge-large-en-v1.5',
device_map="auto"
)
except Exception as e:
print(f"Device map failed, falling back to default: {e}")
st.session_state.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
print("βœ… Embedding model loaded successfully")
except Exception as e:
print(f"❌ Error loading embedding model: {e}")
st.session_state.model_errors['embedding'] = str(e)
# Load cross-encoder
try:
print("Loading cross-encoder: cross-encoder/ms-marco-MiniLM-L6-v2")
st.text("Loading cross-encoder...")
st.session_state.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
print("βœ… Cross-encoder loaded successfully")
except Exception as e:
print(f"❌ Error loading cross-encoder: {e}")
st.session_state.model_errors['cross_encoder'] = str(e)
# Load LLM for intent analysis
try:
print("Loading LLM: Qwen/Qwen2-1.5B") # Using smaller model for better compatibility
st.text("Loading LLM for intent analysis...")
# Quantization config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
st.session_state.llm_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B")
st.session_state.llm_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-1.5B",
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
print("βœ… LLM loaded successfully")
except Exception as e:
print(f"❌ Error loading LLM: {e}")
st.session_state.model_errors['llm'] = str(e)
st.session_state.models_loaded = True
st.success("βœ… All models loaded successfully!")
class ResumeScreener:
def __init__(self):
self.embedding_model = st.session_state.embedding_model
self.cross_encoder = st.session_state.cross_encoder
self.llm_tokenizer = st.session_state.llm_tokenizer
self.llm_model = st.session_state.llm_model
# Predefined skills list
self.skills_list = [
'python', 'java', 'javascript', 'react', 'angular', 'vue', 'node.js',
'sql', 'mongodb', 'postgresql', 'mysql', 'aws', 'azure', 'gcp',
'docker', 'kubernetes', 'git', 'machine learning', 'deep learning',
'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
'html', 'css', 'bootstrap', 'tailwind', 'api', 'rest', 'graphql',
'microservices', 'agile', 'scrum', 'devops', 'ci/cd', 'jenkins',
'linux', 'bash', 'shell scripting', 'data analysis', 'statistics',
'excel', 'powerbi', 'tableau', 'spark', 'hadoop', 'kafka',
'redis', 'elasticsearch', 'nginx', 'apache', 'django', 'flask',
'spring', 'express', 'fastapi', 'laravel', 'php', 'c++', 'c#',
'go', 'rust', 'scala', 'r', 'matlab', 'sas', 'spss'
]
def extract_text_from_file(self, file):
"""Extract text from uploaded files"""
try:
if file.type == "application/pdf":
# Try pdfplumber first
try:
with pdfplumber.open(file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
return text
except:
# Fallback to PyPDF2
file.seek(0)
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
elif file.type == "text/plain":
return str(file.read(), "utf-8")
elif file.type == "text/csv":
df = pd.read_csv(file)
return df.to_string()
else:
return "Unsupported file type"
except Exception as e:
st.warning(f"Error extracting text from {file.name}: {str(e)}")
return ""
def get_embedding(self, text):
"""Get embedding for text"""
if not self.embedding_model:
return None
if not text or len(text.strip()) == 0:
return np.zeros(1024) # Default embedding size for BGE
# Truncate if too long
if len(text) > 8000:
text = text[:8000]
try:
embedding = self.embedding_model.encode(text, normalize_embeddings=True)
return embedding
except Exception as e:
st.warning(f"Error getting embedding: {e}")
return np.zeros(1024)
def calculate_bm25_scores(self, resume_texts, job_description):
"""Calculate BM25 scores"""
try:
# Tokenize documents
tokenized_resumes = [word_tokenize(text.lower()) for text in resume_texts]
tokenized_job = word_tokenize(job_description.lower())
# Create BM25 object
bm25 = BM25Okapi(tokenized_resumes)
# Get scores
scores = bm25.get_scores(tokenized_job)
return scores
except Exception as e:
st.warning(f"Error calculating BM25 scores: {e}")
return np.zeros(len(resume_texts))
def faiss_recall(self, resume_texts, job_description, top_k=50):
"""FAISS-based recall for top candidates"""
try:
if not self.embedding_model:
return list(range(min(top_k, len(resume_texts))))
# Get embeddings
resume_embeddings = np.array([self.get_embedding(text) for text in resume_texts])
job_embedding = self.get_embedding(job_description).reshape(1, -1)
# Build FAISS index
dimension = resume_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
index.add(resume_embeddings.astype('float32'))
# Search
scores, indices = index.search(job_embedding.astype('float32'), min(top_k, len(resume_texts)))
return indices[0].tolist()
except Exception as e:
st.warning(f"Error in FAISS recall: {e}")
return list(range(min(top_k, len(resume_texts))))
def cross_encoder_rerank(self, resume_texts, job_description, candidate_indices, top_k=20):
"""Re-rank candidates using cross-encoder"""
try:
if not self.cross_encoder:
return candidate_indices[:top_k]
# Prepare pairs for cross-encoder
pairs = [(job_description, resume_texts[i]) for i in candidate_indices]
# Get scores
scores = self.cross_encoder.predict(pairs)
# Sort by scores and return top_k
scored_indices = list(zip(candidate_indices, scores))
scored_indices.sort(key=lambda x: x[1], reverse=True)
return [idx for idx, _ in scored_indices[:top_k]]
except Exception as e:
st.warning(f"Error in cross-encoder reranking: {e}")
return candidate_indices[:top_k]
def analyze_intent(self, resume_text, job_description):
"""Analyze candidate intent using LLM"""
try:
if not self.llm_model or not self.llm_tokenizer:
return "Maybe", 0.5
prompt = f"""Analyze if this candidate is genuinely interested in this job based on their resume.
Job Description: {job_description[:500]}...
Resume: {resume_text[:1000]}...
Based on the alignment between the candidate's experience and the job requirements, classify their intent as:
- Yes: Strong alignment and genuine interest
- Maybe: Some alignment but unclear intent
- No: Poor alignment or likely not interested
Intent:"""
inputs = self.llm_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
with torch.no_grad():
outputs = self.llm_model.generate(
**inputs,
max_new_tokens=10,
temperature=0.1,
do_sample=True,
pad_token_id=self.llm_tokenizer.eos_token_id
)
response = self.llm_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# Parse response
if "yes" in response.lower():
return "Yes", 0.9
elif "no" in response.lower():
return "No", 0.1
else:
return "Maybe", 0.5
except Exception as e:
st.warning(f"Error in intent analysis: {e}")
return "Maybe", 0.5
def extract_skills(self, text, job_description):
"""Extract matching skills from resume"""
text_lower = text.lower()
job_lower = job_description.lower()
# Find skills from predefined list
found_skills = []
for skill in self.skills_list:
if skill in text_lower:
found_skills.append(skill)
# Extract job-specific keywords (simple approach)
job_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', job_lower))
text_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', text_lower))
job_specific = list(job_words.intersection(text_words))[:10] # Top 10
return {
'technical_skills': found_skills,
'job_specific_keywords': job_specific,
'total_skills': len(found_skills) + len(job_specific)
}
def add_bm25_scores(self, results_df, resume_texts, job_description):
"""Add BM25 scores to results"""
bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
results_df['bm25_score'] = bm25_scores
return results_df
def add_intent_scores(self, results_df, resume_texts, job_description):
"""Add intent analysis scores"""
intent_labels = []
intent_scores = []
progress_bar = st.progress(0)
for i, text in enumerate(resume_texts):
label, score = self.analyze_intent(text, job_description)
intent_labels.append(label)
intent_scores.append(score)
progress_bar.progress((i + 1) / len(resume_texts))
results_df['intent_label'] = intent_labels
results_df['intent_score'] = intent_scores
return results_df
def calculate_final_scores(self, results_df):
"""Calculate final weighted scores"""
# Normalize scores to 0-1 range
if 'cross_encoder_score' in results_df.columns:
ce_scores = (results_df['cross_encoder_score'] - results_df['cross_encoder_score'].min()) / \
(results_df['cross_encoder_score'].max() - results_df['cross_encoder_score'].min() + 1e-8)
else:
ce_scores = np.zeros(len(results_df))
if 'bm25_score' in results_df.columns:
bm25_scores = (results_df['bm25_score'] - results_df['bm25_score'].min()) / \
(results_df['bm25_score'].max() - results_df['bm25_score'].min() + 1e-8)
else:
bm25_scores = np.zeros(len(results_df))
intent_scores = results_df.get('intent_score', np.ones(len(results_df)) * 0.5)
# Weighted combination
final_scores = 0.5 * ce_scores + 0.3 * bm25_scores + 0.2 * intent_scores
results_df['final_score'] = final_scores
return results_df.sort_values('final_score', ascending=False)
def advanced_pipeline_ranking(self, resume_texts, resume_filenames, job_description):
"""Run the complete advanced pipeline"""
st.info("πŸš€ Starting advanced pipeline ranking...")
# Stage 1: FAISS Recall
st.text("Stage 1: FAISS-based recall (top 50 candidates)")
top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50)
# Stage 2: Cross-encoder reranking
st.text("Stage 2: Cross-encoder reranking (top 20 candidates)")
top_20_indices = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20)
# Create results dataframe
results_df = pd.DataFrame({
'rank': range(1, len(top_20_indices) + 1),
'filename': [resume_filenames[i] for i in top_20_indices],
'resume_index': top_20_indices
})
# Stage 3: Add cross-encoder scores
st.text("Stage 3: Adding detailed cross-encoder scores")
if self.cross_encoder:
pairs = [(job_description, resume_texts[i]) for i in top_20_indices]
ce_scores = self.cross_encoder.predict(pairs)
results_df['cross_encoder_score'] = ce_scores
# Stage 4: Add BM25 scores
st.text("Stage 4: Adding BM25 scores")
top_20_texts = [resume_texts[i] for i in top_20_indices]
results_df = self.add_bm25_scores(results_df, top_20_texts, job_description)
# Stage 5: Add intent analysis
st.text("Stage 5: Analyzing candidate intent")
results_df = self.add_intent_scores(results_df, top_20_texts, job_description)
# Calculate final scores
st.text("Calculating final weighted scores...")
results_df = self.calculate_final_scores(results_df)
# Add skills analysis
st.text("Extracting skills and keywords...")
skills_data = []
for i in top_20_indices:
skills = self.extract_skills(resume_texts[i], job_description)
skills_data.append({
'top_skills': ', '.join(skills['technical_skills'][:5]),
'job_keywords': ', '.join(skills['job_specific_keywords'][:5]),
'total_skills_count': skills['total_skills']
})
skills_df = pd.DataFrame(skills_data)
results_df = pd.concat([results_df, skills_df], axis=1)
st.success("βœ… Pipeline completed successfully!")
return results_df
# Load models on startup
if ML_IMPORTS_AVAILABLE and not st.session_state.models_loaded:
load_models()
# Initialize screener
if ML_IMPORTS_AVAILABLE and st.session_state.models_loaded:
screener = ResumeScreener()
# Sidebar
with st.sidebar:
st.title("πŸ€– AI Resume Screener")
st.markdown("---")
st.subheader("πŸ“‹ Pipeline Stages")
st.markdown("""
1. **FAISS Recall**: Semantic similarity search (top 50)
2. **Cross-Encoder**: Deep reranking (top 20)
3. **BM25 Scoring**: Keyword-based relevance
4. **Intent Analysis**: AI-powered candidate intent
5. **Final Ranking**: Weighted score combination
""")
st.subheader("🧠 AI Models")
if st.session_state.models_loaded:
st.success("βœ… Embedding: BGE-Large-EN")
st.success("βœ… Cross-Encoder: MS-Marco-MiniLM")
st.success("βœ… LLM: Qwen2-1.5B")
else:
st.warning("⏳ Models loading...")
if st.session_state.model_errors:
st.error("❌ Model Errors:")
for model, error in st.session_state.model_errors.items():
st.text(f"{model}: {error[:100]}...")
st.subheader("πŸ“Š Scoring Formula")
st.markdown("""
**Final Score = 0.5 Γ— Cross-Encoder + 0.3 Γ— BM25 + 0.2 Γ— Intent**
- Cross-Encoder: Deep semantic matching
- BM25: Keyword relevance
- Intent: Candidate interest level
""")
# Main content
st.title("πŸ€– AI Resume Screener")
st.markdown("Automatically rank candidate resumes against job descriptions using advanced AI")
# Step 1: Job Description Input
st.header("πŸ“ Step 1: Job Description")
job_description = st.text_area(
"Enter the job description:",
height=200,
placeholder="Paste the complete job description here..."
)
# Step 2: Resume Upload
st.header("πŸ“„ Step 2: Load Resumes")
upload_option = st.radio(
"Choose how to load resumes:",
["Upload Files", "Upload CSV", "Load from Hugging Face Dataset"]
)
if upload_option == "Upload Files":
uploaded_files = st.file_uploader(
"Upload resume files",
type=['pdf', 'docx', 'txt'],
accept_multiple_files=True
)
if uploaded_files and st.button("Process Uploaded Files"):
with st.spinner("Processing files..."):
texts = []
filenames = []
for file in uploaded_files:
if ML_IMPORTS_AVAILABLE and st.session_state.models_loaded:
text = screener.extract_text_from_file(file)
if text:
texts.append(text)
filenames.append(file.name)
else:
st.error("Models not loaded. Cannot process files.")
break
st.session_state.resume_texts = texts
st.session_state.resume_filenames = filenames
st.success(f"βœ… Processed {len(texts)} resumes")
elif upload_option == "Upload CSV":
csv_file = st.file_uploader("Upload CSV with resume texts", type=['csv'])
if csv_file:
df = pd.read_csv(csv_file)
st.write("CSV Preview:", df.head())
text_column = st.selectbox("Select text column:", df.columns)
name_column = st.selectbox("Select name/ID column:", df.columns)
if st.button("Load from CSV"):
st.session_state.resume_texts = df[text_column].fillna("").tolist()
st.session_state.resume_filenames = df[name_column].fillna("Unknown").tolist()
st.success(f"βœ… Loaded {len(st.session_state.resume_texts)} resumes from CSV")
elif upload_option == "Load from Hugging Face Dataset":
dataset_name = st.text_input("Dataset name:", "resume-dataset/resume-screening")
if st.button("Load Dataset"):
try:
with st.spinner("Loading dataset..."):
dataset = load_dataset(dataset_name, split="train")
# Try to identify text and name columns
columns = dataset.column_names
text_col = st.selectbox("Select text column:", columns)
name_col = st.selectbox("Select name/ID column:", columns)
if text_col and name_col:
st.session_state.resume_texts = dataset[text_col][:100] # Limit to 100
st.session_state.resume_filenames = [f"Resume_{i}" for i in range(len(st.session_state.resume_texts))]
st.success(f"βœ… Loaded {len(st.session_state.resume_texts)} resumes from dataset")
except Exception as e:
st.error(f"Error loading dataset: {e}")
# Display current resume count
if st.session_state.resume_texts:
st.info(f"πŸ“Š Currently loaded: {len(st.session_state.resume_texts)} resumes")
# Step 3: Run Pipeline
st.header("πŸš€ Step 3: Run Advanced Pipeline")
can_run = (
ML_IMPORTS_AVAILABLE and
st.session_state.models_loaded and
job_description.strip() and
st.session_state.resume_texts
)
if st.button("🎯 Run Advanced Ranking Pipeline", disabled=not can_run):
if not can_run:
if not ML_IMPORTS_AVAILABLE:
st.error("❌ ML libraries not available")
elif not st.session_state.models_loaded:
st.error("❌ Models not loaded")
elif not job_description.strip():
st.error("❌ Please enter a job description")
elif not st.session_state.resume_texts:
st.error("❌ Please load some resumes")
else:
with st.spinner("Running advanced pipeline..."):
results = screener.advanced_pipeline_ranking(
st.session_state.resume_texts,
st.session_state.resume_filenames,
job_description
)
st.session_state.results = results
# Display Results
if st.session_state.results is not None:
st.header("πŸ“Š Results")
# Create tabs for different views
tab1, tab2, tab3 = st.tabs(["πŸ“‹ Summary", "πŸ” Detailed Analysis", "πŸ“ˆ Visualizations"])
with tab1:
st.subheader("Top Ranked Candidates")
# Style the dataframe
display_df = st.session_state.results[['rank', 'filename', 'final_score', 'cross_encoder_score',
'bm25_score', 'intent_score', 'intent_label', 'top_skills']].copy()
display_df['final_score'] = display_df['final_score'].round(3)
display_df['cross_encoder_score'] = display_df['cross_encoder_score'].round(3)
display_df['bm25_score'] = display_df['bm25_score'].round(3)
display_df['intent_score'] = display_df['intent_score'].round(3)
st.dataframe(display_df, use_container_width=True)
# Download link
csv = display_df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="resume_rankings.csv">πŸ“₯ Download Results as CSV</a>'
st.markdown(href, unsafe_allow_html=True)
with tab2:
st.subheader("Detailed Candidate Analysis")
for idx, row in st.session_state.results.iterrows():
with st.expander(f"#{row['rank']} - {row['filename']} (Score: {row['final_score']:.3f})"):
col1, col2 = st.columns(2)
with col1:
st.metric("Final Score", f"{row['final_score']:.3f}")
st.metric("Cross-Encoder", f"{row['cross_encoder_score']:.3f}")
st.metric("BM25 Score", f"{row['bm25_score']:.3f}")
with col2:
st.metric("Intent Score", f"{row['intent_score']:.3f}")
st.metric("Intent Label", row['intent_label'])
st.metric("Skills Count", row['total_skills_count'])
st.write("**Top Skills:**", row['top_skills'])
st.write("**Job Keywords:**", row['job_keywords'])
# Show resume excerpt
resume_text = st.session_state.resume_texts[row['resume_index']]
st.text_area("Resume Excerpt:", resume_text[:500] + "...", height=100, key=f"excerpt_{idx}")
with tab3:
st.subheader("Score Visualizations")
# Score distribution
fig1 = px.bar(
st.session_state.results.head(10),
x='filename',
y='final_score',
title="Top 10 Candidates - Final Scores",
color='final_score',
color_continuous_scale='viridis'
)
fig1.update_xaxis(tickangle=45)
st.plotly_chart(fig1, use_container_width=True)
# Score breakdown
score_cols = ['cross_encoder_score', 'bm25_score', 'intent_score']
fig2 = go.Figure()
for i, col in enumerate(score_cols):
fig2.add_trace(go.Bar(
name=col.replace('_', ' ').title(),
x=st.session_state.results['filename'].head(10),
y=st.session_state.results[col].head(10)
))
fig2.update_layout(
title="Score Breakdown - Top 10 Candidates",
barmode='group',
xaxis_tickangle=45
)
st.plotly_chart(fig2, use_container_width=True)
# Intent distribution
intent_counts = st.session_state.results['intent_label'].value_counts()
fig3 = px.pie(
values=intent_counts.values,
names=intent_counts.index,
title="Candidate Intent Distribution"
)
st.plotly_chart(fig3, use_container_width=True)
# Average metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Avg Final Score", f"{st.session_state.results['final_score'].mean():.3f}")
with col2:
st.metric("Avg Cross-Encoder", f"{st.session_state.results['cross_encoder_score'].mean():.3f}")
with col3:
st.metric("Avg BM25", f"{st.session_state.results['bm25_score'].mean():.3f}")
with col4:
st.metric("Avg Intent", f"{st.session_state.results['intent_score'].mean():.3f}")
# Cleanup Controls
st.header("🧹 Cleanup")
col1, col2 = st.columns(2)
with col1:
if st.button("Clear Resumes Only"):
st.session_state.resume_texts = []
st.session_state.resume_filenames = []
st.session_state.results = None
st.success("βœ… Resumes cleared")
with col2:
if st.button("Reset Entire App"):
# Clear all session state
for key in list(st.session_state.keys()):
del st.session_state[key]
# Free GPU memory
if torch.cuda.is_available():
torch.cuda.empty_cache()
st.success("βœ… App reset complete")
st.experimental_rerun()
# Footer
st.markdown("---")
st.markdown(
"""
<div style='text-align: center; color: #666; font-size: 0.8em;'>
πŸ€– Powered by BGE-Large-EN, MS-Marco-MiniLM, Qwen2-1.5B | Built with Streamlit
</div>
""",
unsafe_allow_html=True
)