from flask import Flask, render_template, request, session, redirect, url_for, flash
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict, Counter
import json
import os
import google.generativeai as genai
import tempfile
from pathlib import Path
import time
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential
import traceback
app = Flask(__name__)
app.secret_key = '0fdd675e2c6f513deb04c79bd7ddb7e0' # IMPORTANT: Change this to a strong, random key in production
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
app.config['TEMP_FOLDER'] = Path(tempfile.gettempdir()) / 'placement_analyzer'
# Update the Gemini configuration to use environment variable
load_dotenv()
# Make sure API key is available
api_key ="AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ"
if not api_key:
print("WARNING: GEMINI_API_KEY not found in environment variables. AI insights will not be available.")
model = None
else:
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.0-flash')
print("Gemini model configured successfully.")
except Exception as e:
print(f"Error configuring Gemini model: {e}. AI insights will not be available.")
model = None
def validate_data_columns(data, required_columns):
"""
Validate that the DataFrame contains all required columns.
"""
# Create a mapping of lowercase column names to actual column names
column_mapping = {col.lower(): col for col in data.columns}
# Check if required columns exist (case-insensitive)
actual_required_columns = []
missing_columns = []
for req_col in required_columns:
if req_col.lower() in column_mapping:
actual_required_columns.append(column_mapping[req_col.lower()])
else:
missing_columns.append(req_col)
if missing_columns:
raise ValueError(
f"Missing required columns: {', '.join(missing_columns)}. Please ensure your CSV has the correct column names.")
# Validate data types and non-empty values
if 'Package' in actual_required_columns:
data['Package'] = pd.to_numeric(data['Package'], errors='coerce')
if data['Package'].isna().all():
raise ValueError("Package column contains no valid numeric values or is entirely empty.")
if 'Year of Placement' in actual_required_columns: # This might be 'Year' after rename, but check original
# Use the actual column name for 'Year of Placement' if it exists
year_col_name = column_mapping.get('year of placement', None)
if year_col_name and not data[year_col_name].isna().all():
data[year_col_name] = pd.to_numeric(data[year_col_name], errors='coerce')
if data[year_col_name].isna().all():
raise ValueError("Year of Placement column contains no valid numeric values or is entirely empty.")
elif year_col_name: # If column exists but is all NA
raise ValueError("Year of Placement column is entirely empty.")
return True
def generate_graphs(data):
"""
Generate comprehensive graphs based on the provided placement data.
"""
graphs = []
try:
print(f"Generating graphs for {len(data)} records...")
# Ensure column names are standardized for graph generation
data.columns = [col.replace('Year of Placement', 'Year').replace('Post', 'Role') for col in data.columns]
# 1. Department-wise Placement Distribution
if 'Department' in data.columns and not data['Department'].isna().all():
print("Generating Department-wise Placement Distribution...")
department_counts = data['Department'].value_counts().reset_index()
department_counts.columns = ['Department', 'Count']
fig1 = px.bar(department_counts,
x='Department', y='Count',
title="Department-wise Placement Distribution",
color_discrete_sequence=['#2563eb'])
fig1.update_layout(height=500, xaxis_title="Department", yaxis_title="Number of Placements")
graphs.append({"graph": fig1.to_html(full_html=False), "title": "Department-wise Placement Distribution"})
# 2. Package Distribution
if 'Package' in data.columns and not data['Package'].isna().all():
print("Generating Package Distribution...")
package_data = data.dropna(subset=['Package'])
if not package_data.empty:
fig2 = px.histogram(package_data,
x='Package',
title="Distribution of Package Values",
color_discrete_sequence=['#10b981'])
fig2.update_layout(height=500, xaxis_title="Package (e.g., in LPA)", yaxis_title="Number of Students")
graphs.append({"graph": fig2.to_html(full_html=False), "title": "Distribution of Package Values"})
# 3. Average Package by Department
if all(col in data.columns for col in ['Department', 'Package']):
print("Generating Average Package by Department...")
clean_data = data.dropna(subset=['Department', 'Package'])
if not clean_data.empty:
avg_package = clean_data.groupby('Department')['Package'].mean().reset_index()
fig3 = px.bar(avg_package,
x='Department', y='Package',
title="Average Package by Department",
color_discrete_sequence=['#3b82f6'])
fig3.update_layout(height=500, xaxis_title="Department", yaxis_title="Average Package (e.g., in LPA)")
graphs.append({"graph": fig3.to_html(full_html=False), "title": "Average Package by Department"})
# 4. Year-wise Placement Trends (Using 'Year' column after rename)
if 'Year' in data.columns and not data['Year'].isna().all():
print("Generating Year-wise Placement Trends...")
year_counts = data['Year'].value_counts().sort_index().reset_index()
year_counts.columns = ['Year', 'Count']
fig4 = px.line(year_counts,
x='Year', y='Count',
title="Placement Trends Over Years",
markers=True,
color_discrete_sequence=['#f59e0b'])
fig4.update_layout(height=500, xaxis_title="Year", yaxis_title="Number of Placements")
graphs.append({"graph": fig4.to_html(full_html=False), "title": "Placement Trends Over Years"})
# 5. Company-wise Placements
if 'Company' in data.columns and not data['Company'].isna().all():
print("Generating Company-wise Placements...")
top_companies = data['Company'].value_counts().head(10).reset_index()
top_companies.columns = ['Company', 'Count']
fig5 = px.pie(top_companies,
values='Count', names='Company',
title="Top 10 Recruiting Companies",
hole=0.4)
fig5.update_layout(height=500, margin=dict(t=50, b=50, l=50, r=50))
graphs.append({"graph": fig5.to_html(full_html=False), "title": "Top 10 Recruiting Companies"})
# 6. Top 10 Job Roles
if 'Role' in data.columns and not data['Role'].isna().all():
print("Generating Top 10 Job Roles...")
top_roles = data['Role'].value_counts().head(10).reset_index()
top_roles.columns = ['Role', 'Count']
fig6 = px.bar(top_roles, x='Count', y='Role', orientation='h',
title='Top 10 Job Roles Placed',
color_discrete_sequence=px.colors.qualitative.Pastel)
fig6.update_layout(height=500, yaxis={'categoryorder': 'total ascending'},
xaxis_title="Number of Placements", yaxis_title="Job Role")
graphs.append({"graph": fig6.to_html(full_html=False), "title": "Top 10 Job Roles Placed"})
print(f"Generated {len(graphs)} graphs successfully")
return graphs
except Exception as e:
print(f"Error generating graphs: {str(e)}")
traceback.print_exc()
return []
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def generate_single_insight(prompt, model):
try:
response = model.generate_content(
prompt + "\n\nProvide a brief analysis in 2-3 concise bullet points, formatted as HTML
- tags. Ensure the output is only the HTML."
)
if not response or not response.text:
print("Empty response received from Gemini")
return "
- No insight generated - empty response from AI.
"
# Clean the response to ensure it's valid HTML ul/li
clean_text = response.text.replace('```html', '').replace('```', '').strip()
if not clean_text.startswith('') or not clean_text.endswith('
'):
# If AI doesn't format it, try to make it list-like or a single point
lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
if lines:
return "" + "".join([f"- {line.lstrip('- ').lstrip('* ')}
" for line in lines]) + "
"
else:
return "- AI insight could not be properly formatted.
"
return clean_text
except Exception as e:
print(f"Error in generate_single_insight: {type(e).__name__}: {str(e)}")
raise
def generate_insights(data, graph_titles):
"""
Generate insights for each graph using Gemini AI with retry logic
"""
if not model:
print("No AI model available, returning default insights")
return ["- AI insights not available - missing API key or configuration error.
"] * len(
graph_titles)
insights = []
# Ensure column names are standardized for data access
data.columns = [col.replace('Year of Placement', 'Year').replace('Post', 'Role') for col in data.columns]
try:
# Calculate statistics for insights with null checking
stats = {
'total_placements': len(data),
'avg_package': round(data['Package'].mean(), 2) if 'Package' in data.columns and not data[
'Package'].isna().all() else 'N/A',
'departments': data['Department'].nunique() if 'Department' in data.columns and not data[
'Department'].isna().all() else 'N/A',
'companies': data['Company'].nunique() if 'Company' in data.columns and not data[
'Company'].isna().all() else 'N/A',
'max_package': round(data['Package'].max(), 2) if 'Package' in data.columns and not data[
'Package'].isna().all() else 'N/A',
'min_package': round(data['Package'].min(), 2) if 'Package' in data.columns and not data[
'Package'].isna().all() else 'N/A',
'median_package': round(data['Package'].median(), 2) if 'Package' in data.columns and not data[
'Package'].isna().all() else 'N/A',
'years_covered': f"{data['Year'].min()}-{data['Year'].max()}" if 'Year' in data.columns and not data[
'Year'].isna().all() and len(data['Year'].dropna().unique()) > 1 else str(
data['Year'].min()) if 'Year' in data.columns and not data['Year'].isna().all() else 'N/A',
'top_department': data['Department'].value_counts().idxmax() if 'Department' in data.columns and not data[
'Department'].isna().all() else 'N/A',
'top_company': data['Company'].value_counts().idxmax() if 'Company' in data.columns and not data[
'Company'].isna().all() else 'N/A',
'top_role': data['Role'].value_counts().idxmax() if 'Role' in data.columns and not data[
'Role'].isna().all() else 'N/A'
}
# Context for AI model
overall_context = f"""
Here is a summary of the placement data:
- Total Placements: {stats['total_placements']}
- Departments involved: {stats['departments']}
- Unique Companies: {stats['companies']}
- Average Package: {stats['avg_package']}
- Maximum Package: {stats['max_package']}
- Minimum Package: {stats['min_package']}
- Median Package: {stats['median_package']}
- Years Covered: {stats['years_covered']}
- Most Placements by Department: {stats['top_department']}
- Most Placements by Company: {stats['top_company']}
- Most Placements by Role: {stats['top_role']}
"""
# Map graph titles to specific prompts for tailored insights
prompt_map = {
"Department-wise Placement Distribution": f"""{overall_context}
The graph shows the distribution of placements across different departments. What are the key observations regarding which departments have the most/least placements, and any significant disparities?""",
"Distribution of Package Values": f"""{overall_context}
The graph displays the frequency distribution of package values. What does this reveal about typical salary ranges, outliers, and the overall earning potential?""",
"Average Package by Department": f"""{overall_context}
This graph presents the average package offered per department. What insights can be drawn about the earning potential differences between departments?""",
"Placement Trends Over Years": f"""{overall_context}
This graph illustrates the number of placements over the years. What trends (growth, decline, stability) can be identified in placement activity over time?""",
"Top 10 Recruiting Companies": f"""{overall_context}
This graph shows the top 10 companies by the number of placements. What does this indicate about the primary recruiters and their impact on placements?""",
"Top 10 Job Roles Placed": f"""{overall_context}
This graph displays the top 10 job roles students were placed in. What are the predominant job types or career paths for these students?"""
}
for title in graph_titles:
prompt = prompt_map.get(title, f"{overall_context}\n\nProvide key insights for a graph titled '{title}'.")
try:
print(f"Generating insight for graph: '{title}'...")
insight = generate_single_insight(prompt, model)
insights.append(insight)
except Exception as e:
print(f"Failed to generate insight for '{title}' after retries: {type(e).__name__}: {str(e)}")
insights.append("- Unable to generate insight for this graph at this time.
")
return insights
except Exception as e:
print(f"Error in generate_insights overall: {type(e).__name__}: {str(e)}")
traceback.print_exc()
return ["- Error generating insights. Please try again.
"] * len(graph_titles)
@app.route('/', methods=['GET', 'POST'])
def home():
"""
Handle placement data upload and analysis
"""
print(f"Request method: {request.method}")
print(f"Request form keys: {list(request.form.keys())}")
print(f"Request files keys: {list(request.files.keys())}")
if request.method == 'POST':
print("POST request received")
# Check if upload_csv button was clicked - THIS IS THE KEY CHECK
if 'upload_csv' not in request.form:
print("upload_csv not in form (this means the button's name/value wasn't sent)")
flash("Invalid form submission or button not recognized. Please try again.", "error")
return redirect(url_for('home'))
# Check if file was uploaded
if 'file' not in request.files:
print("No file part in request")
flash("No file selected", "error")
return redirect(url_for('home'))
file = request.files['file']
print(f"File received: {file.filename}")
# Check if file was actually selected
if file.filename == '':
print("No file selected (empty filename)")
flash("No file selected", "error")
return redirect(url_for('home'))
# Check file extension
if not file.filename.lower().endswith('.csv'):
print("Invalid file type (not .csv)")
flash("Please upload a CSV file", "error")
return redirect(url_for('home'))
try:
print("Processing CSV file...")
# Read CSV data directly from the file stream
data = pd.read_csv(file.stream)
print(f"CSV loaded successfully with {len(data)} rows and {len(data.columns)} columns")
print(f"Columns before cleaning: {list(data.columns)}")
if data.empty:
flash("Uploaded file is empty or invalid. Please upload a valid CSV.", "error")
return redirect(url_for('home'))
# Clean column names (strip whitespace, make consistent case for internal use)
data.columns = data.columns.str.strip()
print(f"Cleaned columns: {list(data.columns)}")
# Required columns (exact expected names in the CSV)
# The validation function will check these case-insensitively
required_csv_columns = ['Name', 'Department', 'Company', 'Post', 'Package', 'Year of Placement',
'Graduation Year']
# Validate data columns
try:
validate_data_columns(data.copy(),
required_csv_columns) # Pass a copy to avoid modifying original during validation
except ValueError as ve:
print(f"Validation error: {str(ve)}")
flash(f"Invalid data or missing columns: {str(ve)}", "error")
return redirect(url_for('home'))
# Rename columns for consistency *after* validation check, for internal use
# Use a dictionary comprehension to ensure we only rename if the column exists
rename_map = {
col: new_name for col_check, new_name in [('Year of Placement', 'Year'), ('Post', 'Role')]
for col in data.columns if col.lower() == col_check.lower()
}
data = data.rename(columns=rename_map)
print("Columns potentially renamed for internal processing.")
print(f"Columns after renaming for processing: {list(data.columns)}")
# Generate graphs and insights
print("Generating graphs...")
graphs_info = generate_graphs(data.copy()) # Pass a copy
graph_html_list = [item["graph"] for item in graphs_info]
graph_titles_list = [item["title"] for item in graphs_info]
print(f"Generated {len(graph_html_list)} graphs")
print("Generating insights...")
insights_list = generate_insights(data.copy(), graph_titles_list) # Pass a copy
print(f"Generated {len(insights_list)} insights")
# Ensure we have matching pairs
min_length = min(len(graph_html_list), len(insights_list))
if min_length == 0:
flash("No graphs or insights could be generated from the data. Please check file format and content.",
"error")
return redirect(url_for('home'))
final_graphs_and_insights = [{"graph": g, "insight": i}
for g, i in zip(graph_html_list[:min_length], insights_list[:min_length])]
# Store analysis results
# Generate a unique ID for this analysis session
session['analysis_id'] = f"analysis_{int(time.time())}"
# Create temp directory and save results
try:
analysis_path_dir = app.config['TEMP_FOLDER'] / session['analysis_id']
os.makedirs(analysis_path_dir, exist_ok=True)
analysis_file_path = analysis_path_dir / 'data.json'
with open(analysis_file_path, 'w') as f:
json.dump({
'graphs': graph_html_list[:min_length],
'insights': insights_list[:min_length]
}, f)
print(f"Analysis results saved successfully to {analysis_file_path}")
except Exception as e:
print(f"Error saving analysis to temporary file: {str(e)}")
traceback.print_exc()
flash("Analysis completed, but there was an issue saving the results temporarily.", "warning")
# Continue displaying results even if saving fails
flash("Analysis completed successfully! Scroll down to see the results.", "success")
return render_template('index.html', graphs_and_insights=final_graphs_and_insights)
except pd.errors.EmptyDataError:
flash("The uploaded CSV file is empty. Please upload a file with data.", "error")
return redirect(url_for('home'))
except pd.errors.ParserError:
flash("Could not parse the CSV file. Please ensure it's a valid CSV format.", "error")
return redirect(url_for('home'))
except Exception as e:
error_msg = f"An unexpected error occurred while processing your file: {str(e)}. Please check the file's content and try again."
print(error_msg)
traceback.print_exc()
flash(error_msg, "error")
return redirect(url_for('home'))
# Handle GET requests
print("GET request - checking for saved analysis...")
analysis_id = session.get('analysis_id')
graphs_and_insights = []
if analysis_id:
analysis_file_path = app.config['TEMP_FOLDER'] / analysis_id / 'data.json'
if analysis_file_path.exists():
try:
with open(analysis_file_path) as f:
data = json.load(f)
print(f"Loaded saved analysis from {analysis_file_path}")
graphs_and_insights = [{"graph": g, "insight": i}
for g, i in zip(data['graphs'], data['insights'])]
except Exception as e:
print(f"Error loading saved analysis: {str(e)}")
traceback.print_exc()
# If loading fails, clear session to prevent re-attempting with a corrupt ID
session.pop('analysis_id', None)
flash("Could not load previous analysis. Please upload your file again.", "warning")
print("Rendering template.")
return render_template('index.html', graphs_and_insights=graphs_and_insights)
@app.errorhandler(413)
def too_large(e):
flash("File is too large. Maximum file size is 16MB.", "error")
return redirect(url_for('home'))
@app.errorhandler(400)
def bad_request(e):
flash("Bad request. Please check your input and try again.", "error")
return redirect(url_for('home'))
if __name__ == '__main__':
# Create necessary directories
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['TEMP_FOLDER'], exist_ok=True)
print("Flask app starting...")
print(f"Upload folder: {app.config['UPLOAD_FOLDER']}")
print(f"Temp folder: {app.config['TEMP_FOLDER']}")
# Run in debug mode, set debug=False for production
app.run(debug=True, port=2541)