Spaces:

pranit144
/

Institute_placement_anaylsis

Sleeping

App Files Files Community

Institute_placement_anaylsis / app.py

pranit144

Upload 3 files

feabcc4 verified 3 months ago

raw

history blame contribute delete

24.4 kB

	from flask import Flask, render_template, request, session, redirect, url_for, flash
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from collections import defaultdict, Counter
	import json
	import os
	import google.generativeai as genai
	import tempfile
	from pathlib import Path
	import time
	from dotenv import load_dotenv
	from tenacity import retry, stop_after_attempt, wait_exponential
	import traceback

	app = Flask(__name__)
	app.secret_key = '0fdd675e2c6f513deb04c79bd7ddb7e0' # IMPORTANT: Change this to a strong, random key in production
	app.config['UPLOAD_FOLDER'] = 'uploads'
	app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
	app.config['TEMP_FOLDER'] = Path(tempfile.gettempdir()) / 'placement_analyzer'

	# Update the Gemini configuration to use environment variable
	load_dotenv()

	# Make sure API key is available
	api_key ="AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ"
	if not api_key:
	print("WARNING: GEMINI_API_KEY not found in environment variables. AI insights will not be available.")
	model = None
	else:
	try:
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-2.0-flash')
	print("Gemini model configured successfully.")
	except Exception as e:
	print(f"Error configuring Gemini model: {e}. AI insights will not be available.")
	model = None


	def validate_data_columns(data, required_columns):
	"""
	Validate that the DataFrame contains all required columns.
	"""
	# Create a mapping of lowercase column names to actual column names
	column_mapping = {col.lower(): col for col in data.columns}

	# Check if required columns exist (case-insensitive)
	actual_required_columns = []
	missing_columns = []
	for req_col in required_columns:
	if req_col.lower() in column_mapping:
	actual_required_columns.append(column_mapping[req_col.lower()])
	else:
	missing_columns.append(req_col)

	if missing_columns:
	raise ValueError(
	f"Missing required columns: {', '.join(missing_columns)}. Please ensure your CSV has the correct column names.")

	# Validate data types and non-empty values
	if 'Package' in actual_required_columns:
	data['Package'] = pd.to_numeric(data['Package'], errors='coerce')
	if data['Package'].isna().all():
	raise ValueError("Package column contains no valid numeric values or is entirely empty.")

	if 'Year of Placement' in actual_required_columns: # This might be 'Year' after rename, but check original
	# Use the actual column name for 'Year of Placement' if it exists
	year_col_name = column_mapping.get('year of placement', None)
	if year_col_name and not data[year_col_name].isna().all():
	data[year_col_name] = pd.to_numeric(data[year_col_name], errors='coerce')
	if data[year_col_name].isna().all():
	raise ValueError("Year of Placement column contains no valid numeric values or is entirely empty.")
	elif year_col_name: # If column exists but is all NA
	raise ValueError("Year of Placement column is entirely empty.")

	return True


	def generate_graphs(data):
	"""
	Generate comprehensive graphs based on the provided placement data.
	"""
	graphs = []

	try:
	print(f"Generating graphs for {len(data)} records...")

	# Ensure column names are standardized for graph generation
	data.columns = [col.replace('Year of Placement', 'Year').replace('Post', 'Role') for col in data.columns]

	# 1. Department-wise Placement Distribution
	if 'Department' in data.columns and not data['Department'].isna().all():
	print("Generating Department-wise Placement Distribution...")
	department_counts = data['Department'].value_counts().reset_index()
	department_counts.columns = ['Department', 'Count']
	fig1 = px.bar(department_counts,
	x='Department', y='Count',
	title="Department-wise Placement Distribution",
	color_discrete_sequence=['#2563eb'])
	fig1.update_layout(height=500, xaxis_title="Department", yaxis_title="Number of Placements")
	graphs.append({"graph": fig1.to_html(full_html=False), "title": "Department-wise Placement Distribution"})

	# 2. Package Distribution
	if 'Package' in data.columns and not data['Package'].isna().all():
	print("Generating Package Distribution...")
	package_data = data.dropna(subset=['Package'])
	if not package_data.empty:
	fig2 = px.histogram(package_data,
	x='Package',
	title="Distribution of Package Values",
	color_discrete_sequence=['#10b981'])
	fig2.update_layout(height=500, xaxis_title="Package (e.g., in LPA)", yaxis_title="Number of Students")
	graphs.append({"graph": fig2.to_html(full_html=False), "title": "Distribution of Package Values"})

	# 3. Average Package by Department
	if all(col in data.columns for col in ['Department', 'Package']):
	print("Generating Average Package by Department...")
	clean_data = data.dropna(subset=['Department', 'Package'])
	if not clean_data.empty:
	avg_package = clean_data.groupby('Department')['Package'].mean().reset_index()
	fig3 = px.bar(avg_package,
	x='Department', y='Package',
	title="Average Package by Department",
	color_discrete_sequence=['#3b82f6'])
	fig3.update_layout(height=500, xaxis_title="Department", yaxis_title="Average Package (e.g., in LPA)")
	graphs.append({"graph": fig3.to_html(full_html=False), "title": "Average Package by Department"})

	# 4. Year-wise Placement Trends (Using 'Year' column after rename)
	if 'Year' in data.columns and not data['Year'].isna().all():
	print("Generating Year-wise Placement Trends...")
	year_counts = data['Year'].value_counts().sort_index().reset_index()
	year_counts.columns = ['Year', 'Count']
	fig4 = px.line(year_counts,
	x='Year', y='Count',
	title="Placement Trends Over Years",
	markers=True,
	color_discrete_sequence=['#f59e0b'])
	fig4.update_layout(height=500, xaxis_title="Year", yaxis_title="Number of Placements")
	graphs.append({"graph": fig4.to_html(full_html=False), "title": "Placement Trends Over Years"})

	# 5. Company-wise Placements
	if 'Company' in data.columns and not data['Company'].isna().all():
	print("Generating Company-wise Placements...")
	top_companies = data['Company'].value_counts().head(10).reset_index()
	top_companies.columns = ['Company', 'Count']
	fig5 = px.pie(top_companies,
	values='Count', names='Company',
	title="Top 10 Recruiting Companies",
	hole=0.4)
	fig5.update_layout(height=500, margin=dict(t=50, b=50, l=50, r=50))
	graphs.append({"graph": fig5.to_html(full_html=False), "title": "Top 10 Recruiting Companies"})

	# 6. Top 10 Job Roles
	if 'Role' in data.columns and not data['Role'].isna().all():
	print("Generating Top 10 Job Roles...")
	top_roles = data['Role'].value_counts().head(10).reset_index()
	top_roles.columns = ['Role', 'Count']
	fig6 = px.bar(top_roles, x='Count', y='Role', orientation='h',
	title='Top 10 Job Roles Placed',
	color_discrete_sequence=px.colors.qualitative.Pastel)
	fig6.update_layout(height=500, yaxis={'categoryorder': 'total ascending'},
	xaxis_title="Number of Placements", yaxis_title="Job Role")
	graphs.append({"graph": fig6.to_html(full_html=False), "title": "Top 10 Job Roles Placed"})

	print(f"Generated {len(graphs)} graphs successfully")
	return graphs

	except Exception as e:
	print(f"Error generating graphs: {str(e)}")
	traceback.print_exc()
	return []


	@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
	def generate_single_insight(prompt, model):
	try:
	response = model.generate_content(
	prompt + "\n\nProvide a brief analysis in 2-3 concise bullet points, formatted as HTML <ul><li> tags. Ensure the output is only the HTML."
	)
	if not response or not response.text:
	print("Empty response received from Gemini")
	return "<ul><li>No insight generated - empty response from AI.</li></ul>"

	# Clean the response to ensure it's valid HTML ul/li
	clean_text = response.text.replace('```html', '').replace('```', '').strip()
	if not clean_text.startswith('<ul>') or not clean_text.endswith('</ul>'):
	# If AI doesn't format it, try to make it list-like or a single point
	lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
	if lines:
	return "<ul>" + "".join([f"<li>{line.lstrip('- ').lstrip('* ')}</li>" for line in lines]) + "</ul>"
	else:
	return "<ul><li>AI insight could not be properly formatted.</li></ul>"
	return clean_text
	except Exception as e:
	print(f"Error in generate_single_insight: {type(e).__name__}: {str(e)}")
	raise


	def generate_insights(data, graph_titles):
	"""
	Generate insights for each graph using Gemini AI with retry logic
	"""
	if not model:
	print("No AI model available, returning default insights")
	return ["<ul><li>AI insights not available - missing API key or configuration error.</li></ul>"] * len(
	graph_titles)

	insights = []

	# Ensure column names are standardized for data access
	data.columns = [col.replace('Year of Placement', 'Year').replace('Post', 'Role') for col in data.columns]

	try:
	# Calculate statistics for insights with null checking
	stats = {
	'total_placements': len(data),
	'avg_package': round(data['Package'].mean(), 2) if 'Package' in data.columns and not data[
	'Package'].isna().all() else 'N/A',
	'departments': data['Department'].nunique() if 'Department' in data.columns and not data[
	'Department'].isna().all() else 'N/A',
	'companies': data['Company'].nunique() if 'Company' in data.columns and not data[
	'Company'].isna().all() else 'N/A',
	'max_package': round(data['Package'].max(), 2) if 'Package' in data.columns and not data[
	'Package'].isna().all() else 'N/A',
	'min_package': round(data['Package'].min(), 2) if 'Package' in data.columns and not data[
	'Package'].isna().all() else 'N/A',
	'median_package': round(data['Package'].median(), 2) if 'Package' in data.columns and not data[
	'Package'].isna().all() else 'N/A',
	'years_covered': f"{data['Year'].min()}-{data['Year'].max()}" if 'Year' in data.columns and not data[
	'Year'].isna().all() and len(data['Year'].dropna().unique()) > 1 else str(
	data['Year'].min()) if 'Year' in data.columns and not data['Year'].isna().all() else 'N/A',
	'top_department': data['Department'].value_counts().idxmax() if 'Department' in data.columns and not data[
	'Department'].isna().all() else 'N/A',
	'top_company': data['Company'].value_counts().idxmax() if 'Company' in data.columns and not data[
	'Company'].isna().all() else 'N/A',
	'top_role': data['Role'].value_counts().idxmax() if 'Role' in data.columns and not data[
	'Role'].isna().all() else 'N/A'
	}

	# Context for AI model
	overall_context = f"""
	Here is a summary of the placement data:
	- Total Placements: {stats['total_placements']}
	- Departments involved: {stats['departments']}
	- Unique Companies: {stats['companies']}
	- Average Package: {stats['avg_package']}
	- Maximum Package: {stats['max_package']}
	- Minimum Package: {stats['min_package']}
	- Median Package: {stats['median_package']}
	- Years Covered: {stats['years_covered']}
	- Most Placements by Department: {stats['top_department']}
	- Most Placements by Company: {stats['top_company']}
	- Most Placements by Role: {stats['top_role']}
	"""

	# Map graph titles to specific prompts for tailored insights
	prompt_map = {
	"Department-wise Placement Distribution": f"""{overall_context}
	The graph shows the distribution of placements across different departments. What are the key observations regarding which departments have the most/least placements, and any significant disparities?""",

	"Distribution of Package Values": f"""{overall_context}
	The graph displays the frequency distribution of package values. What does this reveal about typical salary ranges, outliers, and the overall earning potential?""",

	"Average Package by Department": f"""{overall_context}
	This graph presents the average package offered per department. What insights can be drawn about the earning potential differences between departments?""",

	"Placement Trends Over Years": f"""{overall_context}
	This graph illustrates the number of placements over the years. What trends (growth, decline, stability) can be identified in placement activity over time?""",

	"Top 10 Recruiting Companies": f"""{overall_context}
	This graph shows the top 10 companies by the number of placements. What does this indicate about the primary recruiters and their impact on placements?""",

	"Top 10 Job Roles Placed": f"""{overall_context}
	This graph displays the top 10 job roles students were placed in. What are the predominant job types or career paths for these students?"""
	}

	for title in graph_titles:
	prompt = prompt_map.get(title, f"{overall_context}\n\nProvide key insights for a graph titled '{title}'.")
	try:
	print(f"Generating insight for graph: '{title}'...")
	insight = generate_single_insight(prompt, model)
	insights.append(insight)
	except Exception as e:
	print(f"Failed to generate insight for '{title}' after retries: {type(e).__name__}: {str(e)}")
	insights.append("<ul><li>Unable to generate insight for this graph at this time.</li></ul>")

	return insights

	except Exception as e:
	print(f"Error in generate_insights overall: {type(e).__name__}: {str(e)}")
	traceback.print_exc()
	return ["<ul><li>Error generating insights. Please try again.</li></ul>"] * len(graph_titles)


	@app.route('/', methods=['GET', 'POST'])
	def home():
	"""
	Handle placement data upload and analysis
	"""
	print(f"Request method: {request.method}")
	print(f"Request form keys: {list(request.form.keys())}")
	print(f"Request files keys: {list(request.files.keys())}")

	if request.method == 'POST':
	print("POST request received")

	# Check if upload_csv button was clicked - THIS IS THE KEY CHECK
	if 'upload_csv' not in request.form:
	print("upload_csv not in form (this means the button's name/value wasn't sent)")
	flash("Invalid form submission or button not recognized. Please try again.", "error")
	return redirect(url_for('home'))

	# Check if file was uploaded
	if 'file' not in request.files:
	print("No file part in request")
	flash("No file selected", "error")
	return redirect(url_for('home'))

	file = request.files['file']
	print(f"File received: {file.filename}")

	# Check if file was actually selected
	if file.filename == '':
	print("No file selected (empty filename)")
	flash("No file selected", "error")
	return redirect(url_for('home'))

	# Check file extension
	if not file.filename.lower().endswith('.csv'):
	print("Invalid file type (not .csv)")
	flash("Please upload a CSV file", "error")
	return redirect(url_for('home'))

	try:
	print("Processing CSV file...")

	# Read CSV data directly from the file stream
	data = pd.read_csv(file.stream)
	print(f"CSV loaded successfully with {len(data)} rows and {len(data.columns)} columns")
	print(f"Columns before cleaning: {list(data.columns)}")

	if data.empty:
	flash("Uploaded file is empty or invalid. Please upload a valid CSV.", "error")
	return redirect(url_for('home'))

	# Clean column names (strip whitespace, make consistent case for internal use)
	data.columns = data.columns.str.strip()
	print(f"Cleaned columns: {list(data.columns)}")

	# Required columns (exact expected names in the CSV)
	# The validation function will check these case-insensitively
	required_csv_columns = ['Name', 'Department', 'Company', 'Post', 'Package', 'Year of Placement',
	'Graduation Year']

	# Validate data columns
	try:
	validate_data_columns(data.copy(),
	required_csv_columns) # Pass a copy to avoid modifying original during validation
	except ValueError as ve:
	print(f"Validation error: {str(ve)}")
	flash(f"Invalid data or missing columns: {str(ve)}", "error")
	return redirect(url_for('home'))

	# Rename columns for consistency after validation check, for internal use
	# Use a dictionary comprehension to ensure we only rename if the column exists
	rename_map = {
	col: new_name for col_check, new_name in [('Year of Placement', 'Year'), ('Post', 'Role')]
	for col in data.columns if col.lower() == col_check.lower()
	}
	data = data.rename(columns=rename_map)
	print("Columns potentially renamed for internal processing.")
	print(f"Columns after renaming for processing: {list(data.columns)}")

	# Generate graphs and insights
	print("Generating graphs...")
	graphs_info = generate_graphs(data.copy()) # Pass a copy
	graph_html_list = [item["graph"] for item in graphs_info]
	graph_titles_list = [item["title"] for item in graphs_info]
	print(f"Generated {len(graph_html_list)} graphs")

	print("Generating insights...")
	insights_list = generate_insights(data.copy(), graph_titles_list) # Pass a copy
	print(f"Generated {len(insights_list)} insights")

	# Ensure we have matching pairs
	min_length = min(len(graph_html_list), len(insights_list))
	if min_length == 0:
	flash("No graphs or insights could be generated from the data. Please check file format and content.",
	"error")
	return redirect(url_for('home'))

	final_graphs_and_insights = [{"graph": g, "insight": i}
	for g, i in zip(graph_html_list[:min_length], insights_list[:min_length])]

	# Store analysis results
	# Generate a unique ID for this analysis session
	session['analysis_id'] = f"analysis_{int(time.time())}"

	# Create temp directory and save results
	try:
	analysis_path_dir = app.config['TEMP_FOLDER'] / session['analysis_id']
	os.makedirs(analysis_path_dir, exist_ok=True)
	analysis_file_path = analysis_path_dir / 'data.json'

	with open(analysis_file_path, 'w') as f:
	json.dump({
	'graphs': graph_html_list[:min_length],
	'insights': insights_list[:min_length]
	}, f)
	print(f"Analysis results saved successfully to {analysis_file_path}")
	except Exception as e:
	print(f"Error saving analysis to temporary file: {str(e)}")
	traceback.print_exc()
	flash("Analysis completed, but there was an issue saving the results temporarily.", "warning")
	# Continue displaying results even if saving fails

	flash("Analysis completed successfully! Scroll down to see the results.", "success")
	return render_template('index.html', graphs_and_insights=final_graphs_and_insights)

	except pd.errors.EmptyDataError:
	flash("The uploaded CSV file is empty. Please upload a file with data.", "error")
	return redirect(url_for('home'))
	except pd.errors.ParserError:
	flash("Could not parse the CSV file. Please ensure it's a valid CSV format.", "error")
	return redirect(url_for('home'))
	except Exception as e:
	error_msg = f"An unexpected error occurred while processing your file: {str(e)}. Please check the file's content and try again."
	print(error_msg)
	traceback.print_exc()
	flash(error_msg, "error")
	return redirect(url_for('home'))

	# Handle GET requests
	print("GET request - checking for saved analysis...")
	analysis_id = session.get('analysis_id')
	graphs_and_insights = []
	if analysis_id:
	analysis_file_path = app.config['TEMP_FOLDER'] / analysis_id / 'data.json'
	if analysis_file_path.exists():
	try:
	with open(analysis_file_path) as f:
	data = json.load(f)
	print(f"Loaded saved analysis from {analysis_file_path}")
	graphs_and_insights = [{"graph": g, "insight": i}
	for g, i in zip(data['graphs'], data['insights'])]
	except Exception as e:
	print(f"Error loading saved analysis: {str(e)}")
	traceback.print_exc()
	# If loading fails, clear session to prevent re-attempting with a corrupt ID
	session.pop('analysis_id', None)
	flash("Could not load previous analysis. Please upload your file again.", "warning")

	print("Rendering template.")
	return render_template('index.html', graphs_and_insights=graphs_and_insights)


	@app.errorhandler(413)
	def too_large(e):
	flash("File is too large. Maximum file size is 16MB.", "error")
	return redirect(url_for('home'))


	@app.errorhandler(400)
	def bad_request(e):
	flash("Bad request. Please check your input and try again.", "error")
	return redirect(url_for('home'))


	if __name__ == '__main__':
	# Create necessary directories
	os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
	os.makedirs(app.config['TEMP_FOLDER'], exist_ok=True)

	print("Flask app starting...")
	print(f"Upload folder: {app.config['UPLOAD_FOLDER']}")
	print(f"Temp folder: {app.config['TEMP_FOLDER']}")

	# Run in debug mode, set debug=False for production
	app.run(debug=True, port=2541)