Spaces:

pranit144
/

Institute_placement_anaylsis

Sleeping

File size: 24,352 Bytes

feabcc4

from flask import Flask, render_template, request, session, redirect, url_for, flash
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict, Counter
import json
import os
import google.generativeai as genai
import tempfile
from pathlib import Path
import time
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential
import traceback

app = Flask(__name__)
app.secret_key = '0fdd675e2c6f513deb04c79bd7ddb7e0'  # IMPORTANT: Change this to a strong, random key in production
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size
app.config['TEMP_FOLDER'] = Path(tempfile.gettempdir()) / 'placement_analyzer'

# Update the Gemini configuration to use environment variable
load_dotenv()

# Make sure API key is available
api_key ="AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ"
if not api_key:
    print("WARNING: GEMINI_API_KEY not found in environment variables. AI insights will not be available.")
    model = None
else:
    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-2.0-flash')
        print("Gemini model configured successfully.")
    except Exception as e:
        print(f"Error configuring Gemini model: {e}. AI insights will not be available.")
        model = None


def validate_data_columns(data, required_columns):
    """

    Validate that the DataFrame contains all required columns.

    """
    # Create a mapping of lowercase column names to actual column names
    column_mapping = {col.lower(): col for col in data.columns}

    # Check if required columns exist (case-insensitive)
    actual_required_columns = []
    missing_columns = []
    for req_col in required_columns:
        if req_col.lower() in column_mapping:
            actual_required_columns.append(column_mapping[req_col.lower()])
        else:
            missing_columns.append(req_col)

    if missing_columns:
        raise ValueError(
            f"Missing required columns: {', '.join(missing_columns)}. Please ensure your CSV has the correct column names.")

    # Validate data types and non-empty values
    if 'Package' in actual_required_columns:
        data['Package'] = pd.to_numeric(data['Package'], errors='coerce')
        if data['Package'].isna().all():
            raise ValueError("Package column contains no valid numeric values or is entirely empty.")

    if 'Year of Placement' in actual_required_columns:  # This might be 'Year' after rename, but check original
        # Use the actual column name for 'Year of Placement' if it exists
        year_col_name = column_mapping.get('year of placement', None)
        if year_col_name and not data[year_col_name].isna().all():
            data[year_col_name] = pd.to_numeric(data[year_col_name], errors='coerce')
            if data[year_col_name].isna().all():
                raise ValueError("Year of Placement column contains no valid numeric values or is entirely empty.")
        elif year_col_name:  # If column exists but is all NA
            raise ValueError("Year of Placement column is entirely empty.")

    return True


def generate_graphs(data):
    """

    Generate comprehensive graphs based on the provided placement data.

    """
    graphs = []

    try:
        print(f"Generating graphs for {len(data)} records...")

        # Ensure column names are standardized for graph generation
        data.columns = [col.replace('Year of Placement', 'Year').replace('Post', 'Role') for col in data.columns]

        # 1. Department-wise Placement Distribution
        if 'Department' in data.columns and not data['Department'].isna().all():
            print("Generating Department-wise Placement Distribution...")
            department_counts = data['Department'].value_counts().reset_index()
            department_counts.columns = ['Department', 'Count']
            fig1 = px.bar(department_counts,
                          x='Department', y='Count',
                          title="Department-wise Placement Distribution",
                          color_discrete_sequence=['#2563eb'])
            fig1.update_layout(height=500, xaxis_title="Department", yaxis_title="Number of Placements")
            graphs.append({"graph": fig1.to_html(full_html=False), "title": "Department-wise Placement Distribution"})

        # 2. Package Distribution
        if 'Package' in data.columns and not data['Package'].isna().all():
            print("Generating Package Distribution...")
            package_data = data.dropna(subset=['Package'])
            if not package_data.empty:
                fig2 = px.histogram(package_data,
                                    x='Package',
                                    title="Distribution of Package Values",
                                    color_discrete_sequence=['#10b981'])
                fig2.update_layout(height=500, xaxis_title="Package (e.g., in LPA)", yaxis_title="Number of Students")
                graphs.append({"graph": fig2.to_html(full_html=False), "title": "Distribution of Package Values"})

        # 3. Average Package by Department
        if all(col in data.columns for col in ['Department', 'Package']):
            print("Generating Average Package by Department...")
            clean_data = data.dropna(subset=['Department', 'Package'])
            if not clean_data.empty:
                avg_package = clean_data.groupby('Department')['Package'].mean().reset_index()
                fig3 = px.bar(avg_package,
                              x='Department', y='Package',
                              title="Average Package by Department",
                              color_discrete_sequence=['#3b82f6'])
                fig3.update_layout(height=500, xaxis_title="Department", yaxis_title="Average Package (e.g., in LPA)")
                graphs.append({"graph": fig3.to_html(full_html=False), "title": "Average Package by Department"})

        # 4. Year-wise Placement Trends (Using 'Year' column after rename)
        if 'Year' in data.columns and not data['Year'].isna().all():
            print("Generating Year-wise Placement Trends...")
            year_counts = data['Year'].value_counts().sort_index().reset_index()
            year_counts.columns = ['Year', 'Count']
            fig4 = px.line(year_counts,
                           x='Year', y='Count',
                           title="Placement Trends Over Years",
                           markers=True,
                           color_discrete_sequence=['#f59e0b'])
            fig4.update_layout(height=500, xaxis_title="Year", yaxis_title="Number of Placements")
            graphs.append({"graph": fig4.to_html(full_html=False), "title": "Placement Trends Over Years"})

        # 5. Company-wise Placements
        if 'Company' in data.columns and not data['Company'].isna().all():
            print("Generating Company-wise Placements...")
            top_companies = data['Company'].value_counts().head(10).reset_index()
            top_companies.columns = ['Company', 'Count']
            fig5 = px.pie(top_companies,
                          values='Count', names='Company',
                          title="Top 10 Recruiting Companies",
                          hole=0.4)
            fig5.update_layout(height=500, margin=dict(t=50, b=50, l=50, r=50))
            graphs.append({"graph": fig5.to_html(full_html=False), "title": "Top 10 Recruiting Companies"})

        # 6. Top 10 Job Roles
        if 'Role' in data.columns and not data['Role'].isna().all():
            print("Generating Top 10 Job Roles...")
            top_roles = data['Role'].value_counts().head(10).reset_index()
            top_roles.columns = ['Role', 'Count']
            fig6 = px.bar(top_roles, x='Count', y='Role', orientation='h',
                          title='Top 10 Job Roles Placed',
                          color_discrete_sequence=px.colors.qualitative.Pastel)
            fig6.update_layout(height=500, yaxis={'categoryorder': 'total ascending'},
                               xaxis_title="Number of Placements", yaxis_title="Job Role")
            graphs.append({"graph": fig6.to_html(full_html=False), "title": "Top 10 Job Roles Placed"})

        print(f"Generated {len(graphs)} graphs successfully")
        return graphs

    except Exception as e:
        print(f"Error generating graphs: {str(e)}")
        traceback.print_exc()
        return []


@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def generate_single_insight(prompt, model):
    try:
        response = model.generate_content(
            prompt + "\n\nProvide a brief analysis in 2-3 concise bullet points, formatted as HTML <ul><li> tags. Ensure the output is only the HTML."
        )
        if not response or not response.text:
            print("Empty response received from Gemini")
            return "<ul><li>No insight generated - empty response from AI.</li></ul>"

        # Clean the response to ensure it's valid HTML ul/li
        clean_text = response.text.replace('```html', '').replace('```', '').strip()
        if not clean_text.startswith('<ul>') or not clean_text.endswith('</ul>'):
            # If AI doesn't format it, try to make it list-like or a single point
            lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
            if lines:
                return "<ul>" + "".join([f"<li>{line.lstrip('- ').lstrip('* ')}</li>" for line in lines]) + "</ul>"
            else:
                return "<ul><li>AI insight could not be properly formatted.</li></ul>"
        return clean_text
    except Exception as e:
        print(f"Error in generate_single_insight: {type(e).__name__}: {str(e)}")
        raise


def generate_insights(data, graph_titles):
    """

    Generate insights for each graph using Gemini AI with retry logic

    """
    if not model:
        print("No AI model available, returning default insights")
        return ["<ul><li>AI insights not available - missing API key or configuration error.</li></ul>"] * len(
            graph_titles)

    insights = []

    # Ensure column names are standardized for data access
    data.columns = [col.replace('Year of Placement', 'Year').replace('Post', 'Role') for col in data.columns]

    try:
        # Calculate statistics for insights with null checking
        stats = {
            'total_placements': len(data),
            'avg_package': round(data['Package'].mean(), 2) if 'Package' in data.columns and not data[
                'Package'].isna().all() else 'N/A',
            'departments': data['Department'].nunique() if 'Department' in data.columns and not data[
                'Department'].isna().all() else 'N/A',
            'companies': data['Company'].nunique() if 'Company' in data.columns and not data[
                'Company'].isna().all() else 'N/A',
            'max_package': round(data['Package'].max(), 2) if 'Package' in data.columns and not data[
                'Package'].isna().all() else 'N/A',
            'min_package': round(data['Package'].min(), 2) if 'Package' in data.columns and not data[
                'Package'].isna().all() else 'N/A',
            'median_package': round(data['Package'].median(), 2) if 'Package' in data.columns and not data[
                'Package'].isna().all() else 'N/A',
            'years_covered': f"{data['Year'].min()}-{data['Year'].max()}" if 'Year' in data.columns and not data[
                'Year'].isna().all() and len(data['Year'].dropna().unique()) > 1 else str(
                data['Year'].min()) if 'Year' in data.columns and not data['Year'].isna().all() else 'N/A',
            'top_department': data['Department'].value_counts().idxmax() if 'Department' in data.columns and not data[
                'Department'].isna().all() else 'N/A',
            'top_company': data['Company'].value_counts().idxmax() if 'Company' in data.columns and not data[
                'Company'].isna().all() else 'N/A',
            'top_role': data['Role'].value_counts().idxmax() if 'Role' in data.columns and not data[
                'Role'].isna().all() else 'N/A'
        }

        # Context for AI model
        overall_context = f"""

        Here is a summary of the placement data:

        - Total Placements: {stats['total_placements']}

        - Departments involved: {stats['departments']}

        - Unique Companies: {stats['companies']}

        - Average Package: {stats['avg_package']}

        - Maximum Package: {stats['max_package']}

        - Minimum Package: {stats['min_package']}

        - Median Package: {stats['median_package']}

        - Years Covered: {stats['years_covered']}

        - Most Placements by Department: {stats['top_department']}

        - Most Placements by Company: {stats['top_company']}

        - Most Placements by Role: {stats['top_role']}

        """

        # Map graph titles to specific prompts for tailored insights
        prompt_map = {
            "Department-wise Placement Distribution": f"""{overall_context}

            The graph shows the distribution of placements across different departments. What are the key observations regarding which departments have the most/least placements, and any significant disparities?""",

            "Distribution of Package Values": f"""{overall_context}

            The graph displays the frequency distribution of package values. What does this reveal about typical salary ranges, outliers, and the overall earning potential?""",

            "Average Package by Department": f"""{overall_context}

            This graph presents the average package offered per department. What insights can be drawn about the earning potential differences between departments?""",

            "Placement Trends Over Years": f"""{overall_context}

            This graph illustrates the number of placements over the years. What trends (growth, decline, stability) can be identified in placement activity over time?""",

            "Top 10 Recruiting Companies": f"""{overall_context}

            This graph shows the top 10 companies by the number of placements. What does this indicate about the primary recruiters and their impact on placements?""",

            "Top 10 Job Roles Placed": f"""{overall_context}

            This graph displays the top 10 job roles students were placed in. What are the predominant job types or career paths for these students?"""
        }

        for title in graph_titles:
            prompt = prompt_map.get(title, f"{overall_context}\n\nProvide key insights for a graph titled '{title}'.")
            try:
                print(f"Generating insight for graph: '{title}'...")
                insight = generate_single_insight(prompt, model)
                insights.append(insight)
            except Exception as e:
                print(f"Failed to generate insight for '{title}' after retries: {type(e).__name__}: {str(e)}")
                insights.append("<ul><li>Unable to generate insight for this graph at this time.</li></ul>")

        return insights

    except Exception as e:
        print(f"Error in generate_insights overall: {type(e).__name__}: {str(e)}")
        traceback.print_exc()
        return ["<ul><li>Error generating insights. Please try again.</li></ul>"] * len(graph_titles)


@app.route('/', methods=['GET', 'POST'])
def home():
    """

    Handle placement data upload and analysis

    """
    print(f"Request method: {request.method}")
    print(f"Request form keys: {list(request.form.keys())}")
    print(f"Request files keys: {list(request.files.keys())}")

    if request.method == 'POST':
        print("POST request received")

        # Check if upload_csv button was clicked - THIS IS THE KEY CHECK
        if 'upload_csv' not in request.form:
            print("upload_csv not in form (this means the button's name/value wasn't sent)")
            flash("Invalid form submission or button not recognized. Please try again.", "error")
            return redirect(url_for('home'))

        # Check if file was uploaded
        if 'file' not in request.files:
            print("No file part in request")
            flash("No file selected", "error")
            return redirect(url_for('home'))

        file = request.files['file']
        print(f"File received: {file.filename}")

        # Check if file was actually selected
        if file.filename == '':
            print("No file selected (empty filename)")
            flash("No file selected", "error")
            return redirect(url_for('home'))

        # Check file extension
        if not file.filename.lower().endswith('.csv'):
            print("Invalid file type (not .csv)")
            flash("Please upload a CSV file", "error")
            return redirect(url_for('home'))

        try:
            print("Processing CSV file...")

            # Read CSV data directly from the file stream
            data = pd.read_csv(file.stream)
            print(f"CSV loaded successfully with {len(data)} rows and {len(data.columns)} columns")
            print(f"Columns before cleaning: {list(data.columns)}")

            if data.empty:
                flash("Uploaded file is empty or invalid. Please upload a valid CSV.", "error")
                return redirect(url_for('home'))

            # Clean column names (strip whitespace, make consistent case for internal use)
            data.columns = data.columns.str.strip()
            print(f"Cleaned columns: {list(data.columns)}")

            # Required columns (exact expected names in the CSV)
            # The validation function will check these case-insensitively
            required_csv_columns = ['Name', 'Department', 'Company', 'Post', 'Package', 'Year of Placement',
                                    'Graduation Year']

            # Validate data columns
            try:
                validate_data_columns(data.copy(),
                                      required_csv_columns)  # Pass a copy to avoid modifying original during validation
            except ValueError as ve:
                print(f"Validation error: {str(ve)}")
                flash(f"Invalid data or missing columns: {str(ve)}", "error")
                return redirect(url_for('home'))

            # Rename columns for consistency *after* validation check, for internal use
            # Use a dictionary comprehension to ensure we only rename if the column exists
            rename_map = {
                col: new_name for col_check, new_name in [('Year of Placement', 'Year'), ('Post', 'Role')]
                for col in data.columns if col.lower() == col_check.lower()
            }
            data = data.rename(columns=rename_map)
            print("Columns potentially renamed for internal processing.")
            print(f"Columns after renaming for processing: {list(data.columns)}")

            # Generate graphs and insights
            print("Generating graphs...")
            graphs_info = generate_graphs(data.copy())  # Pass a copy
            graph_html_list = [item["graph"] for item in graphs_info]
            graph_titles_list = [item["title"] for item in graphs_info]
            print(f"Generated {len(graph_html_list)} graphs")

            print("Generating insights...")
            insights_list = generate_insights(data.copy(), graph_titles_list)  # Pass a copy
            print(f"Generated {len(insights_list)} insights")

            # Ensure we have matching pairs
            min_length = min(len(graph_html_list), len(insights_list))
            if min_length == 0:
                flash("No graphs or insights could be generated from the data. Please check file format and content.",
                      "error")
                return redirect(url_for('home'))

            final_graphs_and_insights = [{"graph": g, "insight": i}
                                         for g, i in zip(graph_html_list[:min_length], insights_list[:min_length])]

            # Store analysis results
            # Generate a unique ID for this analysis session
            session['analysis_id'] = f"analysis_{int(time.time())}"

            # Create temp directory and save results
            try:
                analysis_path_dir = app.config['TEMP_FOLDER'] / session['analysis_id']
                os.makedirs(analysis_path_dir, exist_ok=True)
                analysis_file_path = analysis_path_dir / 'data.json'

                with open(analysis_file_path, 'w') as f:
                    json.dump({
                        'graphs': graph_html_list[:min_length],
                        'insights': insights_list[:min_length]
                    }, f)
                print(f"Analysis results saved successfully to {analysis_file_path}")
            except Exception as e:
                print(f"Error saving analysis to temporary file: {str(e)}")
                traceback.print_exc()
                flash("Analysis completed, but there was an issue saving the results temporarily.", "warning")
                # Continue displaying results even if saving fails

            flash("Analysis completed successfully! Scroll down to see the results.", "success")
            return render_template('index.html', graphs_and_insights=final_graphs_and_insights)

        except pd.errors.EmptyDataError:
            flash("The uploaded CSV file is empty. Please upload a file with data.", "error")
            return redirect(url_for('home'))
        except pd.errors.ParserError:
            flash("Could not parse the CSV file. Please ensure it's a valid CSV format.", "error")
            return redirect(url_for('home'))
        except Exception as e:
            error_msg = f"An unexpected error occurred while processing your file: {str(e)}. Please check the file's content and try again."
            print(error_msg)
            traceback.print_exc()
            flash(error_msg, "error")
            return redirect(url_for('home'))

    # Handle GET requests
    print("GET request - checking for saved analysis...")
    analysis_id = session.get('analysis_id')
    graphs_and_insights = []
    if analysis_id:
        analysis_file_path = app.config['TEMP_FOLDER'] / analysis_id / 'data.json'
        if analysis_file_path.exists():
            try:
                with open(analysis_file_path) as f:
                    data = json.load(f)
                    print(f"Loaded saved analysis from {analysis_file_path}")
                    graphs_and_insights = [{"graph": g, "insight": i}
                                           for g, i in zip(data['graphs'], data['insights'])]
            except Exception as e:
                print(f"Error loading saved analysis: {str(e)}")
                traceback.print_exc()
                # If loading fails, clear session to prevent re-attempting with a corrupt ID
                session.pop('analysis_id', None)
                flash("Could not load previous analysis. Please upload your file again.", "warning")

    print("Rendering template.")
    return render_template('index.html', graphs_and_insights=graphs_and_insights)


@app.errorhandler(413)
def too_large(e):
    flash("File is too large. Maximum file size is 16MB.", "error")
    return redirect(url_for('home'))


@app.errorhandler(400)
def bad_request(e):
    flash("Bad request. Please check your input and try again.", "error")
    return redirect(url_for('home'))


if __name__ == '__main__':
    # Create necessary directories
    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
    os.makedirs(app.config['TEMP_FOLDER'], exist_ok=True)

    print("Flask app starting...")
    print(f"Upload folder: {app.config['UPLOAD_FOLDER']}")
    print(f"Temp folder: {app.config['TEMP_FOLDER']}")

    # Run in debug mode, set debug=False for production
    app.run(debug=True, port=2541)