Spaces:

kerols77
/

UniRecommend

Sleeping

File size: 11,140 Bytes

1f8582e

import pandas as pd
import numpy as np
import json
import os
import logging
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, mean_absolute_error, accuracy_score, precision_score, recall_score

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s")

class StudentRecommendationSystem:
    def __init__(self, json_dir: str = "./data"):
        self.json_dir = json_dir
        try:
            with open(os.path.join(json_dir, "subjects.json"), "r") as f:
                subjects_data = json.load(f)
                self.core_subjects = subjects_data["core_subjects"]
            logging.info("Loaded subjects data successfully.")
        except Exception as e:
            logging.error("Error loading subjects data: " + str(e))
            raise
        try:
            with open(os.path.join(json_dir, "universities.json"), "r") as f:
                universities_data = json.load(f)
                self.top_universities = universities_data["top_universities"]
            logging.info("Loaded universities data successfully.")
        except Exception as e:
            logging.error("Error loading universities data: " + str(e))
            raise
        try:
            with open(os.path.join(json_dir, "departments.json"), "r") as f:
                departments_data = json.load(f)
                self.program_departments = departments_data["program_departments"]
            logging.info("Loaded departments data successfully.")
        except Exception as e:
            logging.error("Error loading departments data: " + str(e))
            raise
        self.university_programs = {}
        for program in self.core_subjects:
            self.university_programs[program] = {"core_subjects": self.core_subjects[program]}
        logging.info("University programs mapping created.")
        self.ai_model = self._train_dummy_model()

    def _train_dummy_model(self) -> Pipeline:
        X = np.array([[0], [50], [100]])
        y = np.array([0, 50, 100])
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', LinearRegression())
        ])
        pipeline.fit(X, y)
        logging.info("Dummy AI model pipeline trained successfully.")
        return pipeline

    def _refine_match_score(self, score: float) -> float:
        refined = self.ai_model.predict(np.array([[score]]))[0]
        logging.debug(f"Refined score for raw score {score} is {refined}.")
        return refined

    def predict_success_probability(self, refined_score: float) -> float:
        probability = refined_score / 100.0
        logging.debug(f"Predicted success probability from refined score {refined_score} is {probability}.")
        return probability

    def load_student_grades(self, grades_data: dict = None, grades_file: str = None) -> pd.DataFrame:
        if grades_file:
            try:
                with open(grades_file, "r") as f:
                    grades_data = json.load(f)
                    if "sample_grades" in grades_data:
                        grades_data = grades_data["sample_grades"]
                logging.info(f"Student grades loaded from file: {grades_file}")
            except Exception as e:
                logging.error("Error loading student grades file: " + str(e))
                raise
        if not grades_data:
            raise ValueError("Either grades_data or grades_file must be provided")
        self.student_data = pd.DataFrame(list(grades_data.items()), columns=['Subject', 'Grade'])
        return self.student_data

    def identify_strengths(self, threshold: float = 85) -> pd.DataFrame:
        strengths = self.student_data[self.student_data['Grade'] >= threshold]
        return strengths.sort_values(by='Grade', ascending=False)

    def calculate_program_match(self, strengths: pd.DataFrame) -> pd.DataFrame:
        program_scores = {}
        for program, details in self.university_programs.items():
            score = 0
            core_subjects = details["core_subjects"]
            total_possible_score = len(core_subjects) * 100
            for subject in core_subjects:
                subject_grade = self.student_data[self.student_data['Subject'] == subject]
                if not subject_grade.empty:
                    score += subject_grade.iloc[0]['Grade']
            raw_score = (score / total_possible_score) * 100 if total_possible_score > 0 else 0
            refined_score = self._refine_match_score(raw_score)
            success_probability = self.predict_success_probability(refined_score)
            program_scores[program] = {
                "raw_score": raw_score,
                "refined_score": refined_score,
                "success_probability": success_probability
            }
            logging.debug(f"Program {program}: raw_score {raw_score}, refined_score {refined_score}, success_probability {success_probability}")
        program_df = pd.DataFrame([
            {"Program": program,
             "Raw Score": scores["raw_score"],
             "AI Refined Score": scores["refined_score"],
             "Success Probability": scores["success_probability"]}
            for program, scores in program_scores.items()
        ])
        return program_df.sort_values(by='AI Refined Score', ascending=False)

    def get_top_recommendations(self, program_matches: pd.DataFrame, top_n: int = 3) -> list:
        recommendations = []
        for i in range(min(top_n, len(program_matches))):
            program = program_matches.iloc[i]['Program']
            raw_score = program_matches.iloc[i]['Raw Score']
            refined_score = program_matches.iloc[i]['AI Refined Score']
            success_probability = program_matches.iloc[i]['Success Probability']
            if refined_score >= 50:
                universities = self.top_universities.get(program, ["No specific recommendations"])
                departments = self.program_departments.get(program, ["No specific departments"])
                recommendations.append({
                    "program": program,
                    "raw_score": raw_score,
                    "refined_score": refined_score,
                    "success_probability": success_probability,
                    "recommended_universities": universities[:3],
                    "recommended_departments": departments[:3]
                })
        return recommendations

    def evaluate_recommendations(self, program_matches_df: pd.DataFrame, ground_truth: dict, threshold: float = 60) -> dict:
        predictions = program_matches_df.apply(lambda row: 1 if row["AI Refined Score"] >= threshold else 0, axis=1).tolist()
        actuals = [ground_truth.get(program, 0) for program in program_matches_df["Program"].tolist()]
        metrics = {
            "f1_score": f1_score(actuals, predictions),
            "accuracy": accuracy_score(actuals, predictions),
            "precision": precision_score(actuals, predictions, zero_division=0),
            "recall": recall_score(actuals, predictions, zero_division=0),
            "mae": mean_absolute_error(actuals, predictions)
        }
        logging.info("Evaluation metrics computed.")
        return metrics

    def save_ai_model(self, file_path: str) -> None:
        try:
            with open(file_path, "wb") as f:
                pickle.dump(self.ai_model, f)
            logging.info(f"AI model saved to {file_path}")
        except Exception as e:
            logging.error("Error saving AI model: " + str(e))
            raise

    def load_ai_model(self, file_path: str) -> None:
        try:
            with open(file_path, "rb") as f:
                self.ai_model = pickle.load(f)
            logging.info(f"AI model loaded from {file_path}")
        except Exception as e:
            logging.error("Error loading AI model: " + str(e))
            raise

    def process_student_data(self, grades_data: dict = None, grades_file: str = None, strength_threshold: float = 85) -> dict:
        self.load_student_grades(grades_data, grades_file)
        strengths = self.identify_strengths(strength_threshold)
        program_matches = self.calculate_program_match(strengths)
        recommendations = self.get_top_recommendations(program_matches)
        report = {
            "strengths": strengths.to_dict('records'),
            "program_matches": program_matches.to_dict('records'),
            "top_recommendations": recommendations
        }
        return report

def setup_json_directory(json_dir: str = "./data") -> str:
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    return json_dir

def main():
    json_dir = setup_json_directory()
    sample_data_file = os.path.join(json_dir, "sample-data.json")
    recommendation_system = StudentRecommendationSystem(json_dir)
    results = recommendation_system.process_student_data(grades_file=sample_data_file)
    with open(sample_data_file, "r") as f:
        sample_data = json.load(f)
        sample_grades = sample_data["sample_grades"]
    print("\n===== STUDENT ACADEMIC PROFILE =====")
    print("\nSubjects and Grades:")
    for subject in sample_grades:
        print(f"- {subject}: {sample_grades[subject]}")
    print("\n===== ACADEMIC STRENGTHS =====")
    for strength in results["strengths"]:
        print(f"- {strength['Subject']}: {strength['Grade']}")
    print("\n===== PROGRAM MATCHES (Including AI Details) =====")
    for match in results["program_matches"]:
        print(f"- {match['Program']}: Raw Score = {match['Raw Score']:.1f}%, AI Refined Score = {match['AI Refined Score']:.1f}%, Success Probability = {match['Success Probability']:.2f}")
    print("\n===== PROGRAM RECOMMENDATIONS =====")
    for i, rec in enumerate(results["top_recommendations"], 1):
        print(f"\n{i}. {rec['program']} (Raw Score: {rec['raw_score']:.1f}%, AI Refined Score: {rec['refined_score']:.1f}%, Success Probability: {rec['success_probability']:.2f})")
        print("   Recommended Universities:")
        for uni in rec['recommended_universities']:
            print(f"   - {uni}")
        print("   Recommended Departments:")
        for dept in rec['recommended_departments']:
            print(f"   - {dept}")
    program_matches_df = pd.DataFrame(results["program_matches"])
    dummy_ground_truth = {row["Program"]: (1 if row["Raw Score"] >= 65 else 0) for idx, row in program_matches_df.iterrows()}
    evaluation_metrics = recommendation_system.evaluate_recommendations(program_matches_df, dummy_ground_truth)
    print("\n===== EVALUATION METRICS =====")
    for metric, value in evaluation_metrics.items():
        print(f"{metric.capitalize()}: {value:.2f}")
    recommendation_system.save_ai_model(os.path.join(json_dir, "ai_model.pkl"))

if __name__ == "__main__":
    main()