fir-gen / training /train_pipeline.py
aniudupa's picture
Upload 30 files
63310f4 verified
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib
# Add parent directory to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Data Ingestion class to load data
class DataIngestion:
@staticmethod
def load_data(file_path):
return pd.read_csv(file_path)
# Data Transformation class to handle preprocessing
class DataTransformation:
def __init__(self):
self.label_encoders = {}
self.scaler = StandardScaler()
def fit_transform(self, data):
# Assuming categorical columns need encoding
categorical_columns = ['statute', 'offense_category', 'penalty'] # Adjust as per your data
for col in categorical_columns:
if col in data.columns:
encoder = LabelEncoder()
data[col] = encoder.fit_transform(data[col])
self.label_encoders[col] = encoder
# Scaling numerical features
numerical_columns = ['imprisonment_duration_served', 'risk_score', 'penalty_severity']
data[numerical_columns] = self.scaler.fit_transform(data[numerical_columns])
return data
# Model Trainer class for training and saving model
class ModelTrainer:
def __init__(self):
self.model = RandomForestClassifier(n_estimators=100, random_state=42)
def train(self, X_train, y_train):
self.model.fit(X_train, y_train)
def evaluate(self, X_test, y_test):
accuracy = self.model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
def save_model(self, model_path):
joblib.dump(self.model, model_path)
# Ensure the ipc_vector_db folder exists
models_dir = os.path.join("ipc_vector_db") # Directory for models and preprocessing objects
os.makedirs(models_dir, exist_ok=True)
# Load and preprocess data
data = DataIngestion.load_data('data/a.csv') # Replace with your data path
transformer = DataTransformation()
data = transformer.fit_transform(data)
# Split data into features and target
X = data.drop(columns=['case_id', 'bail_eligibility'])
y = data['bail_eligibility']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
trainer = ModelTrainer()
trainer.train(X_train, y_train)
# Evaluate the model
trainer.evaluate(X_test, y_test)
# Save model and preprocessing objects
model_path = os.path.join(models_dir, 'bail_reckoner_model.pkl')
preprocessing_path = os.path.join(models_dir, 'preprocessing_objects.pkl')
trainer.save_model(model_path)
joblib.dump({'label_encoders': transformer.label_encoders, 'scaler': transformer.scaler}, preprocessing_path)
print(f"Model and preprocessing objects saved to {models_dir}")