Spaces:
Build error
Build error
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import warnings | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
import joblib | |
warnings.filterwarnings("ignore") | |
# Load and preprocess data | |
data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39") | |
data.replace('?', np.nan, inplace=True) | |
# Function to check data | |
def check_data(data): | |
return pd.DataFrame({ | |
'type': data.dtypes, | |
'amount_unique': data.nunique(), | |
'unique_values': [data[x].unique() for x in data.columns], | |
'null_values': data.isna().sum(), | |
'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2) | |
}) | |
print(check_data(data).sort_values("null_values", ascending=False)) | |
# Fill missing values with mode | |
for column in data.columns: | |
mode_value = data[column].mode().iloc[0] | |
data[column] = data[column].replace(np.nan, mode_value) | |
# Encode categorical variables | |
le = LabelEncoder() | |
for col in data.columns: | |
if data[col].dtype == 'O': | |
data[col] = le.fit_transform(data[col]) | |
# Drop less important columns | |
to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location', | |
'auto_year', 'auto_make', 'auto_model'] | |
data.drop(columns=to_drop, inplace=True) | |
# Correlation heatmap | |
plt.figure(figsize=(23, 23)) | |
corr_matrix = data.corr() | |
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) | |
sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma') | |
plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16) | |
plt.show() | |
# Drop less correlated features | |
to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age', | |
'incident_hour_of_the_day', 'insured_occupation'] | |
data.drop(columns=to_drop, inplace=True) | |
# Feature importance | |
X = data.iloc[:, :-1] | |
Y = data['fraud_reported'] | |
model = RandomForestClassifier(n_estimators=1000) | |
model.fit(X, Y) | |
feat_importances = pd.Series(model.feature_importances_, index=X.columns) | |
final_feat = feat_importances.nlargest(10).index.tolist() | |
final_feat.append('fraud_reported') | |
data_new = data[final_feat] | |
# Prepare data for modeling | |
df_model = data_new.copy() | |
X = df_model.drop(columns='fraud_reported') | |
y = df_model['fraud_reported'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43) | |
# Train the final model | |
final_model = RandomForestClassifier( | |
criterion='gini', | |
max_depth=5, | |
min_samples_leaf=4, | |
min_samples_split=10, | |
n_estimators=100, | |
random_state=42, | |
class_weight='balanced' | |
) | |
final_model.fit(X_train, y_train) | |
# Evaluate the model | |
y_pred = final_model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print(f"Model Accuracy: {accuracy}") | |
# Save the model | |
joblib.dump(final_model, 'model/only_model.joblib') | |
print("Model saved successfully.") | |