File size: 15,944 Bytes
a47b6e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from src.models.loan_recovery_model import LoanRecoveryModel
from src.utils.data_generator import generate_loan_data
from src.preprocessing.data_processor import LoanDataProcessor

# Set page configuration
st.set_page_config(
    page_title="Smart Loan Recovery System",
    page_icon="💰",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Define functions
@st.cache_data
def load_sample_data():
    """Load or generate sample data."""
    data_path = "data/loan_data.csv"
    if os.path.exists(data_path):
        return pd.read_csv(data_path)
    else:
        data = generate_loan_data(n_samples=1000)
        os.makedirs("data", exist_ok=True)
        data.to_csv(data_path, index=False)
        return data

@st.cache_resource
def load_model(model_type="random_forest"):
    """Load the trained model."""
    model_path = f"models/loan_recovery_{model_type}.pkl"

    # Check if model exists, if not train it
    if not os.path.exists(model_path):
        st.info(f"Model not found. Training a new {model_type} model...")
        from src.train_model import train_and_save_model
        train_and_save_model(model_type=model_type)

    return LoanRecoveryModel.load_model(model_path)

def predict_recovery(model, data):
    """Make predictions using the model."""
    recovery_probs = model.predict(data)
    return recovery_probs

def plot_recovery_distribution(data):
    """Plot the distribution of recovery status."""
    fig, ax = plt.subplots(figsize=(10, 6))
    recovery_counts = data['recovery_status'].value_counts()
    labels = ['Not Recovered', 'Recovered']
    ax.bar(labels, recovery_counts.values)
    ax.set_ylabel('Count')
    ax.set_title('Distribution of Loan Recovery Status')
    for i, v in enumerate(recovery_counts.values):
        ax.text(i, v + 5, str(v), ha='center')

    # Add percentage labels
    total = len(data)
    for i, v in enumerate(recovery_counts.values):
        percentage = v / total * 100
        ax.text(i, v/2, f"{percentage:.1f}%", ha='center', color='white', fontweight='bold')

    return fig

def plot_feature_importance(model):
    """Plot feature importance."""
    return model.plot_feature_importance(top_n=10)

def plot_recovery_by_feature(data, feature, is_categorical=False):
    """Plot recovery rate by a specific feature."""
    fig, ax = plt.subplots(figsize=(10, 6))

    if is_categorical:
        # For categorical features
        recovery_by_feature = data.groupby(feature)['recovery_status'].mean().sort_values()
        counts = data.groupby(feature).size()

        # Create a bar plot
        bars = ax.bar(recovery_by_feature.index, recovery_by_feature.values * 100)
        ax.set_ylabel('Recovery Rate (%)')
        ax.set_title(f'Recovery Rate by {feature.replace("_", " ").title()}')
        ax.set_ylim(0, 100)

        # Add count labels
        for i, (idx, count) in enumerate(counts.items()):
            ax.text(i, 5, f"n={count}", ha='center', color='white', fontweight='bold')

        # Rotate x-axis labels if needed
        if len(recovery_by_feature) > 5:
            plt.xticks(rotation=45, ha='right')
    else:
        # For numerical features, create bins
        if feature in ['age', 'loan_term', 'previous_defaults', 'days_past_due']:
            # These features have a small range, so we can use them directly
            data['feature_bin'] = data[feature]
        else:
            # Create bins for continuous features
            data['feature_bin'] = pd.qcut(data[feature], 5, duplicates='drop')

        # Calculate recovery rate by bin
        recovery_by_bin = data.groupby('feature_bin')['recovery_status'].mean().sort_index()
        counts = data.groupby('feature_bin').size()

        # Create a bar plot
        bars = ax.bar(range(len(recovery_by_bin)), recovery_by_bin.values * 100)
        ax.set_ylabel('Recovery Rate (%)')
        ax.set_title(f'Recovery Rate by {feature.replace("_", " ").title()}')
        ax.set_ylim(0, 100)

        # Set x-axis labels
        if feature in ['age', 'loan_term', 'previous_defaults', 'days_past_due']:
            ax.set_xticks(range(len(recovery_by_bin)))
            ax.set_xticklabels(recovery_by_bin.index)
        else:
            # Format bin labels
            bin_labels = []
            for bin_range in recovery_by_bin.index:
                if hasattr(bin_range, 'left') and hasattr(bin_range, 'right'):
                    bin_labels.append(f"{bin_range.left:.1f}-{bin_range.right:.1f}")
                else:
                    bin_labels.append(str(bin_range))

            ax.set_xticks(range(len(recovery_by_bin)))
            ax.set_xticklabels(bin_labels)
            plt.xticks(rotation=45, ha='right')

        # Add count labels
        for i, count in enumerate(counts.values):
            ax.text(i, 5, f"n={count}", ha='center', color='white', fontweight='bold')

        # Add feature name to x-axis
        ax.set_xlabel(feature.replace("_", " ").title())

    plt.tight_layout()
    return fig

# Main application
def main():
    # Header
    st.title("Smart Loan Recovery System")
    st.image("https://img.icons8.com/color/96/000000/loan.png", width=100)

    # Load data and model
    data = load_sample_data()

    # Load Random Forest model only
    model = load_model("random_forest")

    # Prediction page
    st.title("Predict Loan Recovery")

    st.write("""
    Use this tool to predict the probability of recovering a loan based on customer and loan information.
    You can either:
    1. Enter information for a single loan
    2. Upload a CSV file with multiple loans
    """)

    prediction_type = st.radio("Prediction Type", ["Single Loan", "Batch Prediction"])

    if prediction_type == "Single Loan":
        st.subheader("Enter Loan Information")

        col1, col2, col3 = st.columns(3)

        with col1:
            age = st.number_input("Age", min_value=18, max_value=100, value=35)
            gender = st.selectbox("Gender", ["Male", "Female"])
            employment_status = st.selectbox(
                "Employment Status",
                ["Employed", "Self-employed", "Unemployed", "Retired"]
            )
            annual_income = st.number_input("Annual Income ($)", min_value=0, value=60000)

        with col2:
            credit_score = st.slider("Credit Score", 300, 850, 650)
            loan_amount = st.number_input("Loan Amount ($)", min_value=1000, value=20000)
            interest_rate = st.slider("Interest Rate (%)", 1.0, 25.0, 8.0, 0.1)
            loan_term = st.selectbox("Loan Term (months)", [12, 24, 36, 48, 60])

        with col3:
            payment_history = st.selectbox(
                "Payment History",
                ["Excellent", "Good", "Fair", "Poor", "Very Poor"]
            )
            days_past_due = st.number_input("Days Past Due", min_value=0, value=0)
            previous_defaults = st.number_input("Previous Defaults", min_value=0, max_value=10, value=0)

        # Calculate derived features
        monthly_payment = (loan_amount * (interest_rate/100/12) *
                          (1 + interest_rate/100/12)**(loan_term)) / \
                          ((1 + interest_rate/100/12)**(loan_term) - 1)

        debt_to_income = (monthly_payment * 12) / max(1, annual_income)

        # Display calculated values
        st.subheader("Calculated Values")
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Monthly Payment", f"${monthly_payment:.2f}")
        with col2:
            st.metric("Debt-to-Income Ratio", f"{debt_to_income*100:.2f}%")

        # Create input dataframe
        input_data = pd.DataFrame({
            'age': [age],
            'gender': [gender],
            'employment_status': [employment_status],
            'annual_income': [annual_income],
            'credit_score': [credit_score],
            'loan_amount': [loan_amount],
            'interest_rate': [interest_rate],
            'loan_term': [loan_term],
            'payment_history': [payment_history],
            'days_past_due': [days_past_due],
            'previous_defaults': [previous_defaults],
            'monthly_payment': [monthly_payment],
            'debt_to_income': [debt_to_income]
        })

        # Make prediction
        if st.button("Predict Recovery Probability"):
            with st.spinner("Calculating recovery probability..."):
                recovery_prob = predict_recovery(model, input_data)[0]

                # Display result
                st.subheader("Prediction Result")

                # Create gauge chart for probability
                fig, ax = plt.subplots(figsize=(10, 2))
                ax.barh([0], [100], color='lightgray', height=0.5)
                ax.barh([0], [recovery_prob * 100], color='green' if recovery_prob >= 0.5 else 'red', height=0.5)
                ax.set_xlim(0, 100)
                ax.set_yticks([])
                ax.set_xticks([0, 25, 50, 75, 100])
                ax.set_xticklabels(['0%', '25%', '50%', '75%', '100%'])
                ax.axvline(50, color='gray', linestyle='--', alpha=0.5)
                ax.text(recovery_prob * 100, 0, f"{recovery_prob*100:.1f}%",
                        ha='center', va='center', fontweight='bold', color='black')

                st.pyplot(fig)

                # Recommendation
                st.subheader("Recovery Assessment")
                if recovery_prob >= 0.8:
                    st.success("High probability of recovery. Standard collection procedures recommended.")
                elif recovery_prob >= 0.5:
                    st.info("Moderate probability of recovery. Consider offering a payment plan.")
                elif recovery_prob >= 0.3:
                    st.warning("Low probability of recovery. Consider debt restructuring or settlement offers.")
                else:
                    st.error("Very low probability of recovery. Consider debt write-off or third-party collection.")

                # Risk factors
                st.subheader("Key Risk Factors")
                risk_factors = []

                if credit_score < 600:
                    risk_factors.append("Low credit score")
                if days_past_due > 30:
                    risk_factors.append("Significant payment delay")
                if previous_defaults > 0:
                    risk_factors.append("History of defaults")
                if debt_to_income > 0.4:
                    risk_factors.append("High debt-to-income ratio")
                if payment_history in ["Poor", "Very Poor"]:
                    risk_factors.append("Poor payment history")

                if risk_factors:
                    for factor in risk_factors:
                        st.write(f"• {factor}")
                else:
                    st.write("No significant risk factors identified.")

    else:  # Batch prediction
        st.subheader("Upload CSV File")
        st.write("""
        Upload a CSV file with loan information. The file should contain the following columns:
        age, gender, employment_status, annual_income, credit_score, loan_amount, interest_rate,
        loan_term, payment_history, days_past_due, previous_defaults
        """)

        # Sample file download
        sample_data = data.sample(5).drop(['customer_id', 'recovery_status'], axis=1, errors='ignore')

        @st.cache_data
        def convert_df_to_csv(df):
            return df.to_csv(index=False).encode('utf-8')

        csv = convert_df_to_csv(sample_data)
        st.download_button(
            "Download Sample CSV",
            csv,
            "sample_loans.csv",
            "text/csv",
            key='download-csv'
        )

        # File upload
        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

        if uploaded_file is not None:
            # Load and display the data
            batch_data = pd.read_csv(uploaded_file)
            st.write("Preview of uploaded data:")
            st.dataframe(batch_data.head())

            # Check for required columns
            required_cols = ['age', 'gender', 'employment_status', 'annual_income',
                            'credit_score', 'loan_amount', 'interest_rate',
                            'loan_term', 'payment_history', 'days_past_due',
                            'previous_defaults']

            missing_cols = [col for col in required_cols if col not in batch_data.columns]

            if missing_cols:
                st.error(f"Missing required columns: {', '.join(missing_cols)}")
            else:
                # Calculate derived features if not present
                if 'monthly_payment' not in batch_data.columns:
                    batch_data['monthly_payment'] = (
                        batch_data['loan_amount'] * (batch_data['interest_rate']/100/12) *
                        (1 + batch_data['interest_rate']/100/12)**(batch_data['loan_term'])
                    ) / (
                        (1 + batch_data['interest_rate']/100/12)**(batch_data['loan_term']) - 1
                    )

                if 'debt_to_income' not in batch_data.columns:
                    batch_data['debt_to_income'] = (batch_data['monthly_payment'] * 12) / batch_data['annual_income'].replace(0, 1)

                # Make predictions
                if st.button("Run Batch Prediction"):
                    with st.spinner("Processing batch predictions..."):
                        # Make predictions
                        recovery_probs = predict_recovery(model, batch_data)

                        # Add predictions to the dataframe
                        batch_data['recovery_probability'] = recovery_probs
                        batch_data['recovery_prediction'] = (recovery_probs >= 0.5).astype(int)

                        # Display results
                        st.subheader("Prediction Results")
                        st.dataframe(batch_data)

                        # Summary statistics
                        st.subheader("Summary")
                        avg_prob = batch_data['recovery_probability'].mean() * 100
                        predicted_recoveries = batch_data['recovery_prediction'].sum()
                        recovery_rate = predicted_recoveries / len(batch_data) * 100

                        col1, col2 = st.columns(2)
                        with col1:
                            st.metric("Average Recovery Probability", f"{avg_prob:.2f}%")
                        with col2:
                            st.metric("Predicted Recovery Rate", f"{recovery_rate:.2f}% ({predicted_recoveries}/{len(batch_data)})")

                        # Distribution of probabilities
                        st.subheader("Distribution of Recovery Probabilities")
                        fig, ax = plt.subplots(figsize=(10, 6))
                        sns.histplot(batch_data['recovery_probability'], bins=20, kde=True, ax=ax)
                        ax.set_xlabel("Recovery Probability")
                        ax.set_ylabel("Count")
                        ax.axvline(0.5, color='red', linestyle='--')
                        ax.text(0.5, ax.get_ylim()[1]*0.9, "Decision Threshold",
                                rotation=90, va='top', ha='right', color='red')
                        st.pyplot(fig)

                        # Download results
                        csv = convert_df_to_csv(batch_data)
                        st.download_button(
                            "Download Results CSV",
                            csv,
                            "loan_recovery_predictions.csv",
                            "text/csv",
                            key='download-results'
                        )



if __name__ == "__main__":
    main()