File size: 5,459 Bytes
7e976f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import process

# Enhanced data generation with realistic fraud patterns
def load_data():
    np.random.seed(42)
    cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
    age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
    incomes = ['Low', 'Medium', 'High']
    
    data = pd.DataFrame({
        'TransactionID': range(1, 1001),
        'Amount': np.random.uniform(10, 15000, 1000).round(2),
        'Type': np.random.choice(['Credit', 'Debit'], 1000),
        'City': np.random.choice(cities, 1000),
        'Age': np.random.randint(18, 70, 1000),
        'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
    })
    
    # Create realistic fraud patterns
    data['Fraud'] = 0
    data.loc[
        ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
        ((data['Type'] == 'Credit') & (data['Amount'] > 8000)) |
        ((data['City'] == 'New York') & (data['Age'].between(20, 35)) & (data['Amount'] > 6000)),
        'Fraud'
    ] = 1
    
    return data

data = load_data()

# Initialize separate encoders for each feature
le_type = LabelEncoder()
le_city = LabelEncoder()
le_income = LabelEncoder()

# Fit encoders on full dataset (or training data in real scenarios)
data['Type_encoded'] = le_type.fit_transform(data['Type'])
data['City_encoded'] = le_city.fit_transform(data['City'])
data['Income_encoded'] = le_income.fit_transform(data['Income'])

# Train model
features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
X = data[features]
y = data['Fraud']

model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X, y)

def process_nl_query(query):
    try:
        # Extract amount
        amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
        if amount_match:
            amount = float(amount_match.group(1).replace(',', ''))
        else:
            return "Error: Could not extract transaction amount."

        # Extract transaction type
        trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
        
        # Fuzzy match city
        cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
        city_match = process.extractOne(query, cities)
        city = city_match[0] if city_match[1] > 70 else 'Unknown'
        
        # Extract age
        age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
        age = int(age_match.group(1)) if age_match else None
        
        # Extract income level
        income = 'Low' if 'low' in query.lower() else \
                 'High' if 'high' in query.lower() else 'Medium'

        # Handle unseen labels
        city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
        income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
        
        # Prepare input
        input_df = pd.DataFrame({
            'Amount': [amount],
            'Type_encoded': le_type.transform([trans_type])[0],
            'City_encoded': city_encoded,
            'Age': [age] if age else data['Age'].median(),  # Handle missing age
            'Income_encoded': income_encoded
        })
        
        # Predict
        proba = model.predict_proba(input_df)[0][1]
        prediction = model.predict(input_df)[0]
        
        # Generate explanation
        explanation = []
        if amount > 5000 and income == 'Low':
            explanation.append("High amount for low income")
        if amount > 8000 and trans_type == 'Credit':
            explanation.append("Unusually large credit transaction")
        if city == 'New York' and 20 <= age <= 35 and amount > 6000:
            explanation.append("Suspicious pattern for young adults in NYC")
            
        return (
            f"Transaction Details:\n"
            f"- Amount: ${amount:,.2f}\n"
            f"- Type: {trans_type}\n"
            f"- City: {city}\n"
            f"- Age: {age}\n"
            f"- Income Level: {income}\n\n"
            f"Fraud Analysis:\n"
            f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
            f"- Confidence: {proba*100:.1f}%\n"
            f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
        )
        
    except Exception as e:
        return f"Error processing query: {str(e)}"

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## Enhanced Fraud Detection System")
    
    with gr.Tab("Natural Language Query"):
        gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
        nl_input = gr.Textbox(label="Enter your transaction query:")
        nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
        gr.Examples(
            examples=[
                "Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
                "Verify a $300 debit in Phoenix for a 60-year-old high income client"
            ],
            inputs=nl_input
        )
        nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
    
    with gr.Tab("Data Insights"):
        gr.Markdown("### Fraud Pattern Analysis")
        gr.DataFrame(data[data['Fraud'] == 1].describe())

demo.launch()