Spaces:
Paused
Paused
File size: 5,459 Bytes
7e976f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import process
# Enhanced data generation with realistic fraud patterns
def load_data():
np.random.seed(42)
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
incomes = ['Low', 'Medium', 'High']
data = pd.DataFrame({
'TransactionID': range(1, 1001),
'Amount': np.random.uniform(10, 15000, 1000).round(2),
'Type': np.random.choice(['Credit', 'Debit'], 1000),
'City': np.random.choice(cities, 1000),
'Age': np.random.randint(18, 70, 1000),
'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
})
# Create realistic fraud patterns
data['Fraud'] = 0
data.loc[
((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
((data['Type'] == 'Credit') & (data['Amount'] > 8000)) |
((data['City'] == 'New York') & (data['Age'].between(20, 35)) & (data['Amount'] > 6000)),
'Fraud'
] = 1
return data
data = load_data()
# Initialize separate encoders for each feature
le_type = LabelEncoder()
le_city = LabelEncoder()
le_income = LabelEncoder()
# Fit encoders on full dataset (or training data in real scenarios)
data['Type_encoded'] = le_type.fit_transform(data['Type'])
data['City_encoded'] = le_city.fit_transform(data['City'])
data['Income_encoded'] = le_income.fit_transform(data['Income'])
# Train model
features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
X = data[features]
y = data['Fraud']
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X, y)
def process_nl_query(query):
try:
# Extract amount
amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
if amount_match:
amount = float(amount_match.group(1).replace(',', ''))
else:
return "Error: Could not extract transaction amount."
# Extract transaction type
trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
# Fuzzy match city
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
city_match = process.extractOne(query, cities)
city = city_match[0] if city_match[1] > 70 else 'Unknown'
# Extract age
age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
age = int(age_match.group(1)) if age_match else None
# Extract income level
income = 'Low' if 'low' in query.lower() else \
'High' if 'high' in query.lower() else 'Medium'
# Handle unseen labels
city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
# Prepare input
input_df = pd.DataFrame({
'Amount': [amount],
'Type_encoded': le_type.transform([trans_type])[0],
'City_encoded': city_encoded,
'Age': [age] if age else data['Age'].median(), # Handle missing age
'Income_encoded': income_encoded
})
# Predict
proba = model.predict_proba(input_df)[0][1]
prediction = model.predict(input_df)[0]
# Generate explanation
explanation = []
if amount > 5000 and income == 'Low':
explanation.append("High amount for low income")
if amount > 8000 and trans_type == 'Credit':
explanation.append("Unusually large credit transaction")
if city == 'New York' and 20 <= age <= 35 and amount > 6000:
explanation.append("Suspicious pattern for young adults in NYC")
return (
f"Transaction Details:\n"
f"- Amount: ${amount:,.2f}\n"
f"- Type: {trans_type}\n"
f"- City: {city}\n"
f"- Age: {age}\n"
f"- Income Level: {income}\n\n"
f"Fraud Analysis:\n"
f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
f"- Confidence: {proba*100:.1f}%\n"
f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
)
except Exception as e:
return f"Error processing query: {str(e)}"
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("## Enhanced Fraud Detection System")
with gr.Tab("Natural Language Query"):
gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
nl_input = gr.Textbox(label="Enter your transaction query:")
nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
gr.Examples(
examples=[
"Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
"Verify a $300 debit in Phoenix for a 60-year-old high income client"
],
inputs=nl_input
)
nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
with gr.Tab("Data Insights"):
gr.Markdown("### Fraud Pattern Analysis")
gr.DataFrame(data[data['Fraud'] == 1].describe())
demo.launch()
|