Spaces:

AITestingWorkSpace
/

FraudNLP

Paused

File size: 5,459 Bytes

7e976f4

import gradio as gr
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import process

# Enhanced data generation with realistic fraud patterns
def load_data():
    np.random.seed(42)
    cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
    age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
    incomes = ['Low', 'Medium', 'High']
    
    data = pd.DataFrame({
        'TransactionID': range(1, 1001),
        'Amount': np.random.uniform(10, 15000, 1000).round(2),
        'Type': np.random.choice(['Credit', 'Debit'], 1000),
        'City': np.random.choice(cities, 1000),
        'Age': np.random.randint(18, 70, 1000),
        'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
    })
    
    # Create realistic fraud patterns
    data['Fraud'] = 0
    data.loc[
        ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
        ((data['Type'] == 'Credit') & (data['Amount'] > 8000)) |
        ((data['City'] == 'New York') & (data['Age'].between(20, 35)) & (data['Amount'] > 6000)),
        'Fraud'
    ] = 1
    
    return data

data = load_data()

# Initialize separate encoders for each feature
le_type = LabelEncoder()
le_city = LabelEncoder()
le_income = LabelEncoder()

# Fit encoders on full dataset (or training data in real scenarios)
data['Type_encoded'] = le_type.fit_transform(data['Type'])
data['City_encoded'] = le_city.fit_transform(data['City'])
data['Income_encoded'] = le_income.fit_transform(data['Income'])

# Train model
features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
X = data[features]
y = data['Fraud']

model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X, y)

def process_nl_query(query):
    try:
        # Extract amount
        amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
        if amount_match:
            amount = float(amount_match.group(1).replace(',', ''))
        else:
            return "Error: Could not extract transaction amount."

        # Extract transaction type
        trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
        
        # Fuzzy match city
        cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
        city_match = process.extractOne(query, cities)
        city = city_match[0] if city_match[1] > 70 else 'Unknown'
        
        # Extract age
        age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
        age = int(age_match.group(1)) if age_match else None
        
        # Extract income level
        income = 'Low' if 'low' in query.lower() else \
                 'High' if 'high' in query.lower() else 'Medium'

        # Handle unseen labels
        city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
        income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
        
        # Prepare input
        input_df = pd.DataFrame({
            'Amount': [amount],
            'Type_encoded': le_type.transform([trans_type])[0],
            'City_encoded': city_encoded,
            'Age': [age] if age else data['Age'].median(),  # Handle missing age
            'Income_encoded': income_encoded
        })
        
        # Predict
        proba = model.predict_proba(input_df)[0][1]
        prediction = model.predict(input_df)[0]
        
        # Generate explanation
        explanation = []
        if amount > 5000 and income == 'Low':
            explanation.append("High amount for low income")
        if amount > 8000 and trans_type == 'Credit':
            explanation.append("Unusually large credit transaction")
        if city == 'New York' and 20 <= age <= 35 and amount > 6000:
            explanation.append("Suspicious pattern for young adults in NYC")
            
        return (
            f"Transaction Details:\n"
            f"- Amount: ${amount:,.2f}\n"
            f"- Type: {trans_type}\n"
            f"- City: {city}\n"
            f"- Age: {age}\n"
            f"- Income Level: {income}\n\n"
            f"Fraud Analysis:\n"
            f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
            f"- Confidence: {proba*100:.1f}%\n"
            f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
        )
        
    except Exception as e:
        return f"Error processing query: {str(e)}"

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## Enhanced Fraud Detection System")
    
    with gr.Tab("Natural Language Query"):
        gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
        nl_input = gr.Textbox(label="Enter your transaction query:")
        nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
        gr.Examples(
            examples=[
                "Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
                "Verify a $300 debit in Phoenix for a 60-year-old high income client"
            ],
            inputs=nl_input
        )
        nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
    
    with gr.Tab("Data Insights"):
        gr.Markdown("### Fraud Pattern Analysis")
        gr.DataFrame(data[data['Fraud'] == 1].describe())

demo.launch()