Spaces:

sanabanu31
/

email_classifier

Running

File size: 3,334 Bytes

6b21b32
bf70aa2
 
 
9bd35d7
bf70aa2
11184ec
cea309f
 
 
6b21b32
11184ec
 
cea309f
 
e431852
bf70aa2
 
e431852
9bd35d7
 
e431852
9bd35d7
 
 
 
 
 
 
 
e431852
 
 
 
 
11184ec
e431852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bd35d7
 
 
 
 
 
 
 
 
 
 
 
e431852
9bd35d7
 
 
 
 
 
 
 
 
e431852
9bd35d7
e431852
9bd35d7
 
e431852
 
 
 
9bd35d7
e431852
bf70aa2
e431852
 
 
bf70aa2
e431852
bf70aa2
 
11184ec
e431852
 
 
 
 
 
 
 
9b81b0a
e431852
 
bf70aa2
11184ec
 
 
 
bf70aa2
6b21b32
e431852
6b21b32

from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import re
from transformers import pipeline

# Initialize FastAPI app
app = FastAPI(
    title="Email Classification API",
    version="1.0.0",
    description="Classifies support emails into categories and masks personal information.",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Load pre-trained model
model = joblib.load("model.joblib")

# Initialize NER pipeline (multilingual)
ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)

# Regex patterns for PII detection
EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
CVV_REGEX = r'(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b'
EXPIRY_REGEX = r'\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b'
PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'

NER_TO_TOKEN = {
    'PER': 'full_name',
    'EMAIL': 'email',
    'DATE': 'dob'
}

def mask_pii(text, mapping=None, counter=None):
    if mapping is None:
        mapping = {}
    if counter is None:
        counter = {
            'full_name': 0,
            'email': 0,
            'phone_number': 0,
            'dob': 0,
            'aadhar_num': 0,
            'credit_debit_no': 0,
            'cvv_no': 0,
            'expiry_no': 0
        }

    # Mask NER entities first
    entities = ner(text)
    for ent in entities:
        label = ent['entity_group']
        if label in NER_TO_TOKEN:
            token_name = NER_TO_TOKEN[label]
            original = ent['word'].replace('##', '')
            token = f"[{token_name}_{counter[token_name]:03d}]"
            if original in text:
                text = text.replace(original, token, 1)
                mapping[token] = original
                counter[token_name] += 1

    # Mask regex patterns
    regex_map = [
        (CARD_REGEX, 'credit_debit_no'),
        (AADHAAR_REGEX, 'aadhar_num'),
        (PHONE_REGEX, 'phone_number'),
        (CVV_REGEX, 'cvv_no'),
        (EXPIRY_REGEX, 'expiry_no'),
        (EMAIL_REGEX, 'email'),
        (DOB_REGEX, 'dob')
    ]

    for regex, token_name in regex_map:
        def replacer(match):
            original = match.group(0)
            token = f"[{token_name}_{counter[token_name]:03d}]"
            counter[token_name] += 1
            mapping[token] = original
            return token
        text = re.sub(regex, replacer, text)

    return text, mapping

# Input schema
class EmailInput(BaseModel):
    input_email_body: str

# Classification Endpoint
@app.post("/classify")
def classify_email(data: EmailInput):
    raw_text = data.input_email_body

    # Masking using your advanced function
    masked_text, pii_map = mask_pii(raw_text)

    # Convert pii_map to a list for easier frontend use (optional)
    entity_list = [{"placeholder": k, "original": v} for k, v in pii_map.items()]

    # Prediction
    predicted_category = model.predict([masked_text])[0]

    # Response format
    return {
        "input_email_body": raw_text,
        "list_of_masked_entities": entity_list,
        "masked_email": masked_text,
        "category_of_the_email": predicted_category
    }

# Health check endpoint
@app.get("/")
def root():
    return {"message": "Email Classification API is running."}