sanabanu31 commited on
Commit
0082dc5
·
verified ·
1 Parent(s): 46c162b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -19,7 +19,7 @@ model = joblib.load("model.joblib")
19
  class EmailInput(BaseModel):
20
  input_email_body: str
21
 
22
- # PII Masking Function
23
  def mask_and_store_all_pii(text):
24
  text = str(text)
25
  pii_map = {}
@@ -27,20 +27,22 @@ def mask_and_store_all_pii(text):
27
 
28
  patterns = {
29
  "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
30
- "phone_number": r"\b\d{10}\b",
31
- "dob": r"\b\d{2}[/-]\d{2}[/-]\d{4}\b",
32
- "aadhar_num": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}\b",
33
- "credit_debit_no": r"\b(?:\d[ -]*?){13,16}\b",
34
- "cvv_no": r"\b\d{3}\b",
35
- "expiry_no": r"\b(0[1-9]|1[0-2])\/\d{2,4}\b",
36
- "full_name": r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)\b"
37
  }
38
 
39
  for label, pattern in patterns.items():
40
  for match in re.finditer(pattern, text):
41
  original = match.group()
42
  start, end = match.start(), match.end()
43
- placeholder = f"[{label}_{len(pii_map)}]"
 
 
44
  pii_map[placeholder] = original
45
  entity_list.append({
46
  "position": [start, end],
@@ -81,4 +83,3 @@ def classify_email(data: EmailInput):
81
  @app.get("/")
82
  def root():
83
  return {"message": "Email Classification API is running."}
84
-
 
19
  class EmailInput(BaseModel):
20
  input_email_body: str
21
 
22
+ # Updated PII Masking Function (fixes Aadhaar vs Card and name misclassifications)
23
  def mask_and_store_all_pii(text):
24
  text = str(text)
25
  pii_map = {}
 
27
 
28
  patterns = {
29
  "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
30
+ "phone_number": r"\+?\d[\d\s\-]{7,14}\d",
31
+ "dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
32
+ "aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?!\d)",
33
+ "credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
34
+ "cvv_no": r"(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b",
35
+ "expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
36
+ # Avoid naive full name pattern to prevent false positives like 'Dear Sir'
37
  }
38
 
39
  for label, pattern in patterns.items():
40
  for match in re.finditer(pattern, text):
41
  original = match.group()
42
  start, end = match.start(), match.end()
43
+ placeholder = f"[{label}_{len(pii_map):03d}]"
44
+ if original not in text:
45
+ continue
46
  pii_map[placeholder] = original
47
  entity_list.append({
48
  "position": [start, end],
 
83
  @app.get("/")
84
  def root():
85
  return {"message": "Email Classification API is running."}