sanabanu31 commited on
Commit
fbb3cc8
·
verified ·
1 Parent(s): 0082dc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -8
app.py CHANGED
@@ -19,7 +19,7 @@ model = joblib.load("model.joblib")
19
  class EmailInput(BaseModel):
20
  input_email_body: str
21
 
22
- # Updated PII Masking Function (fixes Aadhaar vs Card and name misclassifications)
23
  def mask_and_store_all_pii(text):
24
  text = str(text)
25
  pii_map = {}
@@ -27,29 +27,35 @@ def mask_and_store_all_pii(text):
27
 
28
  patterns = {
29
  "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
30
- "phone_number": r"\+?\d[\d\s\-]{7,14}\d",
31
  "dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
32
- "aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?!\d)",
33
  "credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
34
- "cvv_no": r"(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b",
35
  "expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
36
- # Avoid naive full name pattern to prevent false positives like 'Dear Sir'
37
  }
38
 
 
 
 
 
 
 
39
  for label, pattern in patterns.items():
40
  for match in re.finditer(pattern, text):
41
  original = match.group()
42
  start, end = match.start(), match.end()
43
- placeholder = f"[{label}_{len(pii_map):03d}]"
44
- if original not in text:
45
  continue
 
46
  pii_map[placeholder] = original
47
  entity_list.append({
48
  "position": [start, end],
49
  "classification": label,
50
  "entity": original
51
  })
52
- text = text.replace(original, placeholder, 1)
 
53
 
54
  return text, pii_map, entity_list
55
 
 
19
  class EmailInput(BaseModel):
20
  input_email_body: str
21
 
22
+ # PII Masking Function
23
  def mask_and_store_all_pii(text):
24
  text = str(text)
25
  pii_map = {}
 
27
 
28
  patterns = {
29
  "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
30
+ "phone_number": r"(?<!\d)(\+?\d[\d\s\-]{7,14}\d)(?!\d)",
31
  "dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
32
+ "aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?![\d])",
33
  "credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
34
+ "cvv_no": r"(?i)\b(?:CVV[:\s]*)?(\d{3,4})\b",
35
  "expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
 
36
  }
37
 
38
+ # Track masked spans to prevent overlapping matches
39
+ masked_spans = []
40
+
41
+ def is_overlapping(start, end):
42
+ return any(s <= start < e or s < end <= e for s, e in masked_spans)
43
+
44
  for label, pattern in patterns.items():
45
  for match in re.finditer(pattern, text):
46
  original = match.group()
47
  start, end = match.start(), match.end()
48
+ if is_overlapping(start, end):
 
49
  continue
50
+ placeholder = f"[{label}_{len(pii_map):03d}]"
51
  pii_map[placeholder] = original
52
  entity_list.append({
53
  "position": [start, end],
54
  "classification": label,
55
  "entity": original
56
  })
57
+ text = text[:start] + placeholder + text[end:]
58
+ masked_spans.append((start, start + len(placeholder)))
59
 
60
  return text, pii_map, entity_list
61