Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -19,7 +19,7 @@ model = joblib.load("model.joblib")
|
|
19 |
class EmailInput(BaseModel):
|
20 |
input_email_body: str
|
21 |
|
22 |
-
#
|
23 |
def mask_and_store_all_pii(text):
|
24 |
text = str(text)
|
25 |
pii_map = {}
|
@@ -27,29 +27,35 @@ def mask_and_store_all_pii(text):
|
|
27 |
|
28 |
patterns = {
|
29 |
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
|
30 |
-
"phone_number": r"\+?\d[\d\s\-]{7,14}\d",
|
31 |
"dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
|
32 |
-
"aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(
|
33 |
"credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
|
34 |
-
"cvv_no": r"(?i)\b(?:
|
35 |
"expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
|
36 |
-
# Avoid naive full name pattern to prevent false positives like 'Dear Sir'
|
37 |
}
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
for label, pattern in patterns.items():
|
40 |
for match in re.finditer(pattern, text):
|
41 |
original = match.group()
|
42 |
start, end = match.start(), match.end()
|
43 |
-
|
44 |
-
if original not in text:
|
45 |
continue
|
|
|
46 |
pii_map[placeholder] = original
|
47 |
entity_list.append({
|
48 |
"position": [start, end],
|
49 |
"classification": label,
|
50 |
"entity": original
|
51 |
})
|
52 |
-
text = text
|
|
|
53 |
|
54 |
return text, pii_map, entity_list
|
55 |
|
|
|
19 |
class EmailInput(BaseModel):
|
20 |
input_email_body: str
|
21 |
|
22 |
+
# PII Masking Function
|
23 |
def mask_and_store_all_pii(text):
|
24 |
text = str(text)
|
25 |
pii_map = {}
|
|
|
27 |
|
28 |
patterns = {
|
29 |
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
|
30 |
+
"phone_number": r"(?<!\d)(\+?\d[\d\s\-]{7,14}\d)(?!\d)",
|
31 |
"dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
|
32 |
+
"aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?![\d])",
|
33 |
"credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
|
34 |
+
"cvv_no": r"(?i)\b(?:CVV[:\s]*)?(\d{3,4})\b",
|
35 |
"expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
|
|
|
36 |
}
|
37 |
|
38 |
+
# Track masked spans to prevent overlapping matches
|
39 |
+
masked_spans = []
|
40 |
+
|
41 |
+
def is_overlapping(start, end):
|
42 |
+
return any(s <= start < e or s < end <= e for s, e in masked_spans)
|
43 |
+
|
44 |
for label, pattern in patterns.items():
|
45 |
for match in re.finditer(pattern, text):
|
46 |
original = match.group()
|
47 |
start, end = match.start(), match.end()
|
48 |
+
if is_overlapping(start, end):
|
|
|
49 |
continue
|
50 |
+
placeholder = f"[{label}_{len(pii_map):03d}]"
|
51 |
pii_map[placeholder] = original
|
52 |
entity_list.append({
|
53 |
"position": [start, end],
|
54 |
"classification": label,
|
55 |
"entity": original
|
56 |
})
|
57 |
+
text = text[:start] + placeholder + text[end:]
|
58 |
+
masked_spans.append((start, start + len(placeholder)))
|
59 |
|
60 |
return text, pii_map, entity_list
|
61 |
|