sanabanu31 commited on
Commit
9bd35d7
·
verified ·
1 Parent(s): fbb3cc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -38
app.py CHANGED
@@ -2,6 +2,7 @@ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import joblib
4
  import re
 
5
 
6
  # Initialize FastAPI app
7
  app = FastAPI(
@@ -15,56 +16,100 @@ app = FastAPI(
15
  # Load pre-trained model
16
  model = joblib.load("model.joblib")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Input schema
19
  class EmailInput(BaseModel):
20
  input_email_body: str
21
 
22
- # PII Masking Function
23
  def mask_and_store_all_pii(text):
24
  text = str(text)
25
- pii_map = {}
 
 
 
 
 
 
 
 
 
 
26
  entity_list = []
27
 
28
- patterns = {
29
- "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
30
- "phone_number": r"(?<!\d)(\+?\d[\d\s\-]{7,14}\d)(?!\d)",
31
- "dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
32
- "aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?![\d])",
33
- "credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
34
- "cvv_no": r"(?i)\b(?:CVV[:\s]*)?(\d{3,4})\b",
35
- "expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Track masked spans to prevent overlapping matches
39
- masked_spans = []
40
-
41
- def is_overlapping(start, end):
42
- return any(s <= start < e or s < end <= e for s, e in masked_spans)
43
-
44
- for label, pattern in patterns.items():
45
- for match in re.finditer(pattern, text):
46
- original = match.group()
47
- start, end = match.start(), match.end()
48
- if is_overlapping(start, end):
49
- continue
50
- placeholder = f"[{label}_{len(pii_map):03d}]"
51
- pii_map[placeholder] = original
52
- entity_list.append({
53
- "position": [start, end],
54
- "classification": label,
55
- "entity": original
56
- })
57
- text = text[:start] + placeholder + text[end:]
58
- masked_spans.append((start, start + len(placeholder)))
59
-
60
- return text, pii_map, entity_list
61
 
62
  # Restore PII
 
63
  def restore_pii(masked_text, pii_map):
64
- restored = masked_text
65
  for placeholder, original in pii_map.items():
66
- restored = restored.replace(placeholder, original)
67
- return restored
68
 
69
  # Classification Endpoint
70
  @app.post("/classify")
@@ -77,7 +122,6 @@ def classify_email(data: EmailInput):
77
  # Prediction
78
  predicted_category = model.predict([masked_text])[0]
79
 
80
- # Response format
81
  return {
82
  "input_email_body": raw_text,
83
  "list_of_masked_entities": entity_list,
 
2
  from pydantic import BaseModel
3
  import joblib
4
  import re
5
+ from transformers import pipeline
6
 
7
  # Initialize FastAPI app
8
  app = FastAPI(
 
16
  # Load pre-trained model
17
  model = joblib.load("model.joblib")
18
 
19
+ # Initialize NER pipeline
20
+ ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
21
+
22
+ # Map NER entity labels to token names
23
+ NER_TO_TOKEN = {
24
+ 'PER': 'full_name',
25
+ 'EMAIL': 'email',
26
+ 'DATE': 'dob'
27
+ }
28
+
29
+ # Regex patterns for PII detection
30
+ EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
31
+ AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
32
+ CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
33
+ CVV_REGEX = r'(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b'
34
+ EXPIRY_REGEX = r'\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b'
35
+ PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
36
+ DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'
37
+
38
  # Input schema
39
  class EmailInput(BaseModel):
40
  input_email_body: str
41
 
42
+ # Updated PII Masking Function with NER and regex
43
  def mask_and_store_all_pii(text):
44
  text = str(text)
45
+ mapping = {}
46
+ counter = {
47
+ 'full_name': 0,
48
+ 'email': 0,
49
+ 'phone_number': 0,
50
+ 'dob': 0,
51
+ 'aadhar_num': 0,
52
+ 'credit_debit_no': 0,
53
+ 'cvv_no': 0,
54
+ 'expiry_no': 0
55
+ }
56
  entity_list = []
57
 
58
+ # NER masking
59
+ entities = ner(text)
60
+ for ent in entities:
61
+ label = ent['entity_group']
62
+ if label in NER_TO_TOKEN:
63
+ token_name = NER_TO_TOKEN[label]
64
+ original = ent['word'].replace('##', '')
65
+ token = f"[{token_name}_{counter[token_name]:03d}]"
66
+ if original in text:
67
+ start = text.index(original)
68
+ end = start + len(original)
69
+ text = text.replace(original, token, 1)
70
+ mapping[token] = original
71
+ counter[token_name] += 1
72
+ entity_list.append({
73
+ "position": [start, start + len(token)],
74
+ "classification": token_name,
75
+ "entity": original
76
+ })
77
+
78
+ # Regex masking
79
+ regex_map = [
80
+ (CARD_REGEX, 'credit_debit_no'),
81
+ (AADHAAR_REGEX, 'aadhar_num'),
82
+ (PHONE_REGEX, 'phone_number'),
83
+ (CVV_REGEX, 'cvv_no'),
84
+ (EXPIRY_REGEX, 'expiry_no'),
85
+ (EMAIL_REGEX, 'email'),
86
+ (DOB_REGEX, 'dob')
87
+ ]
88
 
89
+ for regex, token_name in regex_map:
90
+ for match in re.finditer(regex, text):
91
+ original = match.group(0)
92
+ token = f"[{token_name}_{counter[token_name]:03d}]"
93
+ start = match.start()
94
+ end = match.end()
95
+ if original in text:
96
+ text = text.replace(original, token, 1)
97
+ mapping[token] = original
98
+ counter[token_name] += 1
99
+ entity_list.append({
100
+ "position": [start, start + len(token)],
101
+ "classification": token_name,
102
+ "entity": original
103
+ })
104
+
105
+ return text, mapping, entity_list
 
 
 
 
 
 
106
 
107
  # Restore PII
108
+
109
  def restore_pii(masked_text, pii_map):
 
110
  for placeholder, original in pii_map.items():
111
+ masked_text = masked_text.replace(placeholder, original)
112
+ return masked_text
113
 
114
  # Classification Endpoint
115
  @app.post("/classify")
 
122
  # Prediction
123
  predicted_category = model.predict([masked_text])[0]
124
 
 
125
  return {
126
  "input_email_body": raw_text,
127
  "list_of_masked_entities": entity_list,