sanabanu31 commited on
Commit
e431852
·
verified ·
1 Parent(s): cc5ce19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -56
app.py CHANGED
@@ -1,7 +1,6 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import joblib
4
- import pandas as pd
5
  import re
6
  from transformers import pipeline
7
 
@@ -14,28 +13,13 @@ app = FastAPI(
14
  redoc_url="/redoc"
15
  )
16
 
17
- # Load the combined model pipeline (includes vectorizer)
18
  model = joblib.load("model.joblib")
19
 
20
- # Initialize NER pipeline
21
  ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
22
 
23
- # Input schemas
24
- class EmailInput(BaseModel):
25
- input_email_body: str
26
-
27
- class TrainingExample(BaseModel):
28
- email_body: str
29
- label: str
30
-
31
- # Map NER labels to types
32
- NER_TO_TOKEN = {
33
- 'PER': 'full_name',
34
- 'EMAIL': 'email',
35
- 'DATE': 'dob'
36
- }
37
-
38
- # Regex patterns for PII
39
  EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
40
  AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
41
  CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
@@ -44,17 +28,28 @@ EXPIRY_REGEX = r'\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b'
44
  PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
45
  DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'
46
 
47
- # Masking function
48
- def mask_and_store_all_pii(text):
49
- text = str(text)
50
- mapping = {}
51
- counter = {
52
- 'full_name': 0, 'email': 0, 'phone_number': 0, 'dob': 0,
53
- 'aadhar_num': 0, 'credit_debit_no': 0, 'cvv_no': 0, 'expiry_no': 0
54
- }
55
- entity_list = []
56
 
57
- # NER-based masking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  entities = ner(text)
59
  for ent in entities:
60
  label = ent['entity_group']
@@ -63,17 +58,11 @@ def mask_and_store_all_pii(text):
63
  original = ent['word'].replace('##', '')
64
  token = f"[{token_name}_{counter[token_name]:03d}]"
65
  if original in text:
66
- start = text.index(original)
67
  text = text.replace(original, token, 1)
68
  mapping[token] = original
69
  counter[token_name] += 1
70
- entity_list.append({
71
- "position": [start, start + len(token)],
72
- "classification": token_name,
73
- "entity": original
74
- })
75
 
76
- # Regex-based masking
77
  regex_map = [
78
  (CARD_REGEX, 'credit_debit_no'),
79
  (AADHAAR_REGEX, 'aadhar_num'),
@@ -83,35 +72,37 @@ def mask_and_store_all_pii(text):
83
  (EMAIL_REGEX, 'email'),
84
  (DOB_REGEX, 'dob')
85
  ]
 
86
  for regex, token_name in regex_map:
87
- for match in re.finditer(regex, text):
88
  original = match.group(0)
89
  token = f"[{token_name}_{counter[token_name]:03d}]"
90
- if original in text:
91
- start = text.index(original)
92
- text = text.replace(original, token, 1)
93
- mapping[token] = original
94
- counter[token_name] += 1
95
- entity_list.append({
96
- "position": [start, start + len(token)],
97
- "classification": token_name,
98
- "entity": original
99
- })
100
 
101
- return text, mapping, entity_list
102
 
103
- # Restore PII (optional use)
104
- def restore_pii(masked_text, pii_map):
105
- for placeholder, original in pii_map.items():
106
- masked_text = masked_text.replace(placeholder, original)
107
- return masked_text
108
 
109
- # Prediction endpoint
110
  @app.post("/classify")
111
  def classify_email(data: EmailInput):
112
  raw_text = data.input_email_body
113
- masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)
 
 
 
 
 
 
 
114
  predicted_category = model.predict([masked_text])[0]
 
 
115
  return {
116
  "input_email_body": raw_text,
117
  "list_of_masked_entities": entity_list,
@@ -119,7 +110,7 @@ def classify_email(data: EmailInput):
119
  "category_of_the_email": predicted_category
120
  }
121
 
122
- # Health check
123
  @app.get("/")
124
  def root():
125
  return {"message": "Email Classification API is running."}
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import joblib
 
4
  import re
5
  from transformers import pipeline
6
 
 
13
  redoc_url="/redoc"
14
  )
15
 
16
+ # Load pre-trained model
17
  model = joblib.load("model.joblib")
18
 
19
+ # Initialize NER pipeline (multilingual)
20
  ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
21
 
22
+ # Regex patterns for PII detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
24
  AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
25
  CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
 
28
  PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
29
  DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'
30
 
31
+ NER_TO_TOKEN = {
32
+ 'PER': 'full_name',
33
+ 'EMAIL': 'email',
34
+ 'DATE': 'dob'
35
+ }
 
 
 
 
36
 
37
+ def mask_pii(text, mapping=None, counter=None):
38
+ if mapping is None:
39
+ mapping = {}
40
+ if counter is None:
41
+ counter = {
42
+ 'full_name': 0,
43
+ 'email': 0,
44
+ 'phone_number': 0,
45
+ 'dob': 0,
46
+ 'aadhar_num': 0,
47
+ 'credit_debit_no': 0,
48
+ 'cvv_no': 0,
49
+ 'expiry_no': 0
50
+ }
51
+
52
+ # Mask NER entities first
53
  entities = ner(text)
54
  for ent in entities:
55
  label = ent['entity_group']
 
58
  original = ent['word'].replace('##', '')
59
  token = f"[{token_name}_{counter[token_name]:03d}]"
60
  if original in text:
 
61
  text = text.replace(original, token, 1)
62
  mapping[token] = original
63
  counter[token_name] += 1
 
 
 
 
 
64
 
65
+ # Mask regex patterns
66
  regex_map = [
67
  (CARD_REGEX, 'credit_debit_no'),
68
  (AADHAAR_REGEX, 'aadhar_num'),
 
72
  (EMAIL_REGEX, 'email'),
73
  (DOB_REGEX, 'dob')
74
  ]
75
+
76
  for regex, token_name in regex_map:
77
+ def replacer(match):
78
  original = match.group(0)
79
  token = f"[{token_name}_{counter[token_name]:03d}]"
80
+ counter[token_name] += 1
81
+ mapping[token] = original
82
+ return token
83
+ text = re.sub(regex, replacer, text)
 
 
 
 
 
 
84
 
85
+ return text, mapping
86
 
87
+ # Input schema
88
+ class EmailInput(BaseModel):
89
+ input_email_body: str
 
 
90
 
91
+ # Classification Endpoint
92
  @app.post("/classify")
93
  def classify_email(data: EmailInput):
94
  raw_text = data.input_email_body
95
+
96
+ # Masking using your advanced function
97
+ masked_text, pii_map = mask_pii(raw_text)
98
+
99
+ # Convert pii_map to a list for easier frontend use (optional)
100
+ entity_list = [{"placeholder": k, "original": v} for k, v in pii_map.items()]
101
+
102
+ # Prediction
103
  predicted_category = model.predict([masked_text])[0]
104
+
105
+ # Response format
106
  return {
107
  "input_email_body": raw_text,
108
  "list_of_masked_entities": entity_list,
 
110
  "category_of_the_email": predicted_category
111
  }
112
 
113
+ # Health check endpoint
114
  @app.get("/")
115
  def root():
116
  return {"message": "Email Classification API is running."}