sanabanu31 commited on
Commit
d0995a7
·
verified ·
1 Parent(s): 1d1f6e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -32
app.py CHANGED
@@ -1,7 +1,10 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import joblib
 
4
  import re
 
 
5
  from transformers import pipeline
6
 
7
  # Initialize FastAPI app
@@ -13,20 +16,29 @@ app = FastAPI(
13
  redoc_url="/redoc"
14
  )
15
 
16
- # Load pre-trained model
17
  model = joblib.load("model.joblib")
 
18
 
19
  # Initialize NER pipeline
20
  ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
21
 
22
- # Map NER entity labels to token names
 
 
 
 
 
 
 
 
23
  NER_TO_TOKEN = {
24
  'PER': 'full_name',
25
  'EMAIL': 'email',
26
  'DATE': 'dob'
27
  }
28
 
29
- # Regex patterns for PII detection
30
  EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
31
  AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
32
  CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
@@ -35,27 +47,17 @@ EXPIRY_REGEX = r'\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b'
35
  PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
36
  DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'
37
 
38
- # Input schema
39
- class EmailInput(BaseModel):
40
- input_email_body: str
41
-
42
- # Updated PII Masking Function with NER and regex
43
  def mask_and_store_all_pii(text):
44
  text = str(text)
45
  mapping = {}
46
  counter = {
47
- 'full_name': 0,
48
- 'email': 0,
49
- 'phone_number': 0,
50
- 'dob': 0,
51
- 'aadhar_num': 0,
52
- 'credit_debit_no': 0,
53
- 'cvv_no': 0,
54
- 'expiry_no': 0
55
  }
56
  entity_list = []
57
 
58
- # NER masking
59
  entities = ner(text)
60
  for ent in entities:
61
  label = ent['entity_group']
@@ -65,7 +67,6 @@ def mask_and_store_all_pii(text):
65
  token = f"[{token_name}_{counter[token_name]:03d}]"
66
  if original in text:
67
  start = text.index(original)
68
- end = start + len(original)
69
  text = text.replace(original, token, 1)
70
  mapping[token] = original
71
  counter[token_name] += 1
@@ -75,7 +76,7 @@ def mask_and_store_all_pii(text):
75
  "entity": original
76
  })
77
 
78
- # Regex masking
79
  regex_map = [
80
  (CARD_REGEX, 'credit_debit_no'),
81
  (AADHAAR_REGEX, 'aadhar_num'),
@@ -85,14 +86,12 @@ def mask_and_store_all_pii(text):
85
  (EMAIL_REGEX, 'email'),
86
  (DOB_REGEX, 'dob')
87
  ]
88
-
89
  for regex, token_name in regex_map:
90
  for match in re.finditer(regex, text):
91
  original = match.group(0)
92
  token = f"[{token_name}_{counter[token_name]:03d}]"
93
- start = match.start()
94
- end = match.end()
95
  if original in text:
 
96
  text = text.replace(original, token, 1)
97
  mapping[token] = original
98
  counter[token_name] += 1
@@ -104,24 +103,19 @@ def mask_and_store_all_pii(text):
104
 
105
  return text, mapping, entity_list
106
 
107
- # Restore PII
108
-
109
  def restore_pii(masked_text, pii_map):
110
  for placeholder, original in pii_map.items():
111
  masked_text = masked_text.replace(placeholder, original)
112
  return masked_text
113
 
114
- # Classification Endpoint
115
  @app.post("/classify")
116
  def classify_email(data: EmailInput):
117
  raw_text = data.input_email_body
118
-
119
- # Masking
120
  masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)
121
-
122
- # Prediction
123
- predicted_category = model.predict([masked_text])[0]
124
-
125
  return {
126
  "input_email_body": raw_text,
127
  "list_of_masked_entities": entity_list,
@@ -129,7 +123,33 @@ def classify_email(data: EmailInput):
129
  "category_of_the_email": predicted_category
130
  }
131
 
132
- # Health check endpoint
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  @app.get("/")
134
  def root():
135
  return {"message": "Email Classification API is running."}
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import joblib
4
+ import pandas as pd
5
  import re
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.svm import LinearSVC
8
  from transformers import pipeline
9
 
10
  # Initialize FastAPI app
 
16
  redoc_url="/redoc"
17
  )
18
 
19
+ # Load model and vectorizer
20
  model = joblib.load("model.joblib")
21
+ vectorizer = joblib.load("vectorizer.joblib")
22
 
23
  # Initialize NER pipeline
24
  ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
25
 
26
+ # Input schemas
27
+ class EmailInput(BaseModel):
28
+ input_email_body: str
29
+
30
+ class TrainingExample(BaseModel):
31
+ email_body: str
32
+ label: str
33
+
34
+ # Map NER labels to types
35
  NER_TO_TOKEN = {
36
  'PER': 'full_name',
37
  'EMAIL': 'email',
38
  'DATE': 'dob'
39
  }
40
 
41
+ # Regex patterns for PII
42
  EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
43
  AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
44
  CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
 
47
  PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
48
  DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'
49
 
50
+ # Masking function
 
 
 
 
51
  def mask_and_store_all_pii(text):
52
  text = str(text)
53
  mapping = {}
54
  counter = {
55
+ 'full_name': 0, 'email': 0, 'phone_number': 0, 'dob': 0,
56
+ 'aadhar_num': 0, 'credit_debit_no': 0, 'cvv_no': 0, 'expiry_no': 0
 
 
 
 
 
 
57
  }
58
  entity_list = []
59
 
60
+ # NER-based masking
61
  entities = ner(text)
62
  for ent in entities:
63
  label = ent['entity_group']
 
67
  token = f"[{token_name}_{counter[token_name]:03d}]"
68
  if original in text:
69
  start = text.index(original)
 
70
  text = text.replace(original, token, 1)
71
  mapping[token] = original
72
  counter[token_name] += 1
 
76
  "entity": original
77
  })
78
 
79
+ # Regex-based masking
80
  regex_map = [
81
  (CARD_REGEX, 'credit_debit_no'),
82
  (AADHAAR_REGEX, 'aadhar_num'),
 
86
  (EMAIL_REGEX, 'email'),
87
  (DOB_REGEX, 'dob')
88
  ]
 
89
  for regex, token_name in regex_map:
90
  for match in re.finditer(regex, text):
91
  original = match.group(0)
92
  token = f"[{token_name}_{counter[token_name]:03d}]"
 
 
93
  if original in text:
94
+ start = text.index(original)
95
  text = text.replace(original, token, 1)
96
  mapping[token] = original
97
  counter[token_name] += 1
 
103
 
104
  return text, mapping, entity_list
105
 
106
+ # Restore PII (optional use)
 
107
  def restore_pii(masked_text, pii_map):
108
  for placeholder, original in pii_map.items():
109
  masked_text = masked_text.replace(placeholder, original)
110
  return masked_text
111
 
112
+ # Prediction endpoint
113
  @app.post("/classify")
114
  def classify_email(data: EmailInput):
115
  raw_text = data.input_email_body
 
 
116
  masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)
117
+ features = vectorizer.transform([masked_text])
118
+ predicted_category = model.predict(features)[0]
 
 
119
  return {
120
  "input_email_body": raw_text,
121
  "list_of_masked_entities": entity_list,
 
123
  "category_of_the_email": predicted_category
124
  }
125
 
126
+ # Retraining endpoint
127
+ @app.post("/train")
128
+ def train_model(new_example: TrainingExample):
129
+ df = pd.DataFrame([{"email_body": new_example.email_body, "label": new_example.label}])
130
+ try:
131
+ df.to_csv("training_data.csv", mode='a', header=not pd.io.common.file_exists("training_data.csv"), index=False)
132
+ except Exception as e:
133
+ return {"error": f"Failed to append to dataset: {str(e)}"}
134
+
135
+ # Load dataset
136
+ full_df = pd.read_csv("training_data.csv")
137
+ full_df['masked_text'] = full_df['email_body'].apply(lambda x: mask_and_store_all_pii(x)[0])
138
+
139
+ # Vectorize and train
140
+ new_vectorizer = TfidfVectorizer()
141
+ X = new_vectorizer.fit_transform(full_df['masked_text'])
142
+ y = full_df['label']
143
+ new_model = LinearSVC()
144
+ new_model.fit(X, y)
145
+
146
+ # Save updated model and vectorizer
147
+ joblib.dump(new_model, "model.joblib")
148
+ joblib.dump(new_vectorizer, "vectorizer.joblib")
149
+
150
+ return {"message": "Model retrained successfully with new example."}
151
+
152
+ # Health check
153
  @app.get("/")
154
  def root():
155
  return {"message": "Email Classification API is running."}