Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,6 @@ from pydantic import BaseModel
|
|
3 |
import joblib
|
4 |
import pandas as pd
|
5 |
import re
|
6 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
-
from sklearn.svm import LinearSVC
|
8 |
from transformers import pipeline
|
9 |
|
10 |
# Initialize FastAPI app
|
@@ -16,9 +14,8 @@ app = FastAPI(
|
|
16 |
redoc_url="/redoc"
|
17 |
)
|
18 |
|
19 |
-
# Load model
|
20 |
model = joblib.load("model.joblib")
|
21 |
-
vectorizer = joblib.load("vectorizer.joblib")
|
22 |
|
23 |
# Initialize NER pipeline
|
24 |
ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
|
@@ -114,8 +111,7 @@ def restore_pii(masked_text, pii_map):
|
|
114 |
def classify_email(data: EmailInput):
|
115 |
raw_text = data.input_email_body
|
116 |
masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)
|
117 |
-
|
118 |
-
predicted_category = model.predict(features)[0]
|
119 |
return {
|
120 |
"input_email_body": raw_text,
|
121 |
"list_of_masked_entities": entity_list,
|
@@ -123,32 +119,6 @@ def classify_email(data: EmailInput):
|
|
123 |
"category_of_the_email": predicted_category
|
124 |
}
|
125 |
|
126 |
-
# Retraining endpoint
|
127 |
-
@app.post("/train")
|
128 |
-
def train_model(new_example: TrainingExample):
|
129 |
-
df = pd.DataFrame([{"email_body": new_example.email_body, "label": new_example.label}])
|
130 |
-
try:
|
131 |
-
df.to_csv("training_data.csv", mode='a', header=not pd.io.common.file_exists("training_data.csv"), index=False)
|
132 |
-
except Exception as e:
|
133 |
-
return {"error": f"Failed to append to dataset: {str(e)}"}
|
134 |
-
|
135 |
-
# Load dataset
|
136 |
-
full_df = pd.read_csv("training_data.csv")
|
137 |
-
full_df['masked_text'] = full_df['email_body'].apply(lambda x: mask_and_store_all_pii(x)[0])
|
138 |
-
|
139 |
-
# Vectorize and train
|
140 |
-
new_vectorizer = TfidfVectorizer()
|
141 |
-
X = new_vectorizer.fit_transform(full_df['masked_text'])
|
142 |
-
y = full_df['label']
|
143 |
-
new_model = LinearSVC()
|
144 |
-
new_model.fit(X, y)
|
145 |
-
|
146 |
-
# Save updated model and vectorizer
|
147 |
-
joblib.dump(new_model, "model.joblib")
|
148 |
-
joblib.dump(new_vectorizer, "vectorizer.joblib")
|
149 |
-
|
150 |
-
return {"message": "Model retrained successfully with new example."}
|
151 |
-
|
152 |
# Health check
|
153 |
@app.get("/")
|
154 |
def root():
|
|
|
3 |
import joblib
|
4 |
import pandas as pd
|
5 |
import re
|
|
|
|
|
6 |
from transformers import pipeline
|
7 |
|
8 |
# Initialize FastAPI app
|
|
|
14 |
redoc_url="/redoc"
|
15 |
)
|
16 |
|
17 |
+
# Load the combined model pipeline (includes vectorizer)
|
18 |
model = joblib.load("model.joblib")
|
|
|
19 |
|
20 |
# Initialize NER pipeline
|
21 |
ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)
|
|
|
111 |
def classify_email(data: EmailInput):
|
112 |
raw_text = data.input_email_body
|
113 |
masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)
|
114 |
+
predicted_category = model.predict([masked_text])[0]
|
|
|
115 |
return {
|
116 |
"input_email_body": raw_text,
|
117 |
"list_of_masked_entities": entity_list,
|
|
|
119 |
"category_of_the_email": predicted_category
|
120 |
}
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
# Health check
|
123 |
@app.get("/")
|
124 |
def root():
|