Speccco commited on
Commit
0539585
·
1 Parent(s): c63c773

intial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +34 -0
  2. app.py +235 -0
  3. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set environment variables
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ curl \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set the working directory
14
+ WORKDIR /app
15
+
16
+ # Copy requirements first for better caching
17
+ COPY requirements.txt .
18
+
19
+ # Install Python dependencies
20
+ RUN pip install --no-cache-dir --upgrade pip && \
21
+ pip install --no-cache-dir -r requirements.txt
22
+
23
+ # Copy the application code
24
+ COPY . .
25
+
26
+ # Create a non-root user
27
+ RUN useradd -m -u 1000 user
28
+ USER user
29
+
30
+ # Expose the port
31
+ EXPOSE 7860
32
+
33
+ # Command to run the application
34
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import re
4
+ import pandas as pd
5
+ import warnings
6
+ import os
7
+ from fastapi import FastAPI
8
+ from pydantic import BaseModel
9
+ import uvicorn
10
+
11
+ warnings.filterwarnings('ignore')
12
+
13
+ class ArabicProfanityTester:
14
+ def __init__(self, model_name='Speccco/arabic_profanity_filter'):
15
+ """Initialize the tester with model from Hugging Face Hub"""
16
+ print(f"🔄 Loading model from Hugging Face Hub: {model_name}...")
17
+
18
+ try:
19
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
21
+ self.model.eval()
22
+
23
+ print("✅ Model loaded successfully from Hugging Face Hub!")
24
+ print(f"📊 Model configuration:")
25
+ print(f" - Model type: {type(self.model).__name__}")
26
+ print(f" - Number of labels: {self.model.config.num_labels}")
27
+ print(f" - Max position embeddings: {self.model.config.max_position_embeddings}")
28
+
29
+ except Exception as e:
30
+ print(f"❌ Failed to load model from Hub: {e}")
31
+ print("🔄 Falling back to base AraBERT model...")
32
+
33
+ # Fallback to base model
34
+ base_model = "aubmindlab/bert-base-arabertv02"
35
+ self.tokenizer = AutoTokenizer.from_pretrained(base_model)
36
+ self.model = AutoModelForSequenceClassification.from_pretrained(
37
+ base_model,
38
+ num_labels=2
39
+ )
40
+ self.model.eval()
41
+ print("⚠️ Using base AraBERT model (not fine-tuned)")
42
+
43
+ def preprocess_text(self, text):
44
+ """Simple text preprocessing"""
45
+ if pd.isna(text):
46
+ return ""
47
+
48
+ text = str(text)
49
+ # Remove URLs, mentions, hashtags
50
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
51
+ text = re.sub(r'@\w+|#\w+', '', text)
52
+ # Remove extra whitespace
53
+ text = re.sub(r'\s+', ' ', text).strip()
54
+
55
+ return text
56
+
57
+ def check_bad_words(self, text):
58
+ """Check if text contains explicit bad Arabic/Egyptian words"""
59
+ bad_words = [
60
+ 'شرموطة', 'خرا', 'زفت', 'أمك', 'يلعن دينك', 'متناك',
61
+ 'منيك', 'نايك', 'طيز', 'عرص', 'قواد', 'وسخة', 'كسك',
62
+ 'يا دين أمي', 'ابن وسخة'
63
+ ]
64
+
65
+ text_lower = text.lower()
66
+ found_words = []
67
+
68
+ for bad_word in bad_words:
69
+ if bad_word.lower() in text_lower:
70
+ found_words.append(bad_word)
71
+
72
+ return len(found_words) > 0, found_words
73
+
74
+ def predict(self, text, show_details=True):
75
+ """Predict if text is offensive or not with bad words override"""
76
+ # Preprocess text
77
+ processed_text = self.preprocess_text(text)
78
+
79
+ # Check for explicit bad words first
80
+ has_bad_words, found_bad_words = self.check_bad_words(text)
81
+
82
+ # Tokenize
83
+ inputs = self.tokenizer(
84
+ processed_text,
85
+ return_tensors='pt',
86
+ truncation=True,
87
+ max_length=256,
88
+ padding=True
89
+ )
90
+
91
+ # Get model prediction
92
+ with torch.no_grad():
93
+ outputs = self.model(**inputs)
94
+ logits = outputs.logits
95
+ probabilities = torch.softmax(logits, dim=-1)
96
+ model_predicted_class = torch.argmax(probabilities, dim=-1).item()
97
+ model_confidence = probabilities[0][model_predicted_class].item()
98
+
99
+ # Final decision: bad words override model prediction
100
+ if has_bad_words:
101
+ final_prediction = "Bad"
102
+ final_class = 1 # Offensive
103
+ override_reason = f"Contains explicit bad words: {', '.join(found_bad_words)}"
104
+ else:
105
+ final_prediction = "Good" if model_predicted_class == 0 else "Bad"
106
+ final_class = model_predicted_class
107
+ override_reason = None
108
+
109
+ # Prepare result
110
+ result = {
111
+ 'original_text': text,
112
+ 'processed_text': processed_text,
113
+ 'model_prediction': 'Offensive' if model_predicted_class == 1 else 'Non-Offensive',
114
+ 'model_confidence': model_confidence,
115
+ 'final_prediction': final_prediction,
116
+ 'final_class': final_class,
117
+ 'has_bad_words': has_bad_words,
118
+ 'found_bad_words': found_bad_words,
119
+ 'override_reason': override_reason,
120
+ 'probabilities': {
121
+ 'non_offensive': probabilities[0][0].item(),
122
+ 'offensive': probabilities[0][1].item()
123
+ }
124
+ }
125
+
126
+ return result
127
+
128
+ class ProfanityRequest(BaseModel):
129
+ text: str
130
+
131
+ class BatchProfanityRequest(BaseModel):
132
+ texts: list[str]
133
+
134
+ app = FastAPI(
135
+ title="Arabic Profanity Filter API",
136
+ description="An API to detect profanity in Arabic text using a fine-tuned AraBERT model with rule-based override.",
137
+ version="1.0.0",
138
+ docs_url="/docs",
139
+ redoc_url="/redoc"
140
+ )
141
+
142
+ # Initialize the tester globally
143
+ tester = None
144
+
145
+ @app.on_event("startup")
146
+ async def startup_event():
147
+ """Initialize the model on startup"""
148
+ global tester
149
+ try:
150
+ tester = ArabicProfanityTester()
151
+ print("🚀 Arabic Profanity Filter API is ready!")
152
+ except Exception as e:
153
+ print(f"❌ Failed to load model: {e}")
154
+ raise e
155
+
156
+ @app.get("/", tags=["General"])
157
+ def read_root():
158
+ return {
159
+ "message": "Welcome to the Arabic Profanity Filter API",
160
+ "description": "Detects profanity in Arabic text using AraBERT model with rule-based override",
161
+ "endpoints": {
162
+ "predict": "/predict - Single text prediction",
163
+ "batch": "/batch - Batch text prediction",
164
+ "health": "/health - Health check",
165
+ "docs": "/docs - API documentation"
166
+ }
167
+ }
168
+
169
+ @app.get("/health", tags=["General"])
170
+ def health_check():
171
+ """Health check endpoint"""
172
+ if tester is None:
173
+ return {"status": "unhealthy", "message": "Model not loaded"}
174
+ return {"status": "healthy", "message": "API is running"}
175
+
176
+ @app.post("/predict", tags=["Prediction"])
177
+ async def predict_profanity(request: ProfanityRequest):
178
+ """
179
+ Predicts if the given Arabic text contains profanity.
180
+
181
+ - **text**: The Arabic text to analyze.
182
+
183
+ Returns:
184
+ - original_text: The input text
185
+ - processed_text: Text after preprocessing
186
+ - model_prediction: Model's prediction (Offensive/Non-Offensive)
187
+ - model_confidence: Model's confidence score
188
+ - final_prediction: Final result (Good/Bad) after rule-based override
189
+ - has_bad_words: Whether explicit bad words were found
190
+ - found_bad_words: List of bad words found
191
+ - probabilities: Detailed probability scores
192
+ """
193
+ if tester is None:
194
+ return {"error": "Model not loaded"}
195
+
196
+ try:
197
+ result = tester.predict(request.text, show_details=False)
198
+ return result
199
+ except Exception as e:
200
+ return {"error": f"Prediction failed: {str(e)}"}
201
+
202
+ @app.post("/batch", tags=["Prediction"])
203
+ async def predict_batch_profanity(request: BatchProfanityRequest):
204
+ """
205
+ Predicts profanity for multiple Arabic texts.
206
+
207
+ - **texts**: List of Arabic texts to analyze.
208
+
209
+ Returns list of prediction results for each text.
210
+ """
211
+ if tester is None:
212
+ return {"error": "Model not loaded"}
213
+
214
+ try:
215
+ results = []
216
+ for text in request.texts:
217
+ result = tester.predict(text, show_details=False)
218
+ results.append(result)
219
+
220
+ return {
221
+ "predictions": results,
222
+ "summary": {
223
+ "total": len(results),
224
+ "bad_count": sum(1 for r in results if r['final_prediction'] == 'Bad'),
225
+ "good_count": sum(1 for r in results if r['final_prediction'] == 'Good'),
226
+ "explicit_bad_words_count": sum(1 for r in results if r['has_bad_words'])
227
+ }
228
+ }
229
+ except Exception as e:
230
+ return {"error": f"Batch prediction failed: {str(e)}"}
231
+
232
+ if __name__ == "__main__":
233
+ import os
234
+ port = int(os.environ.get("PORT", 7860))
235
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.21.0
3
+ fastapi>=0.104.0
4
+ uvicorn[standard]>=0.24.0
5
+ pydantic>=2.0.0
6
+ pandas>=1.5.0
7
+ numpy>=1.24.0
8
+ scikit-learn>=1.3.0
9
+ python-multipart
10
+ accelerate
11
+ sentencepiece
12
+ protobuf