Spaces:

Speccco
/

Arabic_Profanity_Model

Running

App Files Files Community

Speccco commited on Sep 1

Commit

0539585

1 Parent(s): c63c773

intial commit

Browse files

Files changed (3) hide show

Dockerfile +34 -0
app.py +235 -0
requirements.txt +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Set the working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy the application code
+COPY . .
+# Create a non-root user
+RUN useradd -m -u 1000 user
+USER user
+# Expose the port
+EXPOSE 7860
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import re
+import pandas as pd
+import warnings
+import os
+from fastapi import FastAPI
+from pydantic import BaseModel
+import uvicorn
+warnings.filterwarnings('ignore')
+class ArabicProfanityTester:
+    def __init__(self, model_name='Speccco/arabic_profanity_filter'):
+        """Initialize the tester with model from Hugging Face Hub"""
+        print(f"🔄 Loading model from Hugging Face Hub: {model_name}...")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            self.model.eval()
+            print("✅ Model loaded successfully from Hugging Face Hub!")
+            print(f"📊 Model configuration:")
+            print(f"   - Model type: {type(self.model).__name__}")
+            print(f"   - Number of labels: {self.model.config.num_labels}")
+            print(f"   - Max position embeddings: {self.model.config.max_position_embeddings}")
+        except Exception as e:
+            print(f"❌ Failed to load model from Hub: {e}")
+            print("🔄 Falling back to base AraBERT model...")
+            # Fallback to base model
+            base_model = "aubmindlab/bert-base-arabertv02"
+            self.tokenizer = AutoTokenizer.from_pretrained(base_model)
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+                base_model,
+                num_labels=2
+            )
+            self.model.eval()
+            print("⚠️  Using base AraBERT model (not fine-tuned)")
+    def preprocess_text(self, text):
+        """Simple text preprocessing"""
+        if pd.isna(text):
+            return ""
+        text = str(text)
+        # Remove URLs, mentions, hashtags
+        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+        text = re.sub(r'@\w+|#\w+', '', text)
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    def check_bad_words(self, text):
+        """Check if text contains explicit bad Arabic/Egyptian words"""
+        bad_words = [
+            'شرموطة', 'خرا', 'زفت', 'أمك', 'يلعن دينك', 'متناك',
+            'منيك', 'نايك', 'طيز', 'عرص', 'قواد', 'وسخة', 'كسك',
+            'يا دين أمي', 'ابن وسخة'
+        ]
+        text_lower = text.lower()
+        found_words = []
+        for bad_word in bad_words:
+            if bad_word.lower() in text_lower:
+                found_words.append(bad_word)
+        return len(found_words) > 0, found_words
+    def predict(self, text, show_details=True):
+        """Predict if text is offensive or not with bad words override"""
+        # Preprocess text
+        processed_text = self.preprocess_text(text)
+        # Check for explicit bad words first
+        has_bad_words, found_bad_words = self.check_bad_words(text)
+        # Tokenize
+        inputs = self.tokenizer(
+            processed_text,
+            return_tensors='pt',
+            truncation=True,
+            max_length=256,
+            padding=True
+        )
+        # Get model prediction
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+            probabilities = torch.softmax(logits, dim=-1)
+            model_predicted_class = torch.argmax(probabilities, dim=-1).item()
+            model_confidence = probabilities[0][model_predicted_class].item()
+        # Final decision: bad words override model prediction
+        if has_bad_words:
+            final_prediction = "Bad"
+            final_class = 1  # Offensive
+            override_reason = f"Contains explicit bad words: {', '.join(found_bad_words)}"
+        else:
+            final_prediction = "Good" if model_predicted_class == 0 else "Bad"
+            final_class = model_predicted_class
+            override_reason = None
+        # Prepare result
+        result = {
+            'original_text': text,
+            'processed_text': processed_text,
+            'model_prediction': 'Offensive' if model_predicted_class == 1 else 'Non-Offensive',
+            'model_confidence': model_confidence,
+            'final_prediction': final_prediction,
+            'final_class': final_class,
+            'has_bad_words': has_bad_words,
+            'found_bad_words': found_bad_words,
+            'override_reason': override_reason,
+            'probabilities': {
+                'non_offensive': probabilities[0][0].item(),
+                'offensive': probabilities[0][1].item()
+            }
+        }
+        return result
+class ProfanityRequest(BaseModel):
+    text: str
+class BatchProfanityRequest(BaseModel):
+    texts: list[str]
+app = FastAPI(
+    title="Arabic Profanity Filter API",
+    description="An API to detect profanity in Arabic text using a fine-tuned AraBERT model with rule-based override.",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+# Initialize the tester globally
+tester = None
+@app.on_event("startup")
+async def startup_event():
+    """Initialize the model on startup"""
+    global tester
+    try:
+        tester = ArabicProfanityTester()
+        print("🚀 Arabic Profanity Filter API is ready!")
+    except Exception as e:
+        print(f"❌ Failed to load model: {e}")
+        raise e
+@app.get("/", tags=["General"])
+def read_root():
+    return {
+        "message": "Welcome to the Arabic Profanity Filter API",
+        "description": "Detects profanity in Arabic text using AraBERT model with rule-based override",
+        "endpoints": {
+            "predict": "/predict - Single text prediction",
+            "batch": "/batch - Batch text prediction",
+            "health": "/health - Health check",
+            "docs": "/docs - API documentation"
+        }
+    }
+@app.get("/health", tags=["General"])
+def health_check():
+    """Health check endpoint"""
+    if tester is None:
+        return {"status": "unhealthy", "message": "Model not loaded"}
+    return {"status": "healthy", "message": "API is running"}
+@app.post("/predict", tags=["Prediction"])
+async def predict_profanity(request: ProfanityRequest):
+    """
+    Predicts if the given Arabic text contains profanity.
+    - **text**: The Arabic text to analyze.
+    Returns:
+    - original_text: The input text
+    - processed_text: Text after preprocessing
+    - model_prediction: Model's prediction (Offensive/Non-Offensive)
+    - model_confidence: Model's confidence score
+    - final_prediction: Final result (Good/Bad) after rule-based override
+    - has_bad_words: Whether explicit bad words were found
+    - found_bad_words: List of bad words found
+    - probabilities: Detailed probability scores
+    """
+    if tester is None:
+        return {"error": "Model not loaded"}
+    try:
+        result = tester.predict(request.text, show_details=False)
+        return result
+    except Exception as e:
+        return {"error": f"Prediction failed: {str(e)}"}
+@app.post("/batch", tags=["Prediction"])
+async def predict_batch_profanity(request: BatchProfanityRequest):
+    """
+    Predicts profanity for multiple Arabic texts.
+    - **texts**: List of Arabic texts to analyze.
+    Returns list of prediction results for each text.
+    """
+    if tester is None:
+        return {"error": "Model not loaded"}
+    try:
+        results = []
+        for text in request.texts:
+            result = tester.predict(text, show_details=False)
+            results.append(result)
+        return {
+            "predictions": results,
+            "summary": {
+                "total": len(results),
+                "bad_count": sum(1 for r in results if r['final_prediction'] == 'Bad'),
+                "good_count": sum(1 for r in results if r['final_prediction'] == 'Good'),
+                "explicit_bad_words_count": sum(1 for r in results if r['has_bad_words'])
+            }
+        }
+    except Exception as e:
+        return {"error": f"Batch prediction failed: {str(e)}"}
+if __name__ == "__main__":
+    import os
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=2.0.0
+transformers>=4.21.0
+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+pydantic>=2.0.0
+pandas>=1.5.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+python-multipart
+accelerate
+sentencepiece
+protobuf