import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import warnings
import os
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn

warnings.filterwarnings('ignore')

class ArabicProfanityTester:
    def __init__(self, model_name='Speccco/arabic_profanity_filter'):
        """Initialize the tester with model from Hugging Face Hub"""
        print(f"🔄 Loading model from Hugging Face Hub: {model_name}...")
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
            self.model.eval()
            
            print("✅ Model loaded successfully from Hugging Face Hub!")
            print(f"📊 Model configuration:")
            print(f"   - Model type: {type(self.model).__name__}")
            print(f"   - Number of labels: {self.model.config.num_labels}")
            print(f"   - Max position embeddings: {self.model.config.max_position_embeddings}")
            
        except Exception as e:
            print(f"❌ Failed to load model from Hub: {e}")
            print("🔄 Falling back to base AraBERT model...")
            
            # Fallback to base model
            base_model = "aubmindlab/bert-base-arabertv02"
            self.tokenizer = AutoTokenizer.from_pretrained(base_model)
            self.model = AutoModelForSequenceClassification.from_pretrained(
                base_model, 
                num_labels=2
            )
            self.model.eval()
            print("⚠️  Using base AraBERT model (not fine-tuned)")
        
    def preprocess_text(self, text):
        """Simple text preprocessing"""
        if pd.isna(text):
            return ""
        
        text = str(text)
        # Remove URLs, mentions, hashtags
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove emojis and other unicode symbols
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"  # dingbats
            u"\U000024C2-\U0001F251"  # enclosed characters
            u"\U0001F900-\U0001F9FF"  # supplemental symbols
            u"\U0001FA00-\U0001FAFF"  # extended symbols
            u"\u2600-\u26FF"          # miscellaneous symbols
            u"\u2700-\u27BF"          # dingbats
            u"\uFE00-\uFE0F"          # variation selectors
            u"\u200D"                 # zero width joiner
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        # Remove English alphabets
        text = re.sub(r'[a-zA-Z]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def check_bad_words(self, text):
        """Check if text contains explicit bad Arabic/Egyptian words"""
        bad_words = [
            'شرموطة', 'خرا', 'زفت', 'أمك', 'يلعن دينك', 'متناك', 
            'منيك', 'نايك', 'طيز', 'عرص', 'قواد', 'وسخة', 'كسك', 
            'يا دين أمي', 'ابن وسخة'
        ]
        
        text_lower = text.lower()
        found_words = []
        
        for bad_word in bad_words:
            if bad_word.lower() in text_lower:
                found_words.append(bad_word)
        
        return len(found_words) > 0, found_words
    
    def predict(self, text, show_details=True):
        """Predict if text is offensive or not with bad words override"""
        # Preprocess text
        processed_text = self.preprocess_text(text)
        
        # Check for explicit bad words first
        has_bad_words, found_bad_words = self.check_bad_words(text)
        
        # Tokenize
        inputs = self.tokenizer(
            processed_text, 
            return_tensors='pt', 
            truncation=True, 
            max_length=256,
            padding=True
        )
        
        # Get model prediction
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            model_predicted_class = torch.argmax(probabilities, dim=-1).item()
            model_confidence = probabilities[0][model_predicted_class].item()
        
        # Final decision: bad words override model prediction
        if has_bad_words:
            final_prediction = "Bad"
            final_class = 1  # Offensive
            override_reason = f"Contains explicit bad words: {', '.join(found_bad_words)}"
        else:
            final_prediction = "Good" if model_predicted_class == 0 else "Bad"
            final_class = model_predicted_class
            override_reason = None
        
        # Prepare result
        result = {
            'original_text': text,
            'processed_text': processed_text,
            'model_prediction': 'Offensive' if model_predicted_class == 1 else 'Non-Offensive',
            'model_confidence': model_confidence,
            'final_prediction': final_prediction,
            'final_class': final_class,
            'has_bad_words': has_bad_words,
            'found_bad_words': found_bad_words,
            'override_reason': override_reason,
            'probabilities': {
                'non_offensive': probabilities[0][0].item(),
                'offensive': probabilities[0][1].item()
            }
        }
        
        return result

class ProfanityRequest(BaseModel):
    text: str

class BatchProfanityRequest(BaseModel):
    texts: list[str]

app = FastAPI(
    title="Arabic Profanity Filter API",
    description="An API to detect profanity in Arabic text using a fine-tuned AraBERT model with rule-based override.",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Initialize the tester globally
tester = None

@app.on_event("startup")
async def startup_event():
    """Initialize the model on startup"""
    global tester
    try:
        tester = ArabicProfanityTester()
        print("🚀 Arabic Profanity Filter API is ready!")
    except Exception as e:
        print(f"❌ Failed to load model: {e}")
        raise e

@app.get("/", tags=["General"])
def read_root():
    return {
        "message": "Welcome to the Arabic Profanity Filter API",
        "description": "Detects profanity in Arabic text using AraBERT model with rule-based override",
        "endpoints": {
            "predict": "/predict - Single text prediction",
            "batch": "/batch - Batch text prediction",
            "health": "/health - Health check",
            "docs": "/docs - API documentation"
        }
    }

@app.get("/health", tags=["General"])
def health_check():
    """Health check endpoint"""
    if tester is None:
        return {"status": "unhealthy", "message": "Model not loaded"}
    return {"status": "healthy", "message": "API is running"}

@app.post("/predict", tags=["Prediction"])
async def predict_profanity(request: ProfanityRequest):
    """
    Predicts if the given Arabic text contains profanity.
    
    - **text**: The Arabic text to analyze.
    
    Returns:
    - original_text: The input text
    - processed_text: Text after preprocessing
    - model_prediction: Model's prediction (Offensive/Non-Offensive)
    - model_confidence: Model's confidence score
    - final_prediction: Final result (Good/Bad) after rule-based override
    - has_bad_words: Whether explicit bad words were found
    - found_bad_words: List of bad words found
    - probabilities: Detailed probability scores
    """
    if tester is None:
        return {"error": "Model not loaded"}
    
    try:
        result = tester.predict(request.text, show_details=False)
        return result
    except Exception as e:
        return {"error": f"Prediction failed: {str(e)}"}

@app.post("/batch", tags=["Prediction"])
async def predict_batch_profanity(request: BatchProfanityRequest):
    """
    Predicts profanity for multiple Arabic texts.
    
    - **texts**: List of Arabic texts to analyze.
    
    Returns list of prediction results for each text.
    """
    if tester is None:
        return {"error": "Model not loaded"}
    
    try:
        results = []
        for text in request.texts:
            result = tester.predict(text, show_details=False)
            results.append(result)
        
        return {
            "predictions": results,
            "summary": {
                "total": len(results),
                "bad_count": sum(1 for r in results if r['final_prediction'] == 'Bad'),
                "good_count": sum(1 for r in results if r['final_prediction'] == 'Good'),
                "explicit_bad_words_count": sum(1 for r in results if r['has_bad_words'])
            }
        }
    except Exception as e:
        return {"error": f"Batch prediction failed: {str(e)}"}

if __name__ == "__main__":
    import os
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port)