import gradio as gr
import pandas as pd
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
import joblib
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Setup
model_path = "."  # All files are in the root directory
tokenizer = AutoTokenizer.from_pretrained(model_path)
product_encoder = joblib.load("category_encoder.pkl")
base_model_name = "DataScienceWFSR/bert-food-product-category-cw"
stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clean text function
def clean_text(text):
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Template function
def template_(day, month, year, country, title, text):
    return f"Date: day {day}, month {month}, year {year}. Country: {country}. Title: {title}. Text: {text}"

# Model definition
class ProductCategoryClassifier(nn.Module):
    def __init__(self, model_name, num_categories):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.4)
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_categories)

    def forward(self, input_ids, attention_mask=None):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = self.dropout(output.last_hidden_state[:, 0, :])
        logits = self.classifier(cls_token)
        return logits

# Load model
num_categories = len(product_encoder.classes_)
model = ProductCategoryClassifier(model_name=base_model_name, num_categories=num_categories).to(device)
model.load_state_dict(torch.load("pytorch_model.bin", map_location=device))
model.eval()

# Inference function
def predict_category(day, month, year, title, text, country="Unknown"):
    title_clean = clean_text(title)
    text_clean = clean_text(text)
    input_text = template_(day, month, year, country, title_clean, text_clean)

    inputs = tokenizer([input_text], padding=True, truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=1).cpu().numpy()[0]

    category = product_encoder.inverse_transform([pred])[0]
    return category

# Gradio interface
iface = gr.Interface(
    fn=predict_category,
    inputs=[
        gr.Number(label="Day"),
        gr.Number(label="Month"),
        gr.Number(label="Year"),
        gr.Textbox(label="Title"),
        gr.Textbox(label="Text", lines=5),
    ],
    outputs="text",
    title="Product Category Predictor",
    description="Enter date and text details to predict the product category.",
)

# Run the app
if __name__ == "__main__":
    iface.launch()