from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.applications.xception import Xception, preprocess_input
import pickle
from tqdm import tqdm
import os
from PIL import Image
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, concatenate
import numpy as np
import gradio as gr
import tensorflow as tf

model = Xception()

# Restructure model
model = Model(inputs = model.inputs , outputs = model.layers[-2].output)

# import tensorflow as tf

# class MyEmbedding(tf.keras.layers.Embedding):
#     def __init__(self, *args, input_length=None, **kwargs):
#         super().__init__(*args, **kwargs)
#         self.input_length = input_length

#     def from_config(cls, config):
#         input_length = config.pop('input_length', None)
#         instance = cls(**config)
#         instance.input_length = input_length
#         return instance

# # Load the model with custom objects
# caption_model = tf.keras.models.load_model('model.h5', custom_objects={'MyEmbedding': MyEmbedding})


with open('captions.txt', 'r') as f:
    next(f)
    captions_doc = f.read()
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)


def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc.,
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)


# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)


def extract_features(image):
    image = load_img(image, target_size=(299, 299))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    return feature

def idx_to_word(integer, tokenizer):
    for word,index, in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def save_image(img, save_dir="saved_images"):
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    # Save the image with a unique name
    img_name = os.path.join(save_dir, "uploaded_image.png")
    img.save(img_name)
    return img_name

# generate caption for an image
def predict_caption(model, image, tokenizer, max_length=35):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break

    return in_text


def caption_prediction(img):
    image = Image.fromarray(img)
    img_path = save_image(image)
    features = extract_features(img_path)
    y_pred = predict_caption(caption_model, features, tokenizer)[8:][:-6]
    return y_pred

demo = gr.Interface(fn=caption_prediction, inputs='image',outputs='text',title='caption generator')
demo.launch(debug=True,share=True)