Spaces:

suriya7
/

Image-Captioning

Sleeping

App Files Files Community

Image-Captioning / app.py

suriya7

Update app.py

0eb573f verified over 1 year ago

raw

history blame contribute delete

4.93 kB


	from tensorflow.keras.preprocessing.image import load_img, img_to_array
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Model
	from tensorflow.keras.applications.xception import Xception, preprocess_input
	import pickle
	from tqdm import tqdm
	import os
	from PIL import Image
	from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, concatenate
	import numpy as np
	import gradio as gr
	import tensorflow as tf

	model = Xception()

	# Restructure model
	model = Model(inputs = model.inputs , outputs = model.layers[-2].output)

	# import tensorflow as tf

	# class MyEmbedding(tf.keras.layers.Embedding):
	# def __init__(self, args, input_length=None, *kwargs):
	# super().__init__(args, *kwargs)
	# self.input_length = input_length

	# def from_config(cls, config):
	# input_length = config.pop('input_length', None)
	# instance = cls(**config)
	# instance.input_length = input_length
	# return instance

	# # Load the model with custom objects
	# caption_model = tf.keras.models.load_model('model.h5', custom_objects={'MyEmbedding': MyEmbedding})


	with open('captions.txt', 'r') as f:
	next(f)
	captions_doc = f.read()
	# create mapping of image to captions
	mapping = {}
	# process lines
	for line in tqdm(captions_doc.split('\n')):
	# split the line by comma(,)
	tokens = line.split(',')
	if len(line) < 2:
	continue
	image_id, caption = tokens[0], tokens[1:]
	# remove extension from image ID
	image_id = image_id.split('.')[0]
	# convert caption list to string
	caption = " ".join(caption)
	# create list if needed
	if image_id not in mapping:
	mapping[image_id] = []
	# store the caption
	mapping[image_id].append(caption)


	def clean(mapping):
	for key, captions in mapping.items():
	for i in range(len(captions)):
	# take one caption at a time
	caption = captions[i]
	# preprocessing steps
	# convert to lowercase
	caption = caption.lower()
	# delete digits, special chars, etc.,
	caption = caption.replace('[^A-Za-z]', '')
	# delete additional spaces
	caption = caption.replace('\s+', ' ')
	# add start and end tags to the caption
	caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
	captions[i] = caption

	all_captions = []
	for key in mapping:
	for caption in mapping[key]:
	all_captions.append(caption)


	# tokenize the text
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(all_captions)
	vocab_size = len(tokenizer.word_index) + 1

	# get maximum length of the caption available
	max_length = max(len(caption.split()) for caption in all_captions)


	def extract_features(image):
	image = load_img(image, target_size=(299, 299))
	# convert image pixels to numpy array
	image = img_to_array(image)
	# reshape data for model
	image = np.expand_dims(image, axis=0)
	image = preprocess_input(image)
	feature = model.predict(image, verbose=0)
	return feature

	def idx_to_word(integer, tokenizer):
	for word,index, in tokenizer.word_index.items():
	if index == integer:
	return word
	return None

	def save_image(img, save_dir="saved_images"):
	# Create the directory if it doesn't exist
	os.makedirs(save_dir, exist_ok=True)
	# Save the image with a unique name
	img_name = os.path.join(save_dir, "uploaded_image.png")
	img.save(img_name)
	return img_name

	# generate caption for an image
	def predict_caption(model, image, tokenizer, max_length=35):
	# add start tag for generation process
	in_text = 'startseq'
	# iterate over the max length of sequence
	for i in range(max_length):
	# encode input sequence
	sequence = tokenizer.texts_to_sequences([in_text])[0]
	# pad the sequence
	sequence = pad_sequences([sequence], max_length)
	# predict next word
	yhat = model.predict([image, sequence], verbose=0)
	# get index with high probability
	yhat = np.argmax(yhat)
	# convert index to word
	word = idx_to_word(yhat, tokenizer)
	# stop if word not found
	if word is None:
	break
	# append word as input for generating next word
	in_text += " " + word
	# stop if we reach end tag
	if word == 'endseq':
	break

	return in_text



	def caption_prediction(img):
	image = Image.fromarray(img)
	img_path = save_image(image)
	features = extract_features(img_path)
	y_pred = predict_caption(caption_model, features, tokenizer)[8:][:-6]
	return y_pred

	demo = gr.Interface(fn=caption_prediction, inputs='image',outputs='text',title='caption generator')
	demo.launch(debug=True,share=True)