image-captioning-cloned

Runtime error

App Files Files Community

Anandhju-jayan

chats-bug commited on May 8, 2023

Commit

1673a2d

0 Parent(s):

Duplicate from chats-bug/ai-image-captioning

Browse files

Co-authored-by: Sukrit Chatterjee <chats-bug@users.noreply.huggingface.co>

Files changed (8) hide show

.gitattributes +35 -0
Image1.png +3 -0
Image2.png +3 -0
Image3.png +3 -0
README.md +14 -0
app.py +102 -0
model.py +149 -0
requirements.txt +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

Image1.png ADDED Viewed

Git LFS Details

SHA256: 6509058d30a3047f22d8ce478c2099caa25d3f989e3288541a9c22a4266deeea
Pointer size: 132 Bytes
Size of remote file: 2.41 MB

Image2.png ADDED Viewed

Git LFS Details

SHA256: ea2153871d79f0a8f91b4c390167218b19cd3de563220ea4464525ab962672e7
Pointer size: 132 Bytes
Size of remote file: 2.13 MB

Image3.png ADDED Viewed

Git LFS Details

SHA256: 4a2046a944a7c4be9f6ee3e6e2a26c06cea862985f415a4660a0a365273321a5
Pointer size: 132 Bytes
Size of remote file: 1.86 MB

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Ai Image Captioning
+emoji: 📈
+colorFrom: blue
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.28.2
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: chats-bug/ai-image-captioning
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import gradio as gr
+import torch
+from PIL import Image
+from model import BlipBaseModel, GitBaseCocoModel
+MODELS = {
+	"Git-Base-COCO": GitBaseCocoModel,
+	"Blip Base": BlipBaseModel,
+}
+# examples = [["Image1.png"], ["Image2.png"], ["Image3.png"]]
+def generate_captions(
+	image,
+	num_captions,
+	model_name,
+	max_length,
+	temperature,
+	top_k,
+	top_p,
+	repetition_penalty,
+	diversity_penalty,
+	):
+	"""
+	Generates captions for the given image.
+	-----
+	Parameters:
+	image: PIL.Image
+		The image to generate captions for.
+	num_captions: int
+		The number of captions to generate.
+	** Rest of the parameters are the same as in the model.generate method. **
+	-----
+	Returns:
+	list[str]
+	"""
+	# Convert the numerical values to their corresponding types.
+	# Gradio Slider returns values as floats: except when the value is a whole number, in which case it returns an int.
+	# Only float values suffer from this issue.
+	temperature = float(temperature)
+	top_p = float(top_p)
+	repetition_penalty = float(repetition_penalty)
+	diversity_penalty = float(diversity_penalty)
+	device = "cuda" if torch.cuda.is_available() else "cpu"
+	model = MODELS[model_name](device)
+	captions = model.generate(
+		image=image,
+		max_length=max_length,
+		num_captions=num_captions,
+		temperature=temperature,
+		top_k=top_k,
+		top_p=top_p,
+		repetition_penalty=repetition_penalty,
+		diversity_penalty=diversity_penalty,
+	)
+	# Convert list to a single string separated by newlines.
+	captions = "\n".join(captions)
+	return captions
+title = "AI tool for generating captions for images"
+description = "This tool uses pretrained models to generate captions for images."
+interface = gr.Interface(
+	fn=generate_captions,
+	inputs=[
+		gr.components.Image(type="pil", label="Image"),
+		gr.components.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Captions to Generate"),
+		gr.components.Dropdown(MODELS.keys(), label="Model", value=list(MODELS.keys())[1]), # Default to Blip Base
+		gr.components.Slider(minimum=20, maximum=100, step=5, value=50, label="Maximum Caption Length"),
+		gr.components.Slider(minimum=0.1, maximum=10.0, step=0.1, value=1.0, label="Temperature"),
+		gr.components.Slider(minimum=1, maximum=100, step=1, value=50, label="Top K"),
+		gr.components.Slider(minimum=0.1, maximum=5.0, step=0.1, value=1.0, label="Top P"),
+		gr.components.Slider(minimum=1.0, maximum=10.0, step=0.1, value=2.0, label="Repetition Penalty"),
+		gr.components.Slider(minimum=0.0, maximum=10.0, step=0.1, value=2.0, label="Diversity Penalty"),
+	],
+	outputs=[
+		gr.components.Textbox(label="Caption"),
+	],
+	# Set image examples to be displayed in the interface.
+	examples = [
+		["Image1.png", 1, list(MODELS.keys())[1], 50, 1.0, 50, 1.0, 2.0, 2.0],
+		["Image2.png", 1, list(MODELS.keys())[1], 50, 1.0, 50, 1.0, 2.0, 2.0],
+		["Image3.png", 1, list(MODELS.keys())[1], 50, 1.0, 50, 1.0, 2.0, 2.0],
+	],
+	title=title,
+	description=description,
+	allow_flagging="never",
+)
+if __name__ == "__main__":
+    # Launch the interface.
+	interface.launch(
+		enable_queue=True,
+		debug=True,
+	)

model.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from transformers import AutoProcessor, AutoModelForCausalLM, BlipForConditionalGeneration
+class ImageCaptionModel:
+	def __init__(
+		self,
+		device,
+		processor,
+		model,
+	) -> None:
+		"""
+		Initializes the model for generating captions for images.
+		-----
+		Parameters:
+		device: str
+			The device to use for the model. Must be either "cpu" or "cuda".
+		processor: transformers.AutoProcessor
+			The preprocessor to use for the model.
+		model: transformers.AutoModelForCausalLM or transformers.BlipForConditionalGeneration
+			The model to use for generating captions.
+		-----
+		Returns:
+		None
+		"""
+		self.device = device
+		self.processor = processor
+		self.model = model
+		self.model.to(self.device)
+	def generate(
+		self,
+		image,
+		num_captions: int = 1,
+		max_length: int = 50,
+		temperature: float = 1.0,
+		top_k: int = 50,
+		top_p: float = 1.0,
+		repetition_penalty: float = 1.0,
+		diversity_penalty: float = 0.0,
+	):
+		"""
+		Generates captions for the given image.
+		-----
+		Parameters:
+		preprocessor: transformers.PreTrainedTokenizerFast
+			The preprocessor to use for the model.
+		model: transformers.PreTrainedModel
+			The model to use for generating captions.
+		image: PIL.Image
+			The image to generate captions for.
+		num_captions: int
+			The number of captions to generate.
+		temperature: float
+			The temperature to use for sampling. The value used to module the next token probabilities that will be used by default in the generate method of the model. Must be strictly positive. Defaults to 1.0.
+		top_k: int
+			The number of highest probability vocabulary tokens to keep for top-k-filtering. A large value of top_k will keep more probabilities for each token leading to a better but slower generation. Defaults to 50.
+		top_p: float
+			The value that will be used by default in the generate method of the model for top_p. If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+		repetition_penalty: float
+			The parameter for repetition penalty. 1.0 means no penalty. Defaults to 1.0.
+		diversity_penalty: float
+			The parameter for diversity penalty. 0.0 means no penalty. Defaults to 0.0.
+		"""
+		# Type checking and making sure the values are valid.
+		assert type(num_captions) == int and num_captions > 0, "num_captions must be a positive integer."
+		assert type(max_length) == int and max_length > 0, "max_length must be a positive integer."
+		assert type(temperature) == float and temperature > 0.0, "temperature must be a positive float."
+		assert type(top_k) == int and top_k > 0, "top_k must be a positive integer."
+		assert type(top_p) == float and top_p > 0.0, "top_p must be a positive float."
+		assert type(repetition_penalty) == float and repetition_penalty >= 1.0, "repetition_penalty must be a positive float greater than or equal to 1."
+		assert type(diversity_penalty) == float and diversity_penalty >= 0.0, "diversity_penalty must be a non negative float."
+		pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device) # Convert the image to pixel values.
+		# Generate captions ids.
+		if num_captions == 1:
+			generated_ids = self.model.generate(
+				pixel_values=pixel_values,
+				max_length=max_length,
+				num_return_sequences=1,
+				temperature=temperature,
+				top_k=top_k,
+				top_p=top_p,
+			)
+		else:
+			generated_ids = self.model.generate(
+				pixel_values=pixel_values,
+				max_length=max_length,
+				num_beams=num_captions, # num_beams must be greater than or equal to num_captions and must be divisible by num_beam_groups.
+				num_beam_groups=num_captions, # num_beam_groups is set to equal to num_captions so that all the captions are diverse
+				num_return_sequences=num_captions, # generate multiple captions which are very similar to each other due to the grouping effect of beam search.
+				temperature=temperature,
+				top_k=top_k,
+				top_p=top_p,
+				repetition_penalty=repetition_penalty,
+				diversity_penalty=diversity_penalty,
+			)
+		# Decode the generated ids to get the captions.
+		generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+		return generated_caption
+class GitBaseCocoModel(ImageCaptionModel):
+	def __init__(self, device):
+		"""
+		A wrapper class for the Git-Base-COCO model. It is a pretrained model for image captioning.
+		-----
+		Parameters:
+		device: str
+			The device to run the model on, either "cpu" or "cuda".
+		checkpoint: str
+			The checkpoint to load the model from.
+		-----
+		Returns:
+		None
+		"""
+		checkpoint = "microsoft/git-base-coco"
+		processor = AutoProcessor.from_pretrained(checkpoint)
+		model = AutoModelForCausalLM.from_pretrained(checkpoint)
+		super().__init__(device, processor, model)
+class BlipBaseModel(ImageCaptionModel):
+	def __init__(self, device):
+		"""
+		A wrapper class for the Blip-Base model. It is a pretrained model for image captioning.
+		-----
+		Parameters:
+		device: str
+			The device to run the model on, either "cpu" or "cuda".
+		checkpoint: str
+			The checkpoint to load the model from.
+		-----
+		Returns:
+		None
+		"""
+		self.checkpoint = "Salesforce/blip-image-captioning-base"
+		processor = AutoProcessor.from_pretrained(self.checkpoint)
+		model = BlipForConditionalGeneration.from_pretrained(self.checkpoint)
+		super().__init__(device, processor, model)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+open_clip_torch
+accelerate
+bitsandbytes
+git+https://github.com/huggingface/transformers.git@main