Spaces:

capjamesg
/

CLIP-vs-MetaCLIP

Runtime error

App Files Files Community

CLIP-vs-MetaCLIP / app.py

capjamesg

update requirements

4566d18 almost 2 years ago

raw

history blame contribute delete

2.96 kB

	import gradio as gr
	from autodistill_clip import CLIP
	from autodistill_metaclip import MetaCLIP
	from autodistill.detection import CaptionOntology
	from PIL import Image
	import tempfile

	clip_model = CLIP(None)
	metaclip_model = MetaCLIP(None)

	# create side by side interface

	def clip_model_interface(image, text):
	text = text + ", something else"

	with tempfile.NamedTemporaryFile(suffix=".jpg") as temp:
	image = Image.fromarray(image.astype("uint8"), "RGB")

	image.save(temp.name)

	ontology = CaptionOntology(
	{
	t: t for t in text.split(",")
	}
	)

	clip_model.ontology = ontology

	predictions = clip_model.predict(temp.name)

	labels = [text.split(",")[i] for i in predictions.class_id.tolist()]

	confidences = predictions.confidence.tolist()

	return {
	k: v for k, v in zip(labels, confidences)
	}

	def metaclip_model_interface(image, text):
	text = text + ", something else"

	with tempfile.NamedTemporaryFile(suffix=".jpg") as temp:
	image = Image.fromarray(image.astype("uint8"), "RGB")

	image.save(temp.name)

	ontology = CaptionOntology(
	{
	t: t for t in text.split(",")
	}
	)

	metaclip_model.ontology = ontology

	predictions = metaclip_model.predict(temp.name, confidence=0)

	labels = [text.split(",")[i] for i in predictions.class_id.tolist()]

	confidences = predictions.confidence.tolist()

	return {
	k: v for k, v in zip(labels, confidences)
	}

	def combined_model_interface(input_image, input_text):
	# Call the first function
	clip_output = clip_model_interface(input_image, input_text)

	# Call the second function
	metaclip_output = metaclip_model_interface(input_image, input_text)

	# Return the results from both functions as a tuple
	return clip_output, metaclip_output

	inputs = [
	"image",
	"text"
	]

	outputs = [
	gr.outputs.Label(type="confidences", label="CLIP"),
	gr.outputs.Label(type="confidences", label="MetaCLIP")
	]

	title = "CLIP vs MetaCLIP"

	description = """
	CLIP is a zero-shot classification and embedding model developed by OpenAI.

	MetaCLIP is a model that uses a CLIP architecture with an open dataset, developed by Meta AI.

	Use this space to try out the models and see how they perform on your own images and text.

	Note: Due to the way this space was implemented, CLIP will only return the top class. A fix is coming soon.

	This project uses the following dependencies:

	- [autodistill-clip](https://github.com/autodistill/autodistill-clip)
	- [autodistill-metaclip](https://github.com/autodistill/autodistill-metaclip)
	"""

	gr.Interface(
	fn=combined_model_interface,
	inputs=inputs,
	outputs=outputs,
	title=title,
	description=description,
	allow_flagging=False,
	layout="vertical"
	).launch()