Spaces:

Yossilevii100
/

CLIPLatent

Running

CLIPLatent / app.py

Meir Yossef Levi

fix link errors

a6bc15e 3 days ago

16.6 kB

	"""
	HF Space: CLIP Latent Conformity & Likelihood
	-------------------------------------------------
	This Gradio app computes (1) conformity-to-mean and (2) relative log-likelihood
	for CLIP image/text embeddings. It also supports pairwise comparison in terms of
	both metrics.

	IMPORTANT (data provenance): The matrices below that drive the likelihood are
	loaded from MS-COCO–based statistics. We use the precomputed means and W matrices
	provided in the repo: https://github.com/rbetser/W_CLIP/tree/main/w_mats.

	Definitions used here (aligned with the user's papers):
	- Conformity (per modality) = cosine similarity between a unit-normalized sample
	feature and the corresponding modality mean feature (also unit-normalized).
	- Log-likelihood (per modality) is modeled by a quadratic form using a
	positive semi-definite precision matrix W (MS-COCO-based):
	d^2(x) = (x - mu)^T W (x - mu)
	loglike_rel(x) = -0.5 * d^2(x) (constant terms omitted)

	Notes:
	- Conformity measure is based on the paper: "The Double-Ellipsoid Geometry of CLIP" (https://arxiv.org/abs/2411.14517)

	- Likelihood measure is based on the paper: "Whitened CLIP as a Likelihood Surrogate of Images and Captions" (https://arxiv.org/abs/2505.06934)

	- CLIP embedding dim is 768 for ViT-L/14.

	- We keep modality-specific means (mu_img, mu_txt) and precision matrices
	(W_img, W_txt). These are loaded at runtime from local `.pt` files shipped
	with the Space.
	"""

	from __future__ import annotations
	import gradio as gr
	import torch
	import numpy as np
	from PIL import Image
	from transformers import CLIPModel, AutoProcessor

	# ---------------------------
	# Load internal statistics (from w_mats)
	# ---------------------------
	_device = "cuda" if torch.cuda.is_available() else "cpu"

	# Paths (must be uploaded to the Space inside a folder named w_mats)
	_mean_image_path = "w_mats/mean_image_L14.pt"
	_mean_text_path = "w_mats/mean_text_L14.pt"
	_w_image_path = "w_mats/w_mat_image_L14.pt"
	_w_text_path = "w_mats/w_mat_text_L14.pt"

	# Load tensors
	_modality_mean_image = torch.load(_mean_image_path, map_location=_device, weights_only=False).cpu().numpy()
	_modality_mean_text = torch.load(_mean_text_path, map_location=_device, weights_only=False).cpu().numpy()
	_W_image = torch.load(_w_image_path, map_location=_device, weights_only=False).cpu().numpy()
	_W_text = torch.load(_w_text_path, map_location=_device, weights_only=False).cpu().numpy()

	# Sanity checks
	EMB_DIM = 768 # ViT-L/14 feature dimension
	assert _modality_mean_image.shape == (EMB_DIM,), f"mu_image must be {EMB_DIM}-D"
	assert _modality_mean_text.shape == (EMB_DIM,), f"mu_text must be {EMB_DIM}-D"
	assert _W_image.shape == (EMB_DIM, EMB_DIM), f"W_image must be {EMB_DIM}x{EMB_DIM}"
	assert _W_text.shape == (EMB_DIM, EMB_DIM), f"W_text must be {EMB_DIM}x{EMB_DIM}"

	# ---------------------------
	# Model / Processor
	# ---------------------------
	MODEL_ID = "openai/clip-vit-large-patch14"
	_model: CLIPModel \| None = None
	_processor: AutoProcessor \| None = None


	def _load_model():
	global _model, _processor
	if _model is None:
	_model = CLIPModel.from_pretrained(MODEL_ID).to(_device).eval()
	if _processor is None:
	_processor = AutoProcessor.from_pretrained(MODEL_ID)


	def _l2_normalize(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
	return x / (x.norm(dim=-1, keepdim=True) + eps)


	# ---------------------------
	# Embedding helpers
	# ---------------------------
	@torch.no_grad()
	def embed_image(img: Image.Image) -> np.ndarray:
	_load_model()
	inputs = _processor(images=img, return_tensors="pt").to(_device)
	feats = _model.get_image_features(**inputs) # [1, D]
	# feats = _l2_normalize(feats)
	return feats.squeeze(0).detach().cpu().numpy()


	@torch.no_grad()
	def embed_text(text: str) -> np.ndarray:
	_load_model()
	inputs = _processor(text=[text], return_tensors="pt", padding=True).to(_device)
	feats = _model.get_text_features(**inputs) # [1, D]
	# feats = _l2_normalize(feats)
	return feats.squeeze(0).detach().cpu().numpy()


	# ---------------------------
	# Conformity & Likelihood
	# ---------------------------
	def _cosine(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float:
	a = a / (np.linalg.norm(a) + eps)
	b = b / (np.linalg.norm(b) + eps)
	return float(np.dot(a, b))


	def conformity_image(z: np.ndarray) -> float:
	return _cosine(z, _modality_mean_image)


	def conformity_text(z: np.ndarray) -> float:
	return _cosine(z, _modality_mean_text)


	def loglike_image_relative(z_in_i: np.ndarray) -> float:
	# Convert to torch tensor on the correct device
	z_i = torch.tensor(z_in_i, dtype=torch.float32, device=_device).reshape(1,-1)
	mu_i = torch.tensor(_modality_mean_image, dtype=torch.float32, device=_device).reshape(1,-1)
	W = torch.tensor(_W_image, dtype=torch.float32, device=_device)

	# Center and transform features using the whitening matrix
	cntr_features = z_i - mu_i
	w_features = torch.matmul(cntr_features, W)
	# quad = (cntr_features @ W @ cntr_features.T).squeeze()

	# Compute log-likelihood using Gaussian distribution assumption
	N = z_i.shape[-1]
	log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2))

	# Return as NumPy float
	return log_like.cpu().numpy().item()


	def loglike_text_relative(z_in_t: np.ndarray) -> float:
	# Convert to torch tensor on the correct device
	z_t = torch.tensor(z_in_t, dtype=torch.float32, device=_device).reshape(1,-1)
	mu_t = torch.tensor(_modality_mean_text, dtype=torch.float32, device=_device).reshape(1,-1)
	W = torch.tensor(_W_text, dtype=torch.float32, device=_device)

	# Center and transform features using the whitening matrix
	cntr_features = z_t - mu_t
	w_features = torch.matmul(cntr_features, W)
	#quad = (cntr_features @ W @ cntr_features.T).squeeze()

	# Compute log-likelihood using Gaussian distribution assumption
	N = z_t.shape[-1]
	log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2))

	# Return as NumPy float
	return log_like.cpu().numpy().item()


	# ---------------------------
	# Gradio logic
	# ---------------------------
	DESC = """
	This Space operates on CLIP ViT-L/14 latent space to compute two metrics per modality:
	1. Conformity — measure how common the samle is (based on [The Double-Ellipsoid Geometry of CLIP](https://arxiv.org/abs/2411.14517))
	2. Log-Likelihood — measure how like the common is (based on [Whitened CLIP as a Likelihood Surrogate of Images and Captions](https://arxiv.org/abs/2505.06934))

	All required modality means and W matrices are stored internally and loaded from `w_mats/*.pt`.
	"""

	PROVENANCE = """
	Data provenance
	Modality means and precision matrices (W) are computed from MS-COCO features.
	They are loaded from precomputed `.pt` files in the Space repo.
	"""


	def analyze_single(modality: str, text: str, image: Image.Image):
	if modality == "Image":
	if image is None:
	return {"Error": "Please upload an image."}, None
	z = embed_image(image)
	conf = conformity_image(z)
	ll = loglike_image_relative(z)
	else:
	if not text:
	return {"Error": "Please enter text."}, None
	z = embed_text(text)
	conf = conformity_text(z)
	ll = loglike_text_relative(z)

	report = {
	"Modality": modality,
	"Conformity (cosine to mu)": round(conf, 6),
	"Rel. Log-Likelihood (MS-COCO W)": round(ll, 6),
	}
	summary = f"Conformity: {conf:.6f} \| Log-likelihood: {ll:.6f}"
	return report, summary


	def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image):
	from io import BytesIO
	import base64

	# Prepare images if modality is Image
	img1_html = ""
	img2_html = ""
	if modality == "Image":
	if image1 is None or image2 is None:
	return "<p style='color:red'>Please upload both images.</p>"
	# Convert first image to base64
	buf1 = BytesIO()
	image1.save(buf1, format="PNG")
	img1_b64 = base64.b64encode(buf1.getvalue()).decode()
	img1_html = f"<img src='data:image/png;base64,{img1_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>"
	# Convert second image to base64
	buf2 = BytesIO()
	image2.save(buf2, format="PNG")
	img2_b64 = base64.b64encode(buf2.getvalue()).decode()
	img2_html = f"<img src='data:image/png;base64,{img2_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>"

	z1 = embed_image(image1)
	z2 = embed_image(image2)
	c1, c2 = conformity_image(z1), conformity_image(z2)
	l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2)

	else:
	if not text1 or not text2:
	return "<p style='color:red'>Please enter both texts.</p>"
	z1 = embed_text(text1)
	z2 = embed_text(text2)
	c1, c2 = conformity_text(z1), conformity_text(z2)
	l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2)

	# Build HTML output
	html = f"""
	<div style='display:flex; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'>
	<div style='text-align:center;'>
	{img1_html if modality=="Image" else "<div style='min-height:50px'></div>"}
	<p><b>#1 {modality}:</b></p>
	<p>Conformity: {c1:.6f}</p>
	<p>Log-Likelihood: {l1:.6f}</p>
	</div>
	<div style='text-align:center;'>
	{img2_html if modality=="Image" else "<div style='min-height:50px'></div>"}
	<p><b>#2 {modality}:</b></p>
	<p>Conformity: {c2:.6f}</p>
	<p>Log-Likelihood: {l2:.6f}</p>
	</div>
	<div style='text-align:center;'>
	<p><b>Δ (2-1)</b></p>
	<p>Δ Conformity: {c2-c1:.6f}</p>
	<p>Δ Log-Likelihood: {l2-l1:.6f}</p>
	</div>
	</div>
	"""
	return html


	with gr.Blocks(
	title="CLIP Latent: Conformity & Likelihood (ViT-L/14)",
	css="""
	#result-box, #result-cmp {
	min-height: 200px;
	padding: 10px;
	border: 1px solid #eee;
	border-radius: 8px;
	}
	"""
	) as demo:


	gr.Markdown(f"# CLIP Latent Space — Conformity & Likelihood (ViT-L/14)\n\n{DESC}\n\n{PROVENANCE}")

	with gr.Tab("Single Input"):
	modality = gr.Radio(["Image", "Text"], value="Image", label="Modality")

	img_in = gr.Image(type="pil", label="Image", visible=True)
	txt_in = gr.Textbox(label="Text", visible=False)

	btn = gr.Button("Analyze")
	result_out = gr.HTML("<p>Result will appear here</p>", elem_id="result-box")

	# Update function must be inside the Blocks context
	def update_inputs(mod):
	return gr.update(visible=(mod=="Image")), gr.update(visible=(mod=="Text"))

	modality.change(fn=update_inputs, inputs=[modality], outputs=[img_in, txt_in])

	# Analysis function inside the same context
	def analyze_single_gui(modality: str, text: str, image: Image.Image):
	from io import BytesIO
	import base64

	# Prepare image HTML if modality is Image
	img_html = ""
	if modality == "Image" and image is not None:
	buffered = BytesIO()
	image.save(buffered, format="PNG")
	img_b64 = base64.b64encode(buffered.getvalue()).decode()
	img_html = f"<img src='data:image/png;base64,{img_b64}' width='200px' style='border:1px solid #ccc; border-radius:8px;'/>"

	z = embed_image(image)
	conf = conformity_image(z)
	ll = loglike_image_relative(z)
	else:
	if not text:
	return "<p style='color:red'>Please enter text.</p>"
	z = embed_text(text)
	conf = conformity_text(z)
	ll = loglike_text_relative(z)

	# <-- Set the HTML with min-height and padding here -->
	html = f"""
	<div style='display:flex; align-items:center; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'>
	{img_html}
	<div>
	<p><b>Modality:</b> {modality}</p>
	<p><b>Conformity:</b> {conf:.6f}</p>
	<p><b>Log-Likelihood:</b> {ll:.6f}</p>
	</div>
	</div>
	"""

	# Return the HTML to the gr.HTML component
	return html

	btn.click(analyze_single_gui, inputs=[modality, txt_in, img_in], outputs=[result_out])


	with gr.Tab("Compare Two"):
	modality_c = gr.Radio(["Image", "Text"], value="Image", label="Modality")

	img1 = gr.Image(type="pil", label="#1 Image", visible=True)
	txt1 = gr.Textbox(label="#1 Text", visible=False)
	img2 = gr.Image(type="pil", label="#2 Image", visible=True)
	txt2 = gr.Textbox(label="#2 Text", visible=False)

	result_cmp = gr.HTML("<p>Comparison result will appear here</p>", elem_id="result-cmp")

	def update_compare_inputs(mod):
	return (gr.update(visible=(mod=="Image")), # img1
	gr.update(visible=(mod=="Text")), # txt1
	gr.update(visible=(mod=="Image")), # img2
	gr.update(visible=(mod=="Text"))) # txt2

	modality_c.change(fn=update_compare_inputs,
	inputs=[modality_c],
	outputs=[img1, txt1, img2, txt2])

	def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image):
	from io import BytesIO
	import base64

	if modality == "Image":
	if image1 is None or image2 is None:
	return "<p style='color:red'>Please upload both images.</p>"

	def img_to_html(img):
	buf = BytesIO()
	img.save(buf, format="PNG")
	img_b64 = base64.b64encode(buf.getvalue()).decode()
	return f"<img src='data:image/png;base64,{img_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>"

	img1_html = img_to_html(image1)
	img2_html = img_to_html(image2)

	z1 = embed_image(image1)
	z2 = embed_image(image2)
	c1, c2 = conformity_image(z1), conformity_image(z2)
	l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2)

	else:
	if not text1 or not text2:
	return "<p style='color:red'>Please enter both texts.</p>"
	z1 = embed_text(text1)
	z2 = embed_text(text2)
	c1, c2 = conformity_text(z1), conformity_text(z2)
	l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2)
	img1_html = img2_html = "<div style='min-height:50px'></div>"

	html = f"""
	<div style='display:flex; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'>
	<div style='text-align:center;'>{img1_html}<p><b>#1 {modality}</b></p>
	<p>Conformity: {c1:.6f}</p><p>Log-Likelihood: {l1:.6f}</p>
	</div>
	<div style='text-align:center;'>{img2_html}<p><b>#2 {modality}</b></p>
	<p>Conformity: {c2:.6f}</p><p>Log-Likelihood: {l2:.6f}</p>
	</div>
	<div style='text-align:center;'>
	<p><b>Δ (2-1)</b></p>
	<p>Δ Conformity: {c2-c1:.6f}</p>
	<p>Δ Log-Likelihood: {l2-l1:.6f}</p>
	</div>
	</div>
	"""
	return html

	btn_c = gr.Button("Compare")
	btn_c.click(compare_pair_gui, inputs=[modality_c, txt1, img1, txt2, img2], outputs=[result_cmp])

	gr.Markdown(
	"""
	Implementation details:
	- Embeddings: `openai/clip-vit-large-patch14` via 🤗 Transformers; features are L2-normalized.
	- Conformity: cosine similarity to stored modality means `mu_image`, `mu_text`.
	- Log-likelihood: `-0.5 * (x-mu)^T W (x-mu)` using MS-COCO-based precision `W`.
	"""
	)

	if __name__ == "__main__":
	demo.launch(share=True)