""" HF Space: CLIP Latent Conformity & Likelihood ------------------------------------------------- This Gradio app computes (1) conformity-to-mean and (2) relative log-likelihood for CLIP image/text embeddings. It also supports pairwise comparison in terms of both metrics. IMPORTANT (data provenance): The matrices below that drive the likelihood are loaded from MS-COCO–based statistics. We use the precomputed means and W matrices provided in the repo: https://github.com/rbetser/W_CLIP/tree/main/w_mats. Definitions used here (aligned with the user's papers): - Conformity (per modality) = cosine similarity between a unit-normalized sample feature and the corresponding modality mean feature (also unit-normalized). - Log-likelihood (per modality) is modeled by a quadratic form using a positive semi-definite precision matrix W (MS-COCO-based): d^2(x) = (x - mu)^T W (x - mu) loglike_rel(x) = -0.5 * d^2(x) (constant terms omitted) Notes: - Conformity measure is based on the paper: "The Double-Ellipsoid Geometry of CLIP" (https://arxiv.org/abs/2411.14517) - Likelihood measure is based on the paper: "Whitened CLIP as a Likelihood Surrogate of Images and Captions" (https://arxiv.org/abs/2505.06934) - CLIP embedding dim is 768 for ViT-L/14. - We keep modality-specific means (mu_img, mu_txt) and precision matrices (W_img, W_txt). These are loaded at runtime from local `.pt` files shipped with the Space. """ from __future__ import annotations import gradio as gr import torch import numpy as np from PIL import Image from transformers import CLIPModel, AutoProcessor # --------------------------- # Load internal statistics (from w_mats) # --------------------------- _device = "cuda" if torch.cuda.is_available() else "cpu" # Paths (must be uploaded to the Space inside a folder named w_mats) _mean_image_path = "w_mats/mean_image_L14.pt" _mean_text_path = "w_mats/mean_text_L14.pt" _w_image_path = "w_mats/w_mat_image_L14.pt" _w_text_path = "w_mats/w_mat_text_L14.pt" # Load tensors _modality_mean_image = torch.load(_mean_image_path, map_location=_device, weights_only=False).cpu().numpy() _modality_mean_text = torch.load(_mean_text_path, map_location=_device, weights_only=False).cpu().numpy() _W_image = torch.load(_w_image_path, map_location=_device, weights_only=False).cpu().numpy() _W_text = torch.load(_w_text_path, map_location=_device, weights_only=False).cpu().numpy() # Sanity checks EMB_DIM = 768 # ViT-L/14 feature dimension assert _modality_mean_image.shape == (EMB_DIM,), f"mu_image must be {EMB_DIM}-D" assert _modality_mean_text.shape == (EMB_DIM,), f"mu_text must be {EMB_DIM}-D" assert _W_image.shape == (EMB_DIM, EMB_DIM), f"W_image must be {EMB_DIM}x{EMB_DIM}" assert _W_text.shape == (EMB_DIM, EMB_DIM), f"W_text must be {EMB_DIM}x{EMB_DIM}" # --------------------------- # Model / Processor # --------------------------- MODEL_ID = "openai/clip-vit-large-patch14" _model: CLIPModel | None = None _processor: AutoProcessor | None = None def _load_model(): global _model, _processor if _model is None: _model = CLIPModel.from_pretrained(MODEL_ID).to(_device).eval() if _processor is None: _processor = AutoProcessor.from_pretrained(MODEL_ID) def _l2_normalize(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor: return x / (x.norm(dim=-1, keepdim=True) + eps) # --------------------------- # Embedding helpers # --------------------------- @torch.no_grad() def embed_image(img: Image.Image) -> np.ndarray: _load_model() inputs = _processor(images=img, return_tensors="pt").to(_device) feats = _model.get_image_features(**inputs) # [1, D] # feats = _l2_normalize(feats) return feats.squeeze(0).detach().cpu().numpy() @torch.no_grad() def embed_text(text: str) -> np.ndarray: _load_model() inputs = _processor(text=[text], return_tensors="pt", padding=True).to(_device) feats = _model.get_text_features(**inputs) # [1, D] # feats = _l2_normalize(feats) return feats.squeeze(0).detach().cpu().numpy() # --------------------------- # Conformity & Likelihood # --------------------------- def _cosine(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float: a = a / (np.linalg.norm(a) + eps) b = b / (np.linalg.norm(b) + eps) return float(np.dot(a, b)) def conformity_image(z: np.ndarray) -> float: return _cosine(z, _modality_mean_image) def conformity_text(z: np.ndarray) -> float: return _cosine(z, _modality_mean_text) def loglike_image_relative(z_in_i: np.ndarray) -> float: # Convert to torch tensor on the correct device z_i = torch.tensor(z_in_i, dtype=torch.float32, device=_device).reshape(1,-1) mu_i = torch.tensor(_modality_mean_image, dtype=torch.float32, device=_device).reshape(1,-1) W = torch.tensor(_W_image, dtype=torch.float32, device=_device) # Center and transform features using the whitening matrix cntr_features = z_i - mu_i w_features = torch.matmul(cntr_features, W) # quad = (cntr_features @ W @ cntr_features.T).squeeze() # Compute log-likelihood using Gaussian distribution assumption N = z_i.shape[-1] log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2)) # Return as NumPy float return log_like.cpu().numpy().item() def loglike_text_relative(z_in_t: np.ndarray) -> float: # Convert to torch tensor on the correct device z_t = torch.tensor(z_in_t, dtype=torch.float32, device=_device).reshape(1,-1) mu_t = torch.tensor(_modality_mean_text, dtype=torch.float32, device=_device).reshape(1,-1) W = torch.tensor(_W_text, dtype=torch.float32, device=_device) # Center and transform features using the whitening matrix cntr_features = z_t - mu_t w_features = torch.matmul(cntr_features, W) #quad = (cntr_features @ W @ cntr_features.T).squeeze() # Compute log-likelihood using Gaussian distribution assumption N = z_t.shape[-1] log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2)) # Return as NumPy float return log_like.cpu().numpy().item() # --------------------------- # Gradio logic # --------------------------- DESC = """ This Space operates on **CLIP ViT-L/14** latent space to compute two metrics per modality: 1. **Conformity** — measure how common the samle is (based on [The Double-Ellipsoid Geometry of CLIP](https://arxiv.org/abs/2411.14517)) 2. **Log-Likelihood** — measure how like the common is (based on [Whitened CLIP as a Likelihood Surrogate of Images and Captions](https://arxiv.org/abs/2505.06934)) All required modality means and W matrices are stored *internally* and loaded from `w_mats/*.pt`. """ PROVENANCE = """ **Data provenance** Modality means and precision matrices (W) are computed from **MS-COCO** features. They are loaded from precomputed `.pt` files in the Space repo. """ def analyze_single(modality: str, text: str, image: Image.Image): if modality == "Image": if image is None: return {"Error": "Please upload an image."}, None z = embed_image(image) conf = conformity_image(z) ll = loglike_image_relative(z) else: if not text: return {"Error": "Please enter text."}, None z = embed_text(text) conf = conformity_text(z) ll = loglike_text_relative(z) report = { "Modality": modality, "Conformity (cosine to mu)": round(conf, 6), "Rel. Log-Likelihood (MS-COCO W)": round(ll, 6), } summary = f"Conformity: {conf:.6f} | Log-likelihood: {ll:.6f}" return report, summary def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image): from io import BytesIO import base64 # Prepare images if modality is Image img1_html = "" img2_html = "" if modality == "Image": if image1 is None or image2 is None: return "

Please upload both images.

" # Convert first image to base64 buf1 = BytesIO() image1.save(buf1, format="PNG") img1_b64 = base64.b64encode(buf1.getvalue()).decode() img1_html = f"" # Convert second image to base64 buf2 = BytesIO() image2.save(buf2, format="PNG") img2_b64 = base64.b64encode(buf2.getvalue()).decode() img2_html = f"" z1 = embed_image(image1) z2 = embed_image(image2) c1, c2 = conformity_image(z1), conformity_image(z2) l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2) else: if not text1 or not text2: return "

Please enter both texts.

" z1 = embed_text(text1) z2 = embed_text(text2) c1, c2 = conformity_text(z1), conformity_text(z2) l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2) # Build HTML output html = f"""
{img1_html if modality=="Image" else "
"}

#1 {modality}:

Conformity: {c1:.6f}

Log-Likelihood: {l1:.6f}

{img2_html if modality=="Image" else "
"}

#2 {modality}:

Conformity: {c2:.6f}

Log-Likelihood: {l2:.6f}

Δ (2-1)

Δ Conformity: {c2-c1:.6f}

Δ Log-Likelihood: {l2-l1:.6f}

""" return html with gr.Blocks( title="CLIP Latent: Conformity & Likelihood (ViT-L/14)", css=""" #result-box, #result-cmp { min-height: 200px; padding: 10px; border: 1px solid #eee; border-radius: 8px; } """ ) as demo: gr.Markdown(f"# CLIP Latent Space — Conformity & Likelihood (ViT-L/14)\n\n{DESC}\n\n{PROVENANCE}") with gr.Tab("Single Input"): modality = gr.Radio(["Image", "Text"], value="Image", label="Modality") img_in = gr.Image(type="pil", label="Image", visible=True) txt_in = gr.Textbox(label="Text", visible=False) btn = gr.Button("Analyze") result_out = gr.HTML("

Result will appear here

", elem_id="result-box") # Update function must be inside the Blocks context def update_inputs(mod): return gr.update(visible=(mod=="Image")), gr.update(visible=(mod=="Text")) modality.change(fn=update_inputs, inputs=[modality], outputs=[img_in, txt_in]) # Analysis function inside the same context def analyze_single_gui(modality: str, text: str, image: Image.Image): from io import BytesIO import base64 # Prepare image HTML if modality is Image img_html = "" if modality == "Image" and image is not None: buffered = BytesIO() image.save(buffered, format="PNG") img_b64 = base64.b64encode(buffered.getvalue()).decode() img_html = f"" z = embed_image(image) conf = conformity_image(z) ll = loglike_image_relative(z) else: if not text: return "

Please enter text.

" z = embed_text(text) conf = conformity_text(z) ll = loglike_text_relative(z) # <-- Set the HTML with min-height and padding here --> html = f"""
{img_html}

Modality: {modality}

Conformity: {conf:.6f}

Log-Likelihood: {ll:.6f}

""" # Return the HTML to the gr.HTML component return html btn.click(analyze_single_gui, inputs=[modality, txt_in, img_in], outputs=[result_out]) with gr.Tab("Compare Two"): modality_c = gr.Radio(["Image", "Text"], value="Image", label="Modality") img1 = gr.Image(type="pil", label="#1 Image", visible=True) txt1 = gr.Textbox(label="#1 Text", visible=False) img2 = gr.Image(type="pil", label="#2 Image", visible=True) txt2 = gr.Textbox(label="#2 Text", visible=False) result_cmp = gr.HTML("

Comparison result will appear here

", elem_id="result-cmp") def update_compare_inputs(mod): return (gr.update(visible=(mod=="Image")), # img1 gr.update(visible=(mod=="Text")), # txt1 gr.update(visible=(mod=="Image")), # img2 gr.update(visible=(mod=="Text"))) # txt2 modality_c.change(fn=update_compare_inputs, inputs=[modality_c], outputs=[img1, txt1, img2, txt2]) def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image): from io import BytesIO import base64 if modality == "Image": if image1 is None or image2 is None: return "

Please upload both images.

" def img_to_html(img): buf = BytesIO() img.save(buf, format="PNG") img_b64 = base64.b64encode(buf.getvalue()).decode() return f"" img1_html = img_to_html(image1) img2_html = img_to_html(image2) z1 = embed_image(image1) z2 = embed_image(image2) c1, c2 = conformity_image(z1), conformity_image(z2) l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2) else: if not text1 or not text2: return "

Please enter both texts.

" z1 = embed_text(text1) z2 = embed_text(text2) c1, c2 = conformity_text(z1), conformity_text(z2) l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2) img1_html = img2_html = "
" html = f"""
{img1_html}

#1 {modality}

Conformity: {c1:.6f}

Log-Likelihood: {l1:.6f}

{img2_html}

#2 {modality}

Conformity: {c2:.6f}

Log-Likelihood: {l2:.6f}

Δ (2-1)

Δ Conformity: {c2-c1:.6f}

Δ Log-Likelihood: {l2-l1:.6f}

""" return html btn_c = gr.Button("Compare") btn_c.click(compare_pair_gui, inputs=[modality_c, txt1, img1, txt2, img2], outputs=[result_cmp]) gr.Markdown( """ **Implementation details:** - Embeddings: `openai/clip-vit-large-patch14` via 🤗 Transformers; features are L2-normalized. - Conformity: cosine similarity to stored modality means `mu_image`, `mu_text`. - Log-likelihood: `-0.5 * (x-mu)^T W (x-mu)` using MS-COCO-based precision `W`. """ ) if __name__ == "__main__": demo.launch(share=True)