Spaces:
Running
Running
""" | |
HF Space: CLIP Latent Conformity & Likelihood | |
------------------------------------------------- | |
This Gradio app computes (1) conformity-to-mean and (2) relative log-likelihood | |
for CLIP image/text embeddings. It also supports pairwise comparison in terms of | |
both metrics. | |
IMPORTANT (data provenance): The matrices below that drive the likelihood are | |
loaded from MS-COCO–based statistics. We use the precomputed means and W matrices | |
provided in the repo: https://github.com/rbetser/W_CLIP/tree/main/w_mats. | |
Definitions used here (aligned with the user's papers): | |
- Conformity (per modality) = cosine similarity between a unit-normalized sample | |
feature and the corresponding modality mean feature (also unit-normalized). | |
- Log-likelihood (per modality) is modeled by a quadratic form using a | |
positive semi-definite precision matrix W (MS-COCO-based): | |
d^2(x) = (x - mu)^T W (x - mu) | |
loglike_rel(x) = -0.5 * d^2(x) (constant terms omitted) | |
Notes: | |
- Conformity measure is based on the paper: "The Double-Ellipsoid Geometry of CLIP" (https://arxiv.org/abs/2411.14517) | |
- Likelihood measure is based on the paper: "Whitened CLIP as a Likelihood Surrogate of Images and Captions" (https://arxiv.org/abs/2505.06934) | |
- CLIP embedding dim is 768 for ViT-L/14. | |
- We keep modality-specific means (mu_img, mu_txt) and precision matrices | |
(W_img, W_txt). These are loaded at runtime from local `.pt` files shipped | |
with the Space. | |
""" | |
from __future__ import annotations | |
import gradio as gr | |
import torch | |
import numpy as np | |
from PIL import Image | |
from transformers import CLIPModel, AutoProcessor | |
# --------------------------- | |
# Load internal statistics (from w_mats) | |
# --------------------------- | |
_device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Paths (must be uploaded to the Space inside a folder named w_mats) | |
_mean_image_path = "w_mats/mean_image_L14.pt" | |
_mean_text_path = "w_mats/mean_text_L14.pt" | |
_w_image_path = "w_mats/w_mat_image_L14.pt" | |
_w_text_path = "w_mats/w_mat_text_L14.pt" | |
# Load tensors | |
_modality_mean_image = torch.load(_mean_image_path, map_location=_device, weights_only=False).cpu().numpy() | |
_modality_mean_text = torch.load(_mean_text_path, map_location=_device, weights_only=False).cpu().numpy() | |
_W_image = torch.load(_w_image_path, map_location=_device, weights_only=False).cpu().numpy() | |
_W_text = torch.load(_w_text_path, map_location=_device, weights_only=False).cpu().numpy() | |
# Sanity checks | |
EMB_DIM = 768 # ViT-L/14 feature dimension | |
assert _modality_mean_image.shape == (EMB_DIM,), f"mu_image must be {EMB_DIM}-D" | |
assert _modality_mean_text.shape == (EMB_DIM,), f"mu_text must be {EMB_DIM}-D" | |
assert _W_image.shape == (EMB_DIM, EMB_DIM), f"W_image must be {EMB_DIM}x{EMB_DIM}" | |
assert _W_text.shape == (EMB_DIM, EMB_DIM), f"W_text must be {EMB_DIM}x{EMB_DIM}" | |
# --------------------------- | |
# Model / Processor | |
# --------------------------- | |
MODEL_ID = "openai/clip-vit-large-patch14" | |
_model: CLIPModel | None = None | |
_processor: AutoProcessor | None = None | |
def _load_model(): | |
global _model, _processor | |
if _model is None: | |
_model = CLIPModel.from_pretrained(MODEL_ID).to(_device).eval() | |
if _processor is None: | |
_processor = AutoProcessor.from_pretrained(MODEL_ID) | |
def _l2_normalize(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor: | |
return x / (x.norm(dim=-1, keepdim=True) + eps) | |
# --------------------------- | |
# Embedding helpers | |
# --------------------------- | |
def embed_image(img: Image.Image) -> np.ndarray: | |
_load_model() | |
inputs = _processor(images=img, return_tensors="pt").to(_device) | |
feats = _model.get_image_features(**inputs) # [1, D] | |
# feats = _l2_normalize(feats) | |
return feats.squeeze(0).detach().cpu().numpy() | |
def embed_text(text: str) -> np.ndarray: | |
_load_model() | |
inputs = _processor(text=[text], return_tensors="pt", padding=True).to(_device) | |
feats = _model.get_text_features(**inputs) # [1, D] | |
# feats = _l2_normalize(feats) | |
return feats.squeeze(0).detach().cpu().numpy() | |
# --------------------------- | |
# Conformity & Likelihood | |
# --------------------------- | |
def _cosine(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float: | |
a = a / (np.linalg.norm(a) + eps) | |
b = b / (np.linalg.norm(b) + eps) | |
return float(np.dot(a, b)) | |
def conformity_image(z: np.ndarray) -> float: | |
return _cosine(z, _modality_mean_image) | |
def conformity_text(z: np.ndarray) -> float: | |
return _cosine(z, _modality_mean_text) | |
def loglike_image_relative(z_in_i: np.ndarray) -> float: | |
# Convert to torch tensor on the correct device | |
z_i = torch.tensor(z_in_i, dtype=torch.float32, device=_device).reshape(1,-1) | |
mu_i = torch.tensor(_modality_mean_image, dtype=torch.float32, device=_device).reshape(1,-1) | |
W = torch.tensor(_W_image, dtype=torch.float32, device=_device) | |
# Center and transform features using the whitening matrix | |
cntr_features = z_i - mu_i | |
w_features = torch.matmul(cntr_features, W) | |
# quad = (cntr_features @ W @ cntr_features.T).squeeze() | |
# Compute log-likelihood using Gaussian distribution assumption | |
N = z_i.shape[-1] | |
log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2)) | |
# Return as NumPy float | |
return log_like.cpu().numpy().item() | |
def loglike_text_relative(z_in_t: np.ndarray) -> float: | |
# Convert to torch tensor on the correct device | |
z_t = torch.tensor(z_in_t, dtype=torch.float32, device=_device).reshape(1,-1) | |
mu_t = torch.tensor(_modality_mean_text, dtype=torch.float32, device=_device).reshape(1,-1) | |
W = torch.tensor(_W_text, dtype=torch.float32, device=_device) | |
# Center and transform features using the whitening matrix | |
cntr_features = z_t - mu_t | |
w_features = torch.matmul(cntr_features, W) | |
#quad = (cntr_features @ W @ cntr_features.T).squeeze() | |
# Compute log-likelihood using Gaussian distribution assumption | |
N = z_t.shape[-1] | |
log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2)) | |
# Return as NumPy float | |
return log_like.cpu().numpy().item() | |
# --------------------------- | |
# Gradio logic | |
# --------------------------- | |
DESC = """ | |
This Space operates on **CLIP ViT-L/14** latent space to compute two metrics per modality: | |
1. **Conformity** — measure how common the samle is (based on [The Double-Ellipsoid Geometry of CLIP](https://arxiv.org/abs/2411.14517)) | |
2. **Log-Likelihood** — measure how like the common is (based on [Whitened CLIP as a Likelihood Surrogate of Images and Captions](https://arxiv.org/abs/2505.06934)) | |
All required modality means and W matrices are stored *internally* and loaded from `w_mats/*.pt`. | |
""" | |
PROVENANCE = """ | |
**Data provenance** | |
Modality means and precision matrices (W) are computed from **MS-COCO** features. | |
They are loaded from precomputed `.pt` files in the Space repo. | |
""" | |
def analyze_single(modality: str, text: str, image: Image.Image): | |
if modality == "Image": | |
if image is None: | |
return {"Error": "Please upload an image."}, None | |
z = embed_image(image) | |
conf = conformity_image(z) | |
ll = loglike_image_relative(z) | |
else: | |
if not text: | |
return {"Error": "Please enter text."}, None | |
z = embed_text(text) | |
conf = conformity_text(z) | |
ll = loglike_text_relative(z) | |
report = { | |
"Modality": modality, | |
"Conformity (cosine to mu)": round(conf, 6), | |
"Rel. Log-Likelihood (MS-COCO W)": round(ll, 6), | |
} | |
summary = f"Conformity: {conf:.6f} | Log-likelihood: {ll:.6f}" | |
return report, summary | |
def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image): | |
from io import BytesIO | |
import base64 | |
# Prepare images if modality is Image | |
img1_html = "" | |
img2_html = "" | |
if modality == "Image": | |
if image1 is None or image2 is None: | |
return "<p style='color:red'>Please upload both images.</p>" | |
# Convert first image to base64 | |
buf1 = BytesIO() | |
image1.save(buf1, format="PNG") | |
img1_b64 = base64.b64encode(buf1.getvalue()).decode() | |
img1_html = f"<img src='data:image/png;base64,{img1_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>" | |
# Convert second image to base64 | |
buf2 = BytesIO() | |
image2.save(buf2, format="PNG") | |
img2_b64 = base64.b64encode(buf2.getvalue()).decode() | |
img2_html = f"<img src='data:image/png;base64,{img2_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>" | |
z1 = embed_image(image1) | |
z2 = embed_image(image2) | |
c1, c2 = conformity_image(z1), conformity_image(z2) | |
l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2) | |
else: | |
if not text1 or not text2: | |
return "<p style='color:red'>Please enter both texts.</p>" | |
z1 = embed_text(text1) | |
z2 = embed_text(text2) | |
c1, c2 = conformity_text(z1), conformity_text(z2) | |
l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2) | |
# Build HTML output | |
html = f""" | |
<div style='display:flex; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'> | |
<div style='text-align:center;'> | |
{img1_html if modality=="Image" else "<div style='min-height:50px'></div>"} | |
<p><b>#1 {modality}:</b></p> | |
<p>Conformity: {c1:.6f}</p> | |
<p>Log-Likelihood: {l1:.6f}</p> | |
</div> | |
<div style='text-align:center;'> | |
{img2_html if modality=="Image" else "<div style='min-height:50px'></div>"} | |
<p><b>#2 {modality}:</b></p> | |
<p>Conformity: {c2:.6f}</p> | |
<p>Log-Likelihood: {l2:.6f}</p> | |
</div> | |
<div style='text-align:center;'> | |
<p><b>Δ (2-1)</b></p> | |
<p>Δ Conformity: {c2-c1:.6f}</p> | |
<p>Δ Log-Likelihood: {l2-l1:.6f}</p> | |
</div> | |
</div> | |
""" | |
return html | |
with gr.Blocks( | |
title="CLIP Latent: Conformity & Likelihood (ViT-L/14)", | |
css=""" | |
#result-box, #result-cmp { | |
min-height: 200px; | |
padding: 10px; | |
border: 1px solid #eee; | |
border-radius: 8px; | |
} | |
""" | |
) as demo: | |
gr.Markdown(f"# CLIP Latent Space — Conformity & Likelihood (ViT-L/14)\n\n{DESC}\n\n{PROVENANCE}") | |
with gr.Tab("Single Input"): | |
modality = gr.Radio(["Image", "Text"], value="Image", label="Modality") | |
img_in = gr.Image(type="pil", label="Image", visible=True) | |
txt_in = gr.Textbox(label="Text", visible=False) | |
btn = gr.Button("Analyze") | |
result_out = gr.HTML("<p>Result will appear here</p>", elem_id="result-box") | |
# Update function must be inside the Blocks context | |
def update_inputs(mod): | |
return gr.update(visible=(mod=="Image")), gr.update(visible=(mod=="Text")) | |
modality.change(fn=update_inputs, inputs=[modality], outputs=[img_in, txt_in]) | |
# Analysis function inside the same context | |
def analyze_single_gui(modality: str, text: str, image: Image.Image): | |
from io import BytesIO | |
import base64 | |
# Prepare image HTML if modality is Image | |
img_html = "" | |
if modality == "Image" and image is not None: | |
buffered = BytesIO() | |
image.save(buffered, format="PNG") | |
img_b64 = base64.b64encode(buffered.getvalue()).decode() | |
img_html = f"<img src='data:image/png;base64,{img_b64}' width='200px' style='border:1px solid #ccc; border-radius:8px;'/>" | |
z = embed_image(image) | |
conf = conformity_image(z) | |
ll = loglike_image_relative(z) | |
else: | |
if not text: | |
return "<p style='color:red'>Please enter text.</p>" | |
z = embed_text(text) | |
conf = conformity_text(z) | |
ll = loglike_text_relative(z) | |
# <-- Set the HTML with min-height and padding here --> | |
html = f""" | |
<div style='display:flex; align-items:center; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'> | |
{img_html} | |
<div> | |
<p><b>Modality:</b> {modality}</p> | |
<p><b>Conformity:</b> {conf:.6f}</p> | |
<p><b>Log-Likelihood:</b> {ll:.6f}</p> | |
</div> | |
</div> | |
""" | |
# Return the HTML to the gr.HTML component | |
return html | |
btn.click(analyze_single_gui, inputs=[modality, txt_in, img_in], outputs=[result_out]) | |
with gr.Tab("Compare Two"): | |
modality_c = gr.Radio(["Image", "Text"], value="Image", label="Modality") | |
img1 = gr.Image(type="pil", label="#1 Image", visible=True) | |
txt1 = gr.Textbox(label="#1 Text", visible=False) | |
img2 = gr.Image(type="pil", label="#2 Image", visible=True) | |
txt2 = gr.Textbox(label="#2 Text", visible=False) | |
result_cmp = gr.HTML("<p>Comparison result will appear here</p>", elem_id="result-cmp") | |
def update_compare_inputs(mod): | |
return (gr.update(visible=(mod=="Image")), # img1 | |
gr.update(visible=(mod=="Text")), # txt1 | |
gr.update(visible=(mod=="Image")), # img2 | |
gr.update(visible=(mod=="Text"))) # txt2 | |
modality_c.change(fn=update_compare_inputs, | |
inputs=[modality_c], | |
outputs=[img1, txt1, img2, txt2]) | |
def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image): | |
from io import BytesIO | |
import base64 | |
if modality == "Image": | |
if image1 is None or image2 is None: | |
return "<p style='color:red'>Please upload both images.</p>" | |
def img_to_html(img): | |
buf = BytesIO() | |
img.save(buf, format="PNG") | |
img_b64 = base64.b64encode(buf.getvalue()).decode() | |
return f"<img src='data:image/png;base64,{img_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>" | |
img1_html = img_to_html(image1) | |
img2_html = img_to_html(image2) | |
z1 = embed_image(image1) | |
z2 = embed_image(image2) | |
c1, c2 = conformity_image(z1), conformity_image(z2) | |
l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2) | |
else: | |
if not text1 or not text2: | |
return "<p style='color:red'>Please enter both texts.</p>" | |
z1 = embed_text(text1) | |
z2 = embed_text(text2) | |
c1, c2 = conformity_text(z1), conformity_text(z2) | |
l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2) | |
img1_html = img2_html = "<div style='min-height:50px'></div>" | |
html = f""" | |
<div style='display:flex; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'> | |
<div style='text-align:center;'>{img1_html}<p><b>#1 {modality}</b></p> | |
<p>Conformity: {c1:.6f}</p><p>Log-Likelihood: {l1:.6f}</p> | |
</div> | |
<div style='text-align:center;'>{img2_html}<p><b>#2 {modality}</b></p> | |
<p>Conformity: {c2:.6f}</p><p>Log-Likelihood: {l2:.6f}</p> | |
</div> | |
<div style='text-align:center;'> | |
<p><b>Δ (2-1)</b></p> | |
<p>Δ Conformity: {c2-c1:.6f}</p> | |
<p>Δ Log-Likelihood: {l2-l1:.6f}</p> | |
</div> | |
</div> | |
""" | |
return html | |
btn_c = gr.Button("Compare") | |
btn_c.click(compare_pair_gui, inputs=[modality_c, txt1, img1, txt2, img2], outputs=[result_cmp]) | |
gr.Markdown( | |
""" | |
**Implementation details:** | |
- Embeddings: `openai/clip-vit-large-patch14` via 🤗 Transformers; features are L2-normalized. | |
- Conformity: cosine similarity to stored modality means `mu_image`, `mu_text`. | |
- Log-likelihood: `-0.5 * (x-mu)^T W (x-mu)` using MS-COCO-based precision `W`. | |
""" | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True) | |