Spaces:
Running
Running
File size: 16,557 Bytes
163557b a60f54a a80de47 cdd2cfa a80de47 cdd2cfa a80de47 5e9c23d 9a360f9 5e9c23d 0c37d70 9a360f9 5e9c23d 492ae91 9a360f9 73fbf8a cdd2cfa 492ae91 9a360f9 a80de47 5e9c23d 9a360f9 5e9c23d 0c37d70 9a360f9 6a43627 492ae91 9a360f9 73fbf8a cdd2cfa 9a360f9 a80de47 163557b a6bc15e 163557b a6bc15e 163557b a80de47 160c722 a80de47 997fe14 a80de47 997fe14 a80de47 997fe14 a80de47 997fe14 a80de47 997fe14 a80de47 dcf9c0d ee013eb a80de47 ee013eb e516c97 ee013eb 255109d ee013eb 255109d ee013eb 255109d ee013eb 160c722 ee013eb d714b24 ee013eb 255109d ee013eb d714b24 a80de47 88a3974 e516c97 88a3974 a80de47 88a3974 a80de47 160c722 a80de47 d4cc9e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 |
"""
HF Space: CLIP Latent Conformity & Likelihood
-------------------------------------------------
This Gradio app computes (1) conformity-to-mean and (2) relative log-likelihood
for CLIP image/text embeddings. It also supports pairwise comparison in terms of
both metrics.
IMPORTANT (data provenance): The matrices below that drive the likelihood are
loaded from MS-COCO–based statistics. We use the precomputed means and W matrices
provided in the repo: https://github.com/rbetser/W_CLIP/tree/main/w_mats.
Definitions used here (aligned with the user's papers):
- Conformity (per modality) = cosine similarity between a unit-normalized sample
feature and the corresponding modality mean feature (also unit-normalized).
- Log-likelihood (per modality) is modeled by a quadratic form using a
positive semi-definite precision matrix W (MS-COCO-based):
d^2(x) = (x - mu)^T W (x - mu)
loglike_rel(x) = -0.5 * d^2(x) (constant terms omitted)
Notes:
- Conformity measure is based on the paper: "The Double-Ellipsoid Geometry of CLIP" (https://arxiv.org/abs/2411.14517)
- Likelihood measure is based on the paper: "Whitened CLIP as a Likelihood Surrogate of Images and Captions" (https://arxiv.org/abs/2505.06934)
- CLIP embedding dim is 768 for ViT-L/14.
- We keep modality-specific means (mu_img, mu_txt) and precision matrices
(W_img, W_txt). These are loaded at runtime from local `.pt` files shipped
with the Space.
"""
from __future__ import annotations
import gradio as gr
import torch
import numpy as np
from PIL import Image
from transformers import CLIPModel, AutoProcessor
# ---------------------------
# Load internal statistics (from w_mats)
# ---------------------------
_device = "cuda" if torch.cuda.is_available() else "cpu"
# Paths (must be uploaded to the Space inside a folder named w_mats)
_mean_image_path = "w_mats/mean_image_L14.pt"
_mean_text_path = "w_mats/mean_text_L14.pt"
_w_image_path = "w_mats/w_mat_image_L14.pt"
_w_text_path = "w_mats/w_mat_text_L14.pt"
# Load tensors
_modality_mean_image = torch.load(_mean_image_path, map_location=_device, weights_only=False).cpu().numpy()
_modality_mean_text = torch.load(_mean_text_path, map_location=_device, weights_only=False).cpu().numpy()
_W_image = torch.load(_w_image_path, map_location=_device, weights_only=False).cpu().numpy()
_W_text = torch.load(_w_text_path, map_location=_device, weights_only=False).cpu().numpy()
# Sanity checks
EMB_DIM = 768 # ViT-L/14 feature dimension
assert _modality_mean_image.shape == (EMB_DIM,), f"mu_image must be {EMB_DIM}-D"
assert _modality_mean_text.shape == (EMB_DIM,), f"mu_text must be {EMB_DIM}-D"
assert _W_image.shape == (EMB_DIM, EMB_DIM), f"W_image must be {EMB_DIM}x{EMB_DIM}"
assert _W_text.shape == (EMB_DIM, EMB_DIM), f"W_text must be {EMB_DIM}x{EMB_DIM}"
# ---------------------------
# Model / Processor
# ---------------------------
MODEL_ID = "openai/clip-vit-large-patch14"
_model: CLIPModel | None = None
_processor: AutoProcessor | None = None
def _load_model():
global _model, _processor
if _model is None:
_model = CLIPModel.from_pretrained(MODEL_ID).to(_device).eval()
if _processor is None:
_processor = AutoProcessor.from_pretrained(MODEL_ID)
def _l2_normalize(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
return x / (x.norm(dim=-1, keepdim=True) + eps)
# ---------------------------
# Embedding helpers
# ---------------------------
@torch.no_grad()
def embed_image(img: Image.Image) -> np.ndarray:
_load_model()
inputs = _processor(images=img, return_tensors="pt").to(_device)
feats = _model.get_image_features(**inputs) # [1, D]
# feats = _l2_normalize(feats)
return feats.squeeze(0).detach().cpu().numpy()
@torch.no_grad()
def embed_text(text: str) -> np.ndarray:
_load_model()
inputs = _processor(text=[text], return_tensors="pt", padding=True).to(_device)
feats = _model.get_text_features(**inputs) # [1, D]
# feats = _l2_normalize(feats)
return feats.squeeze(0).detach().cpu().numpy()
# ---------------------------
# Conformity & Likelihood
# ---------------------------
def _cosine(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> float:
a = a / (np.linalg.norm(a) + eps)
b = b / (np.linalg.norm(b) + eps)
return float(np.dot(a, b))
def conformity_image(z: np.ndarray) -> float:
return _cosine(z, _modality_mean_image)
def conformity_text(z: np.ndarray) -> float:
return _cosine(z, _modality_mean_text)
def loglike_image_relative(z_in_i: np.ndarray) -> float:
# Convert to torch tensor on the correct device
z_i = torch.tensor(z_in_i, dtype=torch.float32, device=_device).reshape(1,-1)
mu_i = torch.tensor(_modality_mean_image, dtype=torch.float32, device=_device).reshape(1,-1)
W = torch.tensor(_W_image, dtype=torch.float32, device=_device)
# Center and transform features using the whitening matrix
cntr_features = z_i - mu_i
w_features = torch.matmul(cntr_features, W)
# quad = (cntr_features @ W @ cntr_features.T).squeeze()
# Compute log-likelihood using Gaussian distribution assumption
N = z_i.shape[-1]
log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2))
# Return as NumPy float
return log_like.cpu().numpy().item()
def loglike_text_relative(z_in_t: np.ndarray) -> float:
# Convert to torch tensor on the correct device
z_t = torch.tensor(z_in_t, dtype=torch.float32, device=_device).reshape(1,-1)
mu_t = torch.tensor(_modality_mean_text, dtype=torch.float32, device=_device).reshape(1,-1)
W = torch.tensor(_W_text, dtype=torch.float32, device=_device)
# Center and transform features using the whitening matrix
cntr_features = z_t - mu_t
w_features = torch.matmul(cntr_features, W)
#quad = (cntr_features @ W @ cntr_features.T).squeeze()
# Compute log-likelihood using Gaussian distribution assumption
N = z_t.shape[-1]
log_like = -0.5 * (N * torch.log(torch.tensor(2 * torch.pi, device=_device)) + torch.sum(w_features**2))
# Return as NumPy float
return log_like.cpu().numpy().item()
# ---------------------------
# Gradio logic
# ---------------------------
DESC = """
This Space operates on **CLIP ViT-L/14** latent space to compute two metrics per modality:
1. **Conformity** — measure how common the samle is (based on [The Double-Ellipsoid Geometry of CLIP](https://arxiv.org/abs/2411.14517))
2. **Log-Likelihood** — measure how like the common is (based on [Whitened CLIP as a Likelihood Surrogate of Images and Captions](https://arxiv.org/abs/2505.06934))
All required modality means and W matrices are stored *internally* and loaded from `w_mats/*.pt`.
"""
PROVENANCE = """
**Data provenance**
Modality means and precision matrices (W) are computed from **MS-COCO** features.
They are loaded from precomputed `.pt` files in the Space repo.
"""
def analyze_single(modality: str, text: str, image: Image.Image):
if modality == "Image":
if image is None:
return {"Error": "Please upload an image."}, None
z = embed_image(image)
conf = conformity_image(z)
ll = loglike_image_relative(z)
else:
if not text:
return {"Error": "Please enter text."}, None
z = embed_text(text)
conf = conformity_text(z)
ll = loglike_text_relative(z)
report = {
"Modality": modality,
"Conformity (cosine to mu)": round(conf, 6),
"Rel. Log-Likelihood (MS-COCO W)": round(ll, 6),
}
summary = f"Conformity: {conf:.6f} | Log-likelihood: {ll:.6f}"
return report, summary
def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image):
from io import BytesIO
import base64
# Prepare images if modality is Image
img1_html = ""
img2_html = ""
if modality == "Image":
if image1 is None or image2 is None:
return "<p style='color:red'>Please upload both images.</p>"
# Convert first image to base64
buf1 = BytesIO()
image1.save(buf1, format="PNG")
img1_b64 = base64.b64encode(buf1.getvalue()).decode()
img1_html = f"<img src='data:image/png;base64,{img1_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>"
# Convert second image to base64
buf2 = BytesIO()
image2.save(buf2, format="PNG")
img2_b64 = base64.b64encode(buf2.getvalue()).decode()
img2_html = f"<img src='data:image/png;base64,{img2_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>"
z1 = embed_image(image1)
z2 = embed_image(image2)
c1, c2 = conformity_image(z1), conformity_image(z2)
l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2)
else:
if not text1 or not text2:
return "<p style='color:red'>Please enter both texts.</p>"
z1 = embed_text(text1)
z2 = embed_text(text2)
c1, c2 = conformity_text(z1), conformity_text(z2)
l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2)
# Build HTML output
html = f"""
<div style='display:flex; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'>
<div style='text-align:center;'>
{img1_html if modality=="Image" else "<div style='min-height:50px'></div>"}
<p><b>#1 {modality}:</b></p>
<p>Conformity: {c1:.6f}</p>
<p>Log-Likelihood: {l1:.6f}</p>
</div>
<div style='text-align:center;'>
{img2_html if modality=="Image" else "<div style='min-height:50px'></div>"}
<p><b>#2 {modality}:</b></p>
<p>Conformity: {c2:.6f}</p>
<p>Log-Likelihood: {l2:.6f}</p>
</div>
<div style='text-align:center;'>
<p><b>Δ (2-1)</b></p>
<p>Δ Conformity: {c2-c1:.6f}</p>
<p>Δ Log-Likelihood: {l2-l1:.6f}</p>
</div>
</div>
"""
return html
with gr.Blocks(
title="CLIP Latent: Conformity & Likelihood (ViT-L/14)",
css="""
#result-box, #result-cmp {
min-height: 200px;
padding: 10px;
border: 1px solid #eee;
border-radius: 8px;
}
"""
) as demo:
gr.Markdown(f"# CLIP Latent Space — Conformity & Likelihood (ViT-L/14)\n\n{DESC}\n\n{PROVENANCE}")
with gr.Tab("Single Input"):
modality = gr.Radio(["Image", "Text"], value="Image", label="Modality")
img_in = gr.Image(type="pil", label="Image", visible=True)
txt_in = gr.Textbox(label="Text", visible=False)
btn = gr.Button("Analyze")
result_out = gr.HTML("<p>Result will appear here</p>", elem_id="result-box")
# Update function must be inside the Blocks context
def update_inputs(mod):
return gr.update(visible=(mod=="Image")), gr.update(visible=(mod=="Text"))
modality.change(fn=update_inputs, inputs=[modality], outputs=[img_in, txt_in])
# Analysis function inside the same context
def analyze_single_gui(modality: str, text: str, image: Image.Image):
from io import BytesIO
import base64
# Prepare image HTML if modality is Image
img_html = ""
if modality == "Image" and image is not None:
buffered = BytesIO()
image.save(buffered, format="PNG")
img_b64 = base64.b64encode(buffered.getvalue()).decode()
img_html = f"<img src='data:image/png;base64,{img_b64}' width='200px' style='border:1px solid #ccc; border-radius:8px;'/>"
z = embed_image(image)
conf = conformity_image(z)
ll = loglike_image_relative(z)
else:
if not text:
return "<p style='color:red'>Please enter text.</p>"
z = embed_text(text)
conf = conformity_text(z)
ll = loglike_text_relative(z)
# <-- Set the HTML with min-height and padding here -->
html = f"""
<div style='display:flex; align-items:center; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'>
{img_html}
<div>
<p><b>Modality:</b> {modality}</p>
<p><b>Conformity:</b> {conf:.6f}</p>
<p><b>Log-Likelihood:</b> {ll:.6f}</p>
</div>
</div>
"""
# Return the HTML to the gr.HTML component
return html
btn.click(analyze_single_gui, inputs=[modality, txt_in, img_in], outputs=[result_out])
with gr.Tab("Compare Two"):
modality_c = gr.Radio(["Image", "Text"], value="Image", label="Modality")
img1 = gr.Image(type="pil", label="#1 Image", visible=True)
txt1 = gr.Textbox(label="#1 Text", visible=False)
img2 = gr.Image(type="pil", label="#2 Image", visible=True)
txt2 = gr.Textbox(label="#2 Text", visible=False)
result_cmp = gr.HTML("<p>Comparison result will appear here</p>", elem_id="result-cmp")
def update_compare_inputs(mod):
return (gr.update(visible=(mod=="Image")), # img1
gr.update(visible=(mod=="Text")), # txt1
gr.update(visible=(mod=="Image")), # img2
gr.update(visible=(mod=="Text"))) # txt2
modality_c.change(fn=update_compare_inputs,
inputs=[modality_c],
outputs=[img1, txt1, img2, txt2])
def compare_pair_gui(modality: str, text1: str, image1: Image.Image, text2: str, image2: Image.Image):
from io import BytesIO
import base64
if modality == "Image":
if image1 is None or image2 is None:
return "<p style='color:red'>Please upload both images.</p>"
def img_to_html(img):
buf = BytesIO()
img.save(buf, format="PNG")
img_b64 = base64.b64encode(buf.getvalue()).decode()
return f"<img src='data:image/png;base64,{img_b64}' width='150px' style='border:1px solid #ccc; border-radius:8px;'/>"
img1_html = img_to_html(image1)
img2_html = img_to_html(image2)
z1 = embed_image(image1)
z2 = embed_image(image2)
c1, c2 = conformity_image(z1), conformity_image(z2)
l1, l2 = loglike_image_relative(z1), loglike_image_relative(z2)
else:
if not text1 or not text2:
return "<p style='color:red'>Please enter both texts.</p>"
z1 = embed_text(text1)
z2 = embed_text(text2)
c1, c2 = conformity_text(z1), conformity_text(z2)
l1, l2 = loglike_text_relative(z1), loglike_text_relative(z2)
img1_html = img2_html = "<div style='min-height:50px'></div>"
html = f"""
<div style='display:flex; gap:20px; min-height:150px; padding:10px; border:1px solid #eee; border-radius:8px;'>
<div style='text-align:center;'>{img1_html}<p><b>#1 {modality}</b></p>
<p>Conformity: {c1:.6f}</p><p>Log-Likelihood: {l1:.6f}</p>
</div>
<div style='text-align:center;'>{img2_html}<p><b>#2 {modality}</b></p>
<p>Conformity: {c2:.6f}</p><p>Log-Likelihood: {l2:.6f}</p>
</div>
<div style='text-align:center;'>
<p><b>Δ (2-1)</b></p>
<p>Δ Conformity: {c2-c1:.6f}</p>
<p>Δ Log-Likelihood: {l2-l1:.6f}</p>
</div>
</div>
"""
return html
btn_c = gr.Button("Compare")
btn_c.click(compare_pair_gui, inputs=[modality_c, txt1, img1, txt2, img2], outputs=[result_cmp])
gr.Markdown(
"""
**Implementation details:**
- Embeddings: `openai/clip-vit-large-patch14` via 🤗 Transformers; features are L2-normalized.
- Conformity: cosine similarity to stored modality means `mu_image`, `mu_text`.
- Log-likelihood: `-0.5 * (x-mu)^T W (x-mu)` using MS-COCO-based precision `W`.
"""
)
if __name__ == "__main__":
demo.launch(share=True)
|