Spaces:

astronolan
/

AION-Search

Running

astronolan Claude commited on Nov 16

Commit

c89f65f

1 Parent(s): dda65a0

Add AION-Search Dash app for Hugging Face Spaces

- Add complete application code (app.py, src/, clip/)
- Add Dockerfile configured for HF Spaces deployment
- Add requirements.txt with torch-cpu and all dependencies
- Add cleaned model checkpoint (46MB, inference-only)
- Configure for port 7860 with gunicorn

Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (23) hide show

.gitignore +16 -0
Dockerfile +28 -0
README.md +4 -4
aionsearchmodel.pt +3 -0
app.py +133 -0
clip/__init__.py +8 -0
clip/evaluation/__init__.py +5 -0
clip/evaluation/inference.py +82 -0
clip/models/__init__.py +6 -0
clip/models/clip_model.py +118 -0
clip/models/projections.py +270 -0
clip/utils/__init__.py +10 -0
clip/utils/data_loader.py +250 -0
clip/utils/io_utils.py +103 -0
clip/utils/logging_utils.py +42 -0
main.py +6 -0
requirements.txt +13 -0
src/__init__.py +3 -0
src/callbacks.py +775 -0
src/components.py +821 -0
src/config.py +68 -0
src/services.py +538 -0
src/utils.py +195 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+.env
+.venv
+*.log
+.DS_Store
+tmp/data/processed/*
+.python-version
+pyproject.toml
+uv.lock
+.claude

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app.py .
+COPY src/ ./src/
+COPY clip/ ./clip/
+COPY aionsearchmodel.pt .
+# Create necessary directories
+RUN mkdir -p data/processed logs
+# Expose port for Hugging Face Spaces
+EXPOSE 7860
+# Run the application with gunicorn for production
+# Increased timeout and workers for HF Spaces
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "2", "--timeout", "600", "app:server"]

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: AION Search
-emoji: 🦀
-colorFrom: red
-colorTo: pink
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: AION Search
+emoji: 🌌
+colorFrom: blue
+colorTo: purple
 sdk: docker
 pinned: false
 ---
+AION-Search

aionsearchmodel.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e91a0b8e1f632165d62aff10dc598674a35a92e28a4312af220a606bd44664f6
+size 48614488

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""AION Search - Galaxy Semantic Search Application.
+A Dash web application for semantic search over galaxy images using CLIP embeddings.
+"""
+import os
+import logging
+import argparse
+# Fix OpenMP conflict - MUST be set before importing torch/numpy
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+import dash
+import dash_bootstrap_components as dbc
+import src.config as config
+from src.config import FEATURE_VECTOR_ADDITION
+from src.components import get_app_theme, create_layout
+from src.services import CLIPModelService, EmbeddingService, ZillizService, SearchService, ImageProcessingService
+from src.callbacks import register_callbacks
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def create_app(checkpoint_path: str) -> dash.Dash:
+    """Create and configure the Dash application.
+    Args:
+        checkpoint_path: Path to the CLIP model checkpoint
+    Returns:
+        Configured Dash app instance
+    """
+    # Initialize Dash app
+    app = dash.Dash(
+        __name__,
+        external_stylesheets=[dbc.themes.BOOTSTRAP, dbc.icons.FONT_AWESOME],
+        suppress_callback_exceptions=True
+    )
+    server = app.server
+    # Set custom theme
+    app.index_string = get_app_theme()
+    # Set app title
+    app.title = "AION Galaxy Search"
+    # Initialize services
+    logger.info("Initializing services...")
+    # Load CLIP model
+    clip_service = CLIPModelService()
+    clip_service.load_model(checkpoint_path)
+    # Create service instances
+    embedding_service = EmbeddingService(clip_service)
+    zilliz_service = ZillizService()
+    # Initialize image processing service for advanced search
+    # (now uses pre-existing embeddings from Zilliz, no model loading needed)
+    image_service = ImageProcessingService()
+    logger.info("Image processing service initialized successfully")
+    search_service = SearchService(embedding_service, zilliz_service, image_service)
+    # Get actual count from Zilliz and update config
+    actual_count = zilliz_service.get_collection_count()
+    if actual_count > 0:
+        config.TOTAL_GALAXIES = actual_count
+        logger.info(f"Services initialized. Total galaxies: {config.TOTAL_GALAXIES:,}")
+    else:
+        logger.warning(f"Failed to get collection count from Zilliz, using default: {config.TOTAL_GALAXIES:,}")
+    # Create app layout
+    app.layout = create_layout()
+    # Register callbacks
+    register_callbacks(app, search_service)
+    logger.info("App initialization complete!")
+    return app
+def main():
+    """Main entry point for the application."""
+    parser = argparse.ArgumentParser(description='AION Galaxy Search App')
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default='aionsearchmodel.pt',
+        help='Path to CLIP model checkpoint'
+    )
+    parser.add_argument(
+        '--port',
+        type=int,
+        default=7860,
+        help='Port to run the app on'
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Run in debug mode'
+    )
+    parser.add_argument(
+        '--host',
+        type=str,
+        default='0.0.0.0',
+        help='Host to run the app on'
+    )
+    args = parser.parse_args()
+    # Create and run app
+    logger.info("Starting AION Galaxy Search...")
+    app = create_app(args.checkpoint)
+    logger.info(f"Server starting on {args.host}:{args.port}")
+    app.run_server(
+        debug=args.debug,
+        host=args.host,
+        port=args.port
+    )
+if __name__ == "__main__":
+    main()

clip/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+CLIP alignment for galaxy images and text descriptions.
+This package provides tools for training and using CLIP-style alignment
+between AION galaxy embeddings and text descriptions.
+"""
+__version__ = "0.1.0"

clip/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Evaluation utilities for CLIP model."""
+from .inference import ClipInferenceModel
+__all__ = ["ClipInferenceModel"]

clip/evaluation/inference.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Inference utilities for trained CLIP model.
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from pathlib import Path
+from typing import Union, List, Dict, Tuple
+import logging
+from ..models import GalaxyClipModel
+logger = logging.getLogger(__name__)
+class ClipInferenceModel:
+    """Wrapper for using trained CLIP model for inference and search."""
+    def __init__(self, model_path: str, device: str = "cpu"):
+        """
+        Initialize inference model.
+        Args:
+            model_path: Path to saved model (.pt file)
+            device: Device to use for inference
+        """
+        self.device = torch.device(device)
+        # Load model
+        checkpoint = torch.load(model_path, map_location=self.device)
+        model_config = checkpoint['model_config']
+        # Create model with same config
+        self.model = GalaxyClipModel(
+            image_input_dim=model_config['image_input_dim'],
+            text_input_dim=model_config['text_input_dim'],
+            embedding_dim=model_config['embedding_dim']
+        )
+        # Load weights
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model.to(self.device)
+        self.model.eval()
+        self.config = model_config
+        logger.info(f"Loaded CLIP model on {device}")
+        logger.info(f"Model config: {model_config}")
+    def encode_images(self, image_embeddings):
+        """Encode image embeddings to shared space."""
+        tensor = torch.as_tensor(image_embeddings, dtype=torch.float, device=self.device)
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+            squeeze = True
+        else:
+            squeeze = False
+        with torch.no_grad():
+            # Use image_projector and normalize
+            out = self.model.image_projector(tensor)
+        return out.squeeze(0).cpu() if squeeze else out.cpu()
+    def encode_texts(self, text_embeddings):
+        """Encode text embeddings to shared space."""
+        tensor = torch.as_tensor(text_embeddings, dtype=torch.float, device=self.device)
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+            squeeze = True
+        else:
+            squeeze = False
+        with torch.no_grad():
+            # Use text_projector and normalize
+            out = self.model.text_projector(tensor)
+        return out.squeeze(0).cpu() if squeeze else out.cpu()

clip/models/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""CLIP model architecture for galaxy embeddings."""
+from .clip_model import GalaxyClipModel
+from .projections import CrossAttentionImageProjector, TextProjector
+__all__ = ["GalaxyClipModel", "CrossAttentionImageProjector", "TextProjector"]

clip/models/clip_model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict
+from .projections import TextProjector, CrossAttentionImageProjector, SimpleImageProjector
+class GalaxyClipModel(nn.Module):
+    """CLIP model for aligning galaxy images and text descriptions."""
+    def __init__(
+        self,
+        image_input_dim: int = 768,
+        text_input_dim: int = 3072,
+        embedding_dim: int = 1024,
+        image_hidden_dim: int = 768,
+        text_hidden_dim: int = 1024,
+        dropout: float = 0.1,
+        use_mean_embeddings: bool = True
+    ):
+        """
+        Initialize CLIP model.
+        Args:
+            image_input_dim: AION embedding dimension
+            text_input_dim: Text embedding dimension
+            embedding_dim: Shared embedding space dimension
+            image_hidden_dim: Hidden dimension for image projector
+            text_hidden_dim: Hidden dimension for text projector
+            dropout: Dropout rate
+            use_mean_embeddings: Whether using mean embeddings (True) or full embeddings (False)
+        """
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.use_mean_embeddings = use_mean_embeddings
+        # Choose appropriate image projector based on embedding type
+        if use_mean_embeddings:
+            # Simple projector for mean embeddings (1D vectors)
+            self.image_projector = SimpleImageProjector(
+                input_dim=image_input_dim,
+                output_dim=embedding_dim,
+                hidden_dim=image_hidden_dim,
+                dropout=dropout
+            )
+        else:
+            # Cross-attention projector for full embeddings (2D sequences)
+            self.image_projector = CrossAttentionImageProjector(
+                input_dim=image_input_dim,
+                output_dim=embedding_dim,
+                hidden_dim=image_hidden_dim,
+                dropout=dropout
+            )
+        self.text_projector = TextProjector(
+            input_dim=text_input_dim,
+            output_dim=embedding_dim,
+            hidden_dim=text_hidden_dim,
+            dropout=dropout
+        )
+        # Learnable logit scale parameter initialized to standard CLIP temperature 1/0.07
+        # Using log parameterization for numerical stability
+        self.logit_scale = nn.Parameter(torch.log(torch.tensor(1/0.07, dtype=torch.float32)))
+    def forward(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass for CLIP training.
+        Args:
+            batch: Dictionary containing 'image_embedding' and 'text_embedding'
+        Returns:
+            Dictionary with projected embeddings and logits
+        """
+        image_features = batch['image_embedding']
+        text_features = batch['text_embedding']
+        # Project to shared space and normalize
+        image_features = self.image_projector(image_features)
+        text_features = self.text_projector(text_features)
+        # Compute similarity matrix with learnable logit scale
+        # Clamp after exp to preserve gradients
+        logit_scale = self.logit_scale.exp().clamp(max=100)
+        logits_per_image = logit_scale * image_features @ text_features.T
+        logits_per_text = logits_per_image.T
+        return {
+            'image_features': image_features,
+            'text_features': text_features,
+            'logits_per_image': logits_per_image,
+            'logits_per_text': logits_per_text,
+            'logit_scale': logit_scale
+        }
+    def compute_contrastive_loss(self, outputs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Compute contrastive loss (InfoNCE).
+        Args:
+            outputs: Model outputs from forward pass
+        Returns:
+            Contrastive loss
+        """
+        logits_per_image = outputs['logits_per_image']
+        logits_per_text = outputs['logits_per_text']
+        batch_size = logits_per_image.shape[0]
+        labels = torch.arange(batch_size, device=logits_per_image.device)
+        # Cross-entropy loss for both directions
+        loss_i2t = F.cross_entropy(logits_per_image, labels)
+        loss_t2i = F.cross_entropy(logits_per_text, labels)
+        return (loss_i2t + loss_t2i) / 2

clip/models/projections.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import torch
+import torch.nn as nn
+from typing import Optional
+import torch.nn.functional as F
+class TextProjector(nn.Module):
+    """Projects text embeddings to shared space."""
+    def __init__(
+        self,
+        input_dim: int = 3072,
+        output_dim: int = 1024,
+        hidden_dim: Optional[int] = None,
+        dropout: float = 0.1,
+        num_layers: int = 4,
+    ):
+        """
+        Initialize text projector.
+        Args:
+            input_dim: Dimension of text embeddings (3072)
+            output_dim: Dimension of shared embedding space
+            hidden_dim: Hidden layer dimension (default: 1024)
+            dropout: Dropout rate
+            num_layers: Number of residual layers (default: 2)
+        """
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = 1024
+        self.fc_in = nn.Linear(input_dim, hidden_dim)
+        self.blocks = nn.ModuleList([
+            nn.Sequential(
+                nn.LayerNorm(hidden_dim),
+                nn.GELU(),
+                nn.Dropout(dropout),
+                nn.Linear(hidden_dim, hidden_dim),
+            ) for _ in range(num_layers)
+        ])
+        self.fc_out = nn.Linear(hidden_dim, output_dim)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize projection weights."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Project text embeddings to shared space.
+        Args:
+            x: Text embeddings (batch_size, input_dim)
+        Returns:
+            Projected embeddings (batch_size, output_dim)
+        """
+        h = self.fc_in(x)
+        for blk in self.blocks:       # residual MLP stack
+            h = h + blk(h)
+        h = self.fc_out(h)
+        return F.normalize(h, dim=-1, eps=1e-3)
+class CrossAttentionImageProjector(nn.Module):
+    """Simplified projector with self-attention + cross-attention."""
+    def __init__(
+        self,
+        input_dim: int = 768,
+        output_dim: int = 1024,
+        hidden_dim: Optional[int] = None,
+        dropout: float = 0.1,
+        num_layers: int = 2,  # Kept for compatibility, not used
+        num_heads: int = 4,  # Reduced from 8
+    ):
+        """
+        Initialize simplified cross-attention image projector.
+        Args:
+            input_dim: Dimension of AION embeddings (768)
+            output_dim: Dimension of shared embedding space (default: 1024)
+            hidden_dim: Hidden dimension for attention (default: output_dim)
+            dropout: Dropout rate
+            num_layers: Kept for compatibility but not used
+            num_heads: Number of attention heads (reduced to 4)
+        """
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = output_dim
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+        # Project input to hidden dim
+        self.input_proj = nn.Linear(input_dim, hidden_dim)
+        # Token pooling to reduce sequence length
+        # 576 tokens -> 64 tokens (9x reduction)
+        self.token_pool = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=9, stride=9, padding=0)
+        # Single self-attention layer
+        self.self_attn_norm = nn.LayerNorm(hidden_dim)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=hidden_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        # MLP after self-attention
+        self.mlp1_norm = nn.LayerNorm(hidden_dim)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 2),  # Reduced from 4x
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.Dropout(dropout)
+        )
+        # Learned query vector
+        self.query = nn.Parameter(torch.randn(1, 1, hidden_dim))
+        # Single cross-attention layer
+        self.cross_attn_norm = nn.LayerNorm(hidden_dim)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=hidden_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        # Final MLP
+        self.final_norm = nn.LayerNorm(hidden_dim)
+        self.final_mlp = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 2),  # Reduced from 4x
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim * 2, output_dim)
+        )
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights."""
+        # Initialize query vector
+        nn.init.normal_(self.query, std=0.02)
+        # Initialize other weights
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Project image embeddings to shared space using self-attention + cross-attention.
+        Args:
+            x: Image embeddings (batch_size, n_tokens, input_dim)
+        Returns:
+            Projected embeddings (batch_size, output_dim)
+        """
+        batch_size = x.shape[0]
+        x = F.normalize(x, dim=-1, eps=1e-6)  # Normalize AION embeddings input (handles [B, N, D])
+        # Project input
+        x = self.input_proj(x)  # (B, N, hidden_dim)
+        # Pool tokens to reduce sequence length
+        x = x.transpose(1, 2)  # (B, hidden_dim, N)
+        x = self.token_pool(x)  # (B, hidden_dim, N//9)
+        x = x.transpose(1, 2)  # (B, N//9, hidden_dim)
+        # Self-attention with residual on pooled tokens
+        x_norm = self.self_attn_norm(x)
+        x_attn, _ = self.self_attn(x_norm, x_norm, x_norm, need_weights=False)
+        x = x + x_attn
+        # MLP with residual
+        x = x + self.mlp1(self.mlp1_norm(x))
+        # Cross-attention with learned query
+        query = self.query.expand(batch_size, -1, -1)  # (B, 1, hidden_dim)
+        q_norm = self.cross_attn_norm(query)
+        attended, _ = self.cross_attn(q_norm, x, x, need_weights=False)
+        query = query + attended
+        # Final processing
+        output = self.final_norm(query).squeeze(1)  # (B, hidden_dim)
+        output = self.final_mlp(output)  # (B, output_dim)
+        return F.normalize(output, dim=-1, eps=1e-3)
+class SimpleImageProjector(nn.Module):
+    """Simple projector for mean AION embeddings."""
+    def __init__(
+        self,
+        input_dim: int = 768,
+        output_dim: int = 1024,
+        hidden_dim: Optional[int] = None,
+        dropout: float = 0.1,
+        num_layers: int = 4,
+    ):
+        """
+        Initialize simple image projector.
+        Args:
+            input_dim: Dimension of AION embeddings (768)
+            output_dim: Dimension of shared embedding space
+            hidden_dim: Hidden layer dimension (default: 1024)
+            dropout: Dropout rate
+            num_layers: Number of residual layers (default: 4)
+        """
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = 1024
+        self.fc_in = nn.Linear(input_dim, hidden_dim)
+        self.blocks = nn.ModuleList([
+            nn.Sequential(
+                nn.LayerNorm(hidden_dim),
+                nn.GELU(),
+                nn.Dropout(dropout),
+                nn.Linear(hidden_dim, hidden_dim),
+            ) for _ in range(num_layers)
+        ])
+        self.fc_out = nn.Linear(hidden_dim, output_dim)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize projection weights."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Project image embeddings to shared space.
+        Args:
+            x: Image embeddings (batch_size, input_dim)
+        Returns:
+            Projected embeddings (batch_size, output_dim)
+        """
+        x = F.normalize(x, dim=-1, eps=1e-6) # Normalize AION embeddings input
+        h = self.fc_in(x)
+        for blk in self.blocks:       # residual MLP stack
+            h = h + blk(h)
+        h = self.fc_out(h)
+        return F.normalize(h, dim=-1, eps=1e-3)

clip/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Utility functions for CLIP training and evaluation."""
+from .logging_utils import setup_logging
+from .io_utils import save_clip_embeddings_hdf5, inspect_generated_files
+__all__ = [
+    "setup_logging",
+    "save_clip_embeddings_hdf5",
+    "inspect_generated_files"
+]

clip/utils/data_loader.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Data loader for multi-text training using unified parquet file with nested text embeddings.
+This loader handles the new unified format from 05_generate_unified_embeddings.py.
+"""
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+import logging
+from pathlib import Path
+import random
+logger = logging.getLogger(__name__)
+class UnifiedMultiTextDataset(Dataset):
+    """Dataset for unified parquet file with multiple text embeddings per galaxy."""
+    def __init__(self, parquet_path, split="train", train_ratio=0.8,
+                 text_sampling_strategy="random", epoch=0, max_train_samples=None,
+                 num_embedding=None):
+        self.parquet_path = Path(parquet_path)
+        self.split = split
+        self.train_ratio = train_ratio
+        self.text_sampling_strategy = text_sampling_strategy
+        self.epoch = epoch
+        self.max_train_samples = max_train_samples
+        self.num_embedding = num_embedding
+        # Load the parquet file
+        logger.info(f"Loading unified embeddings from {self.parquet_path}")
+        self.df = pd.read_parquet(self.parquet_path)
+        # Create train/val split based on galaxy_index
+        n_samples = len(self.df)
+        indices = np.arange(n_samples)
+        self.seed = 42
+        # Deterministic split based on galaxy_index
+        split_mask = []
+        for idx in range(n_samples):
+            galaxy_idx = self.df.iloc[idx]['galaxy_index']
+            # Hash the galaxy index for deterministic assignment
+            sample_hash = hash((galaxy_idx, self.seed)) % 10000 / 10000.0
+            is_train = sample_hash < self.train_ratio
+            split_mask.append(is_train)
+        split_mask = np.array(split_mask)
+        if split == "train":
+            self.indices = indices[split_mask]
+            # Limit training samples if specified
+            if self.max_train_samples is not None and len(self.indices) > self.max_train_samples:
+                rng = np.random.RandomState(self.seed)
+                selected_indices = rng.choice(self.indices, size=self.max_train_samples, replace=False)
+                self.indices = np.sort(selected_indices)  # Sort for reproducibility
+                logger.info(f"Limited training set to {self.max_train_samples} samples")
+        else:
+            self.indices = indices[~split_mask]
+        logger.info(f"Dataset initialized: {len(self.indices)} samples for {split} split")
+        logger.info(f"Text sampling strategy: {text_sampling_strategy}")
+        # Validate num_embedding parameter for specific_summary strategy
+        if text_sampling_strategy == "specific_summary" and num_embedding is None:
+            raise ValueError("num_embedding parameter is required when using 'specific_summary' strategy")
+        # Check data structure
+        sample_row = self.df.iloc[0]
+        n_augmented = len(sample_row['augmented_embeddings'])
+        logger.info(f"Each galaxy has 1 original + {n_augmented} augmented embeddings = {1 + n_augmented} total")
+        # Validate num_embedding is within valid range
+        if text_sampling_strategy == "specific_summary":
+            total_embeddings = 1 + n_augmented
+            if num_embedding < 0 or num_embedding >= total_embeddings:
+                raise ValueError(f"num_embedding must be between 0 and {total_embeddings-1}, got {num_embedding}")
+            logger.info(f"Using specific embedding at index {num_embedding}")
+    def __len__(self):
+        return len(self.indices)
+    def set_epoch(self, epoch):
+        """Set current epoch for round-robin sampling."""
+        self.epoch = epoch
+    def _get_all_embeddings_and_sources(self, row):
+        """Combine original and augmented embeddings into single lists."""
+        # Start with original embedding
+        all_embeddings = [np.array(row['text_embedding'], dtype=np.float32)]
+        all_sources = [row['description_sources'][0]]  # 'original'
+        # Add augmented embeddings
+        for aug_emb, aug_source in zip(row['augmented_embeddings'], row['description_sources'][1:]):
+            all_embeddings.append(np.array(aug_emb, dtype=np.float32))
+            all_sources.append(aug_source)
+        return all_embeddings, all_sources
+    def _sample_text_embedding(self, text_embeddings, text_sources, galaxy_idx):
+        """Sample one text embedding from multiple options."""
+        n_texts = len(text_embeddings)
+        if self.text_sampling_strategy == "original":
+            # Always use original text (index 0)
+            idx = 0
+        elif self.text_sampling_strategy == "summaries-only":
+            # Only use summaries (exclude original at index 0)
+            if n_texts > 1:
+                rng = random.Random(galaxy_idx + self.epoch * 1000000)
+                idx = rng.randint(1, n_texts - 1)  # Start from 1 to exclude original
+            else:
+                # Fallback to original if no summaries available
+                idx = 0
+        elif self.text_sampling_strategy == "specific_summary":
+            # Use the specific embedding index provided
+            if self.num_embedding < n_texts:
+                idx = self.num_embedding
+            else:
+                # Fallback to original if index out of range
+                logger.warning(f"Requested embedding index {self.num_embedding} out of range for {n_texts} embeddings, using original")
+                idx = 0
+        elif self.text_sampling_strategy == "random":
+            # Random sampling with seed based on galaxy_idx and epoch
+            rng = random.Random(galaxy_idx + self.epoch * 1000000)
+            idx = rng.randint(0, n_texts - 1)
+        elif self.text_sampling_strategy == "round-robin":
+            # Cycle through texts based on epoch
+            idx = (self.epoch + galaxy_idx) % n_texts
+        elif self.text_sampling_strategy == "weighted":
+            # Weight towards original (50%) and summaries (50% / n_summaries each)
+            rng = random.Random(galaxy_idx + self.epoch * 1000000)
+            n_summaries = n_texts - 1
+            if n_summaries > 0:
+                summary_weight = 0.5 / n_summaries
+                weights = [0.5] + [summary_weight] * n_summaries
+            else:
+                weights = [1.0]
+            idx = rng.choices(range(n_texts), weights=weights)[0]
+        else:
+            idx = 0  # Default to original
+        return text_embeddings[idx], text_sources[idx], idx
+    def __getitem__(self, idx):
+        """Get a single sample with randomly selected text embedding."""
+        actual_idx = self.indices[idx]
+        row = self.df.iloc[actual_idx]
+        # Get AION embedding
+        aion_embedding = np.array(row['aion_embedding'], dtype=np.float32)
+        # Get all text embeddings and sources
+        text_embeddings, text_sources = self._get_all_embeddings_and_sources(row)
+        # Sample one text embedding
+        galaxy_idx = row['galaxy_index']
+        selected_text, selected_source, text_idx = self._sample_text_embedding(
+            text_embeddings, text_sources, galaxy_idx
+        )
+        # Log selection details periodically (every 100th sample)
+        if idx % 100 == 0:
+            logger.debug(f"Galaxy {galaxy_idx}: Selected {selected_source} (index {text_idx}) from {len(text_sources)} options")
+        return {
+            'aion_embedding': torch.from_numpy(aion_embedding),
+            'text_embedding': torch.from_numpy(selected_text),
+            'galaxy_index': galaxy_idx,
+            'text_source': selected_source,
+            'text_index': text_idx,
+            'object_id': row['object_id']
+        }
+def create_unified_multi_text_loaders(
+    unified_embeddings_path,
+    batch_size=64,
+    train_ratio=0.8,
+    pin_memory=True,
+    text_sampling_strategy="random",
+    num_workers=4,
+    max_train_samples=None,
+    num_embedding=None,
+    **kwargs
+):
+    """
+    Create train and validation data loaders for multi-text training from unified parquet.
+    Args:
+        unified_embeddings_path: Path to unified parquet file
+        batch_size: Batch size for training
+        train_ratio: Fraction of samples for training
+        pin_memory: Whether to pin memory for GPU transfer
+        text_sampling_strategy: How to sample text embeddings ("original", "summaries-only", "specific_summary", "random", "round-robin", "weighted")
+        num_workers: Number of data loading workers
+        max_train_samples: Maximum number of training samples (for data scaling experiments)
+        num_embedding: When using "specific_summary" strategy, the index of the embedding to use
+        **kwargs: Additional arguments
+    """
+    # Convert to Path
+    parquet_path = Path(unified_embeddings_path)
+    if not parquet_path.exists():
+        raise ValueError(f"Unified embeddings file not found: {parquet_path}")
+    logger.info(f"Creating unified multi-text data loaders from {parquet_path}")
+    logger.info(f"Batch size: {batch_size}, Workers: {num_workers}")
+    logger.info(f"Text sampling strategy: {text_sampling_strategy}")
+    # Create datasets
+    train_dataset = UnifiedMultiTextDataset(
+        parquet_path=parquet_path,
+        split="train",
+        train_ratio=train_ratio,
+        text_sampling_strategy=text_sampling_strategy,
+        max_train_samples=max_train_samples,
+        num_embedding=num_embedding
+    )
+    val_dataset = UnifiedMultiTextDataset(
+        parquet_path=parquet_path,
+        split="val",
+        train_ratio=train_ratio,
+        text_sampling_strategy=text_sampling_strategy,
+        num_embedding=num_embedding
+    )
+    # Create loaders
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,  # Shuffle within the train split
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=True  # Drop incomplete batches for stable training
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False,  # No shuffle for validation
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=False
+    )
+    return train_loader, val_loader

clip/utils/io_utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+I/O utilities for saving and loading CLIP embeddings.
+"""
+import h5py
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+import logging
+logger = logging.getLogger(__name__)
+def save_clip_embeddings_hdf5(
+    object_ids,
+    galaxy_data,
+    text_data,
+    aion_clip_embeddings,
+    text_clip_embeddings,
+    output_dir="data/processed"
+):
+    """Save CLIP embeddings to separate HDF5 files."""
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # File paths (standardized names)
+    aion_clip_path = output_dir / "galaxy_aion_clip_embeddings.hdf5"
+    text_clip_path = output_dir / "galaxy_text_clip_embeddings.hdf5"
+    logger.info(f"Saving AION CLIP embeddings to: {aion_clip_path}")
+    # Save AION CLIP embeddings
+    with h5py.File(aion_clip_path, 'w') as f:
+        # Object IDs
+        dt = h5py.special_dtype(vlen=str)
+        f.create_dataset('object_id', data=[str(oid) for oid in object_ids], dtype=dt)
+        # Coordinates and metadata
+        ra_values = np.array([galaxy_data[oid]['ra'] for oid in object_ids])
+        dec_values = np.array([galaxy_data[oid]['dec'] for oid in object_ids])
+        healpix_values = np.array([galaxy_data[oid]['healpix'] for oid in object_ids])
+        f.create_dataset('ra', data=ra_values, dtype=np.float64)
+        f.create_dataset('dec', data=dec_values, dtype=np.float64)
+        f.create_dataset('healpix', data=healpix_values, dtype=np.int64)
+        # AION CLIP embeddings
+        f.create_dataset('AION_clip_embedding', data=aion_clip_embeddings, dtype=np.float32)
+        # Metadata
+        f.attrs['description'] = 'AION embeddings encoded through trained CLIP model'
+        f.attrs['embedding_dim'] = aion_clip_embeddings.shape[1]
+        f.attrs['num_objects'] = len(object_ids)
+        f.attrs['created'] = datetime.now().isoformat()
+    logger.info(f"Saving text CLIP embeddings to: {text_clip_path}")
+    # Save text CLIP embeddings
+    with h5py.File(text_clip_path, 'w') as f:
+        # Object IDs
+        dt = h5py.special_dtype(vlen=str)
+        f.create_dataset('object_id', data=[str(oid) for oid in object_ids], dtype=dt)
+        # Coordinates and metadata (use text data for consistency)
+        ra_values = np.array([text_data[oid]['ra'] for oid in object_ids])
+        dec_values = np.array([text_data[oid]['dec'] for oid in object_ids])
+        healpix_values = np.array([text_data[oid]['healpix'] for oid in object_ids])
+        f.create_dataset('ra', data=ra_values, dtype=np.float64)
+        f.create_dataset('dec', data=dec_values, dtype=np.float64)
+        f.create_dataset('healpix', data=healpix_values, dtype=np.int64)
+        # Text CLIP embeddings
+        f.create_dataset('text_clip_embedding', data=text_clip_embeddings, dtype=np.float32)
+        # Metadata
+        f.attrs['description'] = 'Text embeddings encoded through trained CLIP model'
+        f.attrs['embedding_dim'] = text_clip_embeddings.shape[1]
+        f.attrs['num_objects'] = len(object_ids)
+        f.attrs['created'] = datetime.now().isoformat()
+    return aion_clip_path, text_clip_path
+def inspect_generated_files(aion_clip_path, text_clip_path):
+    """Inspect the generated HDF5 files."""
+    logger.info("Inspecting generated AION CLIP embeddings file...")
+    with h5py.File(aion_clip_path, 'r') as f:
+        logger.info(f"AION file datasets: {list(f.keys())}")
+        for key in f.keys():
+            dataset = f[key]
+            logger.info(f"  {key}: shape={dataset.shape}, dtype={dataset.dtype}")
+        logger.info(f"  Attributes: {dict(f.attrs)}")
+    logger.info("Inspecting generated text CLIP embeddings file...")
+    with h5py.File(text_clip_path, 'r') as f:
+        logger.info(f"Text file datasets: {list(f.keys())}")
+        for key in f.keys():
+            dataset = f[key]
+            logger.info(f"  {key}: shape={dataset.shape}, dtype={dataset.dtype}")
+        logger.info(f"  Attributes: {dict(f.attrs)}")

clip/utils/logging_utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Logging utilities."""
+import logging
+import sys
+from pathlib import Path
+def setup_logging(log_level: str = "INFO", log_file: str = None):
+    """
+    Setup logging configuration.
+    Args:
+        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
+        log_file: Optional path to log file
+    """
+    # Clear any existing handlers
+    logging.getLogger().handlers.clear()
+    # Create formatter
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    # Setup root logger
+    logger = logging.getLogger()
+    logger.setLevel(getattr(logging, log_level.upper()))
+    logger.addHandler(console_handler)
+    # File handler if specified
+    if log_file:
+        log_path = Path(log_file)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(log_path)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger

main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from aion-search!")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+dash==2.14.1
+dash-bootstrap-components==1.5.0
+h5py==3.10.0
+numpy==1.24.3
+openai==1.10.0
+httpx==0.26.0
+gunicorn==21.2.0
+huggingface-hub==0.20.1
+pandas==2.0.3
+faiss-cpu==1.7.4
+python-dotenv==1.1.1
+torch --index-url https://download.pytorch.org/whl/cpu
+requests

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """AION Search - Galaxy Semantic Search Application."""
2	+
3	+ __version__ = "0.2.0"

src/callbacks.py ADDED Viewed

	@@ -0,0 +1,775 @@

+"""Dash callbacks for AION Search."""
+import json
+import time
+import logging
+import traceback
+import pandas as pd
+import dash
+from dash import Input, Output, State, callback_context, html
+import dash_bootstrap_components as dbc
+import src.config as config
+from src.config import (
+    DEFAULT_DISPLAY_COUNT,
+    LOAD_MORE_COUNT,
+    IMAGE_HEIGHT,
+    IMAGE_WIDTH,
+    ZILLIZ_PRIMARY_KEY,
+)
+from src.components import create_vector_input_row
+from src.services import SearchService
+logger = logging.getLogger(__name__)
+def register_callbacks(app, search_service: SearchService):
+    """Register all Dash callbacks with the app.
+    Args:
+        app: Dash app instance
+        search_service: SearchService instance for performing searches
+    """
+    @app.callback(
+        Output("galaxy-count", "children"),
+        Input("galaxy-count", "id")
+    )
+    def update_galaxy_count(_):
+        """Update the galaxy count display."""
+        if search_service and config.TOTAL_GALAXIES > 0:
+            return f"{config.TOTAL_GALAXIES:,} galaxies"
+        else:
+            return "loading..."
+    @app.callback(
+        [Output("vector-collapse", "is_open"),
+         Output("vector-arrow", "className")],
+        Input("vector-toggle", "n_clicks"),
+        State("vector-collapse", "is_open"),
+        prevent_initial_call=True
+    )
+    def toggle_vector_section(n_clicks, is_open):
+        """Toggle the vector addition section."""
+        new_state = not is_open
+        arrow_class = "fas fa-chevron-up" if new_state else "fas fa-chevron-down"
+        return new_state, arrow_class
+    @app.callback(
+        Output("vector-inputs", "children", allow_duplicate=True),
+        Input({"type": "vector-delete", "index": dash.dependencies.ALL}, "n_clicks"),
+        State("vector-inputs", "children"),
+        prevent_initial_call=True
+    )
+    def delete_vector_input(n_clicks_list, current_children):
+        """Handle deletion of vector input rows."""
+        if not n_clicks_list or not any(n_clicks_list):
+            return dash.no_update
+        ctx = callback_context
+        if not ctx.triggered:
+            return dash.no_update
+        if ctx.triggered[0]["value"] is None or ctx.triggered[0]["value"] == 0:
+            return dash.no_update
+        button_id = ctx.triggered[0]["prop_id"]
+        index_to_delete = json.loads(button_id.split(".")[0])["index"]
+        logger.info(f"Delete button clicked for index: {index_to_delete}")
+        # Filter out the row with the matching index
+        new_children = []
+        for child in current_children:
+            should_keep = True
+            if isinstance(child, dict):
+                if 'props' in child and 'id' in child['props']:
+                    child_id = child['props']['id']
+                    if isinstance(child_id, dict) and child_id.get("type") == "vector-row" and child_id.get("index") == index_to_delete:
+                        should_keep = False
+            elif hasattr(child, 'id') and isinstance(child.id, dict):
+                if child.id.get("type") == "vector-row" and child.id.get("index") == index_to_delete:
+                    should_keep = False
+            if should_keep:
+                new_children.append(child)
+        # Ensure at least one input remains
+        if len(new_children) == 0:
+            new_children = [create_vector_input_row(0)]
+        return new_children
+    @app.callback(
+        [Output("vector-inputs", "children"),
+         Output("vector-inputs-count", "data")],
+        Input("add-vector-input", "n_clicks"),
+        [State("vector-inputs", "children"),
+         State("vector-inputs-count", "data")],
+        prevent_initial_call=True
+    )
+    def add_vector_input(n_clicks, current_children, count):
+        """Add a new vector input row."""
+        if n_clicks:
+            new_input = create_vector_input_row(count)
+            current_children.append(new_input)
+            return current_children, count + 1
+        return dash.no_update, dash.no_update
+    @app.callback(
+        [Output({"type": "text-input-container", "index": dash.dependencies.ALL}, "style"),
+         Output({"type": "image-input-container", "index": dash.dependencies.ALL}, "style")],
+        Input({"type": "vector-query-type", "index": dash.dependencies.ALL}, "value"),
+        prevent_initial_call=False
+    )
+    def toggle_query_type_inputs(query_types):
+        """Toggle visibility of text vs image inputs based on query type selection."""
+        text_styles = []
+        image_styles = []
+        for query_type in query_types:
+            if query_type == "text":
+                text_styles.append({"display": "block"})
+                image_styles.append({"display": "none"})
+            else:  # image
+                text_styles.append({"display": "none"})
+                image_styles.append({"display": "block"})
+        return text_styles, image_styles
+    @app.callback(
+        [Output("search-button", "n_clicks"),
+         Output("search-input", "value")],
+        [Input("example-1", "n_clicks"),
+         Input("example-2", "n_clicks"),
+         Input("example-3", "n_clicks"),
+         Input("example-4", "n_clicks"),
+         Input("example-5", "n_clicks"),
+         Input("example-6", "n_clicks"),
+         Input("example-7", "n_clicks")],
+        [State("search-button", "n_clicks")],
+        prevent_initial_call=True
+    )
+    def trigger_search_from_examples(click1, click2, click3, click4, click5, click6, click7, current_clicks):
+        """Trigger search when example buttons are clicked."""
+        ctx = callback_context
+        if not ctx.triggered:
+            return dash.no_update, dash.no_update
+        button_id = ctx.triggered[0]["prop_id"].split(".")[0]
+        example_queries = {
+            "example-1": "Merging edge-on galaxy",
+            "example-2": "A peculiar interacting galaxy system featuring plenty of tidal tails and a disturbed morphology",
+            "example-3": "a faint tidal stream wrapping around",
+            "example-4": "Strong gravitational lens",
+            "example-5": "A violent merger in progress with visible tidal features",
+            "example-6": "Low surface brightness",
+            "example-7": "Ring galaxy"
+        }
+        search_query = example_queries.get(button_id, "")
+        if search_query:
+            return (current_clicks or 0) + 1, search_query
+        return dash.no_update, dash.no_update
+    @app.callback(
+        [Output("search-time", "children"),
+         Output("search-results", "children"),
+         Output("search-data", "data"),
+         Output("download-button", "disabled")],
+        [Input("search-button", "n_clicks"),
+         Input("search-input", "n_submit")],
+        [State("search-input", "value"),
+         State("rmag-slider", "value")],
+        prevent_initial_call=True
+    )
+    def perform_search(n_clicks, n_submit, query, rmag_range):
+        """Perform text search."""
+        if not query or not query.strip():
+            return "", dbc.Alert("Please enter a search query", color="warning"), None, True
+        try:
+            # Extract min and max from slider range
+            rmag_min, rmag_max = rmag_range if rmag_range else (None, None)
+            start_time = time.time()
+            df = search_service.search_text(query, rmag_min=rmag_min, rmag_max=rmag_max)
+            search_time = time.time() - start_time
+            # Log query to XML/CSV
+            from src.utils import build_query_xml, log_query_to_csv
+            query_xml = build_query_xml(
+                text_queries=[query],
+                text_weights=[1.0],
+                rmag_min=rmag_min,
+                rmag_max=rmag_max
+            )
+            log_query_to_csv(query_xml)
+            # Build results grid - only load first 60 images
+            grid_items = build_galaxy_grid(df.head(DEFAULT_DISPLAY_COUNT))
+            # Prepare data for store
+            search_data = prepare_search_data(df, query)
+            # Create load more button
+            load_more_button = create_load_more_button(len(df), DEFAULT_DISPLAY_COUNT) if len(df) > DEFAULT_DISPLAY_COUNT else None
+            # Build filter description
+            filter_desc = ""
+            if rmag_min is not None and rmag_max is not None and (rmag_min != 13.0 or rmag_max != 20.0):
+                filter_desc = f" + r-mag: [{rmag_min:.1f}, {rmag_max:.1f}]"
+            # Build complete results container
+            results_container = html.Div([
+                html.P(f"Top {len(df)} matching galaxies (showing {min(DEFAULT_DISPLAY_COUNT, len(df))})",
+                       className="results-header mb-2 text-center"),
+                html.P(f"'{query}'{filter_desc}",
+                       className="text-center mb-3",
+                       style={"color": "rgba(245, 245, 247, 0.6)", "font-size": "0.9rem"}),
+                dbc.Row(grid_items, justify="center", id="search-results-grid"),
+                load_more_button
+            ])
+            return "", results_container, search_data, False
+        except Exception as e:
+            error_msg = dbc.Alert(f"Search failed: {str(e)}", color="danger")
+            logger.error(f"Search error: {e}")
+            logger.error(f"Full traceback:\n{traceback.format_exc()}")
+            return "", error_msg, None, True
+    @app.callback(
+        [Output("galaxy-modal", "is_open"),
+         Output("modal-title", "children"),
+         Output("modal-image", "children"),
+         Output("modal-description", "children"),
+         Output("current-galaxy-data", "data")],
+        [Input({"type": "galaxy-image", "index": dash.dependencies.ALL}, "n_clicks"),
+         Input("close-modal", "n_clicks")],
+        [State("galaxy-modal", "is_open"),
+         State("search-data", "data")],
+        prevent_initial_call=True
+    )
+    def toggle_modal(image_clicks, close_click, is_open, search_data):
+        """Toggle galaxy detail modal."""
+        ctx = callback_context
+        if not ctx.triggered:
+            return False, "", "", "", None
+        if ctx.triggered[0]["prop_id"] == "close-modal.n_clicks":
+            return False, "", "", "", None
+        if search_data:
+            triggered_prop = ctx.triggered[0]["prop_id"]
+            triggered_value = ctx.triggered[0]["value"]
+            if triggered_value is None or triggered_value == 0:
+                return False, "", "", "", None
+            if "galaxy-image" in triggered_prop:
+                try:
+                    prop_dict = json.loads(triggered_prop.split(".n_clicks")[0])
+                    clicked_idx = prop_dict["index"]
+                    if clicked_idx < len(search_data["ra"]):
+                        galaxy_info = extract_galaxy_info(search_data, clicked_idx)
+                        image_element, description_element = build_modal_content(galaxy_info)
+                        galaxy_data = {
+                            ZILLIZ_PRIMARY_KEY: galaxy_info[ZILLIZ_PRIMARY_KEY],
+                            "ra": galaxy_info["ra"],
+                            "dec": galaxy_info["dec"],
+                            "distance": galaxy_info["distance"],
+                            "r_mag": galaxy_info["r_mag"]
+                        }
+                        return (
+                            True,
+                            f"Galaxy at RA={galaxy_info['ra']:.6f}, Dec={galaxy_info['dec']:.6f}",
+                            image_element,
+                            description_element,
+                            galaxy_data
+                        )
+                except:
+                    pass
+        return False, "", "", "", None
+    @app.callback(
+        Output("info-modal", "is_open"),
+        [Input("info-button", "n_clicks"),
+         Input("close-info-modal", "n_clicks")],
+        State("info-modal", "is_open"),
+        prevent_initial_call=True
+    )
+    def toggle_info_modal(info_click, close_click, is_open):
+        """Toggle info modal."""
+        ctx = callback_context
+        if ctx.triggered:
+            button_id = ctx.triggered[0]["prop_id"].split(".")[0]
+            if button_id == "info-button":
+                return True
+            elif button_id == "close-info-modal":
+                return False
+        return is_open
+    @app.callback(
+        [Output("search-results", "children", allow_duplicate=True),
+         Output("search-data", "data", allow_duplicate=True)],
+        Input("load-more-button", "n_clicks"),
+        State("search-data", "data"),
+        prevent_initial_call=True
+    )
+    def load_more_galaxies(n_clicks, search_data):
+        """Load more galaxies when the load more button is clicked."""
+        if n_clicks and search_data and "loaded_count" in search_data:
+            current_count = search_data["loaded_count"]
+            total_count = len(search_data["ra"])
+            next_count = min(current_count + LOAD_MORE_COUNT, total_count)
+            # Build ALL grid items (existing + new)
+            all_grid_items = []
+            for i in range(next_count):
+                galaxy_info = extract_galaxy_info(search_data, i)
+                grid_item = build_galaxy_card(galaxy_info, i)
+                all_grid_items.append(grid_item)
+            search_data["loaded_count"] = next_count
+            load_more_button = create_load_more_button(total_count, next_count) if next_count < total_count else None
+            results_container = html.Div([
+                html.P(f"Top {total_count} matching galaxies (showing {next_count})",
+                       className="results-header mb-2 text-center"),
+                html.P(f"'{search_data['query']}'",
+                       className="text-center mb-3",
+                       style={"color": "rgba(245, 245, 247, 0.6)", "font-size": "0.9rem"}),
+                dbc.Row(all_grid_items, justify="center", id="search-results-grid"),
+                load_more_button
+            ])
+            return results_container, search_data
+        return dash.no_update, dash.no_update
+    @app.callback(
+        [Output("vector-inputs", "children", allow_duplicate=True),
+         Output("vector-inputs-count", "data", allow_duplicate=True),
+         Output("vector-collapse", "is_open", allow_duplicate=True),
+         Output("galaxy-modal", "is_open", allow_duplicate=True)],
+        Input("add-to-advanced-search", "n_clicks"),
+        [State("current-galaxy-data", "data"),
+         State("vector-inputs", "children"),
+         State("vector-inputs-count", "data")],
+        prevent_initial_call=True
+    )
+    def add_galaxy_to_advanced_search(n_clicks, galaxy_data, current_children, count):
+        """Add the current galaxy's RA/Dec to advanced search."""
+        if not n_clicks or not galaxy_data:
+            return dash.no_update, dash.no_update, dash.no_update, dash.no_update
+        # Extract galaxy coordinates
+        ra = galaxy_data.get('ra')
+        dec = galaxy_data.get('dec')
+        if ra is None or dec is None:
+            return dash.no_update, dash.no_update, dash.no_update, dash.no_update
+        # Create a new image input row with the galaxy's RA/Dec pre-filled
+        new_row = create_vector_input_row(
+            index=count,
+            query_type="image",
+            ra=ra,
+            dec=dec,
+            fov=0.025
+        )
+        current_children.append(new_row)
+        # Return updated children, incremented count, open vector panel, close modal
+        return current_children, count + 1, True, False
+    @app.callback(
+        [Output("search-time", "children", allow_duplicate=True),
+         Output("search-results", "children", allow_duplicate=True),
+         Output("search-data", "data", allow_duplicate=True),
+         Output("download-button", "disabled", allow_duplicate=True)],
+        Input("vector-search-button", "n_clicks"),
+        [State({"type": "vector-query-type", "index": dash.dependencies.ALL}, "value"),
+         State({"type": "vector-text", "index": dash.dependencies.ALL}, "value"),
+         State({"type": "vector-ra", "index": dash.dependencies.ALL}, "value"),
+         State({"type": "vector-dec", "index": dash.dependencies.ALL}, "value"),
+         State({"type": "vector-fov", "index": dash.dependencies.ALL}, "value"),
+         State({"type": "vector-operation", "index": dash.dependencies.ALL}, "value"),
+         State("rmag-slider", "value")],
+        prevent_initial_call=True
+    )
+    def perform_vector_search(n_clicks, query_types, text_values, ra_values, dec_values, fov_values, operations, rmag_range):
+        """Perform advanced vector search with multiple text and/or image queries."""
+        if not n_clicks:
+            return dash.no_update, dash.no_update, dash.no_update, dash.no_update
+        def operation_to_weight(op_str):
+            """Convert operation string to float weight."""
+            if op_str == "+":
+                return 1.0
+            elif op_str == "-":
+                return -1.0
+            else:
+                # For magnitude values like "+2", "-5", etc.
+                return float(op_str)
+        def weight_to_display(weight):
+            """Convert weight back to display string."""
+            if weight == 1.0:
+                return "+"
+            elif weight == -1.0:
+                return "-"
+            elif weight > 0:
+                return f"+{int(weight)}"
+            else:
+                return str(int(weight))
+        # Parse inputs to separate text and image queries
+        text_queries = []
+        text_weights = []
+        image_queries = []
+        image_weights = []
+        for i, query_type in enumerate(query_types):
+            operation = operations[i]
+            weight = operation_to_weight(operation)
+            if query_type == "text":
+                text_value = text_values[i]
+                if text_value and text_value.strip():
+                    text_queries.append(text_value.strip())
+                    text_weights.append(weight)
+            else:  # image
+                ra = ra_values[i]
+                dec = dec_values[i]
+                fov = fov_values[i] if fov_values[i] else 0.025
+                if ra is not None and dec is not None:
+                    image_queries.append({
+                        'ra': float(ra),
+                        'dec': float(dec),
+                        'fov': float(fov)
+                    })
+                    image_weights.append(weight)
+        # Validate that we have at least one query
+        if not text_queries and not image_queries:
+            return "", dbc.Alert("Please enter at least one text or image query", color="warning"), None, True
+        try:
+            # Extract min and max from slider range
+            rmag_min, rmag_max = rmag_range if rmag_range else (None, None)
+            # Perform advanced search
+            start_time = time.time()
+            df = search_service.search_advanced(
+                text_queries=text_queries if text_queries else None,
+                text_weights=text_weights if text_weights else None,
+                image_queries=image_queries if image_queries else None,
+                image_weights=image_weights if image_weights else None,
+                rmag_min=rmag_min,
+                rmag_max=rmag_max
+            )
+            search_time = time.time() - start_time
+            # Log query to XML/CSV
+            from src.utils import build_query_xml, log_query_to_csv
+            query_xml = build_query_xml(
+                text_queries=text_queries if text_queries else None,
+                text_weights=text_weights if text_weights else None,
+                image_queries=image_queries if image_queries else None,
+                image_weights=image_weights if image_weights else None,
+                rmag_min=rmag_min,
+                rmag_max=rmag_max
+            )
+            log_query_to_csv(query_xml)
+            # Build results grid
+            grid_items = build_galaxy_grid(df.head(DEFAULT_DISPLAY_COUNT))
+            # Build query description for storage (simple text)
+            query_desc_parts = []
+            for query, weight in zip(text_queries, text_weights):
+                op_display = weight_to_display(weight)
+                query_desc_parts.append(f"{op_display} text:'{query}'")
+            for img_query, weight in zip(image_queries, image_weights):
+                op_display = weight_to_display(weight)
+                query_desc_parts.append(f"{op_display} image:(RA={img_query['ra']:.2f}, Dec={img_query['dec']:.2f})")
+            query_description = " ".join(query_desc_parts)
+            # Build query display with thumbnails for images
+            query_display_parts = []
+            for query, weight in zip(text_queries, text_weights):
+                op_display = weight_to_display(weight)
+                query_display_parts.append(html.Span(f"{op_display} text:'{query}' ", style={"margin-right": "8px"}))
+            for img_query, weight in zip(image_queries, image_weights):
+                op_display = weight_to_display(weight)
+                # Generate thumbnail URL
+                from src.utils import cutout_url
+                thumbnail_url = cutout_url(
+                    img_query['ra'],
+                    img_query['dec'],
+                    fov=img_query.get('fov', 0.025),
+                    size=64
+                )
+                query_display_parts.append(html.Span([
+                    f"{op_display} ",
+                    html.Img(
+                        src=thumbnail_url,
+                        style={
+                            "width": "128px",
+                            "height": "128px",
+                            "vertical-align": "middle",
+                            "margin": "0 4px",
+                            "border-radius": "4px",
+                            "border": "1px solid rgba(255, 255, 255, 0.2)"
+                        }
+                    )
+                ], style={"margin-right": "8px", "display": "inline-block"}))
+            # Build filter description
+            filter_desc = ""
+            if rmag_min is not None and rmag_max is not None and (rmag_min != 13.0 or rmag_max != 20.0):
+                filter_desc = f" + r-mag: [{rmag_min:.1f}, {rmag_max:.1f}]"
+            # Prepare data for store
+            search_data = prepare_search_data(df, query_description, is_vector_search=True)
+            search_data["text_queries"] = text_queries
+            search_data["text_weights"] = text_weights
+            search_data["image_queries"] = image_queries
+            search_data["image_weights"] = image_weights
+            # Create load more button
+            load_more_button = create_load_more_button(len(df), DEFAULT_DISPLAY_COUNT) if len(df) > DEFAULT_DISPLAY_COUNT else None
+            # Build results container
+            results_container = html.Div([
+                html.P(f"Top {len(df)} matching galaxies (showing {min(DEFAULT_DISPLAY_COUNT, len(df))})",
+                       className="results-header mb-2 text-center"),
+                html.P(
+                    query_display_parts + ([f"{filter_desc}"] if filter_desc else []),
+                    className="text-center mb-3",
+                    style={"color": "rgba(245, 245, 247, 0.6)", "font-size": "0.9rem"}
+                ),
+                dbc.Row(grid_items, justify="center", id="search-results-grid"),
+                load_more_button
+            ])
+            return "", results_container, search_data, False
+        except Exception as e:
+            error_msg = dbc.Alert(f"Advanced search failed: {str(e)}", color="danger")
+            logger.error(f"Advanced search error: {e}")
+            logger.error(f"Full traceback:\n{traceback.format_exc()}")
+            return "", error_msg, None, True
+    @app.callback(
+        Output("download-csv", "data"),
+        Input("download-button", "n_clicks"),
+        State("search-data", "data"),
+        prevent_initial_call=True
+    )
+    def download_csv(n_clicks, search_data):
+        """Download search results as CSV."""
+        if n_clicks and search_data:
+            # Create DataFrame with the search results
+            df = pd.DataFrame({
+                ZILLIZ_PRIMARY_KEY: search_data[ZILLIZ_PRIMARY_KEY],
+                'ra': search_data['ra'],
+                'dec': search_data['dec'],
+                'r_mag': search_data['r_mag'],
+                'distance': search_data['distance'],
+                'cutout_url': search_data['cutout_url']
+            })
+            # Create CSV string
+            csv_string = df.to_csv(index=False)
+            # Return download data
+            return dict(content=csv_string, filename="galaxy_search_results.csv")
+        return dash.no_update
+# Helper functions for callbacks
+def build_galaxy_grid(df: pd.DataFrame) -> list:
+    """Build galaxy grid items from DataFrame.
+    Args:
+        df: DataFrame with galaxy data
+    Returns:
+        List of Dash components
+    """
+    grid_items = []
+    for i, row in df.iterrows():
+        galaxy_info = {
+            ZILLIZ_PRIMARY_KEY: row[ZILLIZ_PRIMARY_KEY],
+            "ra": row['ra'],
+            "dec": row['dec'],
+            "distance": row['distance'],
+            "r_mag": row['r_mag'],
+            "cutout_url": row['cutout_url']
+        }
+        grid_item = build_galaxy_card(galaxy_info, i)
+        grid_items.append(grid_item)
+    return grid_items
+def build_galaxy_card(galaxy_info: dict, index: int):
+    """Build a single galaxy card component.
+    Args:
+        galaxy_info: Dictionary with galaxy information
+        index: Index of the galaxy in the results
+    Returns:
+        Dash Bootstrap Col component
+    """
+    return dbc.Col([
+        html.Div([
+            html.Div([
+                html.Img(
+                    src=galaxy_info["cutout_url"],
+                    style={
+                        "width": IMAGE_WIDTH,
+                        "height": IMAGE_HEIGHT,
+                        "object-fit": "cover",
+                        "cursor": "pointer",
+                        "border-radius": "8px"
+                    },
+                    id={"type": "galaxy-image", "index": index},
+                    className="hover-shadow"
+                ),
+                html.Div([
+                    html.Small(f"r = {galaxy_info['r_mag']:.2f} mag", className="score-badge")
+                ], style={
+                    "position": "absolute",
+                    "bottom": "8px",
+                    "right": "8px"
+                })
+            ], style={"position": "relative"})
+        ])
+    ], width=6, md=4, lg=2, className="mb-2 px-1")
+def prepare_search_data(df: pd.DataFrame, query: str, is_vector_search: bool = False) -> dict:
+    """Prepare search data for storage.
+    Args:
+        df: DataFrame with search results
+        query: Search query string
+        is_vector_search: Whether this is a vector search
+    Returns:
+        Dictionary with search data
+    """
+    return {
+        ZILLIZ_PRIMARY_KEY: df[ZILLIZ_PRIMARY_KEY].tolist(),
+        "ra": df['ra'].tolist(),
+        "dec": df['dec'].tolist(),
+        "distance": df['distance'].tolist(),
+        "r_mag": df['r_mag'].tolist(),
+        "cutout_url": df['cutout_url'].tolist(),
+        "loaded_count": DEFAULT_DISPLAY_COUNT,
+        "query": query,
+        "is_vector_search": is_vector_search
+    }
+def extract_galaxy_info(search_data: dict, index: int) -> dict:
+    """Extract galaxy info from search data at given index.
+    Args:
+        search_data: Dictionary with search data
+        index: Index of the galaxy
+    Returns:
+        Dictionary with galaxy information
+    """
+    return {
+        ZILLIZ_PRIMARY_KEY: search_data[ZILLIZ_PRIMARY_KEY][index],
+        "ra": search_data["ra"][index],
+        "dec": search_data["dec"][index],
+        "distance": search_data["distance"][index],
+        "r_mag": search_data["r_mag"][index],
+        "cutout_url": search_data["cutout_url"][index]
+    }
+def build_modal_content(galaxy_info: dict) -> tuple:
+    """Build modal image and description content.
+    Args:
+        galaxy_info: Dictionary with galaxy information
+    Returns:
+        Tuple of (image_element, description_element)
+    """
+    image_element = html.Img(
+        src=galaxy_info["cutout_url"],
+        style={"width": "100%", "max-width": "500px", "height": "auto"}
+    )
+    # Format primary key label (convert snake_case to Title Case)
+    pk_label = ZILLIZ_PRIMARY_KEY.replace("_", " ").title()
+    description_element = html.Div([
+        html.Div([
+            html.Span(f"{pk_label}: {galaxy_info[ZILLIZ_PRIMARY_KEY]}", className="d-inline-block mb-0",
+                     style={"color": "rgba(245, 245, 247, 0.7)", "font-size": "0.9rem"}),
+        ], className="mb-2"),
+        html.Div([
+            html.Span(f"RA: {galaxy_info['ra']:.6f}", className="d-inline-block mb-0",
+                     style={"color": "rgba(245, 245, 247, 0.7)", "font-size": "0.9rem"}),
+            html.Span(" • ", className="mx-2", style={"color": "rgba(245, 245, 247, 0.5)"}),
+            html.Span(f"Dec: {galaxy_info['dec']:.6f}", className="d-inline-block mb-0",
+                     style={"color": "rgba(245, 245, 247, 0.7)", "font-size": "0.9rem"}),
+        ], className="mb-2"),
+        html.Div([
+            html.Span(f"r_mag: {galaxy_info['r_mag']:.2f}", className="d-inline-block mb-0",
+                     style={"color": "rgba(245, 245, 247, 0.7)", "font-size": "0.9rem"}),
+            html.Span(" • ", className="mx-2", style={"color": "rgba(245, 245, 247, 0.5)"}),
+            html.Span(f"Distance: {galaxy_info['distance']:.4f}", className="d-inline-block mb-0",
+                     style={"color": "rgba(245, 245, 247, 0.7)", "font-size": "0.9rem"}),
+        ], className="mb-3"),
+    ])
+    return image_element, description_element
+def create_load_more_button(total_count: int, current_count: int):
+    """Create a load more button.
+    Args:
+        total_count: Total number of results
+        current_count: Number of currently loaded results
+    Returns:
+        Dash Bootstrap Button component
+    """
+    remaining = total_count - current_count
+    button_text = f"Load next {min(LOAD_MORE_COUNT, remaining)} galaxies"
+    return dbc.Button(
+        button_text,
+        id="load-more-button",
+        color="secondary",
+        className="mt-3",
+        style={"width": "100%"}
+    )

src/components.py ADDED Viewed

	@@ -0,0 +1,821 @@

+"""UI components for AION Search."""
+from dash import dcc, html
+import dash_bootstrap_components as dbc
+from src.config import TOTAL_GALAXIES
+def get_app_theme() -> str:
+    """Get the custom CSS theme for the app.
+    Returns:
+        HTML string with embedded CSS
+    """
+    return '''
+<!DOCTYPE html>
+<html>
+    <head>
+        {%metas%}
+        <title>galaxy semantic search</title>
+        {%favicon%}
+        {%css%}
+        <style>
+            @import url('https://fonts.googleapis.com/css2?family=SF+Pro+Display:wght@200;300;400;500;600&display=swap');
+            * {
+                -webkit-font-smoothing: antialiased;
+                -moz-osx-font-smoothing: grayscale;
+            }
+            body {
+                font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', 'Inter', sans-serif;
+                background: #000000;
+                color: #F5F5F7;
+                min-height: 100vh;
+                margin: 0;
+                overflow-x: hidden;
+            }
+            body::before {
+                content: '';
+                position: fixed;
+                top: -50%;
+                left: -50%;
+                width: 200%;
+                height: 200%;
+                background: radial-gradient(circle at 20% 80%, #1C1C1E 0%, transparent 50%),
+                            radial-gradient(circle at 80% 20%, #161618 0%, transparent 50%),
+                            radial-gradient(circle at 40% 40%, #0A0A0B 0%, transparent 50%);
+                z-index: -1;
+            }
+            .container-fluid {
+                background-color: transparent !important;
+                padding-top: 2rem !important;
+            }
+            .hover-shadow {
+                transition: all 0.4s cubic-bezier(0.25, 0.46, 0.45, 0.94);
+                border: 0.5px solid rgba(255, 255, 255, 0.1);
+                background: #0A0A0B;
+                overflow: hidden;
+                position: relative;
+            }
+            .hover-shadow::before {
+                content: '';
+                position: absolute;
+                top: 0;
+                left: 0;
+                right: 0;
+                bottom: 0;
+                background: linear-gradient(135deg, rgba(255,255,255,0.05) 0%, transparent 100%);
+                opacity: 0;
+                transition: opacity 0.4s ease;
+            }
+            .hover-shadow:hover {
+                transform: translateY(-4px) scale(1.02);
+                box-shadow: 0 20px 40px rgba(0, 0, 0, 0.8),
+                           0 0 60px rgba(255, 255, 255, 0.05) !important;
+                border-color: rgba(255, 255, 255, 0.2);
+            }
+            .hover-shadow:hover::before {
+                opacity: 1;
+            }
+            .search-container {
+                background: rgba(255, 255, 255, 0.05);
+                backdrop-filter: blur(40px) saturate(180%);
+                -webkit-backdrop-filter: blur(40px) saturate(180%);
+                border-radius: 16px;
+                padding: 1.25rem;
+                border: 0.5px solid rgba(255, 255, 255, 0.1);
+                box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4),
+                           inset 0 1px 0 rgba(255, 255, 255, 0.1);
+            }
+            .example-button {
+                background: #E5E5E7 !important;
+                background-color: #E5E5E7 !important;
+                border: 0.5px solid #D1D1D3 !important;
+                color: #1A1A1A !important;
+                font-weight: 500;
+                font-size: 0.75rem !important;
+                padding: 0.4rem 0.9rem !important;
+                transition: all 0.3s cubic-bezier(0.25, 0.46, 0.45, 0.94);
+                letter-spacing: 0.01em;
+            }
+            .example-button:hover {
+                background: #F0F0F2 !important;
+                background-color: #F0F0F2 !important;
+                border-color: #C0C0C2 !important;
+                color: #000000 !important;
+                transform: translateY(-1px);
+                box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+            }
+            .example-button i {
+                color: #2A2A2A !important;
+            }
+            .galaxy-title {
+                color: #F5F5F7;
+                font-weight: 200;
+                font-size: 1.75rem;
+                letter-spacing: -0.03em;
+                background: linear-gradient(180deg, #F5F5F7 0%, rgba(245, 245, 247, 0.6) 100%);
+                -webkit-background-clip: text;
+                -webkit-text-fill-color: transparent;
+                animation: float 6s ease-in-out infinite;
+            }
+            .modal-content {
+                background: #1C1C1E;
+                border: 0.5px solid rgba(255, 255, 255, 0.1);
+                border-radius: 16px;
+                backdrop-filter: blur(20px);
+            }
+            .modal-header, .modal-footer {
+                border-color: rgba(255, 255, 255, 0.05);
+            }
+            .form-control:focus, .form-control:active {
+                background-color: rgba(255, 255, 255, 0.05) !important;
+                border-color: rgba(255, 255, 255, 0.3) !important;
+                box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.05) !important;
+                color: #F5F5F7 !important;
+            }
+            .form-control {
+                background-color: rgba(255, 255, 255, 0.03) !important;
+                border: 0.5px solid rgba(255, 255, 255, 0.1) !important;
+                color: #F5F5F7 !important;
+                font-size: 0.95rem !important;
+                font-weight: 300;
+                letter-spacing: 0.01em;
+            }
+            .form-control::placeholder {
+                color: rgba(245, 245, 247, 0.4) !important;
+            }
+            .btn-primary {
+                background: rgba(255, 255, 255, 0.8);
+                color: #000;
+                border: none;
+                font-weight: 600;
+                font-size: 0.9rem;
+                padding: 0.6rem 1.8rem;
+                transition: all 0.3s cubic-bezier(0.25, 0.46, 0.45, 0.94);
+                letter-spacing: 0.02em;
+            }
+            .btn-primary:hover {
+                background: rgba(255, 255, 255, 0.95);
+                transform: translateY(-1px);
+                box-shadow: 0 8px 24px rgba(255, 255, 255, 0.15);
+            }
+            .results-header {
+                color: rgba(245, 245, 247, 0.6);
+                font-weight: 300;
+                font-size: 0.85rem !important;
+                letter-spacing: 0.05em;
+                text-transform: uppercase;
+            }
+            .time-breakdown {
+                color: rgba(245, 245, 247, 0.4);
+                font-size: 0.7rem;
+                font-weight: 300;
+                letter-spacing: 0.02em;
+            }
+            .galaxy-count {
+                color: rgba(245, 245, 247, 0.5);
+                font-weight: 300;
+                font-size: 0.85rem;
+                letter-spacing: 0.05em;
+                text-transform: uppercase;
+            }
+            .score-badge {
+                background: rgba(255, 255, 255, 0.1);
+                backdrop-filter: blur(10px);
+                color: rgba(245, 245, 247, 0.9);
+                font-size: 0.65rem !important;
+                padding: 3px 8px !important;
+                border-radius: 6px;
+                font-weight: 500;
+                letter-spacing: 0.02em;
+                border: 0.5px solid rgba(255, 255, 255, 0.1);
+            }
+            .info-button {
+                color: rgba(245, 245, 247, 0.5) !important;
+                font-size: 0.75rem !important;
+                opacity: 0.8;
+                transition: all 0.3s ease;
+                letter-spacing: 0.02em;
+            }
+            .info-button:hover {
+                opacity: 1;
+                color: #F5F5F7 !important;
+            }
+            ::-webkit-scrollbar {
+                width: 8px;
+            }
+            ::-webkit-scrollbar-track {
+                background: rgba(255, 255, 255, 0.02);
+            }
+            ::-webkit-scrollbar-thumb {
+                background: rgba(255, 255, 255, 0.1);
+                border-radius: 4px;
+            }
+            ::-webkit-scrollbar-thumb:hover {
+                background: rgba(255, 255, 255, 0.2);
+            }
+            .btn-link {
+                text-decoration: none !important;
+            }
+            .input-group-text {
+                background: rgba(255, 255, 255, 0.03) !important;
+                border: 0.5px solid rgba(255, 255, 255, 0.1) !important;
+                color: rgba(245, 245, 247, 0.5) !important;
+            }
+            .spinner-border {
+                color: rgba(245, 245, 247, 0.5) !important;
+            }
+            @supports (backdrop-filter: blur(40px)) {
+                .search-container {
+                    background: rgba(255, 255, 255, 0.03);
+                }
+            }
+            @keyframes float {
+                0%, 100% { transform: translateY(0px); }
+                50% { transform: translateY(-3px); }
+            }
+            .download-button {
+                background: rgba(255, 255, 255, 0.05);
+                border: 0.5px solid rgba(255, 255, 255, 0.1) !important;
+                color: rgba(245, 245, 247, 0.6) !important;
+                font-size: 0.75rem !important;
+                padding: 0.4rem 0.8rem !important;
+                transition: all 0.3s cubic-bezier(0.25, 0.46, 0.45, 0.94);
+                letter-spacing: 0.01em;
+                margin-left: 0.5rem;
+            }
+            .download-button:hover {
+                background: rgba(255, 255, 255, 0.08) !important;
+                border-color: rgba(255, 255, 255, 0.15) !important;
+                color: rgba(245, 245, 247, 0.8) !important;
+                transform: translateY(-1px);
+            }
+            .download-button i {
+                color: rgba(245, 245, 247, 0.6) !important;
+            }
+            .download-button:hover i {
+                color: rgba(245, 245, 247, 0.8) !important;
+            }
+            .refinement-toggle {
+                background: rgba(255, 255, 255, 0.03);
+                border: 0.5px solid rgba(255, 255, 255, 0.1);
+                color: rgba(245, 245, 247, 0.5);
+                font-size: 0.7rem;
+                padding: 0.5rem 0.75rem;
+                transition: all 0.3s ease;
+                cursor: pointer;
+                display: flex;
+                align-items: center;
+                justify-content: center;
+                gap: 0.4rem;
+                margin: 0;
+                letter-spacing: 0.05em;
+                text-transform: uppercase;
+                border-radius: 8px;
+                height: 100%;
+            }
+            .refinement-toggle:hover {
+                background: rgba(255, 255, 255, 0.05);
+                border-color: rgba(255, 255, 255, 0.15);
+                color: rgba(245, 245, 247, 0.8);
+                transform: translateY(-1px);
+            }
+            .refinement-toggle i {
+                transition: transform 0.3s ease;
+                font-size: 0.65rem;
+            }
+            .refinement-toggle.expanded i {
+                transform: rotate(180deg);
+            }
+            .refinement-container {
+                background: rgba(255, 255, 255, 0.03);
+                border: 0.5px solid rgba(255, 255, 255, 0.1);
+                border-radius: 12px;
+                padding: 1.5rem;
+                margin-top: 1rem;
+                backdrop-filter: blur(20px);
+            }
+            .refinement-label {
+                color: rgba(245, 245, 247, 0.6);
+                font-size: 0.85rem;
+                font-weight: 300;
+                margin-bottom: 0.5rem;
+            }
+            .vector-operation-select {
+                background-color: rgba(255, 255, 255, 0.03) !important;
+                border: 0.5px solid rgba(255, 255, 255, 0.1) !important;
+                color: #F5F5F7 !important;
+                font-size: 0.9rem !important;
+                font-weight: 500;
+                text-align: center;
+            }
+            .vector-operation-select option {
+                background-color: #1C1C1E;
+                color: #F5F5F7;
+            }
+            .vector-query-type-select {
+                background-color: rgba(255, 255, 255, 0.03) !important;
+                border: 0.5px solid rgba(255, 255, 255, 0.1) !important;
+                color: #F5F5F7 !important;
+                font-size: 0.9rem !important;
+                font-weight: 500;
+            }
+            .vector-query-type-select option {
+                background-color: #1C1C1E;
+                color: #F5F5F7;
+            }
+            .btn-add-vector {
+                background: rgba(255, 255, 255, 0.05);
+                border: 0.5px solid rgba(255, 255, 255, 0.1) !important;
+                color: rgba(245, 245, 247, 0.6) !important;
+                font-size: 0.8rem !important;
+                padding: 0.4rem 0.8rem !important;
+                transition: all 0.3s cubic-bezier(0.25, 0.46, 0.45, 0.94);
+                letter-spacing: 0.01em;
+            }
+            .btn-add-vector:hover {
+                background: rgba(255, 255, 255, 0.08) !important;
+                border-color: rgba(255, 255, 255, 0.15) !important;
+                color: rgba(245, 245, 247, 0.8) !important;
+                transform: translateY(-1px);
+            }
+            .btn-add-vector i {
+                color: rgba(245, 245, 247, 0.6) !important;
+            }
+            .btn-add-vector:hover i {
+                color: rgba(245, 245, 247, 0.8) !important;
+            }
+            .vector-delete-btn {
+                opacity: 0.5;
+                transition: opacity 0.2s ease;
+                padding: 0.25rem 0.5rem !important;
+                border: none !important;
+                background: none !important;
+            }
+            .vector-delete-btn:hover {
+                opacity: 1;
+                background: rgba(220, 53, 69, 0.1) !important;
+                border-radius: 4px;
+            }
+            .vector-delete-btn i {
+                font-size: 0.9rem;
+            }
+            /* Range slider styling */
+            .rmag-slider .rc-slider-rail {
+                background-color: rgba(255, 255, 255, 0.1);
+                height: 4px;
+            }
+            .rmag-slider .rc-slider-track {
+                background: linear-gradient(90deg, rgba(255, 255, 255, 0.6) 0%, rgba(255, 255, 255, 0.8) 100%);
+                height: 4px;
+            }
+            .rmag-slider .rc-slider-handle {
+                border: 2px solid rgba(255, 255, 255, 0.8);
+                background-color: #F5F5F7;
+                opacity: 1;
+                width: 16px;
+                height: 16px;
+                margin-top: -6px;
+            }
+            .rmag-slider .rc-slider-handle:hover,
+            .rmag-slider .rc-slider-handle:active,
+            .rmag-slider .rc-slider-handle:focus {
+                border-color: rgba(255, 255, 255, 0.95);
+                box-shadow: 0 0 0 5px rgba(255, 255, 255, 0.1);
+            }
+            .rmag-slider .rc-slider-mark-text {
+                color: rgba(245, 245, 247, 0.5);
+                font-size: 0.75rem;
+                font-weight: 300;
+            }
+            .rmag-slider .rc-slider-tooltip-inner {
+                background-color: rgba(255, 255, 255, 0.9);
+                color: #000;
+                font-size: 0.75rem;
+                font-weight: 500;
+                padding: 4px 8px;
+                border-radius: 4px;
+                box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3);
+            }
+            .rmag-slider .rc-slider-tooltip-arrow {
+                border-top-color: rgba(255, 255, 255, 0.9);
+            }
+        </style>
+    </head>
+    <body>
+        {%app_entry%}
+        <footer>
+            {%config%}
+            {%scripts%}
+            {%renderer%}
+        </footer>
+    </body>
+</html>
+'''
+def create_header():
+    """Create the app header with title and galaxy count."""
+    return dbc.Row([
+        dbc.Col([
+            html.Div([
+                html.H1("galaxy semantic search", className="galaxy-title text-center mb-1"),
+                html.Div(id="galaxy-count", className="galaxy-count text-center")
+            ], className="text-center mb-3")
+        ])
+    ])
+def create_rmag_filter_panel():
+    """Create the r_mag filter panel."""
+    return dbc.Row([
+        dbc.Col([
+            html.Div([
+                dbc.Row([
+                    dbc.Col([
+                        html.Div("r-mag",
+                                style={"color": "rgba(245, 245, 247, 0.6)",
+                                      "font-size": "0.85rem",
+                                      "font-weight": "300",
+                                      "text-align": "center"})
+                    ], width=1, className="d-flex align-items-center justify-content-center", style={"padding": "0"}),
+                    dbc.Col([
+                        dcc.RangeSlider(
+                            id="rmag-slider",
+                            min=13.0,
+                            max=20.0,
+                            step=0.1,
+                            value=[13.0, 20.0],
+                            marks={13: '13', 15: '15', 17: '17', 19: '19', 20: '20'},
+                            tooltip={"placement": "bottom", "always_visible": True},
+                            className="rmag-slider"
+                        )
+                    ], width=11)
+                ], className="align-items-center", style={"margin": "0"})
+            ], style={"padding": "0.5rem 0"})
+        ], width=12)
+    ], className="mt-2")
+def create_search_container():
+    """Create the main search input container with examples and input."""
+    return dbc.Row([
+        dbc.Col([
+            html.Div([
+                # Info button in top right
+                html.Div([
+                    dbc.Button([
+                        html.I(className="fas fa-info-circle")
+                    ], id="info-button", color="link", size="sm",
+                    className="info-button")
+                ], style={"position": "absolute", "top": "8px", "right": "8px", "z-index": "1000"}),
+                # Example search buttons
+                html.Div([
+                    html.P("Try these examples:", className="text-center mb-2",
+                          style={"color": "rgba(245, 245, 247, 0.5)", "font-weight": "300",
+                                "font-size": "0.75rem", "letter-spacing": "0.02em"}),
+                    html.Div([
+                        dbc.Button([html.I(className="fas fa-satellite me-2"), "Merging edge-on galaxy"],
+                                 id="example-1", className="example-button me-2 mb-2", size="sm", color="light"),
+                        dbc.Button([html.I(className="fas fa-water me-2"), "Tidal"],
+                                 id="example-2", className="example-button me-2 mb-2", size="sm", color="light"),
+                        dbc.Button([html.I(className="fas fa-stream me-2"), "Stream"],
+                                 id="example-3", className="example-button me-2 mb-2", size="sm", color="light"),
+                        dbc.Button([html.I(className="fas fa-glasses me-2"), "Gravitational lens"],
+                                 id="example-4", className="example-button me-2 mb-2", size="sm", color="light"),
+                        dbc.Button([html.I(className="fas fa-explosion me-2"), "A violent merger"],
+                                 id="example-5", className="example-button me-2 mb-2", size="sm", color="light"),
+                        dbc.Button([html.I(className="fas fa-moon me-2"), "Low surface brightness"],
+                                 id="example-6", className="example-button me-2 mb-2", size="sm", color="light"),
+                        dbc.Button([html.I(className="fas fa-ring me-2"), "Ring galaxy"],
+                                 id="example-7", className="example-button mb-2", size="sm", color="light")
+                    ], className="text-center")
+                ], className="mb-3"),
+                # Search input
+                dbc.InputGroup([
+                    dbc.InputGroupText(html.I(className="fas fa-search")),
+                    dbc.Input(
+                        id="search-input",
+                        placeholder="Describe the galaxy you're looking for...",
+                        type="text",
+                        n_submit=0
+                    ),
+                    dbc.Button("Search",
+                             id="search-button", color="primary", n_clicks=0),
+                    dbc.Button([
+                        html.I(className="fas fa-download")
+                    ], id="download-button", color="secondary", n_clicks=0,
+                    className="download-button", size="sm",
+                    disabled=True)
+                ])
+            ], className="search-container", style={"position": "relative"}),
+            # r_mag filter
+            create_rmag_filter_panel(),
+            # Vector Addition toggle button
+            dbc.Row([
+                dbc.Col([
+                    html.Button([
+                        html.I(className="fas fa-chevron-down", id="vector-arrow"),
+                        "Advanced Search (Vector Addition / Images)"
+                    ], id="vector-toggle", className="refinement-toggle w-100")
+                ], width=12)
+            ], className="mt-3"),
+            # Vector Addition UI - Collapsible section
+            create_vector_addition_panel()
+        ], width=12, lg=11, className="mx-auto")
+    ], className="mb-3")
+def create_vector_addition_panel():
+    """Create the advanced search (vector addition) collapsible panel."""
+    return dbc.Collapse([
+        html.Div([
+            html.P("Advanced Search: Combine multiple text and/or image queries using vector addition/subtraction:", className="refinement-label"),
+            html.Div(id="vector-inputs", children=[
+                # Initial input
+                create_vector_input_row(0)
+            ]),
+            dbc.Row([
+                dbc.Col([
+                    dbc.Button(
+                        [html.I(className="fas fa-plus me-2"), "Add Query"],
+                        id="add-vector-input",
+                        color="secondary",
+                        size="sm",
+                        className="me-2 btn-add-vector"
+                    )
+                ], width=6),
+                dbc.Col([
+                    dbc.Button(
+                        "Advanced Search",
+                        id="vector-search-button",
+                        className="btn-primary w-100",
+                        n_clicks=0
+                    )
+                ], width=6)
+            ], className="mt-3")
+        ], className="refinement-container")
+    ], id="vector-collapse", is_open=False)
+def create_vector_input_row(index: int, query_type: str = "text", ra: float = None, dec: float = None, fov: float = 0.025):
+    """Create a single vector input row with operation selector, query type toggle, and conditional inputs.
+    Args:
+        index: Index of the vector input row
+        query_type: Type of query - "text" or "image" (default: "text")
+        ra: Initial RA value for image queries (default: None)
+        dec: Initial Dec value for image queries (default: None)
+        fov: Initial FoV value for image queries (default: 0.025)
+    Returns:
+        Dash Bootstrap Row component with text/image mode toggle
+    """
+    # Determine display styles based on query type
+    text_display = {"display": "block"} if query_type == "text" else {"display": "none"}
+    image_display = {"display": "none"} if query_type == "text" else {"display": "block"}
+    return dbc.Row([
+        # Operation column with magnitude support
+        dbc.Col([
+            dbc.Select(
+                id={"type": "vector-operation", "index": index},
+                options=[
+                    {"label": "+10", "value": "+10"},
+                    {"label": "+5", "value": "+5"},
+                    {"label": "+2", "value": "+2"},
+                    {"label": "+", "value": "+"},
+                    {"label": "-", "value": "-"},
+                    {"label": "-2", "value": "-2"},
+                    {"label": "-5", "value": "-5"},
+                    {"label": "-10", "value": "-10"}
+                ],
+                value="+",
+                style={"width": "70px"},
+                className="d-inline-block vector-operation-select"
+            )
+        ], width=1),
+        # Query type toggle (Text/Image)
+        dbc.Col([
+            dbc.Select(
+                id={"type": "vector-query-type", "index": index},
+                options=[
+                    {"label": "Text", "value": "text"},
+                    {"label": "Image", "value": "image"}
+                ],
+                value=query_type,
+                style={"width": "100px"},
+                className="d-inline-block vector-query-type-select"
+            )
+        ], width=2),
+        # Input area (text or image fields)
+        dbc.Col([
+            # Text input (shown when type is "text")
+            html.Div([
+                dbc.Input(
+                    id={"type": "vector-text", "index": index},
+                    placeholder="Enter text query...",
+                    type="text"
+                )
+            ], id={"type": "text-input-container", "index": index}, style=text_display),
+            # Image inputs (shown when type is "image")
+            html.Div([
+                dbc.Row([
+                    dbc.Col([
+                        dbc.Input(
+                            id={"type": "vector-ra", "index": index},
+                            placeholder="ra:",
+                            type="number",
+                            step="any",
+                            value=ra
+                        )
+                    ], width=4),
+                    dbc.Col([
+                        dbc.Input(
+                            id={"type": "vector-dec", "index": index},
+                            placeholder="dec:",
+                            type="number",
+                            step="any",
+                            value=dec
+                        )
+                    ], width=4),
+                    dbc.Col([
+                        dbc.Input(
+                            id={"type": "vector-fov", "index": index},
+                            placeholder="fov:",
+                            type="number",
+                            value=fov,
+                            step="any"
+                        )
+                    ], width=4)
+                ])
+            ], id={"type": "image-input-container", "index": index}, style=image_display)
+        ], width=8),
+        # Delete button
+        dbc.Col([
+            dbc.Button(
+                html.I(className="fas fa-times"),
+                id={"type": "vector-delete", "index": index},
+                color="link",
+                size="sm",
+                className="text-danger vector-delete-btn",
+                style={"padding": "0.25rem 0.5rem"}
+            )
+        ], width=1, className="d-flex align-items-center justify-content-end")
+    ], className="mb-2", id={"type": "vector-row", "index": index})
+def create_results_container():
+    """Create the search results display container."""
+    return dbc.Row([
+        dbc.Col([
+            html.Div(id="search-time", className="time-breakdown text-center mb-2"),
+            html.Div(id="search-results")
+        ])
+    ])
+def create_stores():
+    """Create Dash Store components for data persistence."""
+    return [
+        dcc.Store(id="search-data"),
+        dcc.Store(id="current-galaxy-data"),
+        dcc.Store(id="vector-inputs-count", data=1),
+        dcc.Download(id="download-csv")
+    ]
+def create_galaxy_modal():
+    """Create the modal for displaying galaxy details."""
+    return dbc.Modal([
+        dbc.ModalHeader(dbc.ModalTitle(id="modal-title")),
+        dbc.ModalBody([
+            html.Div(id="modal-image", className="text-center mb-3"),
+            html.Div(id="modal-description")
+        ]),
+        dbc.ModalFooter([
+            dbc.Button(
+                [html.I(className="fas fa-plus-circle me-2"), "Add to Advanced Search"],
+                id="add-to-advanced-search",
+                color="primary",
+                className="me-2"
+            ),
+            dbc.Button("Close", id="close-modal", className="ms-auto")
+        ])
+    ], id="galaxy-modal", size="lg", is_open=False)
+def create_info_modal():
+    """Create the info modal explaining the app."""
+    return dbc.Modal([
+        dbc.ModalHeader(dbc.ModalTitle([html.I(className="fas fa-info-circle me-2"), "About Galaxy Search"])),
+        dbc.ModalBody([
+            html.P("This app performs semantic search over galaxy images using CLIP embeddings and BigQuery.",
+                   style={"color": "rgba(245, 245, 247, 0.8)", "margin-bottom": "1rem", "font-size": "0.9rem"}),
+            html.Div([
+                html.P("The search uses contrastive language-image pre-training (CLIP) to match text descriptions with galaxy images. "
+                       "The model was trained on galaxy descriptions and can understand various astronomical features and characteristics.",
+                       style={"margin-bottom": "1rem", "color": "rgba(245, 245, 247, 0.7)"}),
+                html.H6("Search Tips:", style={"color": "#F5F5F7", "font-weight": "500", "margin-bottom": "0.5rem"}),
+                html.Ul([
+                    html.Li("Describe morphological features (spiral, elliptical, irregular, merging)",
+                           style={"color": "rgba(245, 245, 247, 0.6)", "margin-bottom": "0.3rem"}),
+                    html.Li("Mention specific features (tidal tails, dust lanes, star-forming regions)",
+                           style={"color": "rgba(245, 245, 247, 0.6)", "margin-bottom": "0.3rem"}),
+                    html.Li("Use color descriptions or brightness characteristics",
+                           style={"color": "rgba(245, 245, 247, 0.6)", "margin-bottom": "0.3rem"}),
+                    html.Li("Combine multiple features for more specific results",
+                           style={"color": "rgba(245, 245, 247, 0.6)"}),
+                ], style={"margin-left": "1rem"}),
+            ], style={"background": "rgba(255, 255, 255, 0.05)", "padding": "1.5rem", "border-radius": "12px",
+                     "border": "0.5px solid rgba(255, 255, 255, 0.1)", "color": "rgba(245, 245, 247, 0.7)", "font-size": "0.9rem"})
+        ]),
+        dbc.ModalFooter(
+            dbc.Button("Close", id="close-info-modal", className="ms-auto")
+        )
+    ], id="info-modal", size="lg", is_open=False)
+def create_layout():
+    """Create the complete app layout.
+    Returns:
+        Dash Container with the full app layout
+    """
+    return dbc.Container([
+        create_header(),
+        create_search_container(),
+        create_results_container(),
+        *create_stores(),
+        create_galaxy_modal(),
+        create_info_modal()
+    ], fluid=True, className="py-2")

src/config.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Configuration settings, environment variables, and constants."""
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Environment Variables
+ZILLIZ_BEARER = os.getenv("ZILLIZ_BEARER")
+ZILLIZ_ENDPOINT = os.getenv("ZILLIZ_ENDPOINT")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# App Constants
+# Note: TOTAL_GALAXIES is dynamically updated from Zilliz at startup (see app.py)
+# This is just a fallback default value
+TOTAL_GALAXIES = 0
+DEFAULT_TOP_K = 300
+DEFAULT_DISPLAY_COUNT = 60
+LOAD_MORE_COUNT = 120
+# Zilliz Configuration
+ZILLIZ_COLLECTION_NAME = "aionsearch"
+# Image search always uses legacy collection which has pre-existing embeddings
+ZILLIZ_IMAGE_SEARCH_COLLECTION_NAME = ZILLIZ_COLLECTION_NAME
+# Collection-specific configurations
+COLLECTION_CONFIGS = {
+    "legacy_5": {
+        "anns_field": "aion_search_embedding",
+        "primary_key": "object_id",
+        "output_fields": ["object_id", "ra", "dec", "r_mag"]
+    },
+    "aionsearch": {
+        "anns_field": "clip_embedding",
+        "primary_key": "ra_dec",
+        "output_fields": ["ra_dec", "ra", "dec", "r_mag"]
+    }
+}
+# Get configuration for the selected collection
+_collection_config = COLLECTION_CONFIGS.get(ZILLIZ_COLLECTION_NAME, COLLECTION_CONFIGS[ZILLIZ_COLLECTION_NAME])
+ZILLIZ_ANNS_FIELD = _collection_config["anns_field"]
+ZILLIZ_PRIMARY_KEY = _collection_config["primary_key"]
+ZILLIZ_OUTPUT_FIELDS = _collection_config["output_fields"]
+# OpenAI Configuration
+OPENAI_EMBEDDING_MODEL = "text-embedding-3-large"
+# CLIP Model Configuration
+CLIP_EMBEDDING_DIM = 1024
+CLIP_NORMALIZE_EPS = 1e-3
+# UI Configuration
+IMAGE_HEIGHT = "160px"
+IMAGE_WIDTH = "100%"
+CUTOUT_FOV = 0.025
+CUTOUT_SIZE = 256
+# Logging Configuration
+VCU_COST_PER_MILLION = 4.0  # $4 per 1 million vCU
+# Feature Flags (for future features)
+FEATURE_IMAGE_SEARCH = False
+FEATURE_AUTH = False
+FEATURE_CACHE = False
+FEATURE_RERANKING = False
+FEATURE_TRACKING = True
+FEATURE_VECTOR_ADDITION = True

src/services.py ADDED Viewed

	@@ -0,0 +1,538 @@

+"""Backend services for AION Search."""
+import time
+import logging
+import torch
+import torch.nn.functional as F
+import numpy as np
+import pandas as pd
+import requests
+from typing import List
+from openai import OpenAI
+from src.config import (
+    ZILLIZ_BEARER,
+    ZILLIZ_ENDPOINT,
+    ZILLIZ_COLLECTION_NAME,
+    ZILLIZ_IMAGE_SEARCH_COLLECTION_NAME,
+    ZILLIZ_ANNS_FIELD,
+    ZILLIZ_PRIMARY_KEY,
+    ZILLIZ_OUTPUT_FIELDS,
+    COLLECTION_CONFIGS,
+    OPENAI_API_KEY,
+    OPENAI_EMBEDDING_MODEL,
+    CLIP_NORMALIZE_EPS,
+    DEFAULT_TOP_K,
+)
+from src.utils import cutout_url, log_zilliz_query
+logger = logging.getLogger(__name__)
+class CLIPModelService:
+    """Service for managing CLIP model loading and inference."""
+    def __init__(self):
+        self.model = None
+        self.device = None
+        self.loaded = False
+    def load_model(self, checkpoint_path: str) -> None:
+        """Load the CLIP model from checkpoint.
+        Args:
+            checkpoint_path: Path to the CLIP model checkpoint file
+        """
+        logger.info(f"Loading CLIP model from {checkpoint_path}...")
+        from clip.models.clip_model import GalaxyClipModel
+        # Set device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load checkpoint
+        checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
+        model_config = checkpoint['model_config']
+        # Initialize model with saved configuration
+        self.model = GalaxyClipModel(
+            image_input_dim=model_config['image_input_dim'],
+            text_input_dim=model_config['text_input_dim'],
+            embedding_dim=model_config['embedding_dim'],
+            use_mean_embeddings=model_config.get('use_mean_embeddings', True)
+        )
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model.to(self.device)
+        self.model.eval()
+        self.loaded = True
+        logger.info("CLIP model loaded successfully")
+    def encode_text(self, text_embedding: np.ndarray) -> np.ndarray:
+        """Project text embedding through CLIP text projector.
+        Args:
+            text_embedding: OpenAI text embedding (1536-dim)
+        Returns:
+            CLIP-projected embedding (1024-dim)
+        """
+        if not self.loaded:
+            raise RuntimeError("CLIP model not loaded. Call load_model() first.")
+        with torch.no_grad():
+            text_tensor = torch.from_numpy(text_embedding).float().unsqueeze(0).to(self.device)
+            clip_features = self.model.text_projector(text_tensor)
+            # Normalize as per CLIP
+            clip_features = F.normalize(clip_features, dim=-1, eps=CLIP_NORMALIZE_EPS)
+            query_embedding = clip_features.cpu().numpy().squeeze(0)
+        return query_embedding
+class ImageProcessingService:
+    """Service for retrieving pre-existing image embeddings from Zilliz."""
+    def __init__(self):
+        pass
+    def encode_image(self, ra: float, dec: float, fov: float = 0.025, size: int = 256) -> np.ndarray:
+        """Query Zilliz for pre-existing embedding at the given coordinates.
+        Args:
+            ra: Right ascension in degrees
+            dec: Declination in degrees
+            fov: Field of view in degrees (used to define search box)
+            size: Image size in pixels (unused, kept for API compatibility)
+        Returns:
+            Pre-existing AION-Search embedding vector (1024-dim) from Zilliz
+        """
+        logger.info(f"Querying Zilliz for pre-existing embedding at RA={ra}, Dec={dec}, FoV={fov}")
+        # Calculate bounding box based on field of view
+        ra_min = ra - fov/2
+        ra_max = ra + fov/2
+        dec_min = dec - fov/2
+        dec_max = dec + fov/2
+        # Build filter expression for coordinate range
+        filter_expr = f"ra > {ra_min} AND ra < {ra_max} AND dec > {dec_min} AND dec < {dec_max}"
+        # Get the ANNS field for the image search collection
+        image_search_config = COLLECTION_CONFIGS.get(ZILLIZ_IMAGE_SEARCH_COLLECTION_NAME)
+        image_anns_field = image_search_config["anns_field"]
+        # Prepare query payload - always use the image search collection (legacy)
+        payload = {
+            "collectionName": ZILLIZ_IMAGE_SEARCH_COLLECTION_NAME,
+            "filter": filter_expr,
+            "outputFields": [image_anns_field],
+            "limit": 1
+        }
+        headers = {
+            "Authorization": f"Bearer {ZILLIZ_BEARER}",
+            "Accept": "application/json",
+            "Content-Type": "application/json"
+        }
+        try:
+            # Use query endpoint (replace /search with /query)
+            query_endpoint = ZILLIZ_ENDPOINT.replace("/search", "/query")
+            response = requests.post(query_endpoint, json=payload, headers=headers)
+            response.raise_for_status()
+            result = response.json()
+            if result.get("code") == 0 and "data" in result:
+                data = result["data"]
+                if data and len(data) > 0:
+                    # Extract the embedding from the first result using the image search ANNS field
+                    embedding = data[0].get(image_anns_field)
+                    if embedding:
+                        embedding_array = np.array(embedding, dtype=np.float32)
+                        logger.info(f"Retrieved pre-existing embedding with shape: {embedding_array.shape}")
+                        return embedding_array
+                    else:
+                        logger.error(f"No embedding field found in result: {data[0].keys()}")
+                        raise RuntimeError(f"No embedding found at coordinates RA={ra}, Dec={dec}")
+                else:
+                    logger.error(f"No galaxies found at coordinates RA={ra}, Dec={dec} with FoV={fov}")
+                    raise RuntimeError(f"No galaxies found at coordinates RA={ra}, Dec={dec}")
+            else:
+                logger.error(f"Zilliz query failed: {result}")
+                raise RuntimeError(f"Failed to query Zilliz: {result}")
+        except Exception as e:
+            logger.error(f"Error querying Zilliz for embedding: {e}")
+            raise
+class EmbeddingService:
+    """Service for encoding text queries into embeddings."""
+    def __init__(self, clip_service: CLIPModelService):
+        self.clip_service = clip_service
+        self.openai_client = None
+    def _get_openai_client(self) -> OpenAI:
+        """Get or create OpenAI client."""
+        if self.openai_client is None:
+            if not OPENAI_API_KEY:
+                raise ValueError("OPENAI_API_KEY environment variable not set")
+            self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+        return self.openai_client
+    def encode_text_query(self, query: str) -> np.ndarray:
+        """Encode text query using OpenAI embeddings + CLIP text projector.
+        Args:
+            query: Text search query
+        Returns:
+            CLIP embedding vector
+        """
+        client = self._get_openai_client()
+        # Get OpenAI text embedding
+        response = client.embeddings.create(
+            input=query,
+            model=OPENAI_EMBEDDING_MODEL
+        )
+        text_embedding = np.array(response.data[0].embedding)
+        # Project through CLIP text projector
+        return self.clip_service.encode_text(text_embedding)
+    def encode_vector_queries(
+        self,
+        queries: List[str],
+        operations: List[str]
+    ) -> np.ndarray:
+        """Encode multiple text queries and combine them using vector addition/subtraction.
+        Args:
+            queries: List of text queries
+            operations: List of operations ('+' or '-') for each query
+        Returns:
+            Combined normalized embedding vector
+        """
+        client = self._get_openai_client()
+        # Get all embeddings at once for efficiency
+        response = client.embeddings.create(
+            input=queries,
+            model=OPENAI_EMBEDDING_MODEL
+        )
+        # Initialize combined embedding
+        combined_embedding = None
+        # Process each embedding with its operation
+        for embedding_data, operation in zip(response.data, operations):
+            text_embedding = np.array(embedding_data.embedding)
+            # Project through CLIP text projector
+            query_embedding = self.clip_service.encode_text(text_embedding)
+            # Apply operation
+            if combined_embedding is None:
+                combined_embedding = query_embedding if operation == "+" else -query_embedding
+            else:
+                if operation == "+":
+                    combined_embedding += query_embedding
+                else:
+                    combined_embedding -= query_embedding
+        # Normalize the final combined embedding
+        norm = np.linalg.norm(combined_embedding)
+        if norm > 0:
+            combined_embedding = combined_embedding / norm
+        return combined_embedding
+class ZillizService:
+    """Service for interacting with Zilliz vector database."""
+    def get_collection_count(self) -> int:
+        """Get the total number of entities in the collection.
+        Returns:
+            Total count of entities in the collection
+        """
+        logger.info("Getting collection count from Zilliz...")
+        # Use query endpoint with count to get total entities
+        payload = {
+            "collectionName": ZILLIZ_COLLECTION_NAME,
+            "filter": "",  # Empty filter to count all entities
+            "outputFields": ["count(*)"]
+        }
+        headers = {
+            "Authorization": f"Bearer {ZILLIZ_BEARER}",
+            "Accept": "application/json",
+            "Content-Type": "application/json"
+        }
+        try:
+            # Use the query endpoint (replace /search with /query in the endpoint)
+            query_endpoint = ZILLIZ_ENDPOINT.replace("/search", "/query")
+            response = requests.post(query_endpoint, json=payload, headers=headers)
+            response.raise_for_status()
+            result = response.json()
+            if result.get("code") == 0 and "data" in result:
+                # The count should be in the response data
+                data = result["data"]
+                if data and len(data) > 0:
+                    count = data[0].get("count(*)", 0)
+                    logger.info(f"Collection count: {count:,}")
+                    return count
+            else:
+                logger.error(f"Failed to get collection count: {result}")
+                return 0
+        except Exception as e:
+            logger.error(f"Error getting collection count: {e}")
+            return 0
+    def search(self, query_embedding: np.ndarray, top_k: int = DEFAULT_TOP_K, filter_expr: str = None) -> pd.DataFrame:
+        """Search Zilliz for top-k most similar galaxies.
+        Args:
+            query_embedding: Query embedding vector
+            top_k: Number of results to return
+            filter_expr: Optional filter expression for filtering results
+        Returns:
+            DataFrame with search results
+        """
+        logger.info("Querying Zilliz...")
+        start_time = time.time()
+        # Prepare the search payload
+        payload = {
+            "collectionName": ZILLIZ_COLLECTION_NAME,
+            "data": [query_embedding.tolist()],
+            "annsField": ZILLIZ_ANNS_FIELD,
+            "limit": top_k,
+            "outputFields": ZILLIZ_OUTPUT_FIELDS
+        }
+        # Add filter if provided
+        if filter_expr:
+            payload["filter"] = filter_expr
+            logger.info(f"Applying filter: {filter_expr}")
+        headers = {
+            "Authorization": f"Bearer {ZILLIZ_BEARER}",
+            "Accept": "application/json",
+            "Content-Type": "application/json"
+        }
+        try:
+            response = requests.post(ZILLIZ_ENDPOINT, json=payload, headers=headers)
+            response.raise_for_status()
+            result = response.json()
+            if result.get("code") == 0 and "data" in result:
+                # Extract cost from response
+                cost_vcu = result.get("cost", 0)
+                # Convert to DataFrame
+                data_list = result["data"]
+                df = pd.DataFrame(data_list)
+                # Add cutout URLs
+                if not df.empty:
+                    df["cutout_url"] = [cutout_url(ra, dec) for ra, dec in zip(df["ra"], df["dec"])]
+                query_time = time.time() - start_time
+                # Log the query
+                log_zilliz_query(
+                    query_type="vector_search",
+                    query_info={
+                        "top_k": top_k,
+                        "embedding_dim": len(query_embedding)
+                    },
+                    result_count=len(df),
+                    query_time=query_time,
+                    cost_vcu=cost_vcu
+                )
+                return df
+            else:
+                logger.error(f"Zilliz search failed: {result}")
+                return pd.DataFrame()
+        except Exception as e:
+            logger.error(f"Zilliz search error: {e}")
+            return pd.DataFrame()
+class SearchService:
+    """High-level search orchestration service."""
+    def __init__(
+        self,
+        embedding_service: EmbeddingService,
+        zilliz_service: ZillizService,
+        image_service: 'ImageProcessingService' = None
+    ):
+        self.embedding_service = embedding_service
+        self.zilliz_service = zilliz_service
+        self.image_service = image_service
+    def _build_rmag_filter(self, rmag_min=None, rmag_max=None) -> str:
+        """Build r_mag filter expression.
+        Args:
+            rmag_min: Minimum r_mag value (inclusive)
+            rmag_max: Maximum r_mag value (inclusive)
+        Returns:
+            Filter expression string, or None if no filter
+        """
+        filter_parts = []
+        if rmag_min is not None:
+            filter_parts.append(f"r_mag >= {rmag_min}")
+        if rmag_max is not None:
+            filter_parts.append(f"r_mag <= {rmag_max}")
+        if filter_parts:
+            return " AND ".join(filter_parts)
+        return None
+    def search_text(self, query: str, top_k: int = DEFAULT_TOP_K, rmag_min=None, rmag_max=None) -> pd.DataFrame:
+        """Search galaxies using text query.
+        Args:
+            query: Text search query
+            top_k: Number of results to return
+            rmag_min: Minimum r_mag value (inclusive)
+            rmag_max: Maximum r_mag value (inclusive)
+        Returns:
+            DataFrame with search results
+        """
+        # Encode query
+        query_embedding = self.embedding_service.encode_text_query(query)
+        # Build filter
+        filter_expr = self._build_rmag_filter(rmag_min, rmag_max)
+        # Search Zilliz
+        return self.zilliz_service.search(query_embedding, top_k, filter_expr)
+    def search_vector(
+        self,
+        queries: List[str],
+        operations: List[str],
+        top_k: int = DEFAULT_TOP_K,
+        rmag_min=None,
+        rmag_max=None
+    ) -> pd.DataFrame:
+        """Search galaxies using vector addition/subtraction.
+        Args:
+            queries: List of text queries
+            operations: List of operations ('+' or '-') for each query
+            top_k: Number of results to return
+            rmag_min: Minimum r_mag value (inclusive)
+            rmag_max: Maximum r_mag value (inclusive)
+        Returns:
+            DataFrame with search results
+        """
+        # Encode and combine vectors
+        combined_embedding = self.embedding_service.encode_vector_queries(queries, operations)
+        # Build filter
+        filter_expr = self._build_rmag_filter(rmag_min, rmag_max)
+        # Search Zilliz
+        return self.zilliz_service.search(combined_embedding, top_k, filter_expr)
+    def search_advanced(
+        self,
+        text_queries: List[str] = None,
+        text_weights: List[float] = None,
+        image_queries: List[dict] = None,
+        image_weights: List[float] = None,
+        top_k: int = DEFAULT_TOP_K,
+        rmag_min=None,
+        rmag_max=None
+    ) -> pd.DataFrame:
+        """Search galaxies using advanced vector addition/subtraction with text and/or images.
+        Args:
+            text_queries: List of text query strings
+            text_weights: List of weight magnitudes for text queries (e.g., 1.0, -1.0, 2.0, -5.0)
+            image_queries: List of dicts with 'ra', 'dec', 'fov' keys
+            image_weights: List of weight magnitudes for image queries (e.g., 1.0, -1.0, 2.0, -5.0)
+            top_k: Number of results to return
+            rmag_min: Minimum r_mag value (inclusive)
+            rmag_max: Maximum r_mag value (inclusive)
+        Returns:
+            DataFrame with search results
+        """
+        combined_embedding = None
+        # Process text queries
+        if text_queries and len(text_queries) > 0:
+            for query, weight in zip(text_queries, text_weights):
+                query_embedding = self.embedding_service.encode_text_query(query)
+                # Apply weight
+                weighted_embedding = query_embedding * weight
+                if combined_embedding is None:
+                    combined_embedding = weighted_embedding
+                else:
+                    combined_embedding += weighted_embedding
+        # Process image queries
+        if image_queries and len(image_queries) > 0:
+            if self.image_service is None:
+                raise RuntimeError("Image service not initialized")
+            for img_query, weight in zip(image_queries, image_weights):
+                # Encode image
+                image_embedding = self.image_service.encode_image(
+                    ra=img_query['ra'],
+                    dec=img_query['dec'],
+                    fov=img_query.get('fov', 0.025),
+                    size=256
+                )
+                # Apply weight
+                weighted_embedding = image_embedding * weight
+                if combined_embedding is None:
+                    combined_embedding = weighted_embedding
+                else:
+                    combined_embedding += weighted_embedding
+        # Normalize the final combined embedding
+        if combined_embedding is not None:
+            norm = np.linalg.norm(combined_embedding)
+            if norm > 0:
+                combined_embedding = combined_embedding / norm
+        # Build filter
+        filter_expr = self._build_rmag_filter(rmag_min, rmag_max)
+        # Search Zilliz
+        return self.zilliz_service.search(combined_embedding, top_k, filter_expr)

src/utils.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""Utility functions for AION Search."""
+import json
+import logging
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Any
+from src.config import CUTOUT_FOV, CUTOUT_SIZE, VCU_COST_PER_MILLION
+logger = logging.getLogger(__name__)
+def cutout_url(ra: float, dec: float, fov: float = CUTOUT_FOV, size: int = CUTOUT_SIZE) -> str:
+    """Generate Legacy Survey cutout URL from RA/Dec coordinates.
+    Args:
+        ra: Right Ascension in degrees
+        dec: Declination in degrees
+        fov: Field of view in degrees
+        size: Image size in pixels
+    Returns:
+        URL string for the cutout image
+    """
+    return (
+        f"https://alasky.cds.unistra.fr/hips-image-services/hips2fits"
+        f"?hips=CDS/P/DESI-Legacy-Surveys/DR10/color"
+        f"&ra={ra}&dec={dec}&fov={fov}&width={size}&height={size}&format=jpg"
+    )
+def log_zilliz_query(
+    query_type: str,
+    query_info: Dict[str, Any],
+    result_count: int,
+    query_time: float,
+    cost_vcu: int = 0
+) -> None:
+    """Log Zilliz queries to a file in logs/ directory.
+    Args:
+        query_type: Type of query (e.g., "vector_search", "text_search")
+        query_info: Dictionary containing query details
+        result_count: Number of results returned
+        query_time: Query execution time in seconds
+        cost_vcu: Cost in vCU units
+    """
+    logs_dir = Path("logs")
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    log_file = logs_dir / f"zilliz_query_{timestamp}.json"
+    # Convert vCU cost to dollars
+    cost_usd = (cost_vcu / 1e6) * VCU_COST_PER_MILLION
+    log_data = {
+        "timestamp": datetime.now().isoformat(),
+        "query_type": query_type,
+        "query_info": query_info,
+        "result_count": result_count,
+        "query_time_seconds": query_time,
+        "cost_vCU": cost_vcu,
+        "cost_usd": cost_usd
+    }
+    with open(log_file, 'w') as f:
+        json.dump(log_data, f, indent=2)
+    logger.info(
+        f"Query logged to {log_file} | {result_count} results in {query_time:.3f}s | "
+        f"{cost_vcu} vCU (${cost_usd:.6f})"
+    )
+def format_galaxy_count(count: int) -> str:
+    """Format galaxy count with thousands separator.
+    Args:
+        count: Number of galaxies
+    Returns:
+        Formatted string (e.g., "259,636 galaxies")
+    """
+    return f"{count:,} galaxies"
+def build_query_xml(
+    text_queries: list = None,
+    text_weights: list = None,
+    image_queries: list = None,
+    image_weights: list = None,
+    rmag_min: float = None,
+    rmag_max: float = None
+) -> str:
+    """Build XML representation of a query according to aql.md specification.
+    Args:
+        text_queries: List of text query strings
+        text_weights: List of weight magnitudes for text queries (e.g., 1.0, -1.0, 2.0, -5.0)
+        image_queries: List of dicts with 'ra', 'dec', 'fov' keys
+        image_weights: List of weight magnitudes for image queries (e.g., 1.0, -1.0, 2.0, -5.0)
+        rmag_min: Minimum r_mag filter value
+        rmag_max: Maximum r_mag filter value
+    Returns:
+        XML string representation of the query
+    """
+    xml_parts = ['<query>']
+    # Add text queries
+    if text_queries and len(text_queries) > 0:
+        xml_parts.append('  <text>')
+        for query, weight in zip(text_queries, text_weights):
+            xml_parts.append('    <term>')
+            xml_parts.append(f'      <weight>{weight}</weight>')
+            xml_parts.append(f'      <content>{query}</content>')
+            xml_parts.append('    </term>')
+        xml_parts.append('  </text>')
+    # Add image queries
+    if image_queries and len(image_queries) > 0:
+        xml_parts.append('  <image>')
+        for img_query, weight in zip(image_queries, image_weights):
+            xml_parts.append('    <reference>')
+            xml_parts.append(f'      <ra>{img_query["ra"]}</ra>')
+            xml_parts.append(f'      <dec>{img_query["dec"]}</dec>')
+            xml_parts.append(f'      <fov>{img_query["fov"]}</fov>')
+            xml_parts.append(f'      <weight>{weight}</weight>')
+            xml_parts.append('    </reference>')
+        xml_parts.append('  </image>')
+    # Add filters
+    if rmag_min is not None or rmag_max is not None:
+        xml_parts.append('  <filters>')
+        if rmag_min is not None and rmag_max is not None:
+            xml_parts.append('    <filter>')
+            xml_parts.append('      <column>r_mag</column>')
+            xml_parts.append('      <operator>between</operator>')
+            xml_parts.append(f'      <value_min>{rmag_min}</value_min>')
+            xml_parts.append(f'      <value_max>{rmag_max}</value_max>')
+            xml_parts.append('    </filter>')
+        elif rmag_min is not None:
+            xml_parts.append('    <filter>')
+            xml_parts.append('      <column>r_mag</column>')
+            xml_parts.append('      <operator>gte</operator>')
+            xml_parts.append(f'      <value>{rmag_min}</value>')
+            xml_parts.append('    </filter>')
+        elif rmag_max is not None:
+            xml_parts.append('    <filter>')
+            xml_parts.append('      <column>r_mag</column>')
+            xml_parts.append('      <operator>lte</operator>')
+            xml_parts.append(f'      <value>{rmag_max}</value>')
+            xml_parts.append('    </filter>')
+        xml_parts.append('  </filters>')
+    xml_parts.append('</query>')
+    return '\n'.join(xml_parts)
+def log_query_to_csv(
+    query_xml: str,
+    csv_path: str = "logs/query_log.csv"
+) -> None:
+    """Log a query to CSV file with datetime and XML string.
+    Args:
+        query_xml: XML string representation of the query
+        csv_path: Path to the CSV log file
+    """
+    import csv
+    import os
+    # Create logs directory if it doesn't exist
+    log_dir = Path(csv_path).parent
+    log_dir.mkdir(parents=True, exist_ok=True)
+    # Prepare log entry
+    timestamp = datetime.now().isoformat()
+    # Check if file exists to determine if we need to write header
+    file_exists = Path(csv_path).exists()
+    # Append to CSV
+    with open(csv_path, 'a', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        # Write header if file is new
+        if not file_exists:
+            writer.writerow(['datetime', 'query'])
+        # Write the query log
+        writer.writerow([timestamp, query_xml])
+    logger.info(f"Query logged to {csv_path}")