Spaces:
Running
Running
import numpy as np | |
import torch | |
from typing import List, Optional | |
from sentence_transformers import SentenceTransformer | |
# Global cache for embeddings | |
_embedding_cache = {} | |
_model_cache = {} | |
def generate_embeddings( | |
texts: List[str], | |
model_name: str = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", | |
batch_size: int = 32, | |
use_mps: bool = True, | |
) -> np.ndarray: | |
""" | |
Generate embeddings for text chunks with caching. | |
Args: | |
texts: List of text chunks to embed | |
model_name: SentenceTransformer model identifier | |
batch_size: Processing batch size | |
use_mps: Use Apple Silicon acceleration | |
Returns: | |
numpy array of shape (len(texts), embedding_dim) | |
Performance Target: | |
- 100 texts/second on M4-Pro | |
- 384-dimensional embeddings | |
- Memory usage <500MB | |
""" | |
# Check cache for all texts | |
cache_keys = [f"{model_name}:{text}" for text in texts] | |
cached_embeddings = [] | |
texts_to_compute = [] | |
compute_indices = [] | |
for i, key in enumerate(cache_keys): | |
if key in _embedding_cache: | |
cached_embeddings.append((i, _embedding_cache[key])) | |
else: | |
texts_to_compute.append(texts[i]) | |
compute_indices.append(i) | |
# Load model if needed | |
if model_name not in _model_cache: | |
try: | |
model = SentenceTransformer(model_name) | |
except Exception as e: | |
# If default cache fails, try with explicit cache directory | |
import os | |
cache_dir = os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/app/.cache/sentence-transformers') | |
os.makedirs(cache_dir, exist_ok=True) | |
model = SentenceTransformer(model_name, cache_folder=cache_dir) | |
device = 'mps' if use_mps and torch.backends.mps.is_available() else 'cpu' | |
model = model.to(device) | |
model.eval() | |
_model_cache[model_name] = model | |
else: | |
model = _model_cache[model_name] | |
# Compute new embeddings | |
if texts_to_compute: | |
with torch.no_grad(): | |
new_embeddings = model.encode( | |
texts_to_compute, | |
batch_size=batch_size, | |
convert_to_numpy=True, | |
normalize_embeddings=False | |
).astype(np.float32) | |
# Cache new embeddings | |
for i, text in enumerate(texts_to_compute): | |
key = f"{model_name}:{text}" | |
_embedding_cache[key] = new_embeddings[i] | |
# Reconstruct full embedding array | |
result = np.zeros((len(texts), 384), dtype=np.float32) | |
# Fill cached embeddings | |
for idx, embedding in cached_embeddings: | |
result[idx] = embedding | |
# Fill newly computed embeddings | |
if texts_to_compute: | |
for i, original_idx in enumerate(compute_indices): | |
result[original_idx] = new_embeddings[i] | |
return result | |