Sarthak
chore: moved model2vec as in internal package
473c3a0
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast
import huggingface_hub
import safetensors
from huggingface_hub import ModelCard, ModelCardData
from safetensors.numpy import save_file
from tokenizers import Tokenizer
if TYPE_CHECKING:
import numpy as np
from distiller.model2vec.utils import SafeOpenProtocol
logger = logging.getLogger(__name__)
def save_pretrained(
folder_path: Path,
embeddings: np.ndarray,
tokenizer: Tokenizer,
config: dict[str, Any],
create_model_card: bool = True,
subfolder: str | None = None,
**kwargs: Any,
) -> None:
"""
Save a model to a folder.
:param folder_path: The path to the folder.
:param embeddings: The embeddings.
:param tokenizer: The tokenizer.
:param config: A metadata config.
:param create_model_card: Whether to create a model card.
:param subfolder: The subfolder to save the model in.
:param **kwargs: Any additional arguments.
"""
folder_path = folder_path / subfolder if subfolder else folder_path
folder_path.mkdir(exist_ok=True, parents=True)
save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
tokenizer.save(str(folder_path / "tokenizer.json"), pretty=False)
json.dump(config, open(folder_path / "config.json", "w"), indent=4)
# Create modules.json
modules = [{"idx": 0, "name": "0", "path": ".", "type": "sentence_transformers.models.StaticEmbedding"}]
if config.get("normalize"):
# If normalize=True, add sentence_transformers.models.Normalize
modules.append({"idx": 1, "name": "1", "path": "1_Normalize", "type": "sentence_transformers.models.Normalize"})
json.dump(modules, open(folder_path / "modules.json", "w"), indent=4)
logger.info(f"Saved model to {folder_path}")
# Optionally create the model card
if create_model_card:
_create_model_card(folder_path, **kwargs)
def _create_model_card(
folder_path: Path,
base_model_name: str = "unknown",
license: str = "mit",
language: list[str] | None = None,
model_name: str | None = None,
template_path: str = "modelcards/model_card_template.md",
**kwargs: Any,
) -> None:
"""
Create a model card and store it in the specified path.
:param folder_path: The path where the model card will be stored.
:param base_model_name: The name of the base model.
:param license: The license to use.
:param language: The language of the model.
:param model_name: The name of the model to use in the Model Card.
:param template_path: The path to the template.
:param **kwargs: Additional metadata for the model card (e.g., model_name, base_model, etc.).
"""
folder_path = Path(folder_path)
model_name = model_name or folder_path.name
full_path = Path(__file__).parent / template_path
model_card_data = ModelCardData(
model_name=model_name,
base_model=base_model_name,
license=license,
language=language,
tags=["embeddings", "static-embeddings", "sentence-transformers"],
library_name="model2vec",
**kwargs,
)
model_card = ModelCard.from_template(model_card_data, template_path=str(full_path))
model_card.save(folder_path / "README.md")
def load_pretrained(
folder_or_repo_path: str | Path,
subfolder: str | None = None,
token: str | None = None,
from_sentence_transformers: bool = False,
) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
"""
Loads a pretrained model from a folder.
:param folder_or_repo_path: The folder or repo path to load from.
- If this is a local path, we will load from the local path.
- If the local path is not found, we will attempt to load from the huggingface hub.
:param subfolder: The subfolder to load from.
:param token: The huggingface token to use.
:param from_sentence_transformers: Whether to load the model from a sentence transformers model.
:raises: FileNotFoundError if the folder exists, but the file does not exist locally.
:return: The embeddings, tokenizer, config, and metadata.
"""
if from_sentence_transformers:
model_file = "0_StaticEmbedding/model.safetensors"
tokenizer_file = "0_StaticEmbedding/tokenizer.json"
config_name = "config_sentence_transformers.json"
else:
model_file = "model.safetensors"
tokenizer_file = "tokenizer.json"
config_name = "config.json"
folder_or_repo_path = Path(folder_or_repo_path)
local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
if local_folder.exists():
embeddings_path = local_folder / model_file
if not embeddings_path.exists():
msg = f"Embeddings file does not exist in {local_folder}"
raise FileNotFoundError(msg)
config_path = local_folder / config_name
if not config_path.exists():
msg = f"Config file does not exist in {local_folder}"
raise FileNotFoundError(msg)
tokenizer_path = local_folder / tokenizer_file
if not tokenizer_path.exists():
msg = f"Tokenizer file does not exist in {local_folder}"
raise FileNotFoundError(msg)
# README is optional, so this is a bit finicky.
readme_path = local_folder / "README.md"
metadata = _get_metadata_from_readme(readme_path)
else:
logger.info("Folder does not exist locally, attempting to use huggingface hub.")
embeddings_path = Path(
huggingface_hub.hf_hub_download(
folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
)
)
try:
readme_path = Path(
huggingface_hub.hf_hub_download(
folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
)
)
metadata = _get_metadata_from_readme(Path(readme_path))
except Exception as e:
# NOTE: we don't want to raise an error here, since the README is optional.
logger.info(f"No README found in the model folder: {e} No model card loaded.")
metadata = {}
config_path = Path(
huggingface_hub.hf_hub_download(
folder_or_repo_path.as_posix(), config_name, token=token, subfolder=subfolder
)
)
tokenizer_path = Path(
huggingface_hub.hf_hub_download(
folder_or_repo_path.as_posix(), tokenizer_file, token=token, subfolder=subfolder
)
)
opened_tensor_file = cast("SafeOpenProtocol", safetensors.safe_open(embeddings_path, framework="numpy"))
if from_sentence_transformers:
embeddings = opened_tensor_file.get_tensor("embedding.weight")
else:
embeddings = opened_tensor_file.get_tensor("embeddings")
tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
config = json.load(open(config_path))
if len(tokenizer.get_vocab()) != len(embeddings):
logger.warning(
f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
)
return embeddings, tokenizer, config, metadata
def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
"""Get metadata from a README file."""
if not readme_path.exists():
logger.info(f"README file not found in {readme_path}. No model card loaded.")
return {}
model_card = ModelCard.load(readme_path)
data: dict[str, Any] = model_card.data.to_dict()
if not data:
logger.info("File README.md exists, but was empty. No model card loaded.")
return data
def push_folder_to_hub(
folder_path: Path, subfolder: str | None, repo_id: str, private: bool, token: str | None
) -> None:
"""
Push a model folder to the huggingface hub, including model card.
:param folder_path: The path to the folder.
:param subfolder: The subfolder to push to.
If None, the folder will be pushed to the root of the repo.
:param repo_id: The repo name.
:param private: Whether the repo is private.
:param token: The huggingface token.
"""
if not huggingface_hub.repo_exists(repo_id=repo_id, token=token):
huggingface_hub.create_repo(repo_id, token=token, private=private)
# Push model card and all model files to the Hugging Face hub
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
logger.info(f"Pushed model to {repo_id}")