from __future__ import annotations import json import logging from pathlib import Path from typing import TYPE_CHECKING, Any, cast import huggingface_hub import safetensors from huggingface_hub import ModelCard, ModelCardData from safetensors.numpy import save_file from tokenizers import Tokenizer if TYPE_CHECKING: import numpy as np from distiller.model2vec.utils import SafeOpenProtocol logger = logging.getLogger(__name__) def save_pretrained( folder_path: Path, embeddings: np.ndarray, tokenizer: Tokenizer, config: dict[str, Any], create_model_card: bool = True, subfolder: str | None = None, **kwargs: Any, ) -> None: """ Save a model to a folder. :param folder_path: The path to the folder. :param embeddings: The embeddings. :param tokenizer: The tokenizer. :param config: A metadata config. :param create_model_card: Whether to create a model card. :param subfolder: The subfolder to save the model in. :param **kwargs: Any additional arguments. """ folder_path = folder_path / subfolder if subfolder else folder_path folder_path.mkdir(exist_ok=True, parents=True) save_file({"embeddings": embeddings}, folder_path / "model.safetensors") tokenizer.save(str(folder_path / "tokenizer.json"), pretty=False) json.dump(config, open(folder_path / "config.json", "w"), indent=4) # Create modules.json modules = [{"idx": 0, "name": "0", "path": ".", "type": "sentence_transformers.models.StaticEmbedding"}] if config.get("normalize"): # If normalize=True, add sentence_transformers.models.Normalize modules.append({"idx": 1, "name": "1", "path": "1_Normalize", "type": "sentence_transformers.models.Normalize"}) json.dump(modules, open(folder_path / "modules.json", "w"), indent=4) logger.info(f"Saved model to {folder_path}") # Optionally create the model card if create_model_card: _create_model_card(folder_path, **kwargs) def _create_model_card( folder_path: Path, base_model_name: str = "unknown", license: str = "mit", language: list[str] | None = None, model_name: str | None = None, template_path: str = "modelcards/model_card_template.md", **kwargs: Any, ) -> None: """ Create a model card and store it in the specified path. :param folder_path: The path where the model card will be stored. :param base_model_name: The name of the base model. :param license: The license to use. :param language: The language of the model. :param model_name: The name of the model to use in the Model Card. :param template_path: The path to the template. :param **kwargs: Additional metadata for the model card (e.g., model_name, base_model, etc.). """ folder_path = Path(folder_path) model_name = model_name or folder_path.name full_path = Path(__file__).parent / template_path model_card_data = ModelCardData( model_name=model_name, base_model=base_model_name, license=license, language=language, tags=["embeddings", "static-embeddings", "sentence-transformers"], library_name="model2vec", **kwargs, ) model_card = ModelCard.from_template(model_card_data, template_path=str(full_path)) model_card.save(folder_path / "README.md") def load_pretrained( folder_or_repo_path: str | Path, subfolder: str | None = None, token: str | None = None, from_sentence_transformers: bool = False, ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]: """ Loads a pretrained model from a folder. :param folder_or_repo_path: The folder or repo path to load from. - If this is a local path, we will load from the local path. - If the local path is not found, we will attempt to load from the huggingface hub. :param subfolder: The subfolder to load from. :param token: The huggingface token to use. :param from_sentence_transformers: Whether to load the model from a sentence transformers model. :raises: FileNotFoundError if the folder exists, but the file does not exist locally. :return: The embeddings, tokenizer, config, and metadata. """ if from_sentence_transformers: model_file = "0_StaticEmbedding/model.safetensors" tokenizer_file = "0_StaticEmbedding/tokenizer.json" config_name = "config_sentence_transformers.json" else: model_file = "model.safetensors" tokenizer_file = "tokenizer.json" config_name = "config.json" folder_or_repo_path = Path(folder_or_repo_path) local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path if local_folder.exists(): embeddings_path = local_folder / model_file if not embeddings_path.exists(): msg = f"Embeddings file does not exist in {local_folder}" raise FileNotFoundError(msg) config_path = local_folder / config_name if not config_path.exists(): msg = f"Config file does not exist in {local_folder}" raise FileNotFoundError(msg) tokenizer_path = local_folder / tokenizer_file if not tokenizer_path.exists(): msg = f"Tokenizer file does not exist in {local_folder}" raise FileNotFoundError(msg) # README is optional, so this is a bit finicky. readme_path = local_folder / "README.md" metadata = _get_metadata_from_readme(readme_path) else: logger.info("Folder does not exist locally, attempting to use huggingface hub.") embeddings_path = Path( huggingface_hub.hf_hub_download( folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder ) ) try: readme_path = Path( huggingface_hub.hf_hub_download( folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder ) ) metadata = _get_metadata_from_readme(Path(readme_path)) except Exception as e: # NOTE: we don't want to raise an error here, since the README is optional. logger.info(f"No README found in the model folder: {e} No model card loaded.") metadata = {} config_path = Path( huggingface_hub.hf_hub_download( folder_or_repo_path.as_posix(), config_name, token=token, subfolder=subfolder ) ) tokenizer_path = Path( huggingface_hub.hf_hub_download( folder_or_repo_path.as_posix(), tokenizer_file, token=token, subfolder=subfolder ) ) opened_tensor_file = cast("SafeOpenProtocol", safetensors.safe_open(embeddings_path, framework="numpy")) if from_sentence_transformers: embeddings = opened_tensor_file.get_tensor("embedding.weight") else: embeddings = opened_tensor_file.get_tensor("embeddings") tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path)) config = json.load(open(config_path)) if len(tokenizer.get_vocab()) != len(embeddings): logger.warning( f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`" ) return embeddings, tokenizer, config, metadata def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]: """Get metadata from a README file.""" if not readme_path.exists(): logger.info(f"README file not found in {readme_path}. No model card loaded.") return {} model_card = ModelCard.load(readme_path) data: dict[str, Any] = model_card.data.to_dict() if not data: logger.info("File README.md exists, but was empty. No model card loaded.") return data def push_folder_to_hub( folder_path: Path, subfolder: str | None, repo_id: str, private: bool, token: str | None ) -> None: """ Push a model folder to the huggingface hub, including model card. :param folder_path: The path to the folder. :param subfolder: The subfolder to push to. If None, the folder will be pushed to the root of the repo. :param repo_id: The repo name. :param private: Whether the repo is private. :param token: The huggingface token. """ if not huggingface_hub.repo_exists(repo_id=repo_id, token=token): huggingface_hub.create_repo(repo_id, token=token, private=private) # Push model card and all model files to the Hugging Face hub huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder) logger.info(f"Pushed model to {repo_id}")