Spaces:
Paused
Paused
File size: 1,490 Bytes
54b9ca1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from __future__ import annotations
from typing import TYPE_CHECKING, cast
from pathlib import Path
from anyio import Path as AsyncPath
# tokenizers is untyped, https://github.com/huggingface/tokenizers/issues/811
# note: this comment affects the entire file
# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
if TYPE_CHECKING:
# we only import this at the type-level as deferring the import
# avoids issues like this: https://github.com/anthropics/anthropic-sdk-python/issues/280
from tokenizers import Tokenizer as TokenizerType # type: ignore[import]
else:
TokenizerType = None
def _get_tokenizer_cache_path() -> Path:
return Path(__file__).parent / "tokenizer.json"
_tokenizer: TokenizerType | None = None
def _load_tokenizer(raw: str) -> TokenizerType:
global _tokenizer
from tokenizers import Tokenizer
_tokenizer = cast(TokenizerType, Tokenizer.from_str(raw))
return _tokenizer
def sync_get_tokenizer() -> TokenizerType:
if _tokenizer is not None:
return _tokenizer
tokenizer_path = _get_tokenizer_cache_path()
text = tokenizer_path.read_text(encoding="utf-8")
return _load_tokenizer(text)
async def async_get_tokenizer() -> TokenizerType:
if _tokenizer is not None:
return _tokenizer
tokenizer_path = AsyncPath(_get_tokenizer_cache_path())
text = await tokenizer_path.read_text(encoding="utf-8")
return _load_tokenizer(text)
|