File size: 1,490 Bytes
54b9ca1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from __future__ import annotations

from typing import TYPE_CHECKING, cast
from pathlib import Path

from anyio import Path as AsyncPath

# tokenizers is untyped, https://github.com/huggingface/tokenizers/issues/811
# note: this comment affects the entire file
# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
if TYPE_CHECKING:
    # we only import this at the type-level as deferring the import
    # avoids issues like this: https://github.com/anthropics/anthropic-sdk-python/issues/280
    from tokenizers import Tokenizer as TokenizerType  # type: ignore[import]
else:
    TokenizerType = None


def _get_tokenizer_cache_path() -> Path:
    return Path(__file__).parent / "tokenizer.json"


_tokenizer: TokenizerType | None = None


def _load_tokenizer(raw: str) -> TokenizerType:
    global _tokenizer

    from tokenizers import Tokenizer

    _tokenizer = cast(TokenizerType, Tokenizer.from_str(raw))
    return _tokenizer


def sync_get_tokenizer() -> TokenizerType:
    if _tokenizer is not None:
        return _tokenizer

    tokenizer_path = _get_tokenizer_cache_path()
    text = tokenizer_path.read_text(encoding="utf-8")
    return _load_tokenizer(text)


async def async_get_tokenizer() -> TokenizerType:
    if _tokenizer is not None:
        return _tokenizer

    tokenizer_path = AsyncPath(_get_tokenizer_cache_path())
    text = await tokenizer_path.read_text(encoding="utf-8")
    return _load_tokenizer(text)