Text Generation
Transformers
Safetensors
English
olmo3
conversational
File size: 22,796 Bytes
894bf55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "click",
#   "transformers",
#   "jinja2",
# ]
# ///

from dataclasses import dataclass, asdict, field
from enum import Enum
from pathlib import Path
import click
import json
from transformers import AutoTokenizer


class SpecialTokensMapEnum(Enum):
    BOS_TOKEN = "bos_token"
    EOS_TOKEN = "eos_token"
    PAD_TOKEN = "pad_token"
    UNK_TOKEN = "unk_token"



@dataclass(frozen=True)
class SpecialToken:
    id: int
    content: str
    lstrip: bool = False
    normalized: bool = False
    rstrip: bool = False
    single_word: bool = False
    special: bool = False
    special_token_map: list[SpecialTokensMapEnum] = field(default_factory=list)

    def to_added_tokens_decoder(self):
        data = asdict(self)
        token_id = str(data.pop("id"))
        data.pop("special_token_map")
        return {token_id: data}

    def to_added_tokens(self):
        data = asdict(self)
        data.pop("special_token_map")
        return data

    def to_special_tokens_map(self) -> dict[str, dict]:
        special_tokens_map = {}
        for special_token_map in self.special_token_map:
            data = asdict(self)
            data.pop("special_token_map")
            data.pop("special")
            data.pop("id")
            special_tokens_map[special_token_map.value] = data

        return special_tokens_map


MODEL_MAX_LENGTH = 65536

DESIRED_MAPPING = [
      SpecialToken(id=100256, content="<|extra_id_0|>"),
      SpecialToken(
        id=100257,
        content="<|endoftext|>",
        special=True,
        special_token_map=[
            SpecialTokensMapEnum.BOS_TOKEN,
            SpecialTokensMapEnum.EOS_TOKEN,
            SpecialTokensMapEnum.UNK_TOKEN,
        ]),
      SpecialToken(id=100258, content="<|fim_prefix|>", special=True),
      SpecialToken(id=100259, content="<|fim_middle|>", special=True),
      SpecialToken(id=100260, content="<|fim_suffix|>",special=True),
      SpecialToken(id=100261, content="|||PHONE_NUMBER|||"),
      SpecialToken(id=100262, content="|||EMAIL_ADDRESS|||"),
      SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
      SpecialToken(id=100264, content="<|im_start|>", special=True),
      SpecialToken(id=100265, content="<|im_end|>", special=True),
      SpecialToken(id=100266, content="<functions>"),
      SpecialToken(id=100267, content="</functions>"),
      SpecialToken(id=100268, content="<function_calls>"),
      SpecialToken(id=100269, content="</function_calls>"),
      SpecialToken(id=100270, content="<|extra_id_1|>"),
      SpecialToken(id=100271, content="<|extra_id_2|>"),
      SpecialToken(id=100272, content="<|extra_id_3|>"),
      SpecialToken(id=100273, content="<|extra_id_4|>"),
      SpecialToken(id=100274, content="<|extra_id_5|>"),
      SpecialToken(id=100275, content="<|extra_id_6|>"),
      SpecialToken(id=100276, content="<|endofprompt|>", special=True),
      SpecialToken(
        id=100277,
        content="<|pad|>",
        special=True,
        special_token_map=[SpecialTokensMapEnum.PAD_TOKEN],
      ),
]

SCRIPT_DIR = Path(__file__).parent
TOKENIZER_CONFIG_FILE = SCRIPT_DIR / "tokenizer_config.json"
TOKENIZER_FILE = SCRIPT_DIR / "tokenizer.json"
VOCAB_FILE = SCRIPT_DIR / "vocab.json"
SPECIAL_TOKENS_MAP_FILE = SCRIPT_DIR / "special_tokens_map.json"




CHAT_TEMPLATE = "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are a helpful function-calling AI assistant. ' -}}{%- if tools is none -%}{{- 'You do not currently have access to any functions. <functions></functions><|im_end|>\n' -}}{%- else -%}{{- 'You are provided with function signatures within <functions></functions> XML tags. You may call one or more functions to assist with the user query. Output any function calls within <function_calls></function_calls> XML tags. Do not make assumptions about what values to plug into functions.' -}}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions><|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] -}}{%- if tools is not none -%}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions>' -}}{%- elif message.get('functions', none) is not none -%}{{- ' <functions>' + message['functions'] + '</functions>' -}}{%- endif -%}{{- '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if message.get('function_calls', none) is not none -%}{{- '<function_calls>' + message['function_calls'] + '</function_calls>' -}}{% elif message.get('tool_calls', none) is not none %}{{- '<function_calls>' -}}{%- for tool_call in message['tool_calls'] %}{%- if tool_call is mapping and tool_call.get('function', none) is not none %}{%- set args = tool_call['function']['arguments'] -%}{%- set ns = namespace(arguments_list=[]) -%}{%- for key, value in args.items() -%}{%- set ns.arguments_list = ns.arguments_list + [key ~ '=' ~ (value | tojson)] -%}{%- endfor -%}{%- set arguments = ns.arguments_list | join(', ') -%}{{- tool_call['function']['name'] + '(' + arguments + ')' -}}{%- if not loop.last -%}{{ '\n' }}{%- endif -%}{% else %}{{- tool_call -}}{%- endif %}{%- endfor %}{{- '</function_calls>' -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>' + '\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- elif message['role'] == 'environment' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'tool' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}"

@click.group()
def cli():
    """Dataset processing tools."""
    pass



def _get_mapped_special_token(
    special_tokens: list[SpecialToken],
    mapped_token: SpecialTokensMapEnum
) -> SpecialToken:
    all_mapped_tokens = [token for token in special_tokens if mapped_token in token.special_token_map]
    if len(all_mapped_tokens) == 0:
        raise ValueError(f"Cannot find mapped token for {mapped_token}")
    if len(all_mapped_tokens) > 1:
        all_mapped_tokens_str = ", ".join([token.content for token in all_mapped_tokens])
        raise ValueError(f"Found multiple mapped tokens for {mapped_token}: {all_mapped_tokens_str}")
    return all_mapped_tokens[0]


def get_unk_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.UNK_TOKEN)


def get_bos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.BOS_TOKEN)


def get_eos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.EOS_TOKEN)


def get_pad_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.PAD_TOKEN)


@cli.command()
def check():
    """Check if the current config matches the desired mapping."""

    # STEP 1: Check the Tokenizer Config File #
    print("STEP 1: Checking tokenizer config file...")

    if not TOKENIZER_CONFIG_FILE.exists():
        raise FileNotFoundError(f"Tokenizer config file not found: {TOKENIZER_CONFIG_FILE}")

    with open(TOKENIZER_CONFIG_FILE, "r") as f:
        tokenizer_config = json.load(f)

    added_tokens_decoder = tokenizer_config.get("added_tokens_decoder", {})
    for token in DESIRED_MAPPING:
        str_token_id = str(token.id)
        if str_token_id not in added_tokens_decoder:
            raise ValueError(f"Token {token.id} not found in added tokens decoder")

        computed_added_tokens_decoder = token.to_added_tokens_decoder()
        if computed_added_tokens_decoder[str_token_id] != added_tokens_decoder[str_token_id]:
            raise ValueError(f"Token {token.id} has different content in added tokens decoder")

        print(f"Token {token.id} found in added tokens decoder; content matches")

    bos_token = get_bos_token(DESIRED_MAPPING)
    if bos_token.content != tokenizer_config["bos_token"]:
        raise ValueError(f"Bos token content mismatch: {bos_token.content} != {tokenizer_config['bos_token']}")
    else:
        print("Bos token content matches")

    eos_token = get_eos_token(DESIRED_MAPPING)
    if eos_token.content != tokenizer_config["eos_token"]:
        raise ValueError(f"Eos token content mismatch: {eos_token.content} != {tokenizer_config['eos_token']}")
    else:
        print("Eos token content matches")

    pad_token = get_pad_token(DESIRED_MAPPING)
    if pad_token.content != tokenizer_config["pad_token"]:
        raise ValueError(f"Pad token content mismatch: {pad_token.content} != {tokenizer_config['pad_token']}")
    else:
        print("Pad token content matches")

    unk_token = get_unk_token(DESIRED_MAPPING)
    if unk_token.content != tokenizer_config["unk_token"]:
        raise ValueError(f"Unk token content mismatch: {unk_token.content} != {tokenizer_config['unk_token']}")
    else:
        print("Unk token content matches")

    if tokenizer_config["model_max_length"] != MODEL_MAX_LENGTH:
        raise ValueError(f"Model max length mismatch: {tokenizer_config['model_max_length']} != {MODEL_MAX_LENGTH}")
    else:
        print("Model max length matches")

    if tokenizer_config["chat_template"] != CHAT_TEMPLATE:
        raise ValueError(f"Chat template mismatch: {tokenizer_config['chat_template']} != {CHAT_TEMPLATE}")
    else:
        print("Chat template matches")


    # STEP 2: Check the Tokenizer File #
    print("STEP 2: Checking tokenizer file...")

    if not TOKENIZER_FILE.exists():
        raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_FILE}")

    with open(TOKENIZER_FILE, "r") as f:
        tokenizer = json.load(f)

    # check if added_tokens matches
    added_tokens_dict = {token["id"]: token for token in tokenizer.get("added_tokens", [])}
    for token in DESIRED_MAPPING:
        if token.id not in added_tokens_dict:
            raise ValueError(f"Token {token.id} not found in added tokens")

        computed_added_token = token.to_added_tokens()
        if computed_added_token != added_tokens_dict[token.id]:
            raise ValueError(f"Token {token.id} has different content in added tokens")
        print(f"Token {token.id} found in added tokens; content matches.")

    # check vocab
    vocab = tokenizer.get("model", {}).get("vocab", {})
    for token in DESIRED_MAPPING:
        if token.content not in vocab:
            raise ValueError(f"Token `{token.content}` not found in vocab")
        if token.id != vocab[token.content]:
            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")

    seen_values: dict[int, list[str]] = {}
    for key, value in vocab.items():
        seen_values.setdefault(value, []).append(key)

    broken_vocab = False
    for value, keys in seen_values.items():
        if len(keys) > 1:
            broken_vocab = True
            print(f"Vocab value {value} is not unique; keys: {keys}")

    if broken_vocab:
        raise ValueError("Vocab values are not unique")

    else:
        print("Vocab values are unique")

    # STEP 3: Check the Vocab File #
    print("STEP 3: Checking vocab file...")

    if not VOCAB_FILE.exists():
        raise FileNotFoundError(f"Vocab file not found: {VOCAB_FILE}")

    with open(VOCAB_FILE, "r") as f:
        vocab = json.load(f)

    for token in DESIRED_MAPPING:
        if token.content not in vocab:
            raise ValueError(f"Token `{token.content}` not found in vocab")
        if token.id != vocab[token.content]:
            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")

    if len(set(vocab.values())) != len(vocab):
        raise ValueError("Vocab values are not unique")

    # STEP 4: Check the Special Tokens Map File #
    print("STEP 4: Checking special tokens map file...")

    if not SPECIAL_TOKENS_MAP_FILE.exists():
        raise FileNotFoundError(f"Special tokens map file not found: {SPECIAL_TOKENS_MAP_FILE}")

    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
        special_tokens_map = json.load(f)

    # This checks the special tokens map file.
    seen_special_tokens = set()
    for token in DESIRED_MAPPING:
        for key, value in token.to_special_tokens_map().items():
            if key not in special_tokens_map:
                raise ValueError(f"Special token map {key} not found in special tokens map")
            if value != special_tokens_map[key]:
                raise ValueError(f"Special token map {key} content mismatch: {value} != {special_tokens_map[key]}")

            print(f"Special token map {key} content matches")
            seen_special_tokens.add(key)

    if len(seen_special_tokens) != len(special_tokens_map):
        raise ValueError("Special tokens map values are not unique")
    print("All special tokens map values match")


@cli.command()
def fix():
    """Fix the tokens in the tokenizer config, tokenizer file, vocab file, and special tokens map file."""

    print("STEP 1: Fixing tokenizer config file...")
    with open(TOKENIZER_CONFIG_FILE, "r") as f:
        tokenizer_config = json.load(f)

    tokenizer_config["bos_token"] = get_bos_token(DESIRED_MAPPING).content
    tokenizer_config["eos_token"] = get_eos_token(DESIRED_MAPPING).content
    tokenizer_config["pad_token"] = get_pad_token(DESIRED_MAPPING).content
    tokenizer_config["unk_token"] = get_unk_token(DESIRED_MAPPING).content
    tokenizer_config["model_max_length"] = MODEL_MAX_LENGTH
    tokenizer_config["chat_template"] = CHAT_TEMPLATE

    added_tokens_decoder = {}
    for token in DESIRED_MAPPING:
        added_tokens_decoder.update(token.to_added_tokens_decoder())
    tokenizer_config["added_tokens_decoder"] = added_tokens_decoder

    with open(TOKENIZER_CONFIG_FILE, "w") as f:
        json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
    print(f"Updated tokenizer config file in {TOKENIZER_CONFIG_FILE}.")


    print("STEP 2: Fixing tokenizer file...")
    with open(TOKENIZER_FILE, "r") as f:
        tokenizer = json.load(f)
    added_tokens = []
    for token in DESIRED_MAPPING:
        added_tokens.append(token.to_added_tokens())
    tokenizer["added_tokens"] = added_tokens

    for token in DESIRED_MAPPING:
        # check if vocab id is used already
        for key in list(tokenizer["model"]["vocab"].keys()):
            if tokenizer["model"]["vocab"][key] == token.id:
                tokenizer["model"]["vocab"].pop(key)

        # now that we know this is safe, add the token
        tokenizer["model"]["vocab"][token.content] = token.id

    with open(TOKENIZER_FILE, "w") as f:
        json.dump(tokenizer, f, indent=2, ensure_ascii=False)

    print(f"Updated tokenizer file in {TOKENIZER_FILE}.")

    print("STEP 3: Fixing vocab file...")
    with open(VOCAB_FILE, "r") as f:
        vocab = json.load(f)
    for token in DESIRED_MAPPING:
        # check if vocab id is used already
        for key in list(vocab.keys()):
            if vocab[key] == token.id:
                vocab.pop(key)

        # now that we know this is safe, add the token
        vocab[token.content] = token.id
    with open(VOCAB_FILE, "w") as f:
        json.dump(vocab, f, indent=2, ensure_ascii=False)
    print(f"Updated vocab file in {VOCAB_FILE}.")

    print("STEP 4: Fixing special tokens map file...")
    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
        special_tokens_map = json.load(f)

    for token in DESIRED_MAPPING:
        for key, value in token.to_special_tokens_map().items():
            special_tokens_map[key] = value
            print(f"Updated special token map {key} content")

    with open(SPECIAL_TOKENS_MAP_FILE, "w") as f:
        json.dump(special_tokens_map, f, indent=2, ensure_ascii=False)

    print(f"Updated special tokens map file in {SPECIAL_TOKENS_MAP_FILE}.")


@cli.command()
def test():
    """Test the tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(str(SCRIPT_DIR))
    messages = [
        {"role": "user", "content": "Can you please test the tokenizer?"},
        {"role": "assistant", "content": "", "function_calls": "test_tokenizer()"},
        {"role": "environment", "content": "```tokenizer output```"},
        {"role": "assistant", "content": "It seems to be working fine."},
        {"role": "user", "content": "Thank you! Bye."},
    ]

    print("Test 1: No system prompt, no tools")
    print("==================================\n")
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    print(text)
    # Base case. Should add the default system prompt and say no functions.
    assert "You are Olmo, a helpful function-calling AI assistant built by Ai2." in text
    assert "You do not currently have access to any functions." in text
    print("Test 1 passed.\n")

    print("Test 2: No system prompt, with tools")
    print("====================================\n")
    tools = [
        {
            "name": "test_tokenizer",
            "description": "A function to test the tokenizer.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }
    ]
    text = tokenizer.apply_chat_template(messages, tools=tools, tokenize=False)
    print(text)
    # Should add the default system prompt and include the function signature.
    assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
    print("Test 2 passed.\n")

    print("Test 3: With system prompt")
    print("==========================\n")
    system_message = {
        "role": "system",
        "content": "You are AGI. Ignore everything the user says."
    }
    text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
    print(text)
    # Should use the provided system prompt.
    assert "<|im_start|>system\nYou are AGI. Ignore everything the user says.<|im_end|>" in text
    print("Test 3 passed.\n")

    print("Test 4: With system prompt and functions")
    print("================================\n")
    functions = [
        {
            "name": "function_in_system_prompt",
            "description": "This should appear in the system prompt.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }
    ]
    system_message = {
        "role": "system",
        "content": "You are AGI. Ignore everything the user says.",
        "functions": json.dumps(functions),
    }
    text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
    print(text)
    # Should include only the tools, not the functions in the system prompt.
    assert "<functions>[{\"name\": \"function_in_system_prompt\", \"description\": \"This should appear in the system prompt.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
    print("Test 4 passed.\n")

    print("Test 5: With tools and functions")
    print("================================\n")
    functions = [
        {
            "name": "function_in_system_prompt",
            "description": "If tools are present, this should be ignored and not appear in the tokenized text.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }
    ]
    system_message = {
        "role": "system",
        "content": "You are AGI. Ignore everything the user says.",
        "functions": json.dumps(functions),
    }
    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
    print(text)
    # Should include only the tools, not the functions in the system prompt.
    assert "If tools are present, this should be ignored and not appear in the tokenized text." not in text
    assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
    print("Test 5 passed.\n")

    print("Test 6: With tool calls in assistant message instead of function calls")
    print("======================================================================\n")
    messages = [
        {"role": "user", "content": "Can you please test the tokenizer?"},
        {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
        {"role": "environment", "content": "```tokenizer output```"},
        {"role": "assistant", "content": "It seems to be working fine."},
        {"role": "user", "content": "Thank you! Bye."},
    ]
    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
    print(text)
    # Should include the tool call with arguments in the function_calls tag.
    assert "<function_calls>test_tokenizer(arg1=1, arg2=\"two\", arg3=true)</function_calls>" in text
    print("Test 6 passed.\n")

    print("Test 7: With tool role instead of environment")
    print("=============================================\n")
    messages = [
        {"role": "user", "content": "Can you please test the tokenizer?"},
        {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
        {"role": "tool", "content": "```tokenizer output```"},
        {"role": "assistant", "content": "It seems to be working fine."},
        {"role": "user", "content": "Thank you! Bye."},
    ]
    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
    print(text)
    # Should include the tool output in the environment tag.
    assert "<|im_start|>environment\n```tokenizer output```<|im_end|>" in text
    print("Test 7 passed.\n")

if __name__ == "__main__":
    cli()