.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ framework-crop.png filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1536,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": true,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -1,3 +1,231 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: sentence-similarity
4
+ base_model:
5
+ - Qwen/Qwen2.5-1.5B
6
+ tags:
7
+ - transformers
8
+ - sentence-transformers
9
+ - sentence-similarity
10
+ - feature-extraction
11
+ ---
12
+
13
+ <a href="https://github.com/vec-ai/lychee-embed">
14
+ <img src="https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white">
15
+ </a>
16
+ <a href="https://openreview.net/pdf?id=NC6G1KCxlt">
17
+ <img src="https://img.shields.io/badge/Paper-Openreview-red">
18
+ </a>
19
+
20
+ # Lychee Embed
21
+
22
+ `Lychee-embed` is the latest generalist text embedding model based on the `Qwen2.5` model. It is suitable for text retrieval (semantic correlation), text similarity and other downstream tasks, and supports multiple languages of `Qwen2.5`.
23
+ `Lychee-embed` is jointly developed by the NLP Team of Harbin Institute of Technology, Shenzhen and is built based on an innovative multi-stage training framework (warm-up, task-learning, model merging, annealing).
24
+ The first batch of open source is 1.5B parameter version.
25
+
26
+ ![The multi-stage training framework](framework-crop.png)
27
+
28
+
29
+ **Lychee-embed**:
30
+
31
+ - Model Type: Text Embedding
32
+ - Language Support: 100+ Languages
33
+ - Param Size: 1.5B
34
+ - Context Length: 8k
35
+ - Embedding Dim: 1536, Supports diverse settings with 32 steps from 32 to 1536
36
+ - Model Precision: BF16
37
+
38
+ For more details, please refer to our [Paper](https://openreview.net/pdf?id=NC6G1KCxlt).
39
+
40
+
41
+ ### Model List
42
+
43
+ | Model Type | Models | Size | Layers | Sequence Length | Embedding Dimension | MRL Support | Instruction Aware |
44
+ |------------------|----------------------|------|--------|-----------------|---------------------|-------------|----------------|
45
+ | Text Embedding | [lychee-embed](https://huggingface.co/vec-ai/lychee-embed) | 1.5B | 28 | 8K | 1636 | Yes | Yes |
46
+ | Text Reranking | [lychee-rerank](https://huggingface.co/vec-ai/lychee-rerank) | 1.5B | 28 | 8K | - | - | Yes |
47
+
48
+
49
+ > **Note**:
50
+ > - `MRL Support` indicates whether the embedding model supports custom dimensions for the final embedding.
51
+ > - `Instruction Aware` notes whether the embedding or reranking model supports customizing the input instruction according to different tasks.
52
+ > - Like most embedding models, for most downstream tasks, using instructions (instruct) typically yields an improvement of 1% to 5% compared to not using them. Therefore, we recommend that developers create tailored instructions specific to their tasks and scenarios. In multilingual contexts, we also advise users to write their instructions in English, as most instructions utilized during the model training process were originally written in English.
53
+
54
+
55
+ ## Model Usage
56
+
57
+ 📌 **Tips**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the `query` side can lead to a drop in retrieval performance by approximately 1% to 5%.
58
+
59
+
60
+ ### Sentence Transformers Usage
61
+
62
+ ```python
63
+ # Requires transformers>=4.51.0
64
+ # Requires sentence-transformers>=2.7.0
65
+
66
+ from sentence_transformers import SentenceTransformer
67
+
68
+ # Load the model
69
+ model = SentenceTransformer("vec-ai/lychee-embed")
70
+
71
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving,
72
+ # together with setting `padding_side` to "left":
73
+ # model = SentenceTransformer(
74
+ # "vec-ai/lychee-embed",
75
+ # model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
76
+ # tokenizer_kwargs={"padding_side": "left"},
77
+ # )
78
+
79
+ # The queries and documents to embed
80
+ queries = [
81
+ "What is the capital of China?",
82
+ "Explain gravity",
83
+ ]
84
+ documents = [
85
+ "The capital of China is Beijing.",
86
+ "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
87
+ ]
88
+
89
+ # Encode the queries and documents. Note that queries benefit from using a prompt
90
+ # Here we use the prompt called "query" stored under `model.prompts`, but you can
91
+ # also pass your own prompt via the `prompt` argument
92
+ query_embeddings = model.encode(queries, prompt_name="query")
93
+ document_embeddings = model.encode(documents)
94
+
95
+ # Compute the (cosine) similarity between the query and document embeddings
96
+ similarity = model.similarity(query_embeddings, document_embeddings)
97
+ print(similarity)
98
+ # tensor([[0.8952, 0.4001],
99
+ # [0.4668, 0.8334]])
100
+ ```
101
+
102
+ ### Transformers Usage
103
+
104
+ ```python
105
+ # Requires transformers>=4.51.0
106
+
107
+ import torch
108
+ from transformers import AutoTokenizer, AutoModel
109
+
110
+
111
+ def last_token_pool(last_hidden_states: torch.Tensor,
112
+ attention_mask: torch.Tensor) -> torch.Tensor:
113
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
114
+ if left_padding:
115
+ return last_hidden_states[:, -1]
116
+ else:
117
+ sequence_lengths = attention_mask.sum(dim=1) - 1
118
+ batch_size = last_hidden_states.shape[0]
119
+ return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
120
+
121
+
122
+ def get_detailed_instruct(task_description: str, query: str) -> str:
123
+ return f'Instruct: {task_description}\nQuery:{query}'
124
+
125
+ # Each query must come with a one-sentence instruction that describes the task
126
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
127
+
128
+ queries = [
129
+ get_detailed_instruct(task, 'What is the capital of China?'),
130
+ get_detailed_instruct(task, 'Explain gravity')
131
+ ]
132
+ # No need to add instruction for retrieval documents
133
+ documents = [
134
+ "The capital of China is Beijing.",
135
+ "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
136
+ ]
137
+ input_texts = queries + documents
138
+
139
+ tokenizer = AutoTokenizer.from_pretrained('vec-ai/lychee-embed', padding_side='left')
140
+ model = AutoModel.from_pretrained('vec-ai/lychee-embed')
141
+
142
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving.
143
+ # model = AutoModel.from_pretrained('vec-ai/lychee-embed', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
144
+
145
+ max_length = 8192
146
+
147
+ # Tokenize the input texts
148
+ batch_dict = tokenizer(
149
+ input_texts,
150
+ padding=True,
151
+ truncation=True,
152
+ max_length=max_length,
153
+ return_tensors="pt",
154
+ )
155
+ batch_dict.to(model.device)
156
+ outputs = model(**batch_dict)
157
+ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
158
+
159
+ # normalize embeddings
160
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
161
+ scores = (embeddings[:2] @ embeddings[2:].T)
162
+ print(scores.tolist())
163
+ # [[0.8952088952064514, 0.40010833740234375], [0.4668009877204895, 0.8333653807640076]]
164
+ ```
165
+
166
+ ### vLLM Usage
167
+
168
+ ```python
169
+ # Requires vllm>=0.8.5
170
+ import torch
171
+ from vllm import LLM
172
+
173
+ def get_detailed_instruct(task_description: str, query: str) -> str:
174
+ return f'Instruct: {task_description}\nQuery:{query}'
175
+
176
+ # Each query must come with a one-sentence instruction that describes the task
177
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
178
+
179
+ queries = [
180
+ get_detailed_instruct(task, 'What is the capital of China?'),
181
+ get_detailed_instruct(task, 'Explain gravity')
182
+ ]
183
+ # No need to add instruction for retrieval documents
184
+ documents = [
185
+ "The capital of China is Beijing.",
186
+ "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
187
+ ]
188
+ input_texts = queries + documents
189
+
190
+ model = LLM(model="vec-ai/lychee-embed", task="embed")
191
+
192
+ outputs = model.embed(input_texts)
193
+ embeddings = torch.tensor([o.outputs.embedding for o in outputs])
194
+ scores = (embeddings[:2] @ embeddings[2:].T)
195
+ print(scores.tolist())
196
+ # [[0.9007290601730347, 0.4043760895729065], [0.469818651676178, 0.8317853212356567]]
197
+ ```
198
+
199
+
200
+ ## Evaluation
201
+
202
+ | Model | Param | MTEB | CMTEB | MMTEB | MLDR | MTEB-Code | ToolBench | FollowIR | BRIGHT |
203
+ |---|---|---|---|---|---|---|---|---|---|
204
+ | BGE-multilingual | 9.24B | 69.88 | 68.44 | 61.25 | 49.10 | 62.04 | 63.65 | -2.13 | 17.68 |
205
+ | NV-Embed-v2 | 7.85B | 72.31 | - | 56.25 | - | 63.74 | 50.54 | 1.04 | 19.28 |
206
+ | GritLM-7B | 7.24B | 66.8 | - | 60.93 | - | 73.6 | 35.42 | 3.45 | 20.63 |
207
+ | E5-mistral | 7.11B | 66.6 | 59.92 | 60.28 | - | 69.2 | 31.79 | -0.62 | 17.54 |
208
+ | GTE-Qwen2-7B | 7.62B | 69.88 | 71.62 | 62.51 | 56.53 | 62.17 | 59.48 | 4.94 | 22.89 |
209
+ | GTE-Qwen2-1.5B | 1.54B | 67.19 | 67.12 | 59.47 | 52.11 | 61.98 | 62.57 | 0.74 | 18.47 |
210
+ | BGE-M3 (Dense) | 0.56B | 59.84 | 61.79 | 59.54 | 52.50 | 58.22 | 58.45 | -3.11 | 11.94 |
211
+ | Jina-v3 | 0.57B | 65.52 | 63.07 | 58.37 | 40.71 | 58.85 | 59.64 | -1.34 | 11.34 |
212
+ |Qwen3-Embedding-8B | 7.57B | | 73.84 | 70.58 | | 80.68 |
213
+ |Qwen3-Embedding-4B | 4.02B | | 72.27 | 69.45 | | 80.06 |
214
+ |Qwen3-Embedding-0.6B | 0.60B | | 66.33 | 64.33 | | 75.41 |
215
+ | **Lychee-embed** | 1.54B | 68.39 |69.77 | 58.43 | 53.85 | 72.54 | 86.35 | 5.74 | 19.47 |
216
+
217
+ For more details, please refer to our [Paper](https://openreview.net/pdf?id=NC6G1KCxlt).
218
+
219
+ ## Citation
220
+
221
+ If you find our work helpful, feel free to give us a cite.
222
+
223
+ ```
224
+ @inproceedings{zhang2025phased,
225
+ title={Phased Training for LLM-powered Text Retrieval Models Beyond Data Scaling},
226
+ author={Xin Zhang and Yanzhao Zhang and Wen Xie and Dingkun Long and Mingxin Li and Pengjun Xie and Meishan Zhang and Wenjie Li and Min Zhang},
227
+ booktitle={Second Conference on Language Modeling},
228
+ year={2025},
229
+ url={https://openreview.net/forum?id=NC6G1KCxlt}
230
+ }
231
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2Model"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "rms_norm_eps": 1e-06,
49
+ "rope_scaling": null,
50
+ "rope_theta": 1000000.0,
51
+ "sliding_window": null,
52
+ "tie_word_embeddings": true,
53
+ "torch_dtype": "bfloat16",
54
+ "transformers_version": "4.53.0",
55
+ "use_cache": true,
56
+ "use_mrope": false,
57
+ "use_sliding_window": false,
58
+ "vocab_size": 151665
59
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompts": {
3
+ "query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:",
4
+ "document": ""
5
+ },
6
+ "default_prompt_name": null,
7
+ "similarity_fn_name": "cosine"
8
+ }
framework-crop.png ADDED

Git LFS Details

  • SHA256: eade35b0f8eca610087da421fef85075547a7cd0e5636a4b364d61fa3e092341
  • Pointer size: 131 Bytes
  • Size of remote file: 225 kB
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a38c9a345e2427303d0f49aeccbbb7c088fa25b540888a90866b3f1fa69ecaf6
3
+ size 3086632608
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83cdf8c3a34f68862319cb1810ee7b1e2c0a44e0864ae930194ddb76bb7feb8d
3
+ size 11422947
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|endoftext|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff