Mengyao00 commited on
Commit
ba5e03e
·
verified ·
1 Parent(s): 04cb50e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE CHANGED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NVIDIA License
2
+
3
+ ## 1. Definitions
4
+
5
+ - “Licensor” means any person or entity that distributes its Work.
6
+ - “Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license.
7
+ - The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
8
+ - Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
9
+
10
+ ## 2. License Grant
11
+
12
+ ### 2.1 Copyright Grant.
13
+ Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
14
+
15
+ ## 3. Limitations
16
+
17
+ ### 3.1 Redistribution.
18
+ You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
19
+
20
+ ### 3.2 Derivative Works.
21
+ You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
22
+
23
+ ### 3.3 Use Limitation.
24
+ The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for research or evaluation purposes only.
25
+
26
+ ### 3.4 Patent Claims.
27
+ If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
28
+
29
+ ### 3.5 Trademarks.
30
+ This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
31
+
32
+ ### 3.6 Termination.
33
+ If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
34
+
35
+ ## 4. Disclaimer of Warranty.
36
+
37
+ THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
38
+
39
+ ## 5. Limitation of Liability.
40
+
41
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
README.md CHANGED
@@ -1,5 +1,339 @@
1
- ---
2
- license: other
3
- license_name: nvidia-open-model-license
4
- license_link: LICENSE
5
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: nvidia-open-model-license
4
+ license_link: LICENSE
5
+ tags:
6
+ - text
7
+ - image
8
+ - video
9
+ - audio
10
+ - vidore
11
+ - multimodal-embedding
12
+ - Text-to-Video retrieval
13
+ - Text-to-Audio retrieval
14
+ - Visual Document Retrieval
15
+ - feature-extraction
16
+ language:
17
+ - en
18
+ library_name: transformers
19
+ ---
20
+
21
+ # Omni-Embed-Nemotron-3B
22
+
23
+ ## Description
24
+
25
+ NV-QwenOmni-Embed-3B-v1 is a versatile multimodal embedding model capable of encoding content across multiple modalities, including text, image, audio, and video, either individually or in combination, and supports retrieval using queries that can also be multimodal. It is designed to serve as a foundational component in multi-modal Retrieval-Augmented Generation (RAG) systems.
26
+
27
+ The foundational Qwen Omni model ([Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B)) is based on the Thinker-Talker architecture. We only leverage the Thinker component to encode and understand diverse modalities. In this implementation, we do not include the Talker component, as the model focuses on multimodal understanding rather than response generation.
28
+
29
+ This model is for research and development only.
30
+
31
+ ### License/Terms of Use
32
+
33
+ Governing Terms for nvidia/omni-embed-nemotron-3b model: [NVIDIA OneWay Noncommercial License.](https://huggingface.co/datasets/nvidia/PhysicalAI-Robotics-Manipulation-Objects/resolve/main/NVIDIA%20OneWay%20Noncommercial%20License.pdf?download=true)
34
+
35
+ ADDITIONAL INFORMATION: [Qwen RESEARCH LICENSE AGREEMENT](https://huggingface.co/Qwen/Qwen2.5-Omni-3B/blob/main/LICENSE)
36
+
37
+
38
+ This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
39
+
40
+ ### Team
41
+ - Mengyao Xu
42
+ - Gabriel Moreira
43
+ - Radek Osmulski
44
+ - Ronay Ak
45
+ - Yauhen Babakhin
46
+ - Bo Liu
47
+ - Even Oldridge
48
+ - Benedikt Schifferer
49
+
50
+ Correspondence to Mengyao Xu (mengyaox@nvidia.com) and Benedikt Schifferer (bschifferer@nvidia.com)
51
+
52
+ ### Citation
53
+
54
+ ```
55
+ @misc{moreira2025nvretrieverimprovingtextembedding,
56
+ title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
57
+ author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
58
+ year={2025},
59
+ eprint={2407.15831},
60
+ archivePrefix={arXiv},
61
+ primaryClass={cs.IR},
62
+ url={https://arxiv.org/abs/2407.15831},
63
+ }
64
+ ```
65
+
66
+ ### Deployment Geography
67
+ Global
68
+
69
+ ### Use Case
70
+ NV-Omni-Embed is intended for researchers and developers building retrieval-based applications that require understanding and retrieve information across multiple modalities. It is particularly useful in multimodal RAG systems, where queries and documents may include combinations of text, images, audio, and videos. Potential applications include multimedia search engines, cross-modal retrieval systems, and conversational AI with rich input understanding.
71
+
72
+ ### Release Date
73
+ Huggingface 10/1/2025 via [https://huggingface.co/nvidia/omni-embed-nemotron-3b]
74
+
75
+ ## Model Architecture
76
+
77
+ - **Architecture Type:** Transformer
78
+ - **Network Architecture:** Qwen/Qwen2.5-Omni-3B
79
+
80
+ NV-QwenOmni-Embed-3B-v1 is a transformer-based multimodal embedding model built on top of the Thinker component from [Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B). Unlike the original Thinker-Talker architecture, this model does not include the Talker module, as it is designed specifically for multimodal understanding and retrieval rather than response generation. Number of model parameters is 4.7B.
81
+
82
+ The model incorporates a vision encoder, an audio encoder, and a large language model (LLM) from the Qwen architecture to process diverse modalities. Unlike the Omni model, which interleaves audio and video tokens with TMRoPE, our retrieval encoder keeps the two streams separate. Audio and video are encoded independently, preserving their full temporal structure without interleaving. Our experiments show this design improves retrieval performance.
83
+
84
+ NV-QwenOmni-Embed-3B-v1 is trained using a bi-encoder architecture where queries and candidate inputs are embedded independently. A contrastive learning objective is employed to align relevant query-content pairs while pushing apart unrelated ones in the shared embedding space.
85
+
86
+
87
+
88
+
89
+
90
+ ## Input
91
+
92
+ | Property | Query | Document |
93
+ |----------|-------|----------|
94
+ | **Input Type** | Text \| Image \| Audio \| Video \| Any combination | Text \| Image \| Audio \| Video \| Any combination |
95
+ | **Input Format** | List of strings, image tensors, audio arrays, or video clips | List of text strings, images, audio, or video clips |
96
+
97
+
98
+
99
+ | | Text | Image | Video | Audio |
100
+ |---|---|---|---|---|
101
+ | **Input Parameter** | str, list[str], or pre-tokenized list[list[str]]; encoded to token IDs; per-sample 1D; batched 2D [batch, seq_len] | PIL.Image, np.ndarray, or torch.Tensor; per-sample 3D; batched 4D | np.ndarray, torch.Tensor, list of frames per-sample 4D; batched 5D; or file (like .mp4) | 1D waveform (np.ndarray or torch.Tensor) per-sample 1D; batched 2D [batch, num_samples], or file |
102
+
103
+
104
+ **Other Properties**: The model's maximum context length is 32768 tokens.
105
+
106
+ ## Output
107
+
108
+ - **Output Type:** Floats
109
+ - **Output Format:** List of float arrays
110
+ - **Output Parameters:** A tensor of floats equivalent to [batchsize x 2048]
111
+ - **Other Properties Related to Output:** Model outputs embedding vectors of dimension 2048 for each input.
112
+
113
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
114
+
115
+ ### Usage
116
+ The model requires transformers version 4.51.3
117
+ ```
118
+ pip install git+https://github.com/huggingface/transformers.git@v4.51.3-Qwen2.5-Omni-preview
119
+ ```
120
+
121
+ ```
122
+ import torch
123
+ from qwen_omni_utils import process_mm_info
124
+ import torch.nn.functional as F
125
+ from transformers import AutoModel, AutoProcessor
126
+
127
+ model_name_or_path = "nvidia/omni-embed-nemotron-3b"
128
+ model = AutoModel.from_pretrained(
129
+ model_name_or_path,
130
+ torch_dtype=torch.bfloat16,
131
+ attn_implementation="flash_attention_2",
132
+ trust_remote_code=True,
133
+ )
134
+
135
+ model = model.to("cuda:0")
136
+ model.eval()
137
+
138
+ documents = [
139
+ {
140
+ "role": "user",
141
+ "content": [
142
+ {
143
+ "type": "text",
144
+ "text": "passage: This is a passage to be embedded"
145
+ },
146
+ {
147
+ "type": "video",
148
+ "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"
149
+ },
150
+ {
151
+ "type": "audio",
152
+ "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"
153
+ }
154
+ ]
155
+ },
156
+ ]
157
+
158
+ processor = AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
159
+ documents_texts = processor.apply_chat_template(documents, add_generation_prompt=False, tokenize=False)
160
+ audio, images, videos = process_mm_info(documents, use_audio_in_video=False)
161
+
162
+ videos_kwargs = {
163
+ "min_pixels": 32*14*14,
164
+ "max_pixels": 64*28*28,
165
+ "use_audio_in_video": False,
166
+ }
167
+ text_kwargs = {
168
+ "truncation": True,
169
+ "padding": True,
170
+ "max_length": 204800,
171
+ }
172
+ batch_dict = processor(
173
+ text=documents_texts,
174
+ images=images,
175
+ videos=videos,
176
+ audio=audio,
177
+ return_tensors="pt",
178
+ text_kwargs=text_kwargs,
179
+ videos_kwargs=videos_kwargs,
180
+ audio_kwargs={"max_length": 2048000},
181
+ )
182
+
183
+ batch_dict = {k: v.to(model.device) for k, v in batch_dict.items()}
184
+ last_hidden_states = model(**batch_dict, output_hidden_states=True).hidden_states[-1]
185
+ # Average Pooling
186
+ attention_mask = batch_dict["attention_mask"]
187
+ last_hidden_states_masked = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
188
+ embedding = last_hidden_states_masked.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
189
+ embedding = F.normalize(embedding, dim=-1)
190
+ print(embedding)
191
+ print(embedding.shape)
192
+ ```
193
+
194
+
195
+ ## Software Integration: <br>
196
+
197
+ Runtime Engine(s): TensorRT, Triton
198
+ Supported Hardware Microarchitecture Compatibility: A100 40GB, A100 80GB, H100 80GB
199
+ Supported Operating System(s): Linux
200
+
201
+ The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment.
202
+
203
+ ## Model Version(s)
204
+ - **Nvidia Omni Embed Nemotron 3B**
205
+ - **Short name:** omni-embed-nemotron-3b-v1
206
+
207
+ # Training and Evaluation Datasets
208
+
209
+ ## Training Dataset
210
+
211
+ **Data Modality**:
212
+ Image
213
+ Text
214
+
215
+ **Image Training Data Size**: 1 Million to 1 Billion Images
216
+
217
+ **Text Training Data Size**: Less than a Billion Tokens
218
+
219
+
220
+ The model was trained on publicly available datasets, including[HotpotQA](https://huggingface.co/datasets/hotpotqa/hotpot_qa), [MIRACL](https://huggingface.co/datasets/SEACrowd/miracl), [Natural Questions (NQ)](https://huggingface.co/datasets/irds/natural-questions), [Stack Exchange](https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences), [SQuAD](https://huggingface.co/datasets/squad), [Tiger Math/Stack](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub), [DocMatix-IR](https://huggingface.co/datasets/Tevatron/docmatix-ir), [Vidore-ColPali-Training](https://huggingface.co/datasets/vidore/colpali_train_set), and [Wiki-SS-NQ](https://huggingface.co/datasets/Tevatron/wiki-ss-nq).
221
+
222
+ - **Data Collection Method by dataset:** Hybrid: Automated, Human, Synthetic
223
+ - **Labeling Method by dataset:** Hybrid: Automated, Human, Synthetic
224
+ - **Properties:** 1M samples from public datasets.
225
+
226
+ ## Evaluation Dataset
227
+ We evaluate our model on multiple benchmarks covering different modalities. For text retrieval, we select some text retrieval datasets from [MTEB](https://huggingface.co/spaces/mteb/leaderboard). For image retrieval, evaluation is conducted on the public ViDoRe V1 dataset. Since no established video retrieval benchmarks exist, we construct two custom evaluation sets based on the LPM dataset and FineVideo. To provide fair comparison with the state-of-the-art text-only baselines, we use the speech-to-text transcripts released with FineVideo and the transcripts from LPM as the input corpus for standard text retrieval models.
228
+
229
+ - **Data Collection Method by dataset:** Hybrid: Automated, Human, Synthetic
230
+ - **Labeling Method by dataset:** Hybrid: Automated, Human, Synthetic
231
+ - **Properties:** More details on ViDoRe V1 can be found on their leaderboard: [Visual Document Retrieval Benchmark](https://huggingface.co/vidore).
232
+
233
+ ### Model performance comparison on Video retrieval datasets (LPM and FineVideo) using NDCG@10 and NDCG@5 metrics:
234
+
235
+ | Model | NDCG@10 LPM | NDCG@10 FineVideo | NDCG@10 Avg | NDCG@5 LPM | NDCG@5 FineVideo | NDCG@5 Avg |
236
+ |---|---|---|---|---|---|---|
237
+ | Qwen/Qwen3-Embedding-4B | 0.8634 | 0.5405 | 0.7020 | 0.8518 | 0.5264 | 0.6891 |
238
+ | intfloat/multilingual-e5-large-instruct | 0.7952 | 0.4456 | 0.6204 | 0.7759 | 0.4300 | 0.6030 |
239
+ | stella\_en\_1.5B\_v5 | 0.8522 | 0.5359 | 0.6941 | 0.8404 | 0.5206 | 0.6805 |
240
+ | nvidia/omni-embed-nemotron-3b | 0.8465 | 0.5662 | **0.7064** | 0.8355 | 0.5486 | **0.6921** |
241
+
242
+ Multimodal retrieval performance across input modalities on LPM and FineVideo using NDCG@10. Baselines support text only; multimodal settings apply to Omni.
243
+
244
+ ### LPM performance (NDCG@10) modalities breakdown
245
+
246
+ | Model | Text (Transcript+OCR) | Audio-Only | Video-Only | Audio+Video Fusion | Audio+Video Separately |
247
+ |---|---|---|---|---|---|
248
+ | Qwen/Qwen3-Embedding-4B | 0.8634 | N/A | N/A | N/A | N/A |
249
+ | intfloat/multilingual-e5-large-instruct | 0.7952 | N/A | N/A | N/A | N/A |
250
+ | stella\_en\_1.5B\_v5 | 0.8522 | N/A | N/A | N/A | N/A |
251
+ | nvidia/omni-embed-nemotron-3b | 0.8636 | 0.8238 | 0.7365 | 0.8373 | 0.8465 |
252
+
253
+ ### FineVideo performance (NDCG@10) modalities breakdown
254
+
255
+ | Model | Text (Transcript) | Audio-Only | Video-Only | Audio+Video Fusion | Audio+Video Separately |
256
+ |---|---|---|---|---|---|
257
+ | Qwen/Qwen3-Embedding-4B | 0.5405 | N/A | N/A | N/A | N/A |
258
+ | intfloat/multilingual-e5-large-instruct | 0.4456 | N/A | N/A | N/A | N/A |
259
+ | stella\_en\_1.5B\_v5 | 0.5359 | N/A | N/A | N/A | N/A |
260
+ | nvidia/omni-embed-nemotron-3b | 0.6082 | 0.5407 | 0.4488 | 0.4700 | 0.5662 |
261
+
262
+ ### Evaluation of embedding models across text retrieval benchmarks. Results are reported using nDCG@10.
263
+ | Model | Avg. | NQ | FiQA-2018 | SciFact | SCIDOCS | ArguAna | NFCorpus | Quora | LegalBench-CorpLobby | CQAdupGaming | CQAdupUnix |
264
+ |---|---|---|---|---|---|---|---|---|---|---|---|
265
+ | Qwen/Qwen3-Embedding-4B | 0.6654 | 0.6313 | 0.6122 | 0.7833 | 0.3144 | 0.7564 | 0.4110 | 0.8806 | 0.9542 | 0.7151 | 0.5960 |
266
+ | intfloat/multilingual-e5-large-instruct | 0.5900 | 0.6350 | 0.4865 | 0.7162 | 0.1924 | 0.5848 | 0.3634 | 0.8926 | 0.9425 | 0.6396 | 0.4473 |
267
+ | stella\_en\_1.5B\_v5 | 0.6050 | 0.7180 | 0.5996 | 0.8009 | 0.2677 | 0.5706 | 0.4200 | 0.9003 | 0.9468 | 0.5359 | 0.2903 |
268
+ | nvidia/omni-embed-nemotron-3b | 0.6059 | 0.6808 | 0.5382 | 0.7405 | 0.2163 | 0.5891 | 0.3644 | 0.8347 | 0.9413 | 0.6432 | 0.5102 |
269
+
270
+ ### Evaluation of baseline models and our models on [ViDoRe V1](https://huggingface.co/spaces/vidore/vidore-leaderboard}{) (as of September 30th). Results are presented using nDCG@5 metrics.
271
+
272
+ | Model | Size (M) | Avg. | ArxivQA | DocVQA | InfoVQA | Shift Project | AI | Energy | Gov. Reports | Healthcare | TabFQuad | TAT-DQA |
273
+ |---|---|---|---|---|---|---|---|---|---|---|---|---|
274
+ | nvidia/llama-nemoretriever-colembed-1b-v1 | 2418 | 90.5 | 87.6 | 64.5 | 93.6 | 92.3 | 100 | 96.6 | 96.7 | 99.6 | 94.3 | 79.8 |
275
+ | nvidia/llama-nemoretriever-colembed-3b-v1 | 4407 | 91.0 | 88.4 | 66.2 | 94.9 | 90.7 | 99.6 | 96.6 | 97.8 | 99.3 | 95.9 | 80.6 |
276
+ | nomic-ai/colnomic-embed-multimodal-3b | 3000 | 89.9 | 88.2 | 61.3 | 92.8 | 90.2 | 96.3 | 97.3 | 96.6 | 98.3 | 94.5 | 83.1 |
277
+ | vidore/colqwen2.5-v0.2 | 3000 | 89.6 | 89.1 | 63.5 | 92.6 | 88.0 | 99.6 | 95.8 | 96.6 | 98.0 | 90.8 | 82.1 |
278
+ | vidore/colqwen2-v1.0 | 2210 | 89.2 | 88.0 | 61.5 | 92.5 | 89.9 | 99.0 | 95.9 | 95.5 | 98.8 | 89.0 | 82.2 |
279
+ | vidore/colpali-v1.3 | 2920 | 84.7 | 83.7 | 58.7 | 85.7 | 76.5 | 96.6 | 94.6 | 95.9 | 97.4 | 86.7 | 70.7 |
280
+ | vidore/colpali-v1.2 | 2920 | 83.4 | 77.9 | 56.5 | 82.4 | 78.3 | 97.5 | 94.4 | 94.9 | 95.4 | 88.4 | 68.1 |
281
+ | nvidia/omni-embed-nemotron-3b | 4703 | 85.7 | 85.3 | 59.2 | 89.2 | 78.6 | 98.1 | 93.5 | 95.4 | 95.8 | 91.0 | 69.7 |
282
+
283
+
284
+
285
+
286
+
287
+
288
+ ## Inference:
289
+ **Acceleration Engine:** Not Applicable <br>
290
+ **Test Hardware:** A100 40GB, A100 80GB, H100 80GB
291
+
292
+ ## Ethical Considerations
293
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
294
+
295
+ Please report model quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
296
+
297
+ ## Bias
298
+
299
+ | Field | Response |
300
+ | ----- | ----- |
301
+ | Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing | None |
302
+ | Measures taken to mitigate against unwanted bias | None |
303
+
304
+ ## Explainability
305
+
306
+ | Field | Response |
307
+ | ----- | ----- |
308
+ | Intended Application & Domain: | Multi-modality corpus and query embedding for question and answer retrieval. |
309
+ | Model Type: | Transformer encoder. |
310
+ | Intended User: | Creators of generative AI focused on conversational models, as well as users aiming to develop question-and-answer applications, can benefit from leveraging the dense retrieval technologies. These applications can efficiently handle large, multi-modal corpora, including images, text, videos, and audio. |
311
+ | Output: | Array of float numbers (Dense vector for input content, which may include multi-modal corpora). |
312
+ | Describe how the model works: | Model transforms the input into a dense vector representation. |
313
+ | Performance Metrics: | Accuracy |
314
+ | Potential Known Risks: | This model does not guarantee to always retrieve the correct corpus for a given query. |
315
+ | Licensing & Terms of Use: | **Governing Terms:**<br>Your use of the software container and model is governed by the [NVIDIA Software and Model Evaluation License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-and-model-evaluation-license/)<br><br>**Additional Information:**<br>[Qwen RESEARCH LICENSE AGREEMENT](https://huggingface.co/Qwen/Qwen2.5-Omni-3B/blob/main/LICENSE) |
316
+ | Technical Limitations: | The model's max sequence length is 32768. Longer sequence inputs should be truncated. |
317
+ | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | N/A |
318
+ | Verified to have met prescribed NVIDIA quality standards: | Yes |
319
+
320
+ ## Privacy
321
+
322
+ | Field | Response |
323
+ | ----- | ----- |
324
+ | Generatable or reverse engineerable personal data? | None |
325
+ | Personal data used to create this model? | None |
326
+ | How often is dataset reviewed? | Dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for changes. |
327
+ | Is there provenance for all datasets used in training? | Yes |
328
+ | Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
329
+ | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
330
+ | Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/ |
331
+
332
+ ## Safety And Security
333
+
334
+ | Field | Response |
335
+ | ----- | ----- |
336
+ | Model Application(s): | Multi-modal Corpus Embedding for Retrieval. The model processes input from various modalities—text, image, audio, and video—either independently or in combination. |
337
+ | Use Case Restrictions: | Governing Terms:Your use of the model is governed by the [NVIDIA Open License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/). Additional Information: [Qwen RESEARCH LICENSE AGREEMENT](https://huggingface.co/Qwen/Qwen2.5-Omni-3B/blob/main/LICENSE). |
338
+ | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
339
+ | Describe the life critical impact (if present) | Not applicable |
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|AUDIO|>": 151646,
5
+ "<|IMAGE|>": 151655,
6
+ "<|VIDEO|>": 151656,
7
+ "<|audio_bos|>": 151647,
8
+ "<|audio_eos|>": 151648,
9
+ "<|box_end|>": 151649,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|vision_bos|>": 151652,
22
+ "<|vision_eos|>": 151653,
23
+ "<|vision_pad|>": 151654
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "NVOmniEmbedModel"
4
+ ],
5
+ "audio_config": {
6
+ "_attn_implementation_autoset": true,
7
+ "activation_dropout": 0.0,
8
+ "activation_function": "gelu",
9
+ "attention_dropout": 0.0,
10
+ "d_model": 1280,
11
+ "dropout": 0.0,
12
+ "encoder_attention_heads": 20,
13
+ "encoder_ffn_dim": 5120,
14
+ "encoder_layerdrop": 0.0,
15
+ "encoder_layers": 32,
16
+ "init_std": 0.02,
17
+ "initializer_range": 0.02,
18
+ "max_source_positions": 1500,
19
+ "model_type": "qwen2_5_omni_audio_encoder",
20
+ "n_window": 100,
21
+ "num_hidden_layers": 32,
22
+ "num_mel_bins": 128,
23
+ "output_dim": 2048,
24
+ "scale_embedding": false,
25
+ "torch_dtype": "bfloat16"
26
+ },
27
+ "audio_end_token_id": 151648,
28
+ "audio_max_length": 2048000,
29
+ "audio_start_token_id": 151647,
30
+ "audio_token_index": 151646,
31
+ "auto_map": {
32
+ "AutoModel": "modeling_nv_omni_embed.NVOmniEmbedModel",
33
+ "AutoConfig": "modeling_nv_omni_embed.NVOmniEmbedConfig"
34
+ },
35
+ "bos_token_id": 151644,
36
+ "eos_token_id": 151645,
37
+ "ignore_index": -100,
38
+ "image_token_index": 151655,
39
+ "init_std": 0.02,
40
+ "initializer_range": 0.02,
41
+ "model_type": "nvomniembed",
42
+ "pad_token_id": 151643,
43
+ "position_id_per_seconds": 25,
44
+ "resized_height": 680,
45
+ "resized_width": 680,
46
+ "seconds_per_chunk": 2,
47
+ "text_config": {
48
+ "attention_dropout": 0.0,
49
+ "hidden_act": "silu",
50
+ "hidden_size": 2048,
51
+ "init_std": 0.02,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 11008,
54
+ "max_position_embeddings": 32768,
55
+ "max_window_layers": 70,
56
+ "model_type": "qwen2_5_omni_text",
57
+ "num_attention_heads": 16,
58
+ "num_hidden_layers": 36,
59
+ "num_key_value_heads": 2,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_scaling": {
62
+ "mrope_section": [
63
+ 16,
64
+ 24,
65
+ 24
66
+ ],
67
+ "rope_type": "default",
68
+ "type": "default"
69
+ },
70
+ "rope_theta": 1000000.0,
71
+ "sliding_window": 32768,
72
+ "torch_dtype": "bfloat16",
73
+ "use_cache": true,
74
+ "use_sliding_window": false,
75
+ "vocab_size": 151936
76
+ },
77
+ "torch_dtype": "bfloat16",
78
+ "transformers_version": "4.52.0.dev0",
79
+ "user_token_id": 872,
80
+ "video_token_index": 151656,
81
+ "vision_config": {
82
+ "_attn_implementation_autoset": true,
83
+ "depth": 32,
84
+ "embed_dim": 1280,
85
+ "fullatt_block_indexes": [
86
+ 7,
87
+ 15,
88
+ 23,
89
+ 31
90
+ ],
91
+ "hidden_act": "silu",
92
+ "hidden_size": 1280,
93
+ "in_channels": 3,
94
+ "in_chans": 3,
95
+ "init_std": 0.02,
96
+ "initializer_range": 0.02,
97
+ "intermediate_size": 3420,
98
+ "model_type": "qwen2_5_omni_vision_encoder",
99
+ "num_heads": 16,
100
+ "out_hidden_size": 2048,
101
+ "patch_size": 14,
102
+ "spatial_merge_size": 2,
103
+ "spatial_patch_size": 14,
104
+ "temporal_patch_size": 2,
105
+ "tokens_per_second": 25,
106
+ "torch_dtype": "bfloat16",
107
+ "window_size": 112
108
+ },
109
+ "vision_end_token_id": 151653,
110
+ "vision_start_token_id": 151652,
111
+ "vision_token_id": 151654
112
+ }
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.52.0.dev0"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce5d2ff1be32177ae0bc173931f0335a21c5298837257ea1f6d853b222bdf48c
3
+ size 4994841696
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd03af73e3f2843978de3c867fd515e900fa0e9fd331d3f92529380f4e27a27c
3
+ size 4412249056
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_nv_omni_embed.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Qwen2_5OmniThinkerTextModel, Qwen2_5OmniThinkerForConditionalGeneration
3
+ from transformers.cache_utils import Cache
4
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
5
+ from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import Qwen2_5OmniThinkerConfig
6
+
7
+ class BidirectQwen2_5OmniThinkerTextModel(Qwen2_5OmniThinkerTextModel):
8
+
9
+ def __init__(self, config):
10
+ super().__init__(config)
11
+ for layer in self.layers:
12
+ layer.self_attn.is_causal = False
13
+
14
+ # override the _update_causal_mask method to generate bi-directional attention
15
+ def _update_causal_mask(
16
+ self,
17
+ attention_mask: torch.Tensor,
18
+ input_tensor: torch.Tensor,
19
+ cache_position: torch.Tensor,
20
+ past_key_values: Cache,
21
+ output_attentions: bool = False,
22
+ ):
23
+ calculated_attention_mask = super()._update_causal_mask(
24
+ attention_mask,
25
+ input_tensor,
26
+ cache_position,
27
+ past_key_values,
28
+ output_attentions)
29
+ if calculated_attention_mask is None:
30
+ return None
31
+ if self.config._attn_implementation == "flash_attention_2":
32
+ if attention_mask is not None and 0.0 in attention_mask:
33
+ return attention_mask
34
+ causal_mask = _prepare_4d_attention_mask(
35
+ attention_mask,
36
+ dtype=input_tensor.dtype,
37
+ )
38
+ return causal_mask
39
+
40
+ class NVOmniEmbedConfig(Qwen2_5OmniThinkerConfig):
41
+ model_type = "nvomniembed"
42
+
43
+ class NVOmniEmbedModel(Qwen2_5OmniThinkerForConditionalGeneration):
44
+ config_class = NVOmniEmbedConfig
45
+ def __init__(self, config):
46
+ super().__init__(config)
47
+ self.model = BidirectQwen2_5OmniThinkerTextModel._from_config(
48
+ config.text_config, attn_implementation=config._attn_implementation
49
+ )
50
+
preprocessor_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 300,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "max_pixels": 12845056,
19
+ "merge_size": 2,
20
+ "min_pixels": 3136,
21
+ "n_fft": 400,
22
+ "n_samples": 4800000,
23
+ "nb_max_frames": 30000,
24
+ "padding_side": "right",
25
+ "padding_value": 0.0,
26
+ "patch_size": 14,
27
+ "processor_class": "Qwen2_5OmniProcessor",
28
+ "return_attention_mask": true,
29
+ "sampling_rate": 16000,
30
+ "temporal_patch_size": 2
31
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|AUDIO|>",
6
+ "<|audio_bos|>",
7
+ "<|audio_eos|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_bos|>",
12
+ "<|vision_eos|>",
13
+ "<|vision_pad|>",
14
+ "<|IMAGE|>",
15
+ "<|VIDEO|>"
16
+ ],
17
+ "audio_bos_token": "<|audio_bos|>",
18
+ "audio_eos_token": "<|audio_eos|>",
19
+ "audio_token": "<|AUDIO|>",
20
+ "eos_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<|IMAGE|>",
28
+ "pad_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "video_token": "<|VIDEO|>",
36
+ "vision_bos_token": "<|vision_bos|>",
37
+ "vision_eos_token": "<|vision_eos|>"
38
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37ccc49ca787578f0a528ff59a0e1dc7605031c1e0481c32b37fcd1eb03f5e2
3
+ size 11422135
tokenizer_config.json ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|AUDIO|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|audio_bos|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|audio_eos|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_bos|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_eos|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|IMAGE|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|VIDEO|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|AUDIO|>",
185
+ "<|audio_bos|>",
186
+ "<|audio_eos|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_bos|>",
191
+ "<|vision_eos|>",
192
+ "<|vision_pad|>",
193
+ "<|IMAGE|>",
194
+ "<|VIDEO|>"
195
+ ],
196
+ "audio_bos_token": "<|audio_bos|>",
197
+ "audio_eos_token": "<|audio_eos|>",
198
+ "audio_token": "<|AUDIO|>",
199
+ "bos_token": null,
200
+ "clean_up_tokenization_spaces": false,
201
+ "eos_token": "<|im_end|>",
202
+ "errors": "replace",
203
+ "extra_special_tokens": {
204
+ "audio_bos_token": "<|audio_bos|>",
205
+ "audio_eos_token": "<|audio_eos|>",
206
+ "audio_token": "<|AUDIO|>",
207
+ "image_token": "<|IMAGE|>",
208
+ "video_token": "<|VIDEO|>",
209
+ "vision_bos_token": "<|vision_bos|>",
210
+ "vision_eos_token": "<|vision_eos|>"
211
+ },
212
+ "image_token": "<|IMAGE|>",
213
+ "max_length": 900,
214
+ "model_max_length": 32768,
215
+ "pad_to_multiple_of": null,
216
+ "pad_token": "<|endoftext|>",
217
+ "pad_token_type_id": 0,
218
+ "padding_side": "left",
219
+ "processor_class": "Qwen2_5OmniProcessor",
220
+ "split_special_tokens": false,
221
+ "stride": 0,
222
+ "tokenizer_class": "Qwen2Tokenizer",
223
+ "truncation_side": "right",
224
+ "truncation_strategy": "longest_first",
225
+ "unk_token": null,
226
+ "video_token": "<|VIDEO|>",
227
+ "vision_bos_token": "<|vision_bos|>",
228
+ "vision_eos_token": "<|vision_eos|>"
229
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff