littlebird13 commited on
Commit
9b6a3cd
·
verified ·
1 Parent(s): 58c2a16

Add files using upload-large-folder tool

Browse files
Files changed (48) hide show
  1. LICENSE +51 -0
  2. config.json +32 -0
  3. configuration.json +1 -0
  4. configuration_qwen2_rm.py +140 -0
  5. generation_config.json +14 -0
  6. merges.txt +0 -0
  7. model-00001-of-00037.safetensors +3 -0
  8. model-00002-of-00037.safetensors +3 -0
  9. model-00003-of-00037.safetensors +3 -0
  10. model-00004-of-00037.safetensors +3 -0
  11. model-00005-of-00037.safetensors +3 -0
  12. model-00006-of-00037.safetensors +3 -0
  13. model-00007-of-00037.safetensors +3 -0
  14. model-00008-of-00037.safetensors +3 -0
  15. model-00009-of-00037.safetensors +3 -0
  16. model-00010-of-00037.safetensors +3 -0
  17. model-00011-of-00037.safetensors +3 -0
  18. model-00012-of-00037.safetensors +3 -0
  19. model-00013-of-00037.safetensors +3 -0
  20. model-00014-of-00037.safetensors +3 -0
  21. model-00015-of-00037.safetensors +3 -0
  22. model-00016-of-00037.safetensors +3 -0
  23. model-00017-of-00037.safetensors +3 -0
  24. model-00018-of-00037.safetensors +3 -0
  25. model-00019-of-00037.safetensors +3 -0
  26. model-00020-of-00037.safetensors +3 -0
  27. model-00021-of-00037.safetensors +3 -0
  28. model-00022-of-00037.safetensors +3 -0
  29. model-00023-of-00037.safetensors +3 -0
  30. model-00024-of-00037.safetensors +3 -0
  31. model-00025-of-00037.safetensors +3 -0
  32. model-00026-of-00037.safetensors +3 -0
  33. model-00027-of-00037.safetensors +3 -0
  34. model-00028-of-00037.safetensors +3 -0
  35. model-00029-of-00037.safetensors +3 -0
  36. model-00030-of-00037.safetensors +3 -0
  37. model-00031-of-00037.safetensors +3 -0
  38. model-00032-of-00037.safetensors +3 -0
  39. model-00033-of-00037.safetensors +3 -0
  40. model-00034-of-00037.safetensors +3 -0
  41. model-00035-of-00037.safetensors +3 -0
  42. model-00036-of-00037.safetensors +3 -0
  43. model-00037-of-00037.safetensors +3 -0
  44. model.safetensors.index.json +974 -0
  45. modeling_qwen2_rm.py +1549 -0
  46. tokenizer.json +0 -0
  47. tokenizer_config.json +40 -0
  48. vocab.json +0 -0
LICENSE ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Qwen LICENSE AGREEMENT
2
+
3
+ Qwen LICENSE AGREEMENT Release Date: September 19, 2024
4
+
5
+ By clicking to agree or by using or distributing any portion or element of the Qwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
6
+
7
+ 1. Definitions
8
+ a. This Qwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
9
+ b. "We" (or "Us") shall mean Alibaba Cloud.
10
+ c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
11
+ d. "Third Parties" shall mean individuals or legal entities that are not under common control with us or you.
12
+ e. "Qwen" shall mean the large language models, and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by us.
13
+ f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Qwen and Documentation (and any portion thereof) made available under this Agreement.
14
+ g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
15
+ h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
16
+
17
+ 2. Grant of Rights
18
+ You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
19
+
20
+ 3. Redistribution
21
+ You may distribute copies or make the Materials, or derivative works thereof, available as part of a product or service that contains any of them, with or without modifications, and in Source or Object form, provided that you meet the following conditions:
22
+ a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
23
+ b. You shall cause any modified files to carry prominent notices stating that you changed the files;
24
+ c. You shall retain in all copies of the Materials that you distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Qwen is licensed under the Qwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
25
+ d. You may add your own copyright statement to your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of your modifications, or for any such derivative works as a whole, provided your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
26
+
27
+ 4. Restrictions
28
+ If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, you shall request a license from us. You cannot exercise your rights under this Agreement without our express authorization.
29
+
30
+ 5. Rules of use
31
+ a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
32
+ b. If you use the Materials or any outputs or results therefrom to create, train, fine-tune, or improve an AI model that is distributed or made available, you shall prominently display “Built with Qwen” or “Improved using Qwen” in the related product documentation.
33
+
34
+ 6. Intellectual Property
35
+ a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
36
+ b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
37
+ c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licenses granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
38
+
39
+ 7. Disclaimer of Warranty and Limitation of Liability
40
+ a. We are not obligated to support, update, provide training for, or develop any further version of the Qwen Materials or to grant any license thereto.
41
+ b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
42
+ c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
43
+ d. You will defend, indemnify and hold harmless us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
44
+
45
+ 8. Survival and Termination.
46
+ a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
47
+ b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
48
+
49
+ 9. Governing Law and Jurisdiction.
50
+ a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
51
+ b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForRewardModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_qwen2_rm.Qwen2RMConfig",
8
+ "AutoModel": "modeling_qwen2_rm.Qwen2ForRewardModel"
9
+ },
10
+ "bos_token_id": 151643,
11
+ "eos_token_id": 151645,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 8192,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 29568,
16
+ "max_position_embeddings": 2048,
17
+ "max_window_layers": 70,
18
+ "model_type": "qwen2",
19
+ "num_attention_heads": 64,
20
+ "num_hidden_layers": 80,
21
+ "num_key_value_heads": 8,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_theta": 1000000.0,
24
+ "sliding_window": 131072,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.37.0",
28
+ "use_cache": true,
29
+ "use_mrope": false,
30
+ "use_sliding_window": false,
31
+ "vocab_size": 152064
32
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-generation"}
configuration_qwen2_rm.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen2 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ class Qwen2RMConfig(PretrainedConfig):
25
+ r"""
26
+ This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
27
+ Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
28
+ with the defaults will yield a similar configuration to that of
29
+ Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+
35
+ Args:
36
+ vocab_size (`int`, *optional*, defaults to 151936):
37
+ Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
38
+ `inputs_ids` passed when calling [`Qwen2Model`]
39
+ hidden_size (`int`, *optional*, defaults to 4096):
40
+ Dimension of the hidden representations.
41
+ intermediate_size (`int`, *optional*, defaults to 22016):
42
+ Dimension of the MLP representations.
43
+ num_hidden_layers (`int`, *optional*, defaults to 32):
44
+ Number of hidden layers in the Transformer encoder.
45
+ num_attention_heads (`int`, *optional*, defaults to 32):
46
+ Number of attention heads for each attention layer in the Transformer encoder.
47
+ num_key_value_heads (`int`, *optional*, defaults to 32):
48
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
49
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
50
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
51
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
52
+ by meanpooling all the original heads within that group. For more details checkout [this
53
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
54
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
+ The non-linear activation function (function or string) in the decoder.
56
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
57
+ The maximum sequence length that this model might ever be used with.
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
+ The epsilon used by the rms normalization layers.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
64
+ relevant if `config.is_decoder=True`.
65
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
+ Whether the model's input and output word embeddings should be tied.
67
+ rope_theta (`float`, *optional*, defaults to 10000.0):
68
+ The base period of the RoPE embeddings.
69
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
70
+ Whether to use sliding window attention.
71
+ sliding_window (`int`, *optional*, defaults to 4096):
72
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
73
+ max_window_layers (`int`, *optional*, defaults to 28):
74
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
75
+ attention_dropout (`float`, *optional*, defaults to 0.0):
76
+ The dropout ratio for the attention probabilities.
77
+
78
+ ```python
79
+ >>> from transformers import Qwen2Model, Qwen2Config
80
+
81
+ >>> # Initializing a Qwen2 style configuration
82
+ >>> configuration = Qwen2Config()
83
+
84
+ >>> # Initializing a model from the Qwen2-7B style configuration
85
+ >>> model = Qwen2Model(configuration)
86
+
87
+ >>> # Accessing the model configuration
88
+ >>> configuration = model.config
89
+ ```"""
90
+
91
+ model_type = "qwen2"
92
+ keys_to_ignore_at_inference = ["past_key_values"]
93
+
94
+ def __init__(
95
+ self,
96
+ vocab_size=151936,
97
+ hidden_size=4096,
98
+ intermediate_size=22016,
99
+ num_hidden_layers=32,
100
+ num_attention_heads=32,
101
+ num_key_value_heads=32,
102
+ hidden_act="silu",
103
+ max_position_embeddings=32768,
104
+ initializer_range=0.02,
105
+ rms_norm_eps=1e-6,
106
+ use_cache=True,
107
+ tie_word_embeddings=False,
108
+ rope_theta=10000.0,
109
+ use_sliding_window=False,
110
+ sliding_window=4096,
111
+ max_window_layers=28,
112
+ attention_dropout=0.0,
113
+ **kwargs,
114
+ ):
115
+ self.vocab_size = vocab_size
116
+ self.max_position_embeddings = max_position_embeddings
117
+ self.hidden_size = hidden_size
118
+ self.intermediate_size = intermediate_size
119
+ self.num_hidden_layers = num_hidden_layers
120
+ self.num_attention_heads = num_attention_heads
121
+ self.use_sliding_window = use_sliding_window
122
+ self.sliding_window = sliding_window if use_sliding_window else None
123
+ self.max_window_layers = max_window_layers
124
+
125
+ # for backward compatibility
126
+ if num_key_value_heads is None:
127
+ num_key_value_heads = num_attention_heads
128
+
129
+ self.num_key_value_heads = num_key_value_heads
130
+ self.hidden_act = hidden_act
131
+ self.initializer_range = initializer_range
132
+ self.rms_norm_eps = rms_norm_eps
133
+ self.use_cache = use_cache
134
+ self.rope_theta = rope_theta
135
+ self.attention_dropout = attention_dropout
136
+
137
+ super().__init__(
138
+ tie_word_embeddings=tie_word_embeddings,
139
+ **kwargs,
140
+ )
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "pad_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_p": 0.8,
12
+ "top_k": 20,
13
+ "transformers_version": "4.37.0"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a51e9240c91e877cad64d8988195d9e8c66abe88a98e095a51e303f08f175a1
3
+ size 3896596186
model-00002-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78f643b5b89a35c939d95dd43b980c3d9f37a9ea93ca456cb60d92a93f58ef4a
3
+ size 3995200440
model-00003-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4cb04ff53772c29b499893415d0e8cc67d61eb04115a7885d518bf72a8e6788
3
+ size 3812769392
model-00004-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cae0a89fa74a51b4d76021b57eaea8ddb67c1a3f4e7938f73ccf59d12fd78fbf
3
+ size 3995183944
model-00005-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e05235061b8f23a4d74a4732ef1049792316dc8c9a64481f9abe5c9f253946
3
+ size 3995183944
model-00006-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81f43c93a47ae5069bd2e3ba4bd670aa36bec23eaf2651230d8282c0428705f9
3
+ size 3995200456
model-00007-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8400c01a2a1056edb32a377e66ab0b1bc3b0a06a03cf09cd22b6c43cc42b4f8f
3
+ size 3812769424
model-00008-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d9535681399fec92f53123518500ffd010f7bafe6eb2e4994c5a9b3e72e8ef
3
+ size 3995183968
model-00009-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3defdc2a2184ff007ba2083e9e4f65714478db8ac6a1242817257b72b5ec513
3
+ size 3995183968
model-00010-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf79f0328d48a76cd0bd98f163706285c7091f9c3cb3e6596217425deb803af9
3
+ size 3995200464
model-00011-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:469b0f9c514935e0aeed006eabd479b454b88f97f45b486630047d8b7b336b8a
3
+ size 3812769424
model-00012-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4c362e7d0bb8b122a6d084be68e0d00410fe7ffeb636ea7ed98ce66fbbb5b4d
3
+ size 3995183968
model-00013-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e2bebd29bb1559360892c6493be9a2b0a9f2769fccd2194ad0a94358c488896
3
+ size 3995183968
model-00014-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a2360c019a1d9d91f9cb299bb413f7f1f5b923e35ca5568402f76e9eaa89c11
3
+ size 3995200464
model-00015-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d136b7a1eb7455949667877625f35a6f72576472d61f995271ee600377482e7
3
+ size 3812769424
model-00016-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2170c194b75963f2ffbe440ba832774c5c12e2d5bcb68518b83341bc68c8bef9
3
+ size 3995183968
model-00017-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c68a761c5cac68f822931f4b70a35fab0d6d5e1eb59760a3d61e7a642a62d272
3
+ size 3995183968
model-00018-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2434d3d50078275b2d88b6eb8456816e4e34b0fd8ddc402db2ddbe5d6bba4dce
3
+ size 3995200464
model-00019-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6a6f2df2e36938cb6d752935b038c543596ef464875a8d27cefadfce23b310
3
+ size 3812769424
model-00020-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce7deef30c47cc1e965c7790c2f923f0b7ee2606436155e2edcc89e69c9afe3
3
+ size 3995183968
model-00021-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e2e0684a0b826621c72d60b232d0d6eece2767058635a06b7b9374374e7c3c6
3
+ size 3995183968
model-00022-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:510af817b70d11de63df9e1c90aeaaf1d6656aa5621101aaf3962d0df2aa43eb
3
+ size 3995200464
model-00023-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c44a4729399d7f8e161616ad2dd5ce39627176c3faa480709d8521b9b8e71b
3
+ size 3812769424
model-00024-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ac7860bea68b6181ec1d0350ab728268593f087d50ed3155150db7dac33858f
3
+ size 3995183968
model-00025-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:219576a24a15338fa7fff19b673cc75f3964512be19200935068f66e2605e9a9
3
+ size 3995183968
model-00026-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b58f0c4484e7ab8c8a85c860f85043adafd87495b030952e75d3169831f18b6
3
+ size 3995200464
model-00027-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdc606f27e9d12b6f81c21568e40f304e56bd5a3ff94aa81abeefc46313b2014
3
+ size 3812769424
model-00028-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1a3c15641be3e98a417213a95d8a63df434aa970ff7f2e48ec4b31ae73fa77
3
+ size 3995183968
model-00029-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f444259ff180103090fcc1fbee00f38332c133352c458be6b0d0f648af2de39a
3
+ size 3995183968
model-00030-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05f115361acef294ad1986f2d156681306ad39a4589c42003203528976be1c82
3
+ size 3995200464
model-00031-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbd380e7eb1b1acd2033b080c6a18c4f30cccd1fcb579e1bf9fc9537cc7648f2
3
+ size 3812769424
model-00032-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e00cfb5c7c1f9ed366d194c7a33f92b5bc2b923c4525374f5d0699ef8c791abb
3
+ size 3995183968
model-00033-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42ba0d5b29224cfdaeb1e0a1c887a114063f28ccbfc048d8cdf59c5aeb69a66f
3
+ size 3995183968
model-00034-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddcec0c3dfadadd316329ecba7e21c08b1b9d0b10ef061dd883da65ec4b2fce1
3
+ size 3995200464
model-00035-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85281b5599dade6709abcabe9ee2391cf851dc489e48586ad742656c28c55dec
3
+ size 3812769424
model-00036-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266984ffcc4eec34c468203d9a53a6c1064bd2e0e2407951785f68860a743684
3
+ size 3995183968
model-00037-of-00037.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa571248670fe1b3d498f3dda740f96422677de020914792ddc68aa6165c77b7
3
+ size 3460317640
model.safetensors.index.json ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 145546657794
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00037-of-00037.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00037.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00037.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00037.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00037.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00037.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00037.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00037.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00037.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00037.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00037.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00037.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00037.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00037.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00037.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00037.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00037.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00002-of-00037.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00037.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00037.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00037.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00037.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00037.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00037.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00037.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00037.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00006-of-00037.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00006-of-00037.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00006-of-00037.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00006-of-00037.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00006-of-00037.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00006-of-00037.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00006-of-00037.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00006-of-00037.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00006-of-00037.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00006-of-00037.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00006-of-00037.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00006-of-00037.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00006-of-00037.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00006-of-00037.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00006-of-00037.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00006-of-00037.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00006-of-00037.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00006-of-00037.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00006-of-00037.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00006-of-00037.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00006-of-00037.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00006-of-00037.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00006-of-00037.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00006-of-00037.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00006-of-00037.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00007-of-00037.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00007-of-00037.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00007-of-00037.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00007-of-00037.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00007-of-00037.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00007-of-00037.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00007-of-00037.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00007-of-00037.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00007-of-00037.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00007-of-00037.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00007-of-00037.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00007-of-00037.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00007-of-00037.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00007-of-00037.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00007-of-00037.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00007-of-00037.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00007-of-00037.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00007-of-00037.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00007-of-00037.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00007-of-00037.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00007-of-00037.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00007-of-00037.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00007-of-00037.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00007-of-00037.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00008-of-00037.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00008-of-00037.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00008-of-00037.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00007-of-00037.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00007-of-00037.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00007-of-00037.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00007-of-00037.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00007-of-00037.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00007-of-00037.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00007-of-00037.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00007-of-00037.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00008-of-00037.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00008-of-00037.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00008-of-00037.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00008-of-00037.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00008-of-00037.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00008-of-00037.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00008-of-00037.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00008-of-00037.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00008-of-00037.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00008-of-00037.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00008-of-00037.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00008-of-00037.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00008-of-00037.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00009-of-00037.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00009-of-00037.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00008-of-00037.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00008-of-00037.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00008-of-00037.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00008-of-00037.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00008-of-00037.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00008-of-00037.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00008-of-00037.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00008-of-00037.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00008-of-00037.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00009-of-00037.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00009-of-00037.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00009-of-00037.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00009-of-00037.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00009-of-00037.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00009-of-00037.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00009-of-00037.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00009-of-00037.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00009-of-00037.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00009-of-00037.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00009-of-00037.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00009-of-00037.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00009-of-00037.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00010-of-00037.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00009-of-00037.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00009-of-00037.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00009-of-00037.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00009-of-00037.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00009-of-00037.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00009-of-00037.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00009-of-00037.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00009-of-00037.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00009-of-00037.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00009-of-00037.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00010-of-00037.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00010-of-00037.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00010-of-00037.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00010-of-00037.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00010-of-00037.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00010-of-00037.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00010-of-00037.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00010-of-00037.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00010-of-00037.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00010-of-00037.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00010-of-00037.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00010-of-00037.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00037.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00037.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00037.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00037.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00037.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00037.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00037.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00037.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00037.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00037.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00037.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00037.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00010-of-00037.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00010-of-00037.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00010-of-00037.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00010-of-00037.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00010-of-00037.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00010-of-00037.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00010-of-00037.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00010-of-00037.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00010-of-00037.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00010-of-00037.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00010-of-00037.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00010-of-00037.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00010-of-00037.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00011-of-00037.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00011-of-00037.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00011-of-00037.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00011-of-00037.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00011-of-00037.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00011-of-00037.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00011-of-00037.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00011-of-00037.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00011-of-00037.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00011-of-00037.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00011-of-00037.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00011-of-00037.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00011-of-00037.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00011-of-00037.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00011-of-00037.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00011-of-00037.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00011-of-00037.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00011-of-00037.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00011-of-00037.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00011-of-00037.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00011-of-00037.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00011-of-00037.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00011-of-00037.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00011-of-00037.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00012-of-00037.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00012-of-00037.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00012-of-00037.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00011-of-00037.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00011-of-00037.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00011-of-00037.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00011-of-00037.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00011-of-00037.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00011-of-00037.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00011-of-00037.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00011-of-00037.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00012-of-00037.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00012-of-00037.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00012-of-00037.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00012-of-00037.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00012-of-00037.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00012-of-00037.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00012-of-00037.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00012-of-00037.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00012-of-00037.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00012-of-00037.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00012-of-00037.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00012-of-00037.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00012-of-00037.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00013-of-00037.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00013-of-00037.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00012-of-00037.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00012-of-00037.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00012-of-00037.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00012-of-00037.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00012-of-00037.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00012-of-00037.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00012-of-00037.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00012-of-00037.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00012-of-00037.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00013-of-00037.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00013-of-00037.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00013-of-00037.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00013-of-00037.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00013-of-00037.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00013-of-00037.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00013-of-00037.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00013-of-00037.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00013-of-00037.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00013-of-00037.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00013-of-00037.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00013-of-00037.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00013-of-00037.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00014-of-00037.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00013-of-00037.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00013-of-00037.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00013-of-00037.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00013-of-00037.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00013-of-00037.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00013-of-00037.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00013-of-00037.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00013-of-00037.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00013-of-00037.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00013-of-00037.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00014-of-00037.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00014-of-00037.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00014-of-00037.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00014-of-00037.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00014-of-00037.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00014-of-00037.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00014-of-00037.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00014-of-00037.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00014-of-00037.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00014-of-00037.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00014-of-00037.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00014-of-00037.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00014-of-00037.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00014-of-00037.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00014-of-00037.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00014-of-00037.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00014-of-00037.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00014-of-00037.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00014-of-00037.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00014-of-00037.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00014-of-00037.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00014-of-00037.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00014-of-00037.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00014-of-00037.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00037.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00003-of-00037.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00037.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00003-of-00037.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00037.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00037.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00037.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00037.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00037.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00037.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00037.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00037.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00014-of-00037.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00015-of-00037.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00015-of-00037.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00015-of-00037.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00015-of-00037.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00015-of-00037.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00015-of-00037.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00015-of-00037.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00015-of-00037.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00015-of-00037.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00015-of-00037.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00015-of-00037.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00015-of-00037.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00015-of-00037.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00015-of-00037.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00015-of-00037.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00015-of-00037.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00015-of-00037.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00015-of-00037.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00015-of-00037.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00015-of-00037.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00015-of-00037.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00015-of-00037.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00015-of-00037.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00015-of-00037.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00016-of-00037.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00016-of-00037.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00016-of-00037.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00015-of-00037.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00015-of-00037.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00015-of-00037.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00015-of-00037.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00015-of-00037.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00015-of-00037.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00015-of-00037.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00015-of-00037.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00016-of-00037.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00016-of-00037.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00016-of-00037.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00016-of-00037.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00016-of-00037.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00016-of-00037.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00016-of-00037.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00016-of-00037.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00016-of-00037.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00016-of-00037.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00016-of-00037.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00016-of-00037.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00016-of-00037.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00017-of-00037.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00017-of-00037.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00016-of-00037.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00016-of-00037.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00016-of-00037.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00016-of-00037.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00016-of-00037.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00016-of-00037.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00016-of-00037.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00016-of-00037.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00016-of-00037.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00017-of-00037.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00017-of-00037.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00017-of-00037.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00017-of-00037.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00017-of-00037.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00017-of-00037.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00017-of-00037.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00017-of-00037.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00017-of-00037.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00017-of-00037.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00017-of-00037.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00017-of-00037.safetensors",
368
+ "model.layers.36.input_layernorm.weight": "model-00017-of-00037.safetensors",
369
+ "model.layers.36.mlp.down_proj.weight": "model-00018-of-00037.safetensors",
370
+ "model.layers.36.mlp.gate_proj.weight": "model-00017-of-00037.safetensors",
371
+ "model.layers.36.mlp.up_proj.weight": "model-00017-of-00037.safetensors",
372
+ "model.layers.36.post_attention_layernorm.weight": "model-00017-of-00037.safetensors",
373
+ "model.layers.36.self_attn.k_proj.bias": "model-00017-of-00037.safetensors",
374
+ "model.layers.36.self_attn.k_proj.weight": "model-00017-of-00037.safetensors",
375
+ "model.layers.36.self_attn.o_proj.weight": "model-00017-of-00037.safetensors",
376
+ "model.layers.36.self_attn.q_proj.bias": "model-00017-of-00037.safetensors",
377
+ "model.layers.36.self_attn.q_proj.weight": "model-00017-of-00037.safetensors",
378
+ "model.layers.36.self_attn.v_proj.bias": "model-00017-of-00037.safetensors",
379
+ "model.layers.36.self_attn.v_proj.weight": "model-00017-of-00037.safetensors",
380
+ "model.layers.37.input_layernorm.weight": "model-00018-of-00037.safetensors",
381
+ "model.layers.37.mlp.down_proj.weight": "model-00018-of-00037.safetensors",
382
+ "model.layers.37.mlp.gate_proj.weight": "model-00018-of-00037.safetensors",
383
+ "model.layers.37.mlp.up_proj.weight": "model-00018-of-00037.safetensors",
384
+ "model.layers.37.post_attention_layernorm.weight": "model-00018-of-00037.safetensors",
385
+ "model.layers.37.self_attn.k_proj.bias": "model-00018-of-00037.safetensors",
386
+ "model.layers.37.self_attn.k_proj.weight": "model-00018-of-00037.safetensors",
387
+ "model.layers.37.self_attn.o_proj.weight": "model-00018-of-00037.safetensors",
388
+ "model.layers.37.self_attn.q_proj.bias": "model-00018-of-00037.safetensors",
389
+ "model.layers.37.self_attn.q_proj.weight": "model-00018-of-00037.safetensors",
390
+ "model.layers.37.self_attn.v_proj.bias": "model-00018-of-00037.safetensors",
391
+ "model.layers.37.self_attn.v_proj.weight": "model-00018-of-00037.safetensors",
392
+ "model.layers.38.input_layernorm.weight": "model-00018-of-00037.safetensors",
393
+ "model.layers.38.mlp.down_proj.weight": "model-00018-of-00037.safetensors",
394
+ "model.layers.38.mlp.gate_proj.weight": "model-00018-of-00037.safetensors",
395
+ "model.layers.38.mlp.up_proj.weight": "model-00018-of-00037.safetensors",
396
+ "model.layers.38.post_attention_layernorm.weight": "model-00018-of-00037.safetensors",
397
+ "model.layers.38.self_attn.k_proj.bias": "model-00018-of-00037.safetensors",
398
+ "model.layers.38.self_attn.k_proj.weight": "model-00018-of-00037.safetensors",
399
+ "model.layers.38.self_attn.o_proj.weight": "model-00018-of-00037.safetensors",
400
+ "model.layers.38.self_attn.q_proj.bias": "model-00018-of-00037.safetensors",
401
+ "model.layers.38.self_attn.q_proj.weight": "model-00018-of-00037.safetensors",
402
+ "model.layers.38.self_attn.v_proj.bias": "model-00018-of-00037.safetensors",
403
+ "model.layers.38.self_attn.v_proj.weight": "model-00018-of-00037.safetensors",
404
+ "model.layers.39.input_layernorm.weight": "model-00018-of-00037.safetensors",
405
+ "model.layers.39.mlp.down_proj.weight": "model-00019-of-00037.safetensors",
406
+ "model.layers.39.mlp.gate_proj.weight": "model-00019-of-00037.safetensors",
407
+ "model.layers.39.mlp.up_proj.weight": "model-00019-of-00037.safetensors",
408
+ "model.layers.39.post_attention_layernorm.weight": "model-00019-of-00037.safetensors",
409
+ "model.layers.39.self_attn.k_proj.bias": "model-00019-of-00037.safetensors",
410
+ "model.layers.39.self_attn.k_proj.weight": "model-00019-of-00037.safetensors",
411
+ "model.layers.39.self_attn.o_proj.weight": "model-00019-of-00037.safetensors",
412
+ "model.layers.39.self_attn.q_proj.bias": "model-00019-of-00037.safetensors",
413
+ "model.layers.39.self_attn.q_proj.weight": "model-00019-of-00037.safetensors",
414
+ "model.layers.39.self_attn.v_proj.bias": "model-00019-of-00037.safetensors",
415
+ "model.layers.39.self_attn.v_proj.weight": "model-00019-of-00037.safetensors",
416
+ "model.layers.4.input_layernorm.weight": "model-00003-of-00037.safetensors",
417
+ "model.layers.4.mlp.down_proj.weight": "model-00003-of-00037.safetensors",
418
+ "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00037.safetensors",
419
+ "model.layers.4.mlp.up_proj.weight": "model-00003-of-00037.safetensors",
420
+ "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00037.safetensors",
421
+ "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00037.safetensors",
422
+ "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00037.safetensors",
423
+ "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00037.safetensors",
424
+ "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00037.safetensors",
425
+ "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00037.safetensors",
426
+ "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00037.safetensors",
427
+ "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00037.safetensors",
428
+ "model.layers.40.input_layernorm.weight": "model-00019-of-00037.safetensors",
429
+ "model.layers.40.mlp.down_proj.weight": "model-00019-of-00037.safetensors",
430
+ "model.layers.40.mlp.gate_proj.weight": "model-00019-of-00037.safetensors",
431
+ "model.layers.40.mlp.up_proj.weight": "model-00019-of-00037.safetensors",
432
+ "model.layers.40.post_attention_layernorm.weight": "model-00019-of-00037.safetensors",
433
+ "model.layers.40.self_attn.k_proj.bias": "model-00019-of-00037.safetensors",
434
+ "model.layers.40.self_attn.k_proj.weight": "model-00019-of-00037.safetensors",
435
+ "model.layers.40.self_attn.o_proj.weight": "model-00019-of-00037.safetensors",
436
+ "model.layers.40.self_attn.q_proj.bias": "model-00019-of-00037.safetensors",
437
+ "model.layers.40.self_attn.q_proj.weight": "model-00019-of-00037.safetensors",
438
+ "model.layers.40.self_attn.v_proj.bias": "model-00019-of-00037.safetensors",
439
+ "model.layers.40.self_attn.v_proj.weight": "model-00019-of-00037.safetensors",
440
+ "model.layers.41.input_layernorm.weight": "model-00019-of-00037.safetensors",
441
+ "model.layers.41.mlp.down_proj.weight": "model-00020-of-00037.safetensors",
442
+ "model.layers.41.mlp.gate_proj.weight": "model-00020-of-00037.safetensors",
443
+ "model.layers.41.mlp.up_proj.weight": "model-00020-of-00037.safetensors",
444
+ "model.layers.41.post_attention_layernorm.weight": "model-00019-of-00037.safetensors",
445
+ "model.layers.41.self_attn.k_proj.bias": "model-00019-of-00037.safetensors",
446
+ "model.layers.41.self_attn.k_proj.weight": "model-00019-of-00037.safetensors",
447
+ "model.layers.41.self_attn.o_proj.weight": "model-00019-of-00037.safetensors",
448
+ "model.layers.41.self_attn.q_proj.bias": "model-00019-of-00037.safetensors",
449
+ "model.layers.41.self_attn.q_proj.weight": "model-00019-of-00037.safetensors",
450
+ "model.layers.41.self_attn.v_proj.bias": "model-00019-of-00037.safetensors",
451
+ "model.layers.41.self_attn.v_proj.weight": "model-00019-of-00037.safetensors",
452
+ "model.layers.42.input_layernorm.weight": "model-00020-of-00037.safetensors",
453
+ "model.layers.42.mlp.down_proj.weight": "model-00020-of-00037.safetensors",
454
+ "model.layers.42.mlp.gate_proj.weight": "model-00020-of-00037.safetensors",
455
+ "model.layers.42.mlp.up_proj.weight": "model-00020-of-00037.safetensors",
456
+ "model.layers.42.post_attention_layernorm.weight": "model-00020-of-00037.safetensors",
457
+ "model.layers.42.self_attn.k_proj.bias": "model-00020-of-00037.safetensors",
458
+ "model.layers.42.self_attn.k_proj.weight": "model-00020-of-00037.safetensors",
459
+ "model.layers.42.self_attn.o_proj.weight": "model-00020-of-00037.safetensors",
460
+ "model.layers.42.self_attn.q_proj.bias": "model-00020-of-00037.safetensors",
461
+ "model.layers.42.self_attn.q_proj.weight": "model-00020-of-00037.safetensors",
462
+ "model.layers.42.self_attn.v_proj.bias": "model-00020-of-00037.safetensors",
463
+ "model.layers.42.self_attn.v_proj.weight": "model-00020-of-00037.safetensors",
464
+ "model.layers.43.input_layernorm.weight": "model-00020-of-00037.safetensors",
465
+ "model.layers.43.mlp.down_proj.weight": "model-00021-of-00037.safetensors",
466
+ "model.layers.43.mlp.gate_proj.weight": "model-00021-of-00037.safetensors",
467
+ "model.layers.43.mlp.up_proj.weight": "model-00020-of-00037.safetensors",
468
+ "model.layers.43.post_attention_layernorm.weight": "model-00020-of-00037.safetensors",
469
+ "model.layers.43.self_attn.k_proj.bias": "model-00020-of-00037.safetensors",
470
+ "model.layers.43.self_attn.k_proj.weight": "model-00020-of-00037.safetensors",
471
+ "model.layers.43.self_attn.o_proj.weight": "model-00020-of-00037.safetensors",
472
+ "model.layers.43.self_attn.q_proj.bias": "model-00020-of-00037.safetensors",
473
+ "model.layers.43.self_attn.q_proj.weight": "model-00020-of-00037.safetensors",
474
+ "model.layers.43.self_attn.v_proj.bias": "model-00020-of-00037.safetensors",
475
+ "model.layers.43.self_attn.v_proj.weight": "model-00020-of-00037.safetensors",
476
+ "model.layers.44.input_layernorm.weight": "model-00021-of-00037.safetensors",
477
+ "model.layers.44.mlp.down_proj.weight": "model-00021-of-00037.safetensors",
478
+ "model.layers.44.mlp.gate_proj.weight": "model-00021-of-00037.safetensors",
479
+ "model.layers.44.mlp.up_proj.weight": "model-00021-of-00037.safetensors",
480
+ "model.layers.44.post_attention_layernorm.weight": "model-00021-of-00037.safetensors",
481
+ "model.layers.44.self_attn.k_proj.bias": "model-00021-of-00037.safetensors",
482
+ "model.layers.44.self_attn.k_proj.weight": "model-00021-of-00037.safetensors",
483
+ "model.layers.44.self_attn.o_proj.weight": "model-00021-of-00037.safetensors",
484
+ "model.layers.44.self_attn.q_proj.bias": "model-00021-of-00037.safetensors",
485
+ "model.layers.44.self_attn.q_proj.weight": "model-00021-of-00037.safetensors",
486
+ "model.layers.44.self_attn.v_proj.bias": "model-00021-of-00037.safetensors",
487
+ "model.layers.44.self_attn.v_proj.weight": "model-00021-of-00037.safetensors",
488
+ "model.layers.45.input_layernorm.weight": "model-00021-of-00037.safetensors",
489
+ "model.layers.45.mlp.down_proj.weight": "model-00022-of-00037.safetensors",
490
+ "model.layers.45.mlp.gate_proj.weight": "model-00021-of-00037.safetensors",
491
+ "model.layers.45.mlp.up_proj.weight": "model-00021-of-00037.safetensors",
492
+ "model.layers.45.post_attention_layernorm.weight": "model-00021-of-00037.safetensors",
493
+ "model.layers.45.self_attn.k_proj.bias": "model-00021-of-00037.safetensors",
494
+ "model.layers.45.self_attn.k_proj.weight": "model-00021-of-00037.safetensors",
495
+ "model.layers.45.self_attn.o_proj.weight": "model-00021-of-00037.safetensors",
496
+ "model.layers.45.self_attn.q_proj.bias": "model-00021-of-00037.safetensors",
497
+ "model.layers.45.self_attn.q_proj.weight": "model-00021-of-00037.safetensors",
498
+ "model.layers.45.self_attn.v_proj.bias": "model-00021-of-00037.safetensors",
499
+ "model.layers.45.self_attn.v_proj.weight": "model-00021-of-00037.safetensors",
500
+ "model.layers.46.input_layernorm.weight": "model-00022-of-00037.safetensors",
501
+ "model.layers.46.mlp.down_proj.weight": "model-00022-of-00037.safetensors",
502
+ "model.layers.46.mlp.gate_proj.weight": "model-00022-of-00037.safetensors",
503
+ "model.layers.46.mlp.up_proj.weight": "model-00022-of-00037.safetensors",
504
+ "model.layers.46.post_attention_layernorm.weight": "model-00022-of-00037.safetensors",
505
+ "model.layers.46.self_attn.k_proj.bias": "model-00022-of-00037.safetensors",
506
+ "model.layers.46.self_attn.k_proj.weight": "model-00022-of-00037.safetensors",
507
+ "model.layers.46.self_attn.o_proj.weight": "model-00022-of-00037.safetensors",
508
+ "model.layers.46.self_attn.q_proj.bias": "model-00022-of-00037.safetensors",
509
+ "model.layers.46.self_attn.q_proj.weight": "model-00022-of-00037.safetensors",
510
+ "model.layers.46.self_attn.v_proj.bias": "model-00022-of-00037.safetensors",
511
+ "model.layers.46.self_attn.v_proj.weight": "model-00022-of-00037.safetensors",
512
+ "model.layers.47.input_layernorm.weight": "model-00022-of-00037.safetensors",
513
+ "model.layers.47.mlp.down_proj.weight": "model-00022-of-00037.safetensors",
514
+ "model.layers.47.mlp.gate_proj.weight": "model-00022-of-00037.safetensors",
515
+ "model.layers.47.mlp.up_proj.weight": "model-00022-of-00037.safetensors",
516
+ "model.layers.47.post_attention_layernorm.weight": "model-00022-of-00037.safetensors",
517
+ "model.layers.47.self_attn.k_proj.bias": "model-00022-of-00037.safetensors",
518
+ "model.layers.47.self_attn.k_proj.weight": "model-00022-of-00037.safetensors",
519
+ "model.layers.47.self_attn.o_proj.weight": "model-00022-of-00037.safetensors",
520
+ "model.layers.47.self_attn.q_proj.bias": "model-00022-of-00037.safetensors",
521
+ "model.layers.47.self_attn.q_proj.weight": "model-00022-of-00037.safetensors",
522
+ "model.layers.47.self_attn.v_proj.bias": "model-00022-of-00037.safetensors",
523
+ "model.layers.47.self_attn.v_proj.weight": "model-00022-of-00037.safetensors",
524
+ "model.layers.48.input_layernorm.weight": "model-00022-of-00037.safetensors",
525
+ "model.layers.48.mlp.down_proj.weight": "model-00023-of-00037.safetensors",
526
+ "model.layers.48.mlp.gate_proj.weight": "model-00023-of-00037.safetensors",
527
+ "model.layers.48.mlp.up_proj.weight": "model-00023-of-00037.safetensors",
528
+ "model.layers.48.post_attention_layernorm.weight": "model-00023-of-00037.safetensors",
529
+ "model.layers.48.self_attn.k_proj.bias": "model-00023-of-00037.safetensors",
530
+ "model.layers.48.self_attn.k_proj.weight": "model-00023-of-00037.safetensors",
531
+ "model.layers.48.self_attn.o_proj.weight": "model-00023-of-00037.safetensors",
532
+ "model.layers.48.self_attn.q_proj.bias": "model-00023-of-00037.safetensors",
533
+ "model.layers.48.self_attn.q_proj.weight": "model-00023-of-00037.safetensors",
534
+ "model.layers.48.self_attn.v_proj.bias": "model-00023-of-00037.safetensors",
535
+ "model.layers.48.self_attn.v_proj.weight": "model-00023-of-00037.safetensors",
536
+ "model.layers.49.input_layernorm.weight": "model-00023-of-00037.safetensors",
537
+ "model.layers.49.mlp.down_proj.weight": "model-00023-of-00037.safetensors",
538
+ "model.layers.49.mlp.gate_proj.weight": "model-00023-of-00037.safetensors",
539
+ "model.layers.49.mlp.up_proj.weight": "model-00023-of-00037.safetensors",
540
+ "model.layers.49.post_attention_layernorm.weight": "model-00023-of-00037.safetensors",
541
+ "model.layers.49.self_attn.k_proj.bias": "model-00023-of-00037.safetensors",
542
+ "model.layers.49.self_attn.k_proj.weight": "model-00023-of-00037.safetensors",
543
+ "model.layers.49.self_attn.o_proj.weight": "model-00023-of-00037.safetensors",
544
+ "model.layers.49.self_attn.q_proj.bias": "model-00023-of-00037.safetensors",
545
+ "model.layers.49.self_attn.q_proj.weight": "model-00023-of-00037.safetensors",
546
+ "model.layers.49.self_attn.v_proj.bias": "model-00023-of-00037.safetensors",
547
+ "model.layers.49.self_attn.v_proj.weight": "model-00023-of-00037.safetensors",
548
+ "model.layers.5.input_layernorm.weight": "model-00003-of-00037.safetensors",
549
+ "model.layers.5.mlp.down_proj.weight": "model-00004-of-00037.safetensors",
550
+ "model.layers.5.mlp.gate_proj.weight": "model-00004-of-00037.safetensors",
551
+ "model.layers.5.mlp.up_proj.weight": "model-00004-of-00037.safetensors",
552
+ "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00037.safetensors",
553
+ "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00037.safetensors",
554
+ "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00037.safetensors",
555
+ "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00037.safetensors",
556
+ "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00037.safetensors",
557
+ "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00037.safetensors",
558
+ "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00037.safetensors",
559
+ "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00037.safetensors",
560
+ "model.layers.50.input_layernorm.weight": "model-00023-of-00037.safetensors",
561
+ "model.layers.50.mlp.down_proj.weight": "model-00024-of-00037.safetensors",
562
+ "model.layers.50.mlp.gate_proj.weight": "model-00024-of-00037.safetensors",
563
+ "model.layers.50.mlp.up_proj.weight": "model-00024-of-00037.safetensors",
564
+ "model.layers.50.post_attention_layernorm.weight": "model-00023-of-00037.safetensors",
565
+ "model.layers.50.self_attn.k_proj.bias": "model-00023-of-00037.safetensors",
566
+ "model.layers.50.self_attn.k_proj.weight": "model-00023-of-00037.safetensors",
567
+ "model.layers.50.self_attn.o_proj.weight": "model-00023-of-00037.safetensors",
568
+ "model.layers.50.self_attn.q_proj.bias": "model-00023-of-00037.safetensors",
569
+ "model.layers.50.self_attn.q_proj.weight": "model-00023-of-00037.safetensors",
570
+ "model.layers.50.self_attn.v_proj.bias": "model-00023-of-00037.safetensors",
571
+ "model.layers.50.self_attn.v_proj.weight": "model-00023-of-00037.safetensors",
572
+ "model.layers.51.input_layernorm.weight": "model-00024-of-00037.safetensors",
573
+ "model.layers.51.mlp.down_proj.weight": "model-00024-of-00037.safetensors",
574
+ "model.layers.51.mlp.gate_proj.weight": "model-00024-of-00037.safetensors",
575
+ "model.layers.51.mlp.up_proj.weight": "model-00024-of-00037.safetensors",
576
+ "model.layers.51.post_attention_layernorm.weight": "model-00024-of-00037.safetensors",
577
+ "model.layers.51.self_attn.k_proj.bias": "model-00024-of-00037.safetensors",
578
+ "model.layers.51.self_attn.k_proj.weight": "model-00024-of-00037.safetensors",
579
+ "model.layers.51.self_attn.o_proj.weight": "model-00024-of-00037.safetensors",
580
+ "model.layers.51.self_attn.q_proj.bias": "model-00024-of-00037.safetensors",
581
+ "model.layers.51.self_attn.q_proj.weight": "model-00024-of-00037.safetensors",
582
+ "model.layers.51.self_attn.v_proj.bias": "model-00024-of-00037.safetensors",
583
+ "model.layers.51.self_attn.v_proj.weight": "model-00024-of-00037.safetensors",
584
+ "model.layers.52.input_layernorm.weight": "model-00024-of-00037.safetensors",
585
+ "model.layers.52.mlp.down_proj.weight": "model-00025-of-00037.safetensors",
586
+ "model.layers.52.mlp.gate_proj.weight": "model-00025-of-00037.safetensors",
587
+ "model.layers.52.mlp.up_proj.weight": "model-00024-of-00037.safetensors",
588
+ "model.layers.52.post_attention_layernorm.weight": "model-00024-of-00037.safetensors",
589
+ "model.layers.52.self_attn.k_proj.bias": "model-00024-of-00037.safetensors",
590
+ "model.layers.52.self_attn.k_proj.weight": "model-00024-of-00037.safetensors",
591
+ "model.layers.52.self_attn.o_proj.weight": "model-00024-of-00037.safetensors",
592
+ "model.layers.52.self_attn.q_proj.bias": "model-00024-of-00037.safetensors",
593
+ "model.layers.52.self_attn.q_proj.weight": "model-00024-of-00037.safetensors",
594
+ "model.layers.52.self_attn.v_proj.bias": "model-00024-of-00037.safetensors",
595
+ "model.layers.52.self_attn.v_proj.weight": "model-00024-of-00037.safetensors",
596
+ "model.layers.53.input_layernorm.weight": "model-00025-of-00037.safetensors",
597
+ "model.layers.53.mlp.down_proj.weight": "model-00025-of-00037.safetensors",
598
+ "model.layers.53.mlp.gate_proj.weight": "model-00025-of-00037.safetensors",
599
+ "model.layers.53.mlp.up_proj.weight": "model-00025-of-00037.safetensors",
600
+ "model.layers.53.post_attention_layernorm.weight": "model-00025-of-00037.safetensors",
601
+ "model.layers.53.self_attn.k_proj.bias": "model-00025-of-00037.safetensors",
602
+ "model.layers.53.self_attn.k_proj.weight": "model-00025-of-00037.safetensors",
603
+ "model.layers.53.self_attn.o_proj.weight": "model-00025-of-00037.safetensors",
604
+ "model.layers.53.self_attn.q_proj.bias": "model-00025-of-00037.safetensors",
605
+ "model.layers.53.self_attn.q_proj.weight": "model-00025-of-00037.safetensors",
606
+ "model.layers.53.self_attn.v_proj.bias": "model-00025-of-00037.safetensors",
607
+ "model.layers.53.self_attn.v_proj.weight": "model-00025-of-00037.safetensors",
608
+ "model.layers.54.input_layernorm.weight": "model-00025-of-00037.safetensors",
609
+ "model.layers.54.mlp.down_proj.weight": "model-00026-of-00037.safetensors",
610
+ "model.layers.54.mlp.gate_proj.weight": "model-00025-of-00037.safetensors",
611
+ "model.layers.54.mlp.up_proj.weight": "model-00025-of-00037.safetensors",
612
+ "model.layers.54.post_attention_layernorm.weight": "model-00025-of-00037.safetensors",
613
+ "model.layers.54.self_attn.k_proj.bias": "model-00025-of-00037.safetensors",
614
+ "model.layers.54.self_attn.k_proj.weight": "model-00025-of-00037.safetensors",
615
+ "model.layers.54.self_attn.o_proj.weight": "model-00025-of-00037.safetensors",
616
+ "model.layers.54.self_attn.q_proj.bias": "model-00025-of-00037.safetensors",
617
+ "model.layers.54.self_attn.q_proj.weight": "model-00025-of-00037.safetensors",
618
+ "model.layers.54.self_attn.v_proj.bias": "model-00025-of-00037.safetensors",
619
+ "model.layers.54.self_attn.v_proj.weight": "model-00025-of-00037.safetensors",
620
+ "model.layers.55.input_layernorm.weight": "model-00026-of-00037.safetensors",
621
+ "model.layers.55.mlp.down_proj.weight": "model-00026-of-00037.safetensors",
622
+ "model.layers.55.mlp.gate_proj.weight": "model-00026-of-00037.safetensors",
623
+ "model.layers.55.mlp.up_proj.weight": "model-00026-of-00037.safetensors",
624
+ "model.layers.55.post_attention_layernorm.weight": "model-00026-of-00037.safetensors",
625
+ "model.layers.55.self_attn.k_proj.bias": "model-00026-of-00037.safetensors",
626
+ "model.layers.55.self_attn.k_proj.weight": "model-00026-of-00037.safetensors",
627
+ "model.layers.55.self_attn.o_proj.weight": "model-00026-of-00037.safetensors",
628
+ "model.layers.55.self_attn.q_proj.bias": "model-00026-of-00037.safetensors",
629
+ "model.layers.55.self_attn.q_proj.weight": "model-00026-of-00037.safetensors",
630
+ "model.layers.55.self_attn.v_proj.bias": "model-00026-of-00037.safetensors",
631
+ "model.layers.55.self_attn.v_proj.weight": "model-00026-of-00037.safetensors",
632
+ "model.layers.56.input_layernorm.weight": "model-00026-of-00037.safetensors",
633
+ "model.layers.56.mlp.down_proj.weight": "model-00026-of-00037.safetensors",
634
+ "model.layers.56.mlp.gate_proj.weight": "model-00026-of-00037.safetensors",
635
+ "model.layers.56.mlp.up_proj.weight": "model-00026-of-00037.safetensors",
636
+ "model.layers.56.post_attention_layernorm.weight": "model-00026-of-00037.safetensors",
637
+ "model.layers.56.self_attn.k_proj.bias": "model-00026-of-00037.safetensors",
638
+ "model.layers.56.self_attn.k_proj.weight": "model-00026-of-00037.safetensors",
639
+ "model.layers.56.self_attn.o_proj.weight": "model-00026-of-00037.safetensors",
640
+ "model.layers.56.self_attn.q_proj.bias": "model-00026-of-00037.safetensors",
641
+ "model.layers.56.self_attn.q_proj.weight": "model-00026-of-00037.safetensors",
642
+ "model.layers.56.self_attn.v_proj.bias": "model-00026-of-00037.safetensors",
643
+ "model.layers.56.self_attn.v_proj.weight": "model-00026-of-00037.safetensors",
644
+ "model.layers.57.input_layernorm.weight": "model-00026-of-00037.safetensors",
645
+ "model.layers.57.mlp.down_proj.weight": "model-00027-of-00037.safetensors",
646
+ "model.layers.57.mlp.gate_proj.weight": "model-00027-of-00037.safetensors",
647
+ "model.layers.57.mlp.up_proj.weight": "model-00027-of-00037.safetensors",
648
+ "model.layers.57.post_attention_layernorm.weight": "model-00027-of-00037.safetensors",
649
+ "model.layers.57.self_attn.k_proj.bias": "model-00027-of-00037.safetensors",
650
+ "model.layers.57.self_attn.k_proj.weight": "model-00027-of-00037.safetensors",
651
+ "model.layers.57.self_attn.o_proj.weight": "model-00027-of-00037.safetensors",
652
+ "model.layers.57.self_attn.q_proj.bias": "model-00027-of-00037.safetensors",
653
+ "model.layers.57.self_attn.q_proj.weight": "model-00027-of-00037.safetensors",
654
+ "model.layers.57.self_attn.v_proj.bias": "model-00027-of-00037.safetensors",
655
+ "model.layers.57.self_attn.v_proj.weight": "model-00027-of-00037.safetensors",
656
+ "model.layers.58.input_layernorm.weight": "model-00027-of-00037.safetensors",
657
+ "model.layers.58.mlp.down_proj.weight": "model-00027-of-00037.safetensors",
658
+ "model.layers.58.mlp.gate_proj.weight": "model-00027-of-00037.safetensors",
659
+ "model.layers.58.mlp.up_proj.weight": "model-00027-of-00037.safetensors",
660
+ "model.layers.58.post_attention_layernorm.weight": "model-00027-of-00037.safetensors",
661
+ "model.layers.58.self_attn.k_proj.bias": "model-00027-of-00037.safetensors",
662
+ "model.layers.58.self_attn.k_proj.weight": "model-00027-of-00037.safetensors",
663
+ "model.layers.58.self_attn.o_proj.weight": "model-00027-of-00037.safetensors",
664
+ "model.layers.58.self_attn.q_proj.bias": "model-00027-of-00037.safetensors",
665
+ "model.layers.58.self_attn.q_proj.weight": "model-00027-of-00037.safetensors",
666
+ "model.layers.58.self_attn.v_proj.bias": "model-00027-of-00037.safetensors",
667
+ "model.layers.58.self_attn.v_proj.weight": "model-00027-of-00037.safetensors",
668
+ "model.layers.59.input_layernorm.weight": "model-00027-of-00037.safetensors",
669
+ "model.layers.59.mlp.down_proj.weight": "model-00028-of-00037.safetensors",
670
+ "model.layers.59.mlp.gate_proj.weight": "model-00028-of-00037.safetensors",
671
+ "model.layers.59.mlp.up_proj.weight": "model-00028-of-00037.safetensors",
672
+ "model.layers.59.post_attention_layernorm.weight": "model-00027-of-00037.safetensors",
673
+ "model.layers.59.self_attn.k_proj.bias": "model-00027-of-00037.safetensors",
674
+ "model.layers.59.self_attn.k_proj.weight": "model-00027-of-00037.safetensors",
675
+ "model.layers.59.self_attn.o_proj.weight": "model-00027-of-00037.safetensors",
676
+ "model.layers.59.self_attn.q_proj.bias": "model-00027-of-00037.safetensors",
677
+ "model.layers.59.self_attn.q_proj.weight": "model-00027-of-00037.safetensors",
678
+ "model.layers.59.self_attn.v_proj.bias": "model-00027-of-00037.safetensors",
679
+ "model.layers.59.self_attn.v_proj.weight": "model-00027-of-00037.safetensors",
680
+ "model.layers.6.input_layernorm.weight": "model-00004-of-00037.safetensors",
681
+ "model.layers.6.mlp.down_proj.weight": "model-00004-of-00037.safetensors",
682
+ "model.layers.6.mlp.gate_proj.weight": "model-00004-of-00037.safetensors",
683
+ "model.layers.6.mlp.up_proj.weight": "model-00004-of-00037.safetensors",
684
+ "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00037.safetensors",
685
+ "model.layers.6.self_attn.k_proj.bias": "model-00004-of-00037.safetensors",
686
+ "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00037.safetensors",
687
+ "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00037.safetensors",
688
+ "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00037.safetensors",
689
+ "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00037.safetensors",
690
+ "model.layers.6.self_attn.v_proj.bias": "model-00004-of-00037.safetensors",
691
+ "model.layers.6.self_attn.v_proj.weight": "model-00004-of-00037.safetensors",
692
+ "model.layers.60.input_layernorm.weight": "model-00028-of-00037.safetensors",
693
+ "model.layers.60.mlp.down_proj.weight": "model-00028-of-00037.safetensors",
694
+ "model.layers.60.mlp.gate_proj.weight": "model-00028-of-00037.safetensors",
695
+ "model.layers.60.mlp.up_proj.weight": "model-00028-of-00037.safetensors",
696
+ "model.layers.60.post_attention_layernorm.weight": "model-00028-of-00037.safetensors",
697
+ "model.layers.60.self_attn.k_proj.bias": "model-00028-of-00037.safetensors",
698
+ "model.layers.60.self_attn.k_proj.weight": "model-00028-of-00037.safetensors",
699
+ "model.layers.60.self_attn.o_proj.weight": "model-00028-of-00037.safetensors",
700
+ "model.layers.60.self_attn.q_proj.bias": "model-00028-of-00037.safetensors",
701
+ "model.layers.60.self_attn.q_proj.weight": "model-00028-of-00037.safetensors",
702
+ "model.layers.60.self_attn.v_proj.bias": "model-00028-of-00037.safetensors",
703
+ "model.layers.60.self_attn.v_proj.weight": "model-00028-of-00037.safetensors",
704
+ "model.layers.61.input_layernorm.weight": "model-00028-of-00037.safetensors",
705
+ "model.layers.61.mlp.down_proj.weight": "model-00029-of-00037.safetensors",
706
+ "model.layers.61.mlp.gate_proj.weight": "model-00029-of-00037.safetensors",
707
+ "model.layers.61.mlp.up_proj.weight": "model-00028-of-00037.safetensors",
708
+ "model.layers.61.post_attention_layernorm.weight": "model-00028-of-00037.safetensors",
709
+ "model.layers.61.self_attn.k_proj.bias": "model-00028-of-00037.safetensors",
710
+ "model.layers.61.self_attn.k_proj.weight": "model-00028-of-00037.safetensors",
711
+ "model.layers.61.self_attn.o_proj.weight": "model-00028-of-00037.safetensors",
712
+ "model.layers.61.self_attn.q_proj.bias": "model-00028-of-00037.safetensors",
713
+ "model.layers.61.self_attn.q_proj.weight": "model-00028-of-00037.safetensors",
714
+ "model.layers.61.self_attn.v_proj.bias": "model-00028-of-00037.safetensors",
715
+ "model.layers.61.self_attn.v_proj.weight": "model-00028-of-00037.safetensors",
716
+ "model.layers.62.input_layernorm.weight": "model-00029-of-00037.safetensors",
717
+ "model.layers.62.mlp.down_proj.weight": "model-00029-of-00037.safetensors",
718
+ "model.layers.62.mlp.gate_proj.weight": "model-00029-of-00037.safetensors",
719
+ "model.layers.62.mlp.up_proj.weight": "model-00029-of-00037.safetensors",
720
+ "model.layers.62.post_attention_layernorm.weight": "model-00029-of-00037.safetensors",
721
+ "model.layers.62.self_attn.k_proj.bias": "model-00029-of-00037.safetensors",
722
+ "model.layers.62.self_attn.k_proj.weight": "model-00029-of-00037.safetensors",
723
+ "model.layers.62.self_attn.o_proj.weight": "model-00029-of-00037.safetensors",
724
+ "model.layers.62.self_attn.q_proj.bias": "model-00029-of-00037.safetensors",
725
+ "model.layers.62.self_attn.q_proj.weight": "model-00029-of-00037.safetensors",
726
+ "model.layers.62.self_attn.v_proj.bias": "model-00029-of-00037.safetensors",
727
+ "model.layers.62.self_attn.v_proj.weight": "model-00029-of-00037.safetensors",
728
+ "model.layers.63.input_layernorm.weight": "model-00029-of-00037.safetensors",
729
+ "model.layers.63.mlp.down_proj.weight": "model-00030-of-00037.safetensors",
730
+ "model.layers.63.mlp.gate_proj.weight": "model-00029-of-00037.safetensors",
731
+ "model.layers.63.mlp.up_proj.weight": "model-00029-of-00037.safetensors",
732
+ "model.layers.63.post_attention_layernorm.weight": "model-00029-of-00037.safetensors",
733
+ "model.layers.63.self_attn.k_proj.bias": "model-00029-of-00037.safetensors",
734
+ "model.layers.63.self_attn.k_proj.weight": "model-00029-of-00037.safetensors",
735
+ "model.layers.63.self_attn.o_proj.weight": "model-00029-of-00037.safetensors",
736
+ "model.layers.63.self_attn.q_proj.bias": "model-00029-of-00037.safetensors",
737
+ "model.layers.63.self_attn.q_proj.weight": "model-00029-of-00037.safetensors",
738
+ "model.layers.63.self_attn.v_proj.bias": "model-00029-of-00037.safetensors",
739
+ "model.layers.63.self_attn.v_proj.weight": "model-00029-of-00037.safetensors",
740
+ "model.layers.64.input_layernorm.weight": "model-00030-of-00037.safetensors",
741
+ "model.layers.64.mlp.down_proj.weight": "model-00030-of-00037.safetensors",
742
+ "model.layers.64.mlp.gate_proj.weight": "model-00030-of-00037.safetensors",
743
+ "model.layers.64.mlp.up_proj.weight": "model-00030-of-00037.safetensors",
744
+ "model.layers.64.post_attention_layernorm.weight": "model-00030-of-00037.safetensors",
745
+ "model.layers.64.self_attn.k_proj.bias": "model-00030-of-00037.safetensors",
746
+ "model.layers.64.self_attn.k_proj.weight": "model-00030-of-00037.safetensors",
747
+ "model.layers.64.self_attn.o_proj.weight": "model-00030-of-00037.safetensors",
748
+ "model.layers.64.self_attn.q_proj.bias": "model-00030-of-00037.safetensors",
749
+ "model.layers.64.self_attn.q_proj.weight": "model-00030-of-00037.safetensors",
750
+ "model.layers.64.self_attn.v_proj.bias": "model-00030-of-00037.safetensors",
751
+ "model.layers.64.self_attn.v_proj.weight": "model-00030-of-00037.safetensors",
752
+ "model.layers.65.input_layernorm.weight": "model-00030-of-00037.safetensors",
753
+ "model.layers.65.mlp.down_proj.weight": "model-00030-of-00037.safetensors",
754
+ "model.layers.65.mlp.gate_proj.weight": "model-00030-of-00037.safetensors",
755
+ "model.layers.65.mlp.up_proj.weight": "model-00030-of-00037.safetensors",
756
+ "model.layers.65.post_attention_layernorm.weight": "model-00030-of-00037.safetensors",
757
+ "model.layers.65.self_attn.k_proj.bias": "model-00030-of-00037.safetensors",
758
+ "model.layers.65.self_attn.k_proj.weight": "model-00030-of-00037.safetensors",
759
+ "model.layers.65.self_attn.o_proj.weight": "model-00030-of-00037.safetensors",
760
+ "model.layers.65.self_attn.q_proj.bias": "model-00030-of-00037.safetensors",
761
+ "model.layers.65.self_attn.q_proj.weight": "model-00030-of-00037.safetensors",
762
+ "model.layers.65.self_attn.v_proj.bias": "model-00030-of-00037.safetensors",
763
+ "model.layers.65.self_attn.v_proj.weight": "model-00030-of-00037.safetensors",
764
+ "model.layers.66.input_layernorm.weight": "model-00030-of-00037.safetensors",
765
+ "model.layers.66.mlp.down_proj.weight": "model-00031-of-00037.safetensors",
766
+ "model.layers.66.mlp.gate_proj.weight": "model-00031-of-00037.safetensors",
767
+ "model.layers.66.mlp.up_proj.weight": "model-00031-of-00037.safetensors",
768
+ "model.layers.66.post_attention_layernorm.weight": "model-00031-of-00037.safetensors",
769
+ "model.layers.66.self_attn.k_proj.bias": "model-00031-of-00037.safetensors",
770
+ "model.layers.66.self_attn.k_proj.weight": "model-00031-of-00037.safetensors",
771
+ "model.layers.66.self_attn.o_proj.weight": "model-00031-of-00037.safetensors",
772
+ "model.layers.66.self_attn.q_proj.bias": "model-00031-of-00037.safetensors",
773
+ "model.layers.66.self_attn.q_proj.weight": "model-00031-of-00037.safetensors",
774
+ "model.layers.66.self_attn.v_proj.bias": "model-00031-of-00037.safetensors",
775
+ "model.layers.66.self_attn.v_proj.weight": "model-00031-of-00037.safetensors",
776
+ "model.layers.67.input_layernorm.weight": "model-00031-of-00037.safetensors",
777
+ "model.layers.67.mlp.down_proj.weight": "model-00031-of-00037.safetensors",
778
+ "model.layers.67.mlp.gate_proj.weight": "model-00031-of-00037.safetensors",
779
+ "model.layers.67.mlp.up_proj.weight": "model-00031-of-00037.safetensors",
780
+ "model.layers.67.post_attention_layernorm.weight": "model-00031-of-00037.safetensors",
781
+ "model.layers.67.self_attn.k_proj.bias": "model-00031-of-00037.safetensors",
782
+ "model.layers.67.self_attn.k_proj.weight": "model-00031-of-00037.safetensors",
783
+ "model.layers.67.self_attn.o_proj.weight": "model-00031-of-00037.safetensors",
784
+ "model.layers.67.self_attn.q_proj.bias": "model-00031-of-00037.safetensors",
785
+ "model.layers.67.self_attn.q_proj.weight": "model-00031-of-00037.safetensors",
786
+ "model.layers.67.self_attn.v_proj.bias": "model-00031-of-00037.safetensors",
787
+ "model.layers.67.self_attn.v_proj.weight": "model-00031-of-00037.safetensors",
788
+ "model.layers.68.input_layernorm.weight": "model-00031-of-00037.safetensors",
789
+ "model.layers.68.mlp.down_proj.weight": "model-00032-of-00037.safetensors",
790
+ "model.layers.68.mlp.gate_proj.weight": "model-00032-of-00037.safetensors",
791
+ "model.layers.68.mlp.up_proj.weight": "model-00032-of-00037.safetensors",
792
+ "model.layers.68.post_attention_layernorm.weight": "model-00031-of-00037.safetensors",
793
+ "model.layers.68.self_attn.k_proj.bias": "model-00031-of-00037.safetensors",
794
+ "model.layers.68.self_attn.k_proj.weight": "model-00031-of-00037.safetensors",
795
+ "model.layers.68.self_attn.o_proj.weight": "model-00031-of-00037.safetensors",
796
+ "model.layers.68.self_attn.q_proj.bias": "model-00031-of-00037.safetensors",
797
+ "model.layers.68.self_attn.q_proj.weight": "model-00031-of-00037.safetensors",
798
+ "model.layers.68.self_attn.v_proj.bias": "model-00031-of-00037.safetensors",
799
+ "model.layers.68.self_attn.v_proj.weight": "model-00031-of-00037.safetensors",
800
+ "model.layers.69.input_layernorm.weight": "model-00032-of-00037.safetensors",
801
+ "model.layers.69.mlp.down_proj.weight": "model-00032-of-00037.safetensors",
802
+ "model.layers.69.mlp.gate_proj.weight": "model-00032-of-00037.safetensors",
803
+ "model.layers.69.mlp.up_proj.weight": "model-00032-of-00037.safetensors",
804
+ "model.layers.69.post_attention_layernorm.weight": "model-00032-of-00037.safetensors",
805
+ "model.layers.69.self_attn.k_proj.bias": "model-00032-of-00037.safetensors",
806
+ "model.layers.69.self_attn.k_proj.weight": "model-00032-of-00037.safetensors",
807
+ "model.layers.69.self_attn.o_proj.weight": "model-00032-of-00037.safetensors",
808
+ "model.layers.69.self_attn.q_proj.bias": "model-00032-of-00037.safetensors",
809
+ "model.layers.69.self_attn.q_proj.weight": "model-00032-of-00037.safetensors",
810
+ "model.layers.69.self_attn.v_proj.bias": "model-00032-of-00037.safetensors",
811
+ "model.layers.69.self_attn.v_proj.weight": "model-00032-of-00037.safetensors",
812
+ "model.layers.7.input_layernorm.weight": "model-00004-of-00037.safetensors",
813
+ "model.layers.7.mlp.down_proj.weight": "model-00005-of-00037.safetensors",
814
+ "model.layers.7.mlp.gate_proj.weight": "model-00005-of-00037.safetensors",
815
+ "model.layers.7.mlp.up_proj.weight": "model-00004-of-00037.safetensors",
816
+ "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00037.safetensors",
817
+ "model.layers.7.self_attn.k_proj.bias": "model-00004-of-00037.safetensors",
818
+ "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00037.safetensors",
819
+ "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00037.safetensors",
820
+ "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00037.safetensors",
821
+ "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00037.safetensors",
822
+ "model.layers.7.self_attn.v_proj.bias": "model-00004-of-00037.safetensors",
823
+ "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00037.safetensors",
824
+ "model.layers.70.input_layernorm.weight": "model-00032-of-00037.safetensors",
825
+ "model.layers.70.mlp.down_proj.weight": "model-00033-of-00037.safetensors",
826
+ "model.layers.70.mlp.gate_proj.weight": "model-00033-of-00037.safetensors",
827
+ "model.layers.70.mlp.up_proj.weight": "model-00032-of-00037.safetensors",
828
+ "model.layers.70.post_attention_layernorm.weight": "model-00032-of-00037.safetensors",
829
+ "model.layers.70.self_attn.k_proj.bias": "model-00032-of-00037.safetensors",
830
+ "model.layers.70.self_attn.k_proj.weight": "model-00032-of-00037.safetensors",
831
+ "model.layers.70.self_attn.o_proj.weight": "model-00032-of-00037.safetensors",
832
+ "model.layers.70.self_attn.q_proj.bias": "model-00032-of-00037.safetensors",
833
+ "model.layers.70.self_attn.q_proj.weight": "model-00032-of-00037.safetensors",
834
+ "model.layers.70.self_attn.v_proj.bias": "model-00032-of-00037.safetensors",
835
+ "model.layers.70.self_attn.v_proj.weight": "model-00032-of-00037.safetensors",
836
+ "model.layers.71.input_layernorm.weight": "model-00033-of-00037.safetensors",
837
+ "model.layers.71.mlp.down_proj.weight": "model-00033-of-00037.safetensors",
838
+ "model.layers.71.mlp.gate_proj.weight": "model-00033-of-00037.safetensors",
839
+ "model.layers.71.mlp.up_proj.weight": "model-00033-of-00037.safetensors",
840
+ "model.layers.71.post_attention_layernorm.weight": "model-00033-of-00037.safetensors",
841
+ "model.layers.71.self_attn.k_proj.bias": "model-00033-of-00037.safetensors",
842
+ "model.layers.71.self_attn.k_proj.weight": "model-00033-of-00037.safetensors",
843
+ "model.layers.71.self_attn.o_proj.weight": "model-00033-of-00037.safetensors",
844
+ "model.layers.71.self_attn.q_proj.bias": "model-00033-of-00037.safetensors",
845
+ "model.layers.71.self_attn.q_proj.weight": "model-00033-of-00037.safetensors",
846
+ "model.layers.71.self_attn.v_proj.bias": "model-00033-of-00037.safetensors",
847
+ "model.layers.71.self_attn.v_proj.weight": "model-00033-of-00037.safetensors",
848
+ "model.layers.72.input_layernorm.weight": "model-00033-of-00037.safetensors",
849
+ "model.layers.72.mlp.down_proj.weight": "model-00034-of-00037.safetensors",
850
+ "model.layers.72.mlp.gate_proj.weight": "model-00033-of-00037.safetensors",
851
+ "model.layers.72.mlp.up_proj.weight": "model-00033-of-00037.safetensors",
852
+ "model.layers.72.post_attention_layernorm.weight": "model-00033-of-00037.safetensors",
853
+ "model.layers.72.self_attn.k_proj.bias": "model-00033-of-00037.safetensors",
854
+ "model.layers.72.self_attn.k_proj.weight": "model-00033-of-00037.safetensors",
855
+ "model.layers.72.self_attn.o_proj.weight": "model-00033-of-00037.safetensors",
856
+ "model.layers.72.self_attn.q_proj.bias": "model-00033-of-00037.safetensors",
857
+ "model.layers.72.self_attn.q_proj.weight": "model-00033-of-00037.safetensors",
858
+ "model.layers.72.self_attn.v_proj.bias": "model-00033-of-00037.safetensors",
859
+ "model.layers.72.self_attn.v_proj.weight": "model-00033-of-00037.safetensors",
860
+ "model.layers.73.input_layernorm.weight": "model-00034-of-00037.safetensors",
861
+ "model.layers.73.mlp.down_proj.weight": "model-00034-of-00037.safetensors",
862
+ "model.layers.73.mlp.gate_proj.weight": "model-00034-of-00037.safetensors",
863
+ "model.layers.73.mlp.up_proj.weight": "model-00034-of-00037.safetensors",
864
+ "model.layers.73.post_attention_layernorm.weight": "model-00034-of-00037.safetensors",
865
+ "model.layers.73.self_attn.k_proj.bias": "model-00034-of-00037.safetensors",
866
+ "model.layers.73.self_attn.k_proj.weight": "model-00034-of-00037.safetensors",
867
+ "model.layers.73.self_attn.o_proj.weight": "model-00034-of-00037.safetensors",
868
+ "model.layers.73.self_attn.q_proj.bias": "model-00034-of-00037.safetensors",
869
+ "model.layers.73.self_attn.q_proj.weight": "model-00034-of-00037.safetensors",
870
+ "model.layers.73.self_attn.v_proj.bias": "model-00034-of-00037.safetensors",
871
+ "model.layers.73.self_attn.v_proj.weight": "model-00034-of-00037.safetensors",
872
+ "model.layers.74.input_layernorm.weight": "model-00034-of-00037.safetensors",
873
+ "model.layers.74.mlp.down_proj.weight": "model-00034-of-00037.safetensors",
874
+ "model.layers.74.mlp.gate_proj.weight": "model-00034-of-00037.safetensors",
875
+ "model.layers.74.mlp.up_proj.weight": "model-00034-of-00037.safetensors",
876
+ "model.layers.74.post_attention_layernorm.weight": "model-00034-of-00037.safetensors",
877
+ "model.layers.74.self_attn.k_proj.bias": "model-00034-of-00037.safetensors",
878
+ "model.layers.74.self_attn.k_proj.weight": "model-00034-of-00037.safetensors",
879
+ "model.layers.74.self_attn.o_proj.weight": "model-00034-of-00037.safetensors",
880
+ "model.layers.74.self_attn.q_proj.bias": "model-00034-of-00037.safetensors",
881
+ "model.layers.74.self_attn.q_proj.weight": "model-00034-of-00037.safetensors",
882
+ "model.layers.74.self_attn.v_proj.bias": "model-00034-of-00037.safetensors",
883
+ "model.layers.74.self_attn.v_proj.weight": "model-00034-of-00037.safetensors",
884
+ "model.layers.75.input_layernorm.weight": "model-00034-of-00037.safetensors",
885
+ "model.layers.75.mlp.down_proj.weight": "model-00035-of-00037.safetensors",
886
+ "model.layers.75.mlp.gate_proj.weight": "model-00035-of-00037.safetensors",
887
+ "model.layers.75.mlp.up_proj.weight": "model-00035-of-00037.safetensors",
888
+ "model.layers.75.post_attention_layernorm.weight": "model-00035-of-00037.safetensors",
889
+ "model.layers.75.self_attn.k_proj.bias": "model-00035-of-00037.safetensors",
890
+ "model.layers.75.self_attn.k_proj.weight": "model-00035-of-00037.safetensors",
891
+ "model.layers.75.self_attn.o_proj.weight": "model-00035-of-00037.safetensors",
892
+ "model.layers.75.self_attn.q_proj.bias": "model-00035-of-00037.safetensors",
893
+ "model.layers.75.self_attn.q_proj.weight": "model-00035-of-00037.safetensors",
894
+ "model.layers.75.self_attn.v_proj.bias": "model-00035-of-00037.safetensors",
895
+ "model.layers.75.self_attn.v_proj.weight": "model-00035-of-00037.safetensors",
896
+ "model.layers.76.input_layernorm.weight": "model-00035-of-00037.safetensors",
897
+ "model.layers.76.mlp.down_proj.weight": "model-00035-of-00037.safetensors",
898
+ "model.layers.76.mlp.gate_proj.weight": "model-00035-of-00037.safetensors",
899
+ "model.layers.76.mlp.up_proj.weight": "model-00035-of-00037.safetensors",
900
+ "model.layers.76.post_attention_layernorm.weight": "model-00035-of-00037.safetensors",
901
+ "model.layers.76.self_attn.k_proj.bias": "model-00035-of-00037.safetensors",
902
+ "model.layers.76.self_attn.k_proj.weight": "model-00035-of-00037.safetensors",
903
+ "model.layers.76.self_attn.o_proj.weight": "model-00035-of-00037.safetensors",
904
+ "model.layers.76.self_attn.q_proj.bias": "model-00035-of-00037.safetensors",
905
+ "model.layers.76.self_attn.q_proj.weight": "model-00035-of-00037.safetensors",
906
+ "model.layers.76.self_attn.v_proj.bias": "model-00035-of-00037.safetensors",
907
+ "model.layers.76.self_attn.v_proj.weight": "model-00035-of-00037.safetensors",
908
+ "model.layers.77.input_layernorm.weight": "model-00035-of-00037.safetensors",
909
+ "model.layers.77.mlp.down_proj.weight": "model-00036-of-00037.safetensors",
910
+ "model.layers.77.mlp.gate_proj.weight": "model-00036-of-00037.safetensors",
911
+ "model.layers.77.mlp.up_proj.weight": "model-00036-of-00037.safetensors",
912
+ "model.layers.77.post_attention_layernorm.weight": "model-00035-of-00037.safetensors",
913
+ "model.layers.77.self_attn.k_proj.bias": "model-00035-of-00037.safetensors",
914
+ "model.layers.77.self_attn.k_proj.weight": "model-00035-of-00037.safetensors",
915
+ "model.layers.77.self_attn.o_proj.weight": "model-00035-of-00037.safetensors",
916
+ "model.layers.77.self_attn.q_proj.bias": "model-00035-of-00037.safetensors",
917
+ "model.layers.77.self_attn.q_proj.weight": "model-00035-of-00037.safetensors",
918
+ "model.layers.77.self_attn.v_proj.bias": "model-00035-of-00037.safetensors",
919
+ "model.layers.77.self_attn.v_proj.weight": "model-00035-of-00037.safetensors",
920
+ "model.layers.78.input_layernorm.weight": "model-00036-of-00037.safetensors",
921
+ "model.layers.78.mlp.down_proj.weight": "model-00036-of-00037.safetensors",
922
+ "model.layers.78.mlp.gate_proj.weight": "model-00036-of-00037.safetensors",
923
+ "model.layers.78.mlp.up_proj.weight": "model-00036-of-00037.safetensors",
924
+ "model.layers.78.post_attention_layernorm.weight": "model-00036-of-00037.safetensors",
925
+ "model.layers.78.self_attn.k_proj.bias": "model-00036-of-00037.safetensors",
926
+ "model.layers.78.self_attn.k_proj.weight": "model-00036-of-00037.safetensors",
927
+ "model.layers.78.self_attn.o_proj.weight": "model-00036-of-00037.safetensors",
928
+ "model.layers.78.self_attn.q_proj.bias": "model-00036-of-00037.safetensors",
929
+ "model.layers.78.self_attn.q_proj.weight": "model-00036-of-00037.safetensors",
930
+ "model.layers.78.self_attn.v_proj.bias": "model-00036-of-00037.safetensors",
931
+ "model.layers.78.self_attn.v_proj.weight": "model-00036-of-00037.safetensors",
932
+ "model.layers.79.input_layernorm.weight": "model-00036-of-00037.safetensors",
933
+ "model.layers.79.mlp.down_proj.weight": "model-00037-of-00037.safetensors",
934
+ "model.layers.79.mlp.gate_proj.weight": "model-00037-of-00037.safetensors",
935
+ "model.layers.79.mlp.up_proj.weight": "model-00036-of-00037.safetensors",
936
+ "model.layers.79.post_attention_layernorm.weight": "model-00036-of-00037.safetensors",
937
+ "model.layers.79.self_attn.k_proj.bias": "model-00036-of-00037.safetensors",
938
+ "model.layers.79.self_attn.k_proj.weight": "model-00036-of-00037.safetensors",
939
+ "model.layers.79.self_attn.o_proj.weight": "model-00036-of-00037.safetensors",
940
+ "model.layers.79.self_attn.q_proj.bias": "model-00036-of-00037.safetensors",
941
+ "model.layers.79.self_attn.q_proj.weight": "model-00036-of-00037.safetensors",
942
+ "model.layers.79.self_attn.v_proj.bias": "model-00036-of-00037.safetensors",
943
+ "model.layers.79.self_attn.v_proj.weight": "model-00036-of-00037.safetensors",
944
+ "model.layers.8.input_layernorm.weight": "model-00005-of-00037.safetensors",
945
+ "model.layers.8.mlp.down_proj.weight": "model-00005-of-00037.safetensors",
946
+ "model.layers.8.mlp.gate_proj.weight": "model-00005-of-00037.safetensors",
947
+ "model.layers.8.mlp.up_proj.weight": "model-00005-of-00037.safetensors",
948
+ "model.layers.8.post_attention_layernorm.weight": "model-00005-of-00037.safetensors",
949
+ "model.layers.8.self_attn.k_proj.bias": "model-00005-of-00037.safetensors",
950
+ "model.layers.8.self_attn.k_proj.weight": "model-00005-of-00037.safetensors",
951
+ "model.layers.8.self_attn.o_proj.weight": "model-00005-of-00037.safetensors",
952
+ "model.layers.8.self_attn.q_proj.bias": "model-00005-of-00037.safetensors",
953
+ "model.layers.8.self_attn.q_proj.weight": "model-00005-of-00037.safetensors",
954
+ "model.layers.8.self_attn.v_proj.bias": "model-00005-of-00037.safetensors",
955
+ "model.layers.8.self_attn.v_proj.weight": "model-00005-of-00037.safetensors",
956
+ "model.layers.9.input_layernorm.weight": "model-00005-of-00037.safetensors",
957
+ "model.layers.9.mlp.down_proj.weight": "model-00006-of-00037.safetensors",
958
+ "model.layers.9.mlp.gate_proj.weight": "model-00005-of-00037.safetensors",
959
+ "model.layers.9.mlp.up_proj.weight": "model-00005-of-00037.safetensors",
960
+ "model.layers.9.post_attention_layernorm.weight": "model-00005-of-00037.safetensors",
961
+ "model.layers.9.self_attn.k_proj.bias": "model-00005-of-00037.safetensors",
962
+ "model.layers.9.self_attn.k_proj.weight": "model-00005-of-00037.safetensors",
963
+ "model.layers.9.self_attn.o_proj.weight": "model-00005-of-00037.safetensors",
964
+ "model.layers.9.self_attn.q_proj.bias": "model-00005-of-00037.safetensors",
965
+ "model.layers.9.self_attn.q_proj.weight": "model-00005-of-00037.safetensors",
966
+ "model.layers.9.self_attn.v_proj.bias": "model-00005-of-00037.safetensors",
967
+ "model.layers.9.self_attn.v_proj.weight": "model-00005-of-00037.safetensors",
968
+ "model.norm.weight": "model-00037-of-00037.safetensors",
969
+ "score.0.bias": "model-00001-of-00037.safetensors",
970
+ "score.0.weight": "model-00001-of-00037.safetensors",
971
+ "score.2.bias": "model-00001-of-00037.safetensors",
972
+ "score.2.weight": "model-00001-of-00037.safetensors"
973
+ }
974
+ }
modeling_qwen2_rm.py ADDED
@@ -0,0 +1,1549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch Qwen2 model."""
21
+
22
+ import math
23
+ from typing import List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache#, StaticCache
32
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
33
+ from transformers.modeling_outputs import (
34
+ BaseModelOutputWithPast,
35
+ CausalLMOutputWithPast,
36
+ SequenceClassifierOutputWithPast,
37
+ TokenClassifierOutput,
38
+ )
39
+ from transformers.modeling_utils import PreTrainedModel
40
+ from transformers.utils import (
41
+ add_start_docstrings,
42
+ add_start_docstrings_to_model_forward,
43
+ is_flash_attn_2_available,
44
+ is_flash_attn_greater_or_equal_2_10,
45
+ logging,
46
+ replace_return_docstrings,
47
+ )
48
+ from configuration_qwen2_rm import Qwen2RMConfig as Qwen2Config
49
+
50
+
51
+ # if is_flash_attn_2_available():
52
+ # from transformers.modeling_flash_attention_utils import _flash_attention_forward
53
+
54
+
55
+ logger = logging.get_logger(__name__)
56
+
57
+
58
+ _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
59
+ _CONFIG_FOR_DOC = "Qwen2Config"
60
+
61
+
62
+ # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
63
+ def _prepare_4d_causal_attention_mask_with_cache_position(
64
+ attention_mask: torch.Tensor,
65
+ sequence_length: int,
66
+ target_length: int,
67
+ dtype: torch.dtype,
68
+ device: torch.device,
69
+ min_dtype: float,
70
+ cache_position: torch.Tensor,
71
+ batch_size: int,
72
+ ):
73
+ """
74
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
75
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
76
+
77
+ Args:
78
+ attention_mask (`torch.Tensor`):
79
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
80
+ sequence_length (`int`):
81
+ The sequence length being processed.
82
+ target_length (`int`):
83
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
84
+ dtype (`torch.dtype`):
85
+ The dtype to use for the 4D attention mask.
86
+ device (`torch.device`):
87
+ The device to plcae the 4D attention mask on.
88
+ min_dtype (`float`):
89
+ The minimum value representable with the dtype `dtype`.
90
+ cache_position (`torch.Tensor`):
91
+ Indices depicting the position of the input sequence tokens in the sequence.
92
+ batch_size (`torch.Tensor`):
93
+ Batch size.
94
+ """
95
+ if attention_mask is not None and attention_mask.dim() == 4:
96
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
97
+ causal_mask = attention_mask
98
+ else:
99
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
100
+ if sequence_length != 1:
101
+ causal_mask = torch.triu(causal_mask, diagonal=1)
102
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
103
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
104
+ if attention_mask is not None:
105
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
106
+ mask_length = attention_mask.shape[-1]
107
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
108
+ padding_mask = padding_mask == 0
109
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
110
+ padding_mask, min_dtype
111
+ )
112
+
113
+ return causal_mask
114
+
115
+
116
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
117
+ class Qwen2RMSNorm(nn.Module):
118
+ def __init__(self, hidden_size, eps=1e-6):
119
+ """
120
+ Qwen2RMSNorm is equivalent to T5LayerNorm
121
+ """
122
+ super().__init__()
123
+ self.weight = nn.Parameter(torch.ones(hidden_size))
124
+ self.variance_epsilon = eps
125
+
126
+ def forward(self, hidden_states):
127
+ input_dtype = hidden_states.dtype
128
+ hidden_states = hidden_states.to(torch.float32)
129
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
130
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
131
+ return self.weight * hidden_states.to(input_dtype)
132
+
133
+ def extra_repr(self):
134
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
135
+
136
+
137
+ # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
138
+ class Qwen2RotaryEmbedding(nn.Module):
139
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
140
+ super().__init__()
141
+
142
+ self.dim = dim
143
+ self.max_position_embeddings = max_position_embeddings
144
+ self.base = base
145
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
146
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
147
+
148
+ # Build here to make `torch.jit.trace` work.
149
+ self._set_cos_sin_cache(
150
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
151
+ )
152
+
153
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
154
+ self.max_seq_len_cached = seq_len
155
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
156
+
157
+ freqs = torch.outer(t, self.inv_freq)
158
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
159
+ emb = torch.cat((freqs, freqs), dim=-1)
160
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
161
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
162
+
163
+ def forward(self, x, seq_len=None):
164
+ # x: [bs, num_attention_heads, seq_len, head_size]
165
+ if seq_len > self.max_seq_len_cached:
166
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
167
+
168
+ return (
169
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
170
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
171
+ )
172
+
173
+
174
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
175
+ def rotate_half(x):
176
+ """Rotates half the hidden dims of the input."""
177
+ x1 = x[..., : x.shape[-1] // 2]
178
+ x2 = x[..., x.shape[-1] // 2 :]
179
+ return torch.cat((-x2, x1), dim=-1)
180
+
181
+
182
+ # Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
183
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
184
+ """Applies Rotary Position Embedding to the query and key tensors.
185
+
186
+ Args:
187
+ q (`torch.Tensor`): The query tensor.
188
+ k (`torch.Tensor`): The key tensor.
189
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
190
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
191
+ position_ids (`torch.Tensor`):
192
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
193
+ used to pass offsetted position ids when working with a KV-cache.
194
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
195
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
196
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
197
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
198
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
199
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
200
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
201
+ Returns:
202
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
203
+ """
204
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
205
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
206
+ q_embed = (q * cos) + (rotate_half(q) * sin)
207
+ k_embed = (k * cos) + (rotate_half(k) * sin)
208
+ return q_embed, k_embed
209
+
210
+
211
+ # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
212
+ class Qwen2MLP(nn.Module):
213
+ def __init__(self, config):
214
+ super().__init__()
215
+ self.hidden_size = config.hidden_size
216
+ self.intermediate_size = config.intermediate_size
217
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
218
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
219
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
220
+ self.act_fn = ACT2FN[config.hidden_act]
221
+
222
+ def forward(self, hidden_state):
223
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
224
+
225
+
226
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
227
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
228
+ """
229
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
230
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
231
+ """
232
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
233
+ if n_rep == 1:
234
+ return hidden_states
235
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
236
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
237
+
238
+
239
+ class Qwen2Attention(nn.Module):
240
+ """
241
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
242
+ and "Generating Long Sequences with Sparse Transformers".
243
+ """
244
+
245
+ def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
246
+ super().__init__()
247
+ self.config = config
248
+ self.layer_idx = layer_idx
249
+ if layer_idx is None:
250
+ logger.warning_once(
251
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
252
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
253
+ "when creating this class."
254
+ )
255
+
256
+ self.hidden_size = config.hidden_size
257
+ self.num_heads = config.num_attention_heads
258
+ self.head_dim = self.hidden_size // self.num_heads
259
+ self.num_key_value_heads = config.num_key_value_heads
260
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
261
+ self.max_position_embeddings = config.max_position_embeddings
262
+ self.rope_theta = config.rope_theta
263
+ self.is_causal = True
264
+ self.attention_dropout = config.attention_dropout
265
+
266
+ if (self.head_dim * self.num_heads) != self.hidden_size:
267
+ raise ValueError(
268
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
269
+ f" and `num_heads`: {self.num_heads})."
270
+ )
271
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
272
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
273
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
274
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
275
+
276
+ self.rotary_emb = Qwen2RotaryEmbedding(
277
+ self.head_dim,
278
+ max_position_embeddings=self.max_position_embeddings,
279
+ base=self.rope_theta,
280
+ )
281
+
282
+ def forward(
283
+ self,
284
+ hidden_states: torch.Tensor,
285
+ attention_mask: Optional[torch.Tensor] = None,
286
+ position_ids: Optional[torch.LongTensor] = None,
287
+ past_key_value: Optional[Cache] = None,
288
+ output_attentions: bool = False,
289
+ use_cache: bool = False,
290
+ cache_position: Optional[torch.LongTensor] = None,
291
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
292
+ bsz, q_len, _ = hidden_states.size()
293
+
294
+ query_states = self.q_proj(hidden_states)
295
+ key_states = self.k_proj(hidden_states)
296
+ value_states = self.v_proj(hidden_states)
297
+
298
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
299
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
300
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
301
+
302
+ kv_seq_len = key_states.shape[-2]
303
+ if past_key_value is not None:
304
+ if self.layer_idx is None:
305
+ raise ValueError(
306
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
307
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
308
+ "with a layer index."
309
+ )
310
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
311
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
312
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
313
+
314
+ if past_key_value is not None:
315
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
316
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
317
+
318
+ # repeat k/v heads if n_kv_heads < n_heads
319
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
320
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
321
+
322
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
323
+
324
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
325
+ raise ValueError(
326
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
327
+ f" {attn_weights.size()}"
328
+ )
329
+
330
+ if attention_mask is not None: # no matter the length, we just slice it
331
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
332
+ attn_weights = attn_weights + causal_mask
333
+
334
+ # upcast attention to fp32
335
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
336
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
337
+ attn_output = torch.matmul(attn_weights, value_states)
338
+
339
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
340
+ raise ValueError(
341
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
342
+ f" {attn_output.size()}"
343
+ )
344
+
345
+ attn_output = attn_output.transpose(1, 2).contiguous()
346
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
347
+
348
+ attn_output = self.o_proj(attn_output)
349
+
350
+ if not output_attentions:
351
+ attn_weights = None
352
+
353
+ return attn_output, attn_weights, past_key_value
354
+
355
+
356
+ class Qwen2FlashAttention2(Qwen2Attention):
357
+ """
358
+ Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
359
+ as the weights of the module stays untouched. The only required change would be on the forward pass
360
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
361
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
362
+ config.max_window_layers layers.
363
+ """
364
+
365
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
366
+ def __init__(self, *args, **kwargs):
367
+ super().__init__(*args, **kwargs)
368
+
369
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
370
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
371
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
372
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
373
+
374
+ def forward(
375
+ self,
376
+ hidden_states: torch.Tensor,
377
+ attention_mask: Optional[torch.Tensor] = None,
378
+ position_ids: Optional[torch.LongTensor] = None,
379
+ past_key_value: Optional[Cache] = None,
380
+ output_attentions: bool = False,
381
+ use_cache: bool = False,
382
+ cache_position: Optional[torch.LongTensor] = None,
383
+ ):
384
+ bsz, q_len, _ = hidden_states.size()
385
+
386
+ query_states = self.q_proj(hidden_states)
387
+ key_states = self.k_proj(hidden_states)
388
+ value_states = self.v_proj(hidden_states)
389
+
390
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
391
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
392
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
393
+
394
+ kv_seq_len = key_states.shape[-2]
395
+ if past_key_value is not None:
396
+ if self.layer_idx is None:
397
+ raise ValueError(
398
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
399
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
400
+ "with a layer index."
401
+ )
402
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
403
+
404
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
405
+ rotary_seq_len = (
406
+ max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
407
+ )
408
+
409
+ cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
410
+
411
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
412
+
413
+ if past_key_value is not None:
414
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
415
+ cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
416
+ if (
417
+ getattr(self.config, "sliding_window", None) is not None
418
+ and kv_seq_len > self.config.sliding_window
419
+ and cache_has_contents
420
+ ):
421
+ slicing_tokens = 1 - self.config.sliding_window
422
+
423
+ past_key = past_key_value[self.layer_idx][0]
424
+ past_value = past_key_value[self.layer_idx][1]
425
+
426
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
427
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
428
+
429
+ if past_key.shape[-2] != self.config.sliding_window - 1:
430
+ raise ValueError(
431
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
432
+ f" {past_key.shape}"
433
+ )
434
+
435
+ if attention_mask is not None:
436
+ attention_mask = attention_mask[:, slicing_tokens:]
437
+ attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
438
+
439
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
440
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
441
+
442
+ # repeat k/v heads if n_kv_heads < n_heads
443
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
444
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
445
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
446
+
447
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
448
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
449
+ # cast them back in float16 just to be sure everything works as expected.
450
+ input_dtype = query_states.dtype
451
+ if input_dtype == torch.float32:
452
+ if torch.is_autocast_enabled():
453
+ target_dtype = torch.get_autocast_gpu_dtype()
454
+ # Handle the case where the model is quantized
455
+ elif hasattr(self.config, "_pre_quantization_dtype"):
456
+ target_dtype = self.config._pre_quantization_dtype
457
+ else:
458
+ target_dtype = self.q_proj.weight.dtype
459
+
460
+ logger.warning_once(
461
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
462
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
463
+ f" {target_dtype}."
464
+ )
465
+
466
+ query_states = query_states.to(target_dtype)
467
+ key_states = key_states.to(target_dtype)
468
+ value_states = value_states.to(target_dtype)
469
+
470
+ # Reashape to the expected shape for Flash Attention
471
+ query_states = query_states.transpose(1, 2)
472
+ key_states = key_states.transpose(1, 2)
473
+ value_states = value_states.transpose(1, 2)
474
+
475
+ if (
476
+ self.config.use_sliding_window
477
+ and getattr(self.config, "sliding_window", None) is not None
478
+ and self.layer_idx >= self.config.max_window_layers
479
+ ):
480
+ sliding_window = self.config.sliding_window
481
+ else:
482
+ sliding_window = None
483
+
484
+ attn_output = _flash_attention_forward(
485
+ query_states,
486
+ key_states,
487
+ value_states,
488
+ attention_mask,
489
+ q_len,
490
+ position_ids=position_ids,
491
+ dropout=dropout_rate,
492
+ sliding_window=sliding_window,
493
+ is_causal=self.is_causal,
494
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
495
+ )
496
+
497
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
498
+ attn_output = self.o_proj(attn_output)
499
+
500
+ if not output_attentions:
501
+ attn_weights = None
502
+
503
+ return attn_output, attn_weights, past_key_value
504
+
505
+
506
+ # Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Qwen2
507
+ class Qwen2SdpaAttention(Qwen2Attention):
508
+ """
509
+ Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
510
+ `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
511
+ SDPA API.
512
+ """
513
+
514
+ # Adapted from Qwen2Attention.forward
515
+ def forward(
516
+ self,
517
+ hidden_states: torch.Tensor,
518
+ attention_mask: Optional[torch.Tensor] = None,
519
+ position_ids: Optional[torch.LongTensor] = None,
520
+ past_key_value: Optional[Cache] = None,
521
+ output_attentions: bool = False,
522
+ use_cache: bool = False,
523
+ cache_position: Optional[torch.LongTensor] = None,
524
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
525
+ if output_attentions:
526
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
527
+ logger.warning_once(
528
+ "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
529
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
530
+ )
531
+ return super().forward(
532
+ hidden_states=hidden_states,
533
+ attention_mask=attention_mask,
534
+ position_ids=position_ids,
535
+ past_key_value=past_key_value,
536
+ output_attentions=output_attentions,
537
+ use_cache=use_cache,
538
+ )
539
+
540
+ bsz, q_len, _ = hidden_states.size()
541
+
542
+ query_states = self.q_proj(hidden_states)
543
+ key_states = self.k_proj(hidden_states)
544
+ value_states = self.v_proj(hidden_states)
545
+
546
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
547
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
548
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
549
+
550
+ kv_seq_len = key_states.shape[-2]
551
+ if past_key_value is not None:
552
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
553
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
554
+
555
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
556
+
557
+ if past_key_value is not None:
558
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
559
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
560
+
561
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
562
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
563
+
564
+ causal_mask = attention_mask
565
+ if attention_mask is not None: # no matter the length, we just slice it
566
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
567
+
568
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
569
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
570
+ if query_states.device.type == "cuda" and attention_mask is not None:
571
+ query_states = query_states.contiguous()
572
+ key_states = key_states.contiguous()
573
+ value_states = value_states.contiguous()
574
+
575
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
576
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
577
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
578
+ is_causal = True if causal_mask is None and q_len > 1 else False
579
+
580
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
581
+ query_states,
582
+ key_states,
583
+ value_states,
584
+ attn_mask=causal_mask,
585
+ dropout_p=self.attention_dropout if self.training else 0.0,
586
+ is_causal=is_causal,
587
+ )
588
+
589
+ attn_output = attn_output.transpose(1, 2).contiguous()
590
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
591
+
592
+ attn_output = self.o_proj(attn_output)
593
+
594
+ return attn_output, None, past_key_value
595
+
596
+
597
+ QWEN2_ATTENTION_CLASSES = {
598
+ "eager": Qwen2Attention,
599
+ "flash_attention_2": Qwen2FlashAttention2,
600
+ "sdpa": Qwen2SdpaAttention,
601
+ }
602
+
603
+
604
+ class Qwen2DecoderLayer(nn.Module):
605
+ def __init__(self, config: Qwen2Config, layer_idx: int):
606
+ super().__init__()
607
+ self.hidden_size = config.hidden_size
608
+
609
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
610
+ logger.warning_once(
611
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
612
+ "unexpected results may be encountered."
613
+ )
614
+ self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
615
+
616
+ self.mlp = Qwen2MLP(config)
617
+ self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
618
+ self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
619
+
620
+ def forward(
621
+ self,
622
+ hidden_states: torch.Tensor,
623
+ attention_mask: Optional[torch.Tensor] = None,
624
+ position_ids: Optional[torch.LongTensor] = None,
625
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
626
+ output_attentions: Optional[bool] = False,
627
+ use_cache: Optional[bool] = False,
628
+ cache_position: Optional[torch.LongTensor] = None,
629
+ **kwargs,
630
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
631
+ """
632
+ Args:
633
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
634
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
635
+ `(batch, sequence_length)` where padding elements are indicated by 0.
636
+ output_attentions (`bool`, *optional*):
637
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
638
+ returned tensors for more detail.
639
+ use_cache (`bool`, *optional*):
640
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
641
+ (see `past_key_values`).
642
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
643
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
644
+ Indices depicting the position of the input sequence tokens in the sequence.
645
+ kwargs (`dict`, *optional*):
646
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
647
+ into the model
648
+ """
649
+
650
+ residual = hidden_states
651
+
652
+ hidden_states = self.input_layernorm(hidden_states)
653
+
654
+ # Self Attention
655
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
656
+ hidden_states=hidden_states,
657
+ attention_mask=attention_mask,
658
+ position_ids=position_ids,
659
+ past_key_value=past_key_value,
660
+ output_attentions=output_attentions,
661
+ use_cache=use_cache,
662
+ cache_position=cache_position,
663
+ )
664
+ hidden_states = residual + hidden_states
665
+
666
+ # Fully Connected
667
+ residual = hidden_states
668
+ hidden_states = self.post_attention_layernorm(hidden_states)
669
+ hidden_states = self.mlp(hidden_states)
670
+ hidden_states = residual + hidden_states
671
+
672
+ outputs = (hidden_states,)
673
+
674
+ if output_attentions:
675
+ outputs += (self_attn_weights,)
676
+
677
+ if use_cache:
678
+ outputs += (present_key_value,)
679
+
680
+ return outputs
681
+
682
+
683
+ QWEN2_START_DOCSTRING = r"""
684
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
685
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
686
+ etc.)
687
+
688
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
689
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
690
+ and behavior.
691
+
692
+ Parameters:
693
+ config ([`Qwen2Config`]):
694
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
695
+ load the weights associated with the model, only the configuration. Check out the
696
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
697
+ """
698
+
699
+
700
+ @add_start_docstrings(
701
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
702
+ QWEN2_START_DOCSTRING,
703
+ )
704
+ class Qwen2PreTrainedModel(PreTrainedModel):
705
+ config_class = Qwen2Config
706
+ base_model_prefix = "model"
707
+ supports_gradient_checkpointing = True
708
+ _no_split_modules = ["Qwen2DecoderLayer"]
709
+ _skip_keys_device_placement = "past_key_values"
710
+ _supports_flash_attn_2 = True
711
+ _supports_sdpa = True
712
+ _supports_cache_class = True
713
+
714
+ def _init_weights(self, module):
715
+ std = self.config.initializer_range
716
+ if isinstance(module, nn.Linear):
717
+ module.weight.data.normal_(mean=0.0, std=std)
718
+ if module.bias is not None:
719
+ module.bias.data.zero_()
720
+ elif isinstance(module, nn.Embedding):
721
+ module.weight.data.normal_(mean=0.0, std=std)
722
+ if module.padding_idx is not None:
723
+ module.weight.data[module.padding_idx].zero_()
724
+
725
+
726
+ QWEN2_INPUTS_DOCSTRING = r"""
727
+ Args:
728
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
729
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
730
+ it.
731
+
732
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
733
+ [`PreTrainedTokenizer.__call__`] for details.
734
+
735
+ [What are input IDs?](../glossary#input-ids)
736
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
737
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
738
+
739
+ - 1 for tokens that are **not masked**,
740
+ - 0 for tokens that are **masked**.
741
+
742
+ [What are attention masks?](../glossary#attention-mask)
743
+
744
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
745
+ [`PreTrainedTokenizer.__call__`] for details.
746
+
747
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
748
+ `past_key_values`).
749
+
750
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
751
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
752
+ information on the default strategy.
753
+
754
+ - 1 indicates the head is **not masked**,
755
+ - 0 indicates the head is **masked**.
756
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
757
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
758
+ config.n_positions - 1]`.
759
+
760
+ [What are position IDs?](../glossary#position-ids)
761
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
762
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
763
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
764
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
765
+
766
+ Two formats are allowed:
767
+ - a [`~cache_utils.Cache`] instance;
768
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
769
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
770
+ cache format.
771
+
772
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
773
+ legacy cache format will be returned.
774
+
775
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
776
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
777
+ of shape `(batch_size, sequence_length)`.
778
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
779
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
780
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
781
+ model's internal embedding lookup matrix.
782
+ use_cache (`bool`, *optional*):
783
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
784
+ `past_key_values`).
785
+ output_attentions (`bool`, *optional*):
786
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
787
+ tensors for more detail.
788
+ output_hidden_states (`bool`, *optional*):
789
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
790
+ more detail.
791
+ return_dict (`bool`, *optional*):
792
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
793
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
794
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
795
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
796
+ the complete sequence length.
797
+ """
798
+
799
+
800
+ @add_start_docstrings(
801
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
802
+ QWEN2_START_DOCSTRING,
803
+ )
804
+ class Qwen2Model(Qwen2PreTrainedModel):
805
+ """
806
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
807
+
808
+ Args:
809
+ config: Qwen2Config
810
+ """
811
+
812
+ def __init__(self, config: Qwen2Config):
813
+ super().__init__(config)
814
+ self.padding_idx = config.pad_token_id
815
+ self.vocab_size = config.vocab_size
816
+
817
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
818
+ self.layers = nn.ModuleList(
819
+ [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
820
+ )
821
+ self._attn_implementation = config._attn_implementation
822
+ self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
823
+
824
+ self.gradient_checkpointing = False
825
+ # Initialize weights and apply final processing
826
+ self.post_init()
827
+
828
+ def get_input_embeddings(self):
829
+ return self.embed_tokens
830
+
831
+ def set_input_embeddings(self, value):
832
+ self.embed_tokens = value
833
+
834
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
835
+ def forward(
836
+ self,
837
+ input_ids: torch.LongTensor = None,
838
+ attention_mask: Optional[torch.Tensor] = None,
839
+ position_ids: Optional[torch.LongTensor] = None,
840
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
841
+ inputs_embeds: Optional[torch.FloatTensor] = None,
842
+ use_cache: Optional[bool] = None,
843
+ output_attentions: Optional[bool] = None,
844
+ output_hidden_states: Optional[bool] = None,
845
+ return_dict: Optional[bool] = None,
846
+ cache_position: Optional[torch.LongTensor] = None,
847
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
848
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
849
+ output_hidden_states = (
850
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
851
+ )
852
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
853
+
854
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
855
+
856
+ if (input_ids is None) ^ (inputs_embeds is not None):
857
+ raise ValueError(
858
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
859
+ )
860
+
861
+ if self.gradient_checkpointing and self.training:
862
+ if use_cache:
863
+ logger.warning_once(
864
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
865
+ )
866
+ use_cache = False
867
+
868
+ use_legacy_cache = False
869
+ if use_cache and not isinstance(past_key_values, Cache) and not self.training:
870
+ use_legacy_cache = True
871
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
872
+ logger.warning_once(
873
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
874
+ "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
875
+ )
876
+
877
+ if inputs_embeds is None:
878
+ inputs_embeds = self.embed_tokens(input_ids)
879
+
880
+ if cache_position is None:
881
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
882
+ cache_position = torch.arange(
883
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
884
+ )
885
+ if position_ids is None:
886
+ position_ids = cache_position.unsqueeze(0)
887
+
888
+ causal_mask = self._update_causal_mask(
889
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
890
+ )
891
+
892
+ hidden_states = inputs_embeds
893
+
894
+ # decoder layers
895
+ all_hidden_states = () if output_hidden_states else None
896
+ all_self_attns = () if output_attentions else None
897
+ next_decoder_cache = None
898
+
899
+ for decoder_layer in self.layers:
900
+ if output_hidden_states:
901
+ all_hidden_states += (hidden_states,)
902
+
903
+ if self.gradient_checkpointing and self.training:
904
+ layer_outputs = self._gradient_checkpointing_func(
905
+ decoder_layer.__call__,
906
+ hidden_states,
907
+ causal_mask,
908
+ position_ids,
909
+ past_key_values,
910
+ output_attentions,
911
+ use_cache,
912
+ cache_position,
913
+ )
914
+ else:
915
+ layer_outputs = decoder_layer(
916
+ hidden_states,
917
+ attention_mask=causal_mask,
918
+ position_ids=position_ids,
919
+ past_key_value=past_key_values,
920
+ output_attentions=output_attentions,
921
+ use_cache=use_cache,
922
+ cache_position=cache_position,
923
+ )
924
+
925
+ hidden_states = layer_outputs[0]
926
+
927
+ if use_cache:
928
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
929
+
930
+ if output_attentions:
931
+ all_self_attns += (layer_outputs[1],)
932
+
933
+ hidden_states = self.norm(hidden_states)
934
+
935
+ # add hidden states from the last decoder layer
936
+ if output_hidden_states:
937
+ all_hidden_states += (hidden_states,)
938
+
939
+ next_cache = None
940
+ if use_cache:
941
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
942
+
943
+ if not return_dict:
944
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
945
+ return BaseModelOutputWithPast(
946
+ last_hidden_state=hidden_states,
947
+ past_key_values=next_cache,
948
+ hidden_states=all_hidden_states,
949
+ attentions=all_self_attns,
950
+ )
951
+
952
+ # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
953
+ def _update_causal_mask(
954
+ self,
955
+ attention_mask: torch.Tensor,
956
+ input_tensor: torch.Tensor,
957
+ cache_position: torch.Tensor,
958
+ past_key_values: Cache,
959
+ output_attentions: bool,
960
+ ):
961
+ # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
962
+ # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
963
+ # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
964
+ # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
965
+
966
+ if self.config._attn_implementation == "flash_attention_2":
967
+ if attention_mask is not None and 0.0 in attention_mask:
968
+ return attention_mask
969
+ return None
970
+
971
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
972
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
973
+ # to infer the attention mask.
974
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
975
+ using_static_cache = False#isinstance(past_key_values, StaticCache)
976
+
977
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
978
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
979
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
980
+ attention_mask,
981
+ inputs_embeds=input_tensor,
982
+ past_key_values_length=past_seen_tokens,
983
+ is_training=self.training,
984
+ ):
985
+ return None
986
+
987
+ dtype, device = input_tensor.dtype, input_tensor.device
988
+ min_dtype = torch.finfo(dtype).min
989
+ sequence_length = input_tensor.shape[1]
990
+ if using_static_cache:
991
+ target_length = past_key_values.get_max_length()
992
+ else:
993
+ target_length = (
994
+ attention_mask.shape[-1]
995
+ if isinstance(attention_mask, torch.Tensor)
996
+ else past_seen_tokens + sequence_length + 1
997
+ )
998
+
999
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1000
+ causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1001
+ attention_mask,
1002
+ sequence_length=sequence_length,
1003
+ target_length=target_length,
1004
+ dtype=dtype,
1005
+ device=device,
1006
+ min_dtype=min_dtype,
1007
+ cache_position=cache_position,
1008
+ batch_size=input_tensor.shape[0],
1009
+ )
1010
+
1011
+ if (
1012
+ self.config._attn_implementation == "sdpa"
1013
+ and attention_mask is not None
1014
+ and attention_mask.device.type == "cuda"
1015
+ and not output_attentions
1016
+ ):
1017
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1018
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1019
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1020
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1021
+
1022
+ return causal_mask
1023
+
1024
+
1025
+ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
1026
+ _tied_weights_keys = ["lm_head.weight"]
1027
+
1028
+ def __init__(self, config):
1029
+ super().__init__(config)
1030
+ self.model = Qwen2Model(config)
1031
+ self.vocab_size = config.vocab_size
1032
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1033
+
1034
+ # Initialize weights and apply final processing
1035
+ self.post_init()
1036
+
1037
+ def get_input_embeddings(self):
1038
+ return self.model.embed_tokens
1039
+
1040
+ def set_input_embeddings(self, value):
1041
+ self.model.embed_tokens = value
1042
+
1043
+ def get_output_embeddings(self):
1044
+ return self.lm_head
1045
+
1046
+ def set_output_embeddings(self, new_embeddings):
1047
+ self.lm_head = new_embeddings
1048
+
1049
+ def set_decoder(self, decoder):
1050
+ self.model = decoder
1051
+
1052
+ def get_decoder(self):
1053
+ return self.model
1054
+
1055
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1056
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1057
+ def forward(
1058
+ self,
1059
+ input_ids: torch.LongTensor = None,
1060
+ attention_mask: Optional[torch.Tensor] = None,
1061
+ position_ids: Optional[torch.LongTensor] = None,
1062
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1063
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1064
+ labels: Optional[torch.LongTensor] = None,
1065
+ use_cache: Optional[bool] = None,
1066
+ output_attentions: Optional[bool] = None,
1067
+ output_hidden_states: Optional[bool] = None,
1068
+ return_dict: Optional[bool] = None,
1069
+ cache_position: Optional[torch.LongTensor] = None,
1070
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1071
+ r"""
1072
+ Args:
1073
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1074
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1075
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1076
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1077
+
1078
+ Returns:
1079
+
1080
+ Example:
1081
+
1082
+ ```python
1083
+ >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
1084
+
1085
+ >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1086
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1087
+
1088
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1089
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1090
+
1091
+ >>> # Generate
1092
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1093
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1094
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1095
+ ```"""
1096
+
1097
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1098
+ output_hidden_states = (
1099
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1100
+ )
1101
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1102
+
1103
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1104
+ outputs = self.model(
1105
+ input_ids=input_ids,
1106
+ attention_mask=attention_mask,
1107
+ position_ids=position_ids,
1108
+ past_key_values=past_key_values,
1109
+ inputs_embeds=inputs_embeds,
1110
+ use_cache=use_cache,
1111
+ output_attentions=output_attentions,
1112
+ output_hidden_states=output_hidden_states,
1113
+ return_dict=return_dict,
1114
+ cache_position=cache_position,
1115
+ )
1116
+
1117
+ hidden_states = outputs[0]
1118
+ logits = self.lm_head(hidden_states)
1119
+ logits = logits.float()
1120
+
1121
+ loss = None
1122
+ if labels is not None:
1123
+ # Shift so that tokens < n predict n
1124
+ shift_logits = logits[..., :-1, :].contiguous()
1125
+ shift_labels = labels[..., 1:].contiguous()
1126
+ # Flatten the tokens
1127
+ loss_fct = CrossEntropyLoss()
1128
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1129
+ shift_labels = shift_labels.view(-1)
1130
+ # Enable model parallelism
1131
+ shift_labels = shift_labels.to(shift_logits.device)
1132
+ loss = loss_fct(shift_logits, shift_labels)
1133
+
1134
+ if not return_dict:
1135
+ output = (logits,) + outputs[1:]
1136
+ return (loss,) + output if loss is not None else output
1137
+
1138
+ return CausalLMOutputWithPast(
1139
+ loss=loss,
1140
+ logits=logits,
1141
+ past_key_values=outputs.past_key_values,
1142
+ hidden_states=outputs.hidden_states,
1143
+ attentions=outputs.attentions,
1144
+ )
1145
+
1146
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
1147
+ def prepare_inputs_for_generation(
1148
+ self,
1149
+ input_ids,
1150
+ past_key_values=None,
1151
+ attention_mask=None,
1152
+ inputs_embeds=None,
1153
+ cache_position=None,
1154
+ position_ids=None,
1155
+ use_cache=True,
1156
+ **kwargs,
1157
+ ):
1158
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1159
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
1160
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1161
+ if past_key_values is not None:
1162
+ if inputs_embeds is not None: # Exception 1
1163
+ input_ids = input_ids[:, -cache_position.shape[0] :]
1164
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1165
+ input_ids = input_ids[:, cache_position]
1166
+
1167
+ if attention_mask is not None and position_ids is None:
1168
+ # create position_ids on the fly for batch generation
1169
+ position_ids = attention_mask.long().cumsum(-1) - 1
1170
+ position_ids.masked_fill_(attention_mask == 0, 1)
1171
+ if past_key_values:
1172
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1173
+
1174
+ # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
1175
+ position_ids = position_ids.clone(memory_format=torch.contiguous_format)
1176
+
1177
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1178
+ if inputs_embeds is not None and cache_position[0] == 0:
1179
+ model_inputs = {"inputs_embeds": inputs_embeds}
1180
+ else:
1181
+ model_inputs = {"input_ids": input_ids}
1182
+
1183
+ if False and isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
1184
+ if inputs_embeds is not None:
1185
+ batch_size, sequence_length = inputs_embeds.shape
1186
+ device = inputs_embeds.device
1187
+ else:
1188
+ batch_size, sequence_length = input_ids.shape
1189
+ device = input_ids.device
1190
+
1191
+ dtype = self.lm_head.weight.dtype
1192
+ min_dtype = torch.finfo(dtype).min
1193
+
1194
+ attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1195
+ attention_mask,
1196
+ sequence_length=sequence_length,
1197
+ target_length=past_key_values.get_max_length(),
1198
+ dtype=dtype,
1199
+ device=device,
1200
+ min_dtype=min_dtype,
1201
+ cache_position=cache_position,
1202
+ batch_size=batch_size,
1203
+ )
1204
+
1205
+ model_inputs.update(
1206
+ {
1207
+ "position_ids": position_ids,
1208
+ "cache_position": cache_position,
1209
+ "past_key_values": past_key_values,
1210
+ "use_cache": use_cache,
1211
+ "attention_mask": attention_mask,
1212
+ }
1213
+ )
1214
+ return model_inputs
1215
+
1216
+
1217
+ @add_start_docstrings(
1218
+ """
1219
+ The Qwen2 Model transformer with a sequence classification head on top (linear layer).
1220
+
1221
+ [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1222
+ (e.g. GPT-2) do.
1223
+
1224
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1225
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1226
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1227
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1228
+ each row of the batch).
1229
+ """,
1230
+ QWEN2_START_DOCSTRING,
1231
+ )
1232
+ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
1233
+ def __init__(self, config):
1234
+ super().__init__(config)
1235
+ self.num_labels = config.num_labels
1236
+ self.model = Qwen2Model(config)
1237
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1238
+
1239
+ # Initialize weights and apply final processing
1240
+ self.post_init()
1241
+
1242
+ def get_input_embeddings(self):
1243
+ return self.model.embed_tokens
1244
+
1245
+ def set_input_embeddings(self, value):
1246
+ self.model.embed_tokens = value
1247
+
1248
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1249
+ def forward(
1250
+ self,
1251
+ input_ids: torch.LongTensor = None,
1252
+ attention_mask: Optional[torch.Tensor] = None,
1253
+ position_ids: Optional[torch.LongTensor] = None,
1254
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1255
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1256
+ labels: Optional[torch.LongTensor] = None,
1257
+ use_cache: Optional[bool] = None,
1258
+ output_attentions: Optional[bool] = None,
1259
+ output_hidden_states: Optional[bool] = None,
1260
+ return_dict: Optional[bool] = None,
1261
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1262
+ r"""
1263
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1264
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1265
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1266
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1267
+ """
1268
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1269
+
1270
+ transformer_outputs = self.model(
1271
+ input_ids,
1272
+ attention_mask=attention_mask,
1273
+ position_ids=position_ids,
1274
+ past_key_values=past_key_values,
1275
+ inputs_embeds=inputs_embeds,
1276
+ use_cache=use_cache,
1277
+ output_attentions=output_attentions,
1278
+ output_hidden_states=output_hidden_states,
1279
+ return_dict=return_dict,
1280
+ )
1281
+ hidden_states = transformer_outputs[0]
1282
+ logits = self.score(hidden_states)
1283
+
1284
+ if input_ids is not None:
1285
+ batch_size = input_ids.shape[0]
1286
+ else:
1287
+ batch_size = inputs_embeds.shape[0]
1288
+
1289
+ if self.config.pad_token_id is None and batch_size != 1:
1290
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1291
+ if self.config.pad_token_id is None:
1292
+ sequence_lengths = -1
1293
+ else:
1294
+ if input_ids is not None:
1295
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1296
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1297
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1298
+ sequence_lengths = sequence_lengths.to(logits.device)
1299
+ else:
1300
+ sequence_lengths = -1
1301
+
1302
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1303
+
1304
+ loss = None
1305
+ if labels is not None:
1306
+ labels = labels.to(logits.device)
1307
+ if self.config.problem_type is None:
1308
+ if self.num_labels == 1:
1309
+ self.config.problem_type = "regression"
1310
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1311
+ self.config.problem_type = "single_label_classification"
1312
+ else:
1313
+ self.config.problem_type = "multi_label_classification"
1314
+
1315
+ if self.config.problem_type == "regression":
1316
+ loss_fct = MSELoss()
1317
+ if self.num_labels == 1:
1318
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1319
+ else:
1320
+ loss = loss_fct(pooled_logits, labels)
1321
+ elif self.config.problem_type == "single_label_classification":
1322
+ loss_fct = CrossEntropyLoss()
1323
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1324
+ elif self.config.problem_type == "multi_label_classification":
1325
+ loss_fct = BCEWithLogitsLoss()
1326
+ loss = loss_fct(pooled_logits, labels)
1327
+ if not return_dict:
1328
+ output = (pooled_logits,) + transformer_outputs[1:]
1329
+ return ((loss,) + output) if loss is not None else output
1330
+
1331
+ return SequenceClassifierOutputWithPast(
1332
+ loss=loss,
1333
+ logits=pooled_logits,
1334
+ past_key_values=transformer_outputs.past_key_values,
1335
+ hidden_states=transformer_outputs.hidden_states,
1336
+ attentions=transformer_outputs.attentions,
1337
+ )
1338
+
1339
+
1340
+ @add_start_docstrings(
1341
+ """
1342
+ The Qwen2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1343
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1344
+ """,
1345
+ QWEN2_START_DOCSTRING,
1346
+ )
1347
+ # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2, LLAMA->QWEN2
1348
+ class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
1349
+ def __init__(self, config):
1350
+ super().__init__(config)
1351
+ self.num_labels = config.num_labels
1352
+ self.model = Qwen2Model(config)
1353
+ if getattr(config, "classifier_dropout", None) is not None:
1354
+ classifier_dropout = config.classifier_dropout
1355
+ elif getattr(config, "hidden_dropout", None) is not None:
1356
+ classifier_dropout = config.hidden_dropout
1357
+ else:
1358
+ classifier_dropout = 0.1
1359
+ self.dropout = nn.Dropout(classifier_dropout)
1360
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1361
+
1362
+ # Initialize weights and apply final processing
1363
+ self.post_init()
1364
+
1365
+ def get_input_embeddings(self):
1366
+ return self.model.embed_tokens
1367
+
1368
+ def set_input_embeddings(self, value):
1369
+ self.model.embed_tokens = value
1370
+
1371
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1372
+ def forward(
1373
+ self,
1374
+ input_ids: Optional[torch.LongTensor] = None,
1375
+ attention_mask: Optional[torch.Tensor] = None,
1376
+ position_ids: Optional[torch.LongTensor] = None,
1377
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1378
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1379
+ labels: Optional[torch.LongTensor] = None,
1380
+ use_cache: Optional[bool] = None,
1381
+ output_attentions: Optional[bool] = None,
1382
+ output_hidden_states: Optional[bool] = None,
1383
+ return_dict: Optional[bool] = None,
1384
+ ) -> Union[Tuple, TokenClassifierOutput]:
1385
+ r"""
1386
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1387
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1388
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1389
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1390
+ """
1391
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1392
+
1393
+ outputs = self.model(
1394
+ input_ids,
1395
+ attention_mask=attention_mask,
1396
+ position_ids=position_ids,
1397
+ past_key_values=past_key_values,
1398
+ inputs_embeds=inputs_embeds,
1399
+ use_cache=use_cache,
1400
+ output_attentions=output_attentions,
1401
+ output_hidden_states=output_hidden_states,
1402
+ return_dict=return_dict,
1403
+ )
1404
+ sequence_output = outputs[0]
1405
+ sequence_output = self.dropout(sequence_output)
1406
+ logits = self.score(sequence_output)
1407
+
1408
+ loss = None
1409
+ if labels is not None:
1410
+ loss_fct = CrossEntropyLoss()
1411
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1412
+
1413
+ if not return_dict:
1414
+ output = (logits,) + outputs[2:]
1415
+ return ((loss,) + output) if loss is not None else output
1416
+
1417
+ return TokenClassifierOutput(
1418
+ loss=loss,
1419
+ logits=logits,
1420
+ hidden_states=outputs.hidden_states,
1421
+ attentions=outputs.attentions,
1422
+ )
1423
+
1424
+
1425
+ @add_start_docstrings(
1426
+ """
1427
+ The Qwen2 Model transformer with a sequence classification head on top (linear layer).
1428
+
1429
+ [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1430
+ (e.g. GPT-2) do.
1431
+
1432
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1433
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1434
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1435
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1436
+ each row of the batch).
1437
+ """,
1438
+ QWEN2_START_DOCSTRING,
1439
+ )
1440
+ class Qwen2ForRewardModel(Qwen2PreTrainedModel):
1441
+ def __init__(self, config):
1442
+ super().__init__(config)
1443
+ self.num_labels = 1#config.num_labels
1444
+ self.model = Qwen2Model(config)
1445
+ self.score = nn.Sequential(
1446
+ nn.Linear(config.hidden_size, config.hidden_size),
1447
+ nn.ReLU(),
1448
+ nn.Linear(config.hidden_size, self.num_labels)
1449
+ )
1450
+
1451
+ # Initialize weights and apply final processing
1452
+ self.post_init()
1453
+
1454
+ def get_input_embeddings(self):
1455
+ return self.model.embed_tokens
1456
+
1457
+ def set_input_embeddings(self, value):
1458
+ self.model.embed_tokens = value
1459
+
1460
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1461
+ def forward(
1462
+ self,
1463
+ input_ids: torch.LongTensor = None,
1464
+ attention_mask: Optional[torch.Tensor] = None,
1465
+ position_ids: Optional[torch.LongTensor] = None,
1466
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1467
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1468
+ labels: Optional[torch.LongTensor] = None,
1469
+ use_cache: Optional[bool] = None,
1470
+ output_attentions: Optional[bool] = None,
1471
+ output_hidden_states: Optional[bool] = None,
1472
+ return_dict: Optional[bool] = None,
1473
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1474
+ r"""
1475
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1476
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1477
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1478
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1479
+ """
1480
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1481
+
1482
+ transformer_outputs = self.model(
1483
+ input_ids,
1484
+ attention_mask=attention_mask,
1485
+ position_ids=position_ids,
1486
+ past_key_values=past_key_values,
1487
+ inputs_embeds=inputs_embeds,
1488
+ use_cache=use_cache,
1489
+ output_attentions=output_attentions,
1490
+ output_hidden_states=output_hidden_states,
1491
+ return_dict=return_dict,
1492
+ )
1493
+ hidden_states = transformer_outputs[0]
1494
+ logits = self.score(hidden_states)
1495
+
1496
+ if input_ids is not None:
1497
+ batch_size = input_ids.shape[0]
1498
+ else:
1499
+ batch_size = inputs_embeds.shape[0]
1500
+
1501
+ if self.config.pad_token_id is None and batch_size != 1:
1502
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1503
+ if self.config.pad_token_id is None:
1504
+ sequence_lengths = -1
1505
+ else:
1506
+ if input_ids is not None:
1507
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1508
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1509
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1510
+ sequence_lengths = sequence_lengths.to(logits.device)
1511
+ else:
1512
+ sequence_lengths = -1
1513
+
1514
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1515
+
1516
+ loss = None
1517
+ if labels is not None:
1518
+ labels = labels.to(logits.device)
1519
+ if self.config.problem_type is None:
1520
+ if self.num_labels == 1:
1521
+ self.config.problem_type = "regression"
1522
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1523
+ self.config.problem_type = "single_label_classification"
1524
+ else:
1525
+ self.config.problem_type = "multi_label_classification"
1526
+
1527
+ if self.config.problem_type == "regression":
1528
+ loss_fct = MSELoss()
1529
+ if self.num_labels == 1:
1530
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1531
+ else:
1532
+ loss = loss_fct(pooled_logits, labels)
1533
+ elif self.config.problem_type == "single_label_classification":
1534
+ loss_fct = CrossEntropyLoss()
1535
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1536
+ elif self.config.problem_type == "multi_label_classification":
1537
+ loss_fct = BCEWithLogitsLoss()
1538
+ loss = loss_fct(pooled_logits, labels)
1539
+ if not return_dict:
1540
+ output = (pooled_logits,) + transformer_outputs[1:]
1541
+ return ((loss,) + output) if loss is not None else output
1542
+
1543
+ return SequenceClassifierOutputWithPast(
1544
+ loss=loss,
1545
+ logits=pooled_logits,
1546
+ past_key_values=transformer_outputs.past_key_values,
1547
+ hidden_states=transformer_outputs.hidden_states,
1548
+ attentions=transformer_outputs.attentions,
1549
+ )
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{% if loop.last %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>'}}{% else %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{{ '<|endoftext|>' }}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 131072,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff