|
from dataclasses import dataclass |
|
from transformers import PretrainedConfig |
|
|
|
@dataclass |
|
class OrcaleSeekConfig: |
|
model_type: str = "orcaleseek" |
|
vocab_size: int = 50257 |
|
n_embd: int = 768 |
|
n_layer: int = 12 |
|
n_head: int = 12 |
|
n_inner: int = 3072 |
|
activation_function: str = "gelu_new" |
|
resid_pdrop: float = 0.1 |
|
embd_pdrop: float = 0.1 |
|
attn_pdrop: float = 0.1 |
|
layer_norm_epsilon: float = 1e-5 |
|
initializer_range: float = 0.02 |
|
scale_attn_weights: bool = True |
|
use_cache: bool = True |
|
bos_token_id: int = 50256 |
|
eos_token_id: int = 50256 |
|
architectures = ["OrcaleSeekForCausalLM"] |
|
|
|
def to_hf_config(self): |
|
return PretrainedConfig(**self.__dict__) |