from transformers import PretrainedConfig class NanoGPTConfig(PretrainedConfig): model_type = "nanogpt" def __init__( self, sequence_len: int = 1024, vocab_size: int = 50304, n_layer: int = 12, n_head: int = 6, n_kv_head: int = 6, n_embd: int = 768, bos_token_id: int = 0, eos_token_id: int = 1, pad_token_id: int = 1, **kwargs, ): self.sequence_len = sequence_len self.vocab_size = vocab_size self.n_layer = n_layer self.n_head = n_head self.n_kv_head = n_kv_head self.n_embd = n_embd super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs, )