from transformers import PretrainedConfig class NanoGPTConfig(PretrainedConfig): model_type = "nanogpt" def __init__( self, sequence_len: int = 1024, vocab_size: int = 50304, n_layer: int = 12, n_head: int = 6, n_kv_head: int = 6, n_embd: int = 768, **kwargs, ): self.sequence_len = sequence_len self.vocab_size = vocab_size self.n_layer = n_layer self.n_head = n_head self.n_kv_head = n_kv_head self.n_embd = n_embd super().__init__(**kwargs)