| { | |
| "activation": "gelu", | |
| "bias": false, | |
| "d_model": 2048, | |
| "dff": null, | |
| "dropout_rate": 0.0, | |
| "max_block_size": 1024, | |
| "n_heads_ra": 16, | |
| "n_heads_sa": 16, | |
| "n_layers": 24, | |
| "norm_first": true, | |
| "pos_enc_type": "RoPE", | |
| "ra_kwargs": { | |
| "n_kv_heads": 8, | |
| "n_relations": 64, | |
| "rel_activation": "identity", | |
| "rel_proj_dim": 16, | |
| "symmetric_rels": false | |
| }, | |
| "ra_type": "relational_attention", | |
| "sa_kwargs": { | |
| "n_kv_heads": 8 | |
| }, | |
| "share_attn_params": false, | |
| "symbol_retrieval": "symbolic_attention", | |
| "symbol_retrieval_kwargs": { | |
| "d_model": 2048, | |
| "n_heads": 8, | |
| "n_symbols": 2048, | |
| "trainable_symbols": false | |
| }, | |
| "symbol_retriever_config": { | |
| "shared_symbol_retriever": true, | |
| "weight_tie_symbol_library": false | |
| }, | |
| "vocab_size": 50304 | |
| } |