| from transformers.models.roberta.modeling_roberta import RobertaConfig | |
| class JargonConfig(RobertaConfig): | |
| model_type = "jargon" | |
| def __init__( | |
| self, | |
| compress_layer= 1, | |
| shared_layer_kv_compressed=1, | |
| shared_kv_compressed=0, | |
| max_positions=512, | |
| max_position_embeddings=512, | |
| compressed=4, | |
| vocab_size=30522, | |
| freeze_compress=0, | |
| embed_dim=768, | |
| num_heads=16, | |
| dim_feedforward=4096, | |
| dropout=0.1, | |
| activation="relu", | |
| layer_norm_eps=1e-05, | |
| self_attention=True, | |
| encoder_decoder_attention=False, | |
| bias=True, | |
| q_noise=0, | |
| qn_block_size=8, | |
| add_bias_kv=False, | |
| add_zero_attn=False, | |
| num_layers=12, | |
| untie_weights_roberta=False, | |
| layernorm_embedding=False, | |
| encoder_normalize_before=False, | |
| encoder_embed_dim=768, | |
| encoder_attention_heads=12, | |
| quant_noise_pq=0.0, | |
| quant_noise_pq_block_size=8, | |
| quant_noise_scalar=0, | |
| encoder_ffn_embed_dim=4096, | |
| add_pooling_layer=False, | |
| intermediate_size=4096, | |
| intermediate_act_fn="relu", | |
| hidden_act="relu", | |
| output_hidden_states=False, | |
| position_embedding_type="learned", | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.add_pooling_layer = add_pooling_layer | |
| self.compress_layer = compress_layer | |
| self.shared_layer_kv_compressed = shared_layer_kv_compressed | |
| self.shared_kv_compressed = shared_kv_compressed | |
| self.max_positions = max_positions | |
| self.max_position_embeddings = max_position_embeddings | |
| self.compressed = compressed | |
| self.freeze_compress = freeze_compress | |
| self.embed_dim = embed_dim | |
| self.num_heads = num_heads | |
| self.dim_feedforward=dim_feedforward | |
| self.dropout = dropout | |
| self.activation= activation | |
| self.layer_norm_eps = layer_norm_eps | |
| self.self_attention = self_attention | |
| self.encoder_decoder_attention = encoder_decoder_attention | |
| self.bias = bias | |
| self.q_noise = q_noise | |
| self.qn_block_size = qn_block_size | |
| self.add_bias_kv = add_bias_kv | |
| self.add_zero_attn = add_zero_attn | |
| self.num_layers = num_layers | |
| self.untie_weights_roberta = untie_weights_roberta | |
| self.layernorm_embedding=layernorm_embedding | |
| self.encoder_embed_dim = encoder_embed_dim | |
| self.encoder_attention_heads=encoder_attention_heads | |
| self.quant_noise_pq = quant_noise_pq | |
| self.quant_noise_pq_block_size=quant_noise_pq_block_size | |
| self.quant_noise_scalar=quant_noise_scalar | |
| self.encoder_normalize_before=encoder_normalize_before | |
| self.encoder_ffn_embed_dim = encoder_ffn_embed_dim | |
| self.vocab_size = vocab_size | |
| self.intermediate_size = intermediate_size | |
| self.intermediate_act_fn = intermediate_act_fn | |
| self.output_hidden_states = output_hidden_states | |
| self.hidden_act = hidden_act | |
| self.position_embedding_type = position_embedding_type | |
| self.encoder_normalize_before = encoder_normalize_before | |