shmelev commited on
Commit
8d30b42
·
1 Parent(s): 976c4b0

commit from alexey

Browse files
Files changed (1) hide show
  1. config.json +35 -54
config.json CHANGED
@@ -1,55 +1,36 @@
1
  {
2
- "model_path": "/mnt/10tb/home/shmelev/dnalm/downstream_tasks/APARENT/bert_base_sparse_rope_4096_bs256_lr_5e-05_wd0.01_fp16_from_425k/",
3
- "log_interval": 250,
4
- "valid_interval": 1000,
5
- "save_interval": null,
6
- "save_best": true,
7
- "use_generate_on_valid": false,
8
- "init_checkpoint": "/mnt/10tb/home/shmelev/dnalm/downstream_tasks/APARENT/bert_base_sparse_rope_4096_bs256_lr_5e-05_wd0.01_fp16_from_425k/model_best_from_s3.pth",
9
- "skip_used_data": false,
10
- "reset_lr": true,
11
- "reset_iteration": true,
12
- "reset_optimizer": true,
13
- "lr": 5e-05,
14
- "batch_size": 32,
15
- "iters": 500000,
16
- "gradient_accumulation_steps": 1,
17
- "fp16": true,
18
- "fp16_allreduce": false,
19
- "apex_opt_lvl": "O2",
20
- "min_loss_scale": null,
21
- "max_loss_scale": 16777216,
22
- "clip_grad_norm": null,
23
- "clip_grad_value": null,
24
- "early_stopping_patience": null,
25
- "lr_scheduler": "constant_with_warmup",
26
- "num_warmup_steps": 1500,
27
- "num_training_steps": null,
28
- "use_lr_drop": false,
29
- "lr_drop_factor": 0.1,
30
- "lr_drop_patience": 10,
31
- "lr_drop_threshold": 0.0001,
32
- "lr_drop_threshold_mode": "rel",
33
- "lr_drop_cooldown": 0,
34
- "lr_drop_min_lr": 0.0,
35
- "lr_drop_eps": 1e-08,
36
- "optimize_metric": "pearsonr2",
37
- "optimize_mode": "max",
38
- "train_csv": "/mnt/10tb/home/shmelev/dnalm/downstream_tasks/APARENT/dataset_itself/APARENT_train.csv",
39
- "test_csv": "/mnt/10tb/home/shmelev/dnalm/downstream_tasks/APARENT/dataset_itself/APARENT_test.csv",
40
- "seed": 42,
41
- "input_seq_len": 256,
42
- "data_n_workers": 2,
43
- "model_cfg": "/mnt/10tb/home/shmelev/dnalm/data/configs/L12-H768-A12-V32k-L4096-preln-sparse-rope.json",
44
- "model_cls": "src.gena_lm.modeling_bert:BertForAPARENTSequenceRegression",
45
- "tokenizer": "/mnt/10tb/home/shmelev/dnalm/data/tokenizers/human/BPE_32k/",
46
- "optimizer": "AdamW",
47
- "weight_decay": 0.0001,
48
- "ENV": {
49
- "CUDA_VISIBLE_DEVICES": "0"
50
- },
51
- "HVD_INIT": true,
52
- "HVD_SIZE": 1,
53
- "MACHINE": "bio-protein",
54
- "COMMIT": "2513d0c4ee891e7bcd224b9c65e88e27e0fd1892"
55
- }
 
1
  {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "gradient_checkpointing": false,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 4096,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 12,
17
+ "pad_token_id": 3,
18
+ "pre_layer_norm": true,
19
+ "last_layer_norm": false,
20
+ "position_embedding_type": "rotary",
21
+ "rotary_base": 10000,
22
+ "rotary_dim": 32,
23
+ "transformers_version": "4.6.0.dev0",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 32000,
27
+ "sparse_config_cls": "deepspeed.ops.sparse_attention:BigBirdSparsityConfig",
28
+ "sparse_attention": {
29
+ "num_heads": 12,
30
+ "block": 64,
31
+ "different_layout_per_head": true,
32
+ "num_sliding_window_blocks": 3,
33
+ "num_global_blocks": 2,
34
+ "num_random_blocks": 3
35
+ }
36
+ }