Graph Machine Learning
AnemoI
English
anaprietonem commited on
Commit
598f35c
·
verified ·
1 Parent(s): 7976552

Upload 2 files

Browse files
Files changed (2) hide show
  1. config_finetuning.yaml +350 -277
  2. config_pretraining.yaml +369 -304
config_finetuning.yaml CHANGED
@@ -4,8 +4,66 @@ data:
4
  frequency: 6h
5
  timestep: 6h
6
  forcing:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  - cos_latitude
8
- - cos_longitude
9
  - sin_latitude
10
  - sin_longitude
11
  - cos_julian_day
@@ -14,136 +72,41 @@ data:
14
  - sin_local_time
15
  - insolation
16
  - lsm
17
- - sdor
18
- - slor
19
- - z
20
- diagnostic:
21
- - tp
22
- - cp
23
- - sf
24
  - tcc
 
25
  - hcc
26
  - lcc
27
- - mcc
28
- - ro
29
- - ssrd
30
- - strd
31
- - 100u
32
- - 100v
33
- remapped: null
34
- normalizer:
35
- default: mean-std
36
- remap:
37
- cp: tp
38
- sf: tp
39
- std:
40
- - tp
41
- - cp
42
- - sf
43
- - ro
44
- - tcw
45
- - ssrd
46
- - q_50
47
- - q_100
48
- - q_150
49
- - q_200
50
- - q_250
51
- - q_300
52
- - q_400
53
- - q_500
54
- - q_600
55
- - q_700
56
- - q_850
57
- - q_925
58
- - q_1000
59
- min-max: null
60
- max:
61
- - sdor
62
- - slor
63
- - z
64
- none:
65
- - cos_latitude
66
- - cos_longitude
67
- - sin_latitude
68
- - sin_longitude
69
- - cos_julian_day
70
- - cos_local_time
71
- - sin_julian_day
72
- - sin_local_time
73
- - insolation
74
- - lsm
75
- - tcc
76
- - mcc
77
- - hcc
78
- - lcc
79
- - swvl1
80
- - swvl2
81
  imputer:
82
  default: none
 
 
 
 
 
 
 
83
  remapper:
84
  default: none
85
  processors:
 
 
 
 
86
  normalizer:
87
  _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
88
- _convert_: all
89
- config:
90
- default: mean-std
91
- remap:
92
- cp: tp
93
- sf: tp
94
- std:
95
- - tp
96
- - cp
97
- - sf
98
- - ro
99
- - tcw
100
- - ssrd
101
- - q_50
102
- - q_100
103
- - q_150
104
- - q_200
105
- - q_250
106
- - q_300
107
- - q_400
108
- - q_500
109
- - q_600
110
- - q_700
111
- - q_850
112
- - q_925
113
- - q_1000
114
- min-max: null
115
- max:
116
- - sdor
117
- - slor
118
- - z
119
- none:
120
- - cos_latitude
121
- - cos_longitude
122
- - sin_latitude
123
- - sin_longitude
124
- - cos_julian_day
125
- - cos_local_time
126
- - sin_julian_day
127
- - sin_local_time
128
- - insolation
129
- - lsm
130
- - tcc
131
- - mcc
132
- - hcc
133
- - lcc
134
- - swvl1
135
- - swvl2
136
- num_features: 115
137
-
138
  dataloader:
139
  prefetch_factor: 2
140
- pin_memory: True
141
- read_group_size: 4
142
  num_workers:
143
  training: 8
144
  validation: 8
145
- test: 8
146
- predict: 8
147
  batch_size:
148
  training: 1
149
  validation: 1
@@ -154,118 +117,165 @@ dataloader:
154
  validation: 10
155
  test: 20
156
  predict: 20
 
 
 
157
  dataset: ${hardware.paths.data}/${hardware.files.dataset}
158
- land_dataset: ${hardware.paths.data}/${hardware.files.dataset_land}
159
- land_variables: [100u, 100v, swvl1, swvl2, stl1, stl2, tcc, lcc, mcc, hcc, sf, ro, strd, ssrd]
160
  training:
161
  dataset:
162
- - dataset: ${dataloader.dataset}
163
- start: null
164
- end: 2022
165
- frequency: ${data.frequency}
166
- drop: []
167
- - dataset: ${dataloader.land_dataset}
168
- start: null
169
- end: 2022
170
- frequency: ${data.frequency}
171
- select: ${dataloader.land_variables}
172
  start: null
173
  end: 2022
174
  drop: []
175
  validation:
176
  dataset:
177
- - dataset: ${dataloader.dataset}
178
- start: 2022
179
- end: 2022
180
- frequency: ${data.frequency}
181
- drop: []
182
- - dataset: ${dataloader.land_dataset}
183
- start: 2022
184
- end: 2022
185
- frequency: ${data.frequency}
186
- select: ${dataloader.land_variables}
187
  start: 2022
188
- end: 2022
 
 
 
 
 
 
 
 
 
189
  drop: []
190
- validation_rollout: 1
191
-
192
  diagnostics:
193
  plot:
194
- asynchronous: False
195
- datashader: True
196
  frequency:
197
  batch: 750
198
- epoch: 10
199
- parameters: [tp]
 
 
 
 
 
 
 
 
 
 
200
  sample_idx: 0
201
- precip_and_related_fields: [tp, cp]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  callbacks: []
203
- enabled: True
204
- scatter: False
205
- mode: asyncio
206
- callbacks: {}
207
  benchmark_profiler:
208
  memory:
209
- enabled: True
210
  steps: 5
211
  warmup: 2
212
- extra_plots: False
213
- trace_rank0_only: False
214
  time:
215
- enabled: True
216
- verbose: False
217
  speed:
218
- enabled: True
219
  system:
220
- enabled: True
221
  model_summary:
222
- enabled: True
223
  snapshot:
224
- enabled: True
225
  steps: 4
226
  warmup: 0
227
  debug:
228
- anomaly_detection: False
229
- profiler: False
230
- enable_checkpointing: True
231
  checkpoint:
232
  every_n_minutes:
233
  save_frequency: 30
234
  num_models_saved: 3
235
  every_n_epochs:
236
  save_frequency: 1
237
- num_models_saved: 3
238
  every_n_train_steps:
239
  save_frequency: null
240
  num_models_saved: 0
241
  log:
242
  wandb:
243
- enabled: False
 
 
 
 
 
 
244
  tensorboard:
245
- enabled: False
246
  mlflow:
247
- enabled: False
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  interval: 100
249
- enable_progress_bar: True
250
- print_memory_summary: False
251
-
252
  hardware:
253
  paths:
254
  data: ${oc.decode:${oc.env:DATASETS_PATH}}
255
- output: ${oc.decode:${oc.env:OUTPUT_DIR}}
256
  logs:
257
- base: ${hardware.paths.output}/logs
258
- wandb: ${hardware.paths.output}/logs/wandb
259
- mlflow: ${hardware.paths.output}/logs/mlflow
260
- tensorboard: ${hardware.paths.output}/logs/tensorboard
261
- checkpoints: ${hardware.paths.output}/checkpoint/
262
- plots: ${hardware.paths.output}/plots/
263
- profiler: ${hardware.paths.output}/profiler/
264
- graph: ${hardware.paths.output}/graphs/
265
  files:
266
- dataset: aifs-od-an-oper-0001-mars-n320-2016-2023-6h-v6.zarr
267
- dataset_land: aifs-od-an-oper-0001-mars-n320-2016-2023-6h-v1-land.zarr
268
- graph: graph_enc_proc_dec_n320.pt
269
  checkpoint:
270
  every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
271
  every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
@@ -275,9 +285,8 @@ hardware:
275
  num_gpus_per_node: 4
276
  num_nodes: 16
277
  num_gpus_per_model: 4
278
-
279
  graph:
280
- overwrite: True
281
  data: data
282
  hidden: hidden
283
  nodes:
@@ -285,44 +294,34 @@ graph:
285
  node_builder:
286
  _target_: anemoi.graphs.nodes.ZarrDatasetNodes
287
  dataset: ${dataloader.dataset}
288
- attributes:
289
- area_weight:
290
- _target_: anemoi.graphs.nodes.attributes.AreaWeights
291
- norm: unit-max
292
  hidden:
293
  node_builder:
294
  _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
295
  grid: o96
296
  edges:
297
- - source_name: data
298
- target_name: hidden
299
- edge_builder:
300
- _target_: anemoi.graphs.edges.CutOffEdges
301
- cutoff_factor: 0.6
302
- attributes:
303
- edge_length:
304
- _target_: anemoi.graphs.edges.attributes.EdgeLength
305
- norm: unit-std
306
- edge_dirs:
307
- _target_: anemoi.graphs.edges.attributes.EdgeDirection
308
- norm: unit-std
309
- - source_name: hidden
310
- target_name: data
311
- edge_builder:
312
- _target_: anemoi.graphs.edges.KNNEdges
313
- num_nearest_neighbours: 3
314
- attributes:
315
- edge_length:
316
- _target_: anemoi.graphs.edges.attributes.EdgeLength
317
- norm: unit-std
318
- edge_dirs:
319
- _target_: anemoi.graphs.edges.attributes.EdgeDirection
320
- norm: unit-std
321
  attributes:
322
  nodes:
323
  area_weight:
324
- _target_: anemoi.graphs.nodes.attributes.AreaWeights
325
  norm: unit-max
 
326
  edges:
327
  edge_length:
328
  _target_: anemoi.graphs.edges.attributes.EdgeLength
@@ -330,89 +329,138 @@ graph:
330
  edge_dirs:
331
  _target_: anemoi.graphs.edges.attributes.EdgeDirection
332
  norm: unit-std
333
-
334
  model:
335
  activation: GELU
336
  num_channels: 1024
 
 
337
  model:
338
  _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  processor:
340
  _target_: anemoi.models.layers.processor.TransformerProcessor
341
- _convert_: all
342
- activation: GELU
343
  num_layers: 16
344
  num_chunks: 2
345
  mlp_hidden_ratio: 4
346
  num_heads: 16
347
  window_size: 1120
348
  dropout_p: 0.0
 
 
 
 
 
349
  encoder:
350
  _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
351
- _convert_: all
352
- trainable_size: 8
353
- sub_graph_edge_attributes: [edge_length, edge_dirs]
354
- activation: GELU
355
  num_chunks: 1
356
  mlp_hidden_ratio: 4
357
  num_heads: 16
 
 
358
  decoder:
359
  _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
360
- _convert_: all
361
- trainable_size: 8
362
- sub_graph_edge_attributes: [edge_length, edge_dirs]
363
- activation: GELU
364
  num_chunks: 1
365
  mlp_hidden_ratio: 4
366
  num_heads: 16
 
 
 
367
  trainable_parameters:
368
  data: 8
369
  hidden: 8
370
  data2hidden: 8
371
  hidden2data: 8
372
  attributes:
373
- edges: [edge_length, edge_dirs]
 
 
374
  nodes: []
375
- node_loss_weight: area_weight
376
  bounding:
377
- - _target_: anemoi.models.layers.bounding.ReluBounding
378
- variables:
379
- - tp
380
- - ro
381
- - tcw
382
- - ssrd
383
- - q_50
384
- - q_100
385
- - q_150
386
- - q_200
387
- - q_250
388
- - q_300
389
- - q_400
390
- - q_500
391
- - q_600
392
- - q_700
393
- - q_850
394
- - q_925
395
- - q_1000
396
- - _target_: anemoi.models.layers.bounding.HardtanhBounding
397
- variables: [tcc, swvl1, swvl2]
398
- min_val: 0
399
- max_val: 1
400
- - _target_: anemoi.models.layers.bounding.FractionBounding
401
- variables: [cp, sf]
402
- min_val: 0
403
- max_val: 1
404
- total_var: tp
405
- - _target_: anemoi.models.layers.bounding.FractionBounding
406
- variables: [lcc, mcc, hcc]
407
- min_val: 0
408
- max_val: 1
409
- total_var: tcc
410
-
 
 
 
 
 
 
 
 
411
  training:
412
  run_id: null
413
  fork_run_id: ${oc.decode:${oc.env:PRETRAINING_RUN_ID}}
414
- load_weights_only: True
415
- deterministic: False
 
416
  precision: 16-mixed
417
  multistep_input: 2
418
  accum_grad_batches: 1
@@ -421,20 +469,35 @@ training:
421
  val: 32.0
422
  algorithm: value
423
  swa:
424
- enabled: False
425
  lr: 0.0001
426
- zero_optimizer: False
 
 
 
 
 
 
 
 
 
 
 
427
  training_loss:
428
  _target_: anemoi.training.losses.mse.WeightedMSELoss
429
  scalars:
430
- - variable
431
- - loss_weights_mask
432
- ignore_nans: False
433
- loss_gradient_scaling: False
434
  validation_metrics:
435
- - _target_: anemoi.training.losses.mse.WeightedMSELoss
436
- scalars: []
437
- ignore_nans: True
 
 
 
 
 
438
  rollout:
439
  start: 1
440
  epoch_increment: 1
@@ -442,9 +505,10 @@ training:
442
  max_epochs: 13
443
  max_steps: 150000
444
  lr:
445
- rate: 8.0e-7
 
446
  iterations: 7900
447
- min: 3.0e-7
448
  warmup_t: 100
449
  variable_loss_scaling:
450
  default: 1
@@ -464,20 +528,29 @@ training:
464
  2d: 0.5
465
  tp: 0.025
466
  cp: 0.0025
467
- ro: 0.005
468
  sf: 0.025
469
  tcc: 0.1
470
  mcc: 0.1
471
  lcc: 0.1
472
  hcc: 0.1
473
- swvl2: 2.0
474
- swvl1: 1.0
475
  stl2: 10
476
  stl1: 1
477
  ssrd: 0.05
478
  strd: 0.1
479
- metrics: [z_500, t_850, u_850, v_850]
 
 
 
 
480
  pressure_level_scaler:
481
  _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
482
  minimum: 0.2
483
- slope: 0.001
 
 
 
 
 
 
4
  frequency: 6h
5
  timestep: 6h
6
  forcing:
7
+ - cos_latitude
8
+ - cos_longitude
9
+ - sin_latitude
10
+ - sin_longitude
11
+ - cos_julian_day
12
+ - cos_local_time
13
+ - sin_julian_day
14
+ - sin_local_time
15
+ - insolation
16
+ - lsm
17
+ - sdor
18
+ - slor
19
+ - z
20
+ diagnostic:
21
+ - tp
22
+ - cp
23
+ - sf
24
+ - tcc
25
+ - hcc
26
+ - lcc
27
+ - mcc
28
+ - ro
29
+ - ssrd
30
+ - strd
31
+ - 100u
32
+ - 100v
33
+ remapped: null
34
+ normalizer:
35
+ default: mean-std
36
+ remap:
37
+ cp: tp
38
+ sf: tp
39
+ std:
40
+ - tp
41
+ - cp
42
+ - sf
43
+ - ro
44
+ - tcw
45
+ - ssrd
46
+ - q_50
47
+ - q_100
48
+ - q_150
49
+ - q_200
50
+ - q_250
51
+ - q_300
52
+ - q_400
53
+ - q_500
54
+ - q_600
55
+ - q_700
56
+ - q_850
57
+ - q_925
58
+ - q_1000
59
+ min-max: null
60
+ max:
61
+ - sdor
62
+ - slor
63
+ - z
64
+ none:
65
  - cos_latitude
66
+ - cos_longitude
67
  - sin_latitude
68
  - sin_longitude
69
  - cos_julian_day
 
72
  - sin_local_time
73
  - insolation
74
  - lsm
 
 
 
 
 
 
 
75
  - tcc
76
+ - mcc
77
  - hcc
78
  - lcc
79
+ - swvl1
80
+ - swvl2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  imputer:
82
  default: none
83
+ minimum:
84
+ - swvl1
85
+ - swvl2
86
+ - ro
87
+ mean:
88
+ - stl1
89
+ - stl2
90
  remapper:
91
  default: none
92
  processors:
93
+ imputer:
94
+ _target_: anemoi.models.preprocessing.imputer.InputImputer
95
+ _convert_: all
96
+ config: ${data.imputer}
97
  normalizer:
98
  _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
99
+ config: ${data.normalizer}
100
+ num_features: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  dataloader:
102
  prefetch_factor: 2
103
+ pin_memory: true
104
+ read_group_size: ${hardware.num_gpus_per_model}
105
  num_workers:
106
  training: 8
107
  validation: 8
108
+ test: 1
109
+ predict: 1
110
  batch_size:
111
  training: 1
112
  validation: 1
 
117
  validation: 10
118
  test: 20
119
  predict: 20
120
+ grid_indices:
121
+ _target_: anemoi.training.data.grid_indices.FullGrid
122
+ nodes_name: ${graph.data}
123
  dataset: ${hardware.paths.data}/${hardware.files.dataset}
 
 
124
  training:
125
  dataset:
126
+ - dataset: ${hardware.paths.data}/${hardware.files.dataset}
127
+ start: null
128
+ end: 2022
129
+ frequency: ${data.frequency}
 
 
 
 
 
 
130
  start: null
131
  end: 2022
132
  drop: []
133
  validation:
134
  dataset:
135
+ - dataset: ${hardware.paths.data}/${hardware.files.dataset}
136
+ start: 2022
137
+ end: 2024
138
+ frequency: ${data.frequency}
 
 
 
 
 
 
139
  start: 2022
140
+ end: 2024
141
+ drop: []
142
+ test:
143
+ dataset:
144
+ - dataset: ${hardware.paths.data}/${hardware.files.dataset}
145
+ start: 2022
146
+ end: null
147
+ frequency: ${data.frequency}
148
+ start: 2022
149
+ end: null
150
  drop: []
 
 
151
  diagnostics:
152
  plot:
153
+ asynchronous: true
154
+ datashader: true
155
  frequency:
156
  batch: 750
157
+ epoch: 5
158
+ parameters:
159
+ - z_500
160
+ - t_850
161
+ - u_850
162
+ - v_850
163
+ - 2t
164
+ - 10u
165
+ - 10v
166
+ - sp
167
+ - tp
168
+ - cp
169
  sample_idx: 0
170
+ precip_and_related_fields:
171
+ - tp
172
+ - cp
173
+ colormaps:
174
+ default:
175
+ _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
176
+ name: viridis
177
+ error:
178
+ _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
179
+ name: bwr
180
+ precip:
181
+ _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormapClevels
182
+ clevels:
183
+ - '#ffffff'
184
+ - '#04e9e7'
185
+ - '#019ff4'
186
+ - '#0300f4'
187
+ - '#02fd02'
188
+ - '#01c501'
189
+ - '#008e00'
190
+ - '#fdf802'
191
+ - '#e5bc00'
192
+ - '#fd9500'
193
+ - '#fd0000'
194
+ - '#d40000'
195
+ - '#bc0000'
196
+ - '#f800fd'
197
+ variables: ${diagnostics.plot.precip_and_related_fields}
198
  callbacks: []
199
+ callbacks: []
 
 
 
200
  benchmark_profiler:
201
  memory:
202
+ enabled: true
203
  steps: 5
204
  warmup: 2
205
+ extra_plots: false
206
+ trace_rank0_only: false
207
  time:
208
+ enabled: true
209
+ verbose: false
210
  speed:
211
+ enabled: true
212
  system:
213
+ enabled: true
214
  model_summary:
215
+ enabled: true
216
  snapshot:
217
+ enabled: true
218
  steps: 4
219
  warmup: 0
220
  debug:
221
+ anomaly_detection: false
222
+ profiler: false
223
+ enable_checkpointing: true
224
  checkpoint:
225
  every_n_minutes:
226
  save_frequency: 30
227
  num_models_saved: 3
228
  every_n_epochs:
229
  save_frequency: 1
230
+ num_models_saved: -1
231
  every_n_train_steps:
232
  save_frequency: null
233
  num_models_saved: 0
234
  log:
235
  wandb:
236
+ enabled: false
237
+ offline: false
238
+ log_model: false
239
+ project: Anemoi
240
+ entity: ???
241
+ gradients: false
242
+ parameters: false
243
  tensorboard:
244
+ enabled: false
245
  mlflow:
246
+ enabled: false
247
+ offline: false
248
+ authentication: false
249
+ log_model: false
250
+ tracking_uri: ???
251
+ experiment_name: ???
252
+ project_name: ???
253
+ system: true
254
+ terminal: true
255
+ run_name: null
256
+ on_resume_create_child: true
257
+ expand_hyperparams:
258
+ - config
259
+ http_max_retries: 35
260
  interval: 100
261
+ enable_progress_bar: true
262
+ print_memory_summary: false
 
263
  hardware:
264
  paths:
265
  data: ${oc.decode:${oc.env:DATASETS_PATH}}
266
+ output: ${oc.decode:${oc.env:OUTPUT_PATH}}
267
  logs:
268
+ base: ${hardware.paths.output}logs/
269
+ wandb: ${hardware.paths.logs.base}
270
+ mlflow: ${hardware.paths.logs.base}mlflow/
271
+ tensorboard: ${hardware.paths.logs.base}tensorboard/
272
+ checkpoints: ${hardware.paths.output}checkpoint/
273
+ plots: ${hardware.paths.output}plots/
274
+ profiler: ${hardware.paths.output}profiler/
275
+ graph: ${hardware.paths.output}graphs/
276
  files:
277
+ dataset: aifs-ea-an-oper-0001-mars-${data.resolution}-1979-2024-6h-v1-aifs-single-v1.zarr
278
+ graph: graph_enc_proc_dec_${data.resolution}.pt
 
279
  checkpoint:
280
  every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
281
  every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
 
285
  num_gpus_per_node: 4
286
  num_nodes: 16
287
  num_gpus_per_model: 4
 
288
  graph:
289
+ overwrite: true
290
  data: data
291
  hidden: hidden
292
  nodes:
 
294
  node_builder:
295
  _target_: anemoi.graphs.nodes.ZarrDatasetNodes
296
  dataset: ${dataloader.dataset}
297
+ attributes: ${graph.attributes.nodes}
 
 
 
298
  hidden:
299
  node_builder:
300
  _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
301
  grid: o96
302
  edges:
303
+ - source_name: ${graph.data}
304
+ target_name: ${graph.hidden}
305
+ edge_builders:
306
+ - _target_: anemoi.graphs.edges.CutOffEdges
307
+ cutoff_factor: 0.6
308
+ source_mask_attr_name: null
309
+ target_mask_attr_name: null
310
+ attributes: ${graph.attributes.edges}
311
+ - source_name: ${graph.hidden}
312
+ target_name: ${graph.data}
313
+ edge_builders:
314
+ - _target_: anemoi.graphs.edges.KNNEdges
315
+ num_nearest_neighbours: 3
316
+ source_mask_attr_name: null
317
+ target_mask_attr_name: null
318
+ attributes: ${graph.attributes.edges}
 
 
 
 
 
 
 
 
319
  attributes:
320
  nodes:
321
  area_weight:
322
+ _target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights
323
  norm: unit-max
324
+ fill_value: 0
325
  edges:
326
  edge_length:
327
  _target_: anemoi.graphs.edges.attributes.EdgeLength
 
329
  edge_dirs:
330
  _target_: anemoi.graphs.edges.attributes.EdgeDirection
331
  norm: unit-std
332
+ post_processors: []
333
  model:
334
  activation: GELU
335
  num_channels: 1024
336
+ cpu_offload: false
337
+ output_mask: null
338
  model:
339
  _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
340
+ layer_kernels:
341
+ processor:
342
+ LayerNorm:
343
+ _target_: torch.nn.LayerNorm
344
+ _partial_: true
345
+ Linear:
346
+ _target_: torch.nn.Linear
347
+ _partial_: true
348
+ QueryNorm:
349
+ _target_: anemoi.models.layers.normalization.AutocastLayerNorm
350
+ _partial_: true
351
+ bias: false
352
+ KeyNorm:
353
+ _target_: anemoi.models.layers.normalization.AutocastLayerNorm
354
+ _partial_: true
355
+ bias: false
356
+ encoder:
357
+ LayerNorm:
358
+ _target_: torch.nn.LayerNorm
359
+ _partial_: true
360
+ Linear:
361
+ _target_: torch.nn.Linear
362
+ _partial_: true
363
+ decoder:
364
+ LayerNorm:
365
+ _target_: torch.nn.LayerNorm
366
+ _partial_: true
367
+ Linear:
368
+ _target_: torch.nn.Linear
369
+ _partial_: true
370
  processor:
371
  _target_: anemoi.models.layers.processor.TransformerProcessor
372
+ activation: ${model.activation}
 
373
  num_layers: 16
374
  num_chunks: 2
375
  mlp_hidden_ratio: 4
376
  num_heads: 16
377
  window_size: 1120
378
  dropout_p: 0.0
379
+ attention_implementation: flash_attention
380
+ qk_norm: false
381
+ softcap: 0.0
382
+ use_alibi_slopes: false
383
+ cpu_offload: ${model.cpu_offload}
384
  encoder:
385
  _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
386
+ trainable_size: ${model.trainable_parameters.data2hidden}
387
+ sub_graph_edge_attributes: ${model.attributes.edges}
388
+ activation: ${model.activation}
 
389
  num_chunks: 1
390
  mlp_hidden_ratio: 4
391
  num_heads: 16
392
+ qk_norm: false
393
+ cpu_offload: ${model.cpu_offload}
394
  decoder:
395
  _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
396
+ trainable_size: ${model.trainable_parameters.hidden2data}
397
+ sub_graph_edge_attributes: ${model.attributes.edges}
398
+ activation: ${model.activation}
 
399
  num_chunks: 1
400
  mlp_hidden_ratio: 4
401
  num_heads: 16
402
+ initialise_data_extractor_zero: false
403
+ qk_norm: false
404
+ cpu_offload: ${model.cpu_offload}
405
  trainable_parameters:
406
  data: 8
407
  hidden: 8
408
  data2hidden: 8
409
  hidden2data: 8
410
  attributes:
411
+ edges:
412
+ - edge_length
413
+ - edge_dirs
414
  nodes: []
 
415
  bounding:
416
+ - _target_: anemoi.models.layers.bounding.ReluBounding
417
+ variables:
418
+ - tp
419
+ - ro
420
+ - tcw
421
+ - ssrd
422
+ - ro
423
+ - q_50
424
+ - q_100
425
+ - q_150
426
+ - q_200
427
+ - q_250
428
+ - q_300
429
+ - q_400
430
+ - q_500
431
+ - q_600
432
+ - q_700
433
+ - q_850
434
+ - q_925
435
+ - q_1000
436
+ - _target_: anemoi.models.layers.bounding.HardtanhBounding
437
+ variables:
438
+ - tcc
439
+ - swvl1
440
+ - swvl2
441
+ min_val: 0
442
+ max_val: 1
443
+ - _target_: anemoi.models.layers.bounding.FractionBounding
444
+ variables:
445
+ - cp
446
+ - sf
447
+ min_val: 0
448
+ max_val: 1
449
+ total_var: tp
450
+ - _target_: anemoi.models.layers.bounding.FractionBounding
451
+ variables:
452
+ - lcc
453
+ - mcc
454
+ - hcc
455
+ min_val: 0
456
+ max_val: 1
457
+ total_var: tcc
458
  training:
459
  run_id: null
460
  fork_run_id: ${oc.decode:${oc.env:PRETRAINING_RUN_ID}}
461
+ transfer_learning: false
462
+ load_weights_only: true
463
+ deterministic: false
464
  precision: 16-mixed
465
  multistep_input: 2
466
  accum_grad_batches: 1
 
469
  val: 32.0
470
  algorithm: value
471
  swa:
472
+ enabled: false
473
  lr: 0.0001
474
+ optimizer:
475
+ zero: false
476
+ kwargs:
477
+ betas:
478
+ - 0.9
479
+ - 0.95
480
+ model_task: anemoi.training.train.forecaster.GraphForecaster
481
+ strategy:
482
+ _target_: anemoi.training.distributed.strategy.DDPGroupStrategy
483
+ num_gpus_per_model: ${hardware.num_gpus_per_model}
484
+ read_group_size: ${dataloader.read_group_size}
485
+ loss_gradient_scaling: false
486
  training_loss:
487
  _target_: anemoi.training.losses.mse.WeightedMSELoss
488
  scalars:
489
+ - variable
490
+ - loss_weights_mask
491
+ ignore_nans: false
 
492
  validation_metrics:
493
+ - _target_: anemoi.training.losses.mse.WeightedMSELoss
494
+ scalars: []
495
+ ignore_nans: true
496
+ scale_validation_metrics:
497
+ scalars_to_apply:
498
+ - variable
499
+ metrics:
500
+ - all
501
  rollout:
502
  start: 1
503
  epoch_increment: 1
 
505
  max_epochs: 13
506
  max_steps: 150000
507
  lr:
508
+ warmup: 1000
509
+ rate: 8.0e-07
510
  iterations: 7900
511
+ min: 3.0e-07
512
  warmup_t: 100
513
  variable_loss_scaling:
514
  default: 1
 
528
  2d: 0.5
529
  tp: 0.025
530
  cp: 0.0025
531
+ ro: 0.0025
532
  sf: 0.025
533
  tcc: 0.1
534
  mcc: 0.1
535
  lcc: 0.1
536
  hcc: 0.1
537
+ swvl2: 2
538
+ swvl1: 1
539
  stl2: 10
540
  stl1: 1
541
  ssrd: 0.05
542
  strd: 0.1
543
+ metrics:
544
+ - z_500
545
+ - t_850
546
+ - u_850
547
+ - v_850
548
  pressure_level_scaler:
549
  _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
550
  minimum: 0.2
551
+ slope: 0.001
552
+ node_loss_weights:
553
+ _target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
554
+ target_nodes: ${graph.data}
555
+ node_attribute: area_weight
556
+ submodules_to_freeze: []
config_pretraining.yaml CHANGED
@@ -4,6 +4,64 @@ data:
4
  frequency: 6h
5
  timestep: 6h
6
  forcing:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  - cos_latitude
8
  - cos_longitude
9
  - sin_latitude
@@ -14,136 +72,41 @@ data:
14
  - sin_local_time
15
  - insolation
16
  - lsm
17
- - sdor
18
- - slor
19
- - z
20
- diagnostic:
21
- - tp
22
- - cp
23
- - sf
24
  - tcc
 
25
  - hcc
26
  - lcc
27
- - mcc
28
- - ro
29
- - ssrd
30
- - strd
31
- - 100u
32
- - 100v
33
- remapped: null
34
- normalizer:
35
- default: mean-std
36
- remap:
37
- cp: tp
38
- sf: tp
39
- std:
40
- - tp
41
- - cp
42
- - sf
43
- - ro
44
- - tcw
45
- - ssrd
46
- - q_50
47
- - q_100
48
- - q_150
49
- - q_200
50
- - q_250
51
- - q_300
52
- - q_400
53
- - q_500
54
- - q_600
55
- - q_700
56
- - q_850
57
- - q_925
58
- - q_1000
59
- min-max: null
60
- max:
61
- - sdor
62
- - slor
63
- - z
64
- none:
65
- - cos_latitude
66
- - cos_longitude
67
- - sin_latitude
68
- - sin_longitude
69
- - cos_julian_day
70
- - cos_local_time
71
- - sin_julian_day
72
- - sin_local_time
73
- - insolation
74
- - lsm
75
- - tcc
76
- - mcc
77
- - hcc
78
- - lcc
79
- - swvl1
80
- - swvl2
81
  imputer:
82
  default: none
 
 
 
 
 
 
 
83
  remapper:
84
  default: none
85
  processors:
 
 
 
 
86
  normalizer:
87
  _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
88
- _convert_: all
89
- config:
90
- default: mean-std
91
- remap:
92
- cp: tp
93
- sf: tp
94
- std:
95
- - tp
96
- - cp
97
- - sf
98
- - ro
99
- - tcw
100
- - ssrd
101
- - q_50
102
- - q_100
103
- - q_150
104
- - q_200
105
- - q_250
106
- - q_300
107
- - q_400
108
- - q_500
109
- - q_600
110
- - q_700
111
- - q_850
112
- - q_925
113
- - q_1000
114
- min-max: null
115
- max:
116
- - sdor
117
- - slor
118
- - z
119
- none:
120
- - cos_latitude
121
- - cos_longitude
122
- - sin_latitude
123
- - sin_longitude
124
- - cos_julian_day
125
- - cos_local_time
126
- - sin_julian_day
127
- - sin_local_time
128
- - insolation
129
- - lsm
130
- - tcc
131
- - mcc
132
- - hcc
133
- - lcc
134
- - swvl1
135
- - swvl2
136
- num_features: 115
137
-
138
  dataloader:
139
  prefetch_factor: 2
140
- pin_memory: True
141
- read_group_size: 4
142
  num_workers:
143
- training: 4
144
- validation: 4
145
- test: 8
146
- predict: 8
147
  batch_size:
148
  training: 1
149
  validation: 1
@@ -151,145 +114,170 @@ dataloader:
151
  predict: 4
152
  limit_batches:
153
  training: null
154
- validation: 10
155
  test: 20
156
  predict: 20
 
 
 
157
  dataset: ${hardware.paths.data}/${hardware.files.dataset}
158
- land_dataset: ${hardware.paths.data}/${hardware.files.dataset_land}
159
- land_variables: [100u, 100v, swvl1, swvl2, stl1, stl2, tcc, lcc, mcc, hcc, sf, ro, strd, ssrd]
160
  training:
161
  dataset:
162
- - dataset: ${dataloader.dataset}
163
- start: null
164
- end: 2022
165
- frequency: ${data.frequency}
166
- drop: []
167
- - dataset: ${dataloader.land_dataset}
168
- start: null
169
- end: 2022
170
- frequency: ${data.frequency}
171
- select: ${dataloader.land_variables}
172
  start: null
173
  end: 2022
174
  drop: []
175
  validation:
176
  dataset:
177
- - dataset: ${dataloader.dataset}
178
- start: 2022
179
- end: 2022
180
- frequency: ${data.frequency}
181
- drop: []
182
- - dataset: ${dataloader.land_dataset}
183
- start: 2022
184
- end: 2022
185
- frequency: ${data.frequency}
186
- select: ${dataloader.land_variables}
187
  start: 2022
188
- end: 2022
 
 
 
 
 
 
 
 
 
189
  drop: []
190
- validation_rollout: 1
191
-
192
  diagnostics:
193
  plot:
194
- asynchronous: False
195
- datashader: True
196
  frequency:
197
  batch: 750
198
- epoch: 10
199
- parameters: [tp]
 
 
 
 
 
 
 
 
 
 
200
  sample_idx: 0
201
- callbacks:
202
- - _target_: anemoi.training.diagnostics.callbacks.plot.PlotLoss
203
- parameter_groups:
204
- moisture: [tp, cp, tcw]
205
- sfc_wind: [10u, 10v]
206
- - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSample
207
- sample_idx: 0
208
- per_sample: 6
209
- parameters: [tp]
210
- accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100]
211
- cmap_accumulation:
212
- - "#ffffff"
213
- - "#04e9e7"
214
- - "#019ff4"
215
- - "#0300f4"
216
- - "#02fd02"
217
- - "#01c501"
218
- - "#008e00"
219
- - "#fdf802"
220
- - "#e5bc00"
221
- - "#fd9500"
222
- - "#fd0000"
223
- - "#d40000"
224
- - "#bc0000"
225
- - "#f800fd"
226
- precip_and_related_fields: [tp, cp]
227
- enabled: True
228
- scatter: False
229
- mode: asyncio
230
- callbacks: {}
231
  benchmark_profiler:
232
  memory:
233
- enabled: True
234
  steps: 5
235
  warmup: 2
236
- extra_plots: False
237
- trace_rank0_only: False
238
  time:
239
- enabled: True
240
- verbose: False
241
  speed:
242
- enabled: True
243
  system:
244
- enabled: True
245
  model_summary:
246
- enabled: True
247
  snapshot:
248
- enabled: True
249
  steps: 4
250
  warmup: 0
251
  debug:
252
- anomaly_detection: False
253
- profiler: False
254
- enable_checkpointing: True
255
  checkpoint:
256
  every_n_minutes:
257
  save_frequency: 30
258
  num_models_saved: 3
259
  every_n_epochs:
260
  save_frequency: 1
261
- num_models_saved: 3
262
  every_n_train_steps:
263
  save_frequency: null
264
  num_models_saved: 0
265
  log:
266
  wandb:
267
- enabled: False
 
 
 
 
 
 
268
  tensorboard:
269
- enabled: False
270
  mlflow:
271
- enabled: False
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  interval: 100
273
- enable_progress_bar: True
274
- print_memory_summary: False
275
-
276
  hardware:
277
  paths:
278
  data: ${oc.decode:${oc.env:DATASETS_PATH}}
279
- output: ${oc.decode:${oc.env:OUTPUT_DIR}}
280
  logs:
281
- base: ${hardware.paths.output}/logs
282
- wandb: ${hardware.paths.output}/logs/wandb
283
- mlflow: ${hardware.paths.output}/logs/mlflow
284
- tensorboard: ${hardware.paths.output}/logs/tensorboard
285
- checkpoints: ${hardware.paths.output}/checkpoint
286
- plots: ${hardware.paths.output}/plots
287
- profiler: ${hardware.paths.output}/profiler
288
- graph: ${hardware.paths.output}/graphs
289
  files:
290
- dataset: aifs-ea-an-oper-0001-mars-n320-1979-2022-6h-v6.zarr
291
- dataset_land: aifs-ea-an-oper-0001-mars-n320-1979-2023-6h-v1-land.zarr
292
- graph: graph_enc_proc_dec_n320.pt
 
293
  checkpoint:
294
  every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
295
  every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
@@ -299,9 +287,8 @@ hardware:
299
  num_gpus_per_node: 4
300
  num_nodes: 16
301
  num_gpus_per_model: 4
302
-
303
  graph:
304
- overwrite: True
305
  data: data
306
  hidden: hidden
307
  nodes:
@@ -309,142 +296,210 @@ graph:
309
  node_builder:
310
  _target_: anemoi.graphs.nodes.ZarrDatasetNodes
311
  dataset: ${dataloader.dataset}
312
- attributes:
313
- area_weight:
314
- _target_: anemoi.graphs.nodes.attributes.AreaWeights
315
- norm: unit-max
316
  hidden:
317
  node_builder:
318
  _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
319
  grid: o96
320
  edges:
321
- - source_name: data
322
- target_name: hidden
323
- edge_builder:
324
- _target_: anemoi.graphs.edges.CutOffEdges
325
- cutoff_factor: 0.6
326
- attributes:
327
- edge_length:
328
- _target_: anemoi.graphs.edges.attributes.EdgeLength
329
- norm: unit-std
330
- edge_dirs:
331
- _target_: anemoi.graphs.edges.attributes.EdgeDirection
332
- norm: unit-std
333
- - source_name: hidden
334
- target_name: data
335
- edge_builder:
336
- _target_: anemoi.graphs.edges.KNNEdges
337
- num_nearest_neighbours: 3
338
- attributes:
339
- edge_length:
340
- _target_: anemoi.graphs.edges.attributes.EdgeLength
341
- norm: unit-std
342
- edge_dirs:
343
- _target_: anemoi.graphs.edges.attributes.EdgeDirection
344
- norm: unit-std
345
-
 
 
 
 
 
346
  model:
347
  activation: GELU
348
  num_channels: 1024
 
 
349
  model:
350
  _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  processor:
352
  _target_: anemoi.models.layers.processor.TransformerProcessor
353
- _convert_: all
354
- activation: GELU
355
  num_layers: 16
356
  num_chunks: 2
357
  mlp_hidden_ratio: 4
358
  num_heads: 16
359
  window_size: 1120
360
- dropout_p: 0
 
 
 
 
 
361
  encoder:
362
  _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
363
- _convert_: all
364
- trainable_size: 8
365
- sub_graph_edge_attributes: [edge_length, edge_dirs]
366
- activation: GELU
367
  num_chunks: 1
368
  mlp_hidden_ratio: 4
369
  num_heads: 16
 
 
370
  decoder:
371
  _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
372
- _convert_: all
373
- trainable_size: 8
374
- sub_graph_edge_attributes: [edge_length, edge_dirs]
375
- activation: GELU
376
  num_chunks: 1
377
  mlp_hidden_ratio: 4
378
  num_heads: 16
 
 
 
379
  trainable_parameters:
380
  data: 8
381
  hidden: 8
382
  data2hidden: 8
383
  hidden2data: 8
384
  attributes:
385
- edges: [edge_length, edge_dirs]
 
 
386
  nodes: []
387
- node_loss_weight: area_weight
388
  bounding:
389
- - _target_: anemoi.models.layers.bounding.ReluBounding
390
- variables:
391
- - tp
392
- - ro
393
- - tcw
394
- - ssrd
395
- - q_50
396
- - q_100
397
- - q_150
398
- - q_200
399
- - q_250
400
- - q_300
401
- - q_400
402
- - q_500
403
- - q_600
404
- - q_700
405
- - q_850
406
- - q_925
407
- - q_1000
408
- - _target_: anemoi.models.layers.bounding.HardtanhBounding
409
- variables: [tcc, swvl1, swvl2]
410
- min_val: 0
411
- max_val: 1
412
- - _target_: anemoi.models.layers.bounding.FractionBounding
413
- variables: [cp, sf]
414
- min_val: 0
415
- max_val: 1
416
- total_var: tp
417
- - _target_: anemoi.models.layers.bounding.FractionBounding
418
- variables: [lcc, mcc, hcc]
419
- min_val: 0
420
- max_val: 1
421
- total_var: tcc
422
-
 
 
 
 
 
 
 
 
423
  training:
424
  run_id: null
425
  fork_run_id: null
426
- load_weights_only: null
427
- deterministic: False
 
428
  precision: 16-mixed
429
  multistep_input: 2
430
  accum_grad_batches: 1
431
  num_sanity_val_steps: 6
432
  gradient_clip:
433
- val: 32
434
  algorithm: value
435
  swa:
436
- enabled: False
437
  lr: 0.0001
438
- zero_optimizer: False
 
 
 
 
 
 
 
 
 
 
 
439
  training_loss:
440
  _target_: anemoi.training.losses.mse.WeightedMSELoss
441
- scalars: [variable, loss_weights_mask]
442
- ignore_nans: False
443
- loss_gradient_scaling: False
 
444
  validation_metrics:
445
- - _target_: anemoi.training.losses.mse.WeightedMSELoss
446
- scalars: []
447
- ignore_nans: True
 
 
 
 
 
448
  rollout:
449
  start: 1
450
  epoch_increment: 0
@@ -452,9 +507,10 @@ training:
452
  max_epochs: null
453
  max_steps: 260000
454
  lr:
455
- rate: 0.00003125
 
456
  iterations: 260000
457
- min: 3.0e-7
458
  variable_loss_scaling:
459
  default: 1
460
  pl:
@@ -473,20 +529,29 @@ training:
473
  2d: 0.5
474
  tp: 0.025
475
  cp: 0.0025
476
- ro: 0.005
477
  sf: 0.025
478
  tcc: 0.1
479
  mcc: 0.1
480
  lcc: 0.1
481
  hcc: 0.1
482
- swvl2: 2.0
483
- swvl1: 1.0
484
  stl2: 10
485
  stl1: 1
486
  ssrd: 0.05
487
  strd: 0.1
488
- metrics: [z_500, t_850, u_850, v_850]
 
 
 
 
489
  pressure_level_scaler:
490
  _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
491
  minimum: 0.2
492
- slope: 0.001
 
 
 
 
 
 
4
  frequency: 6h
5
  timestep: 6h
6
  forcing:
7
+ - cos_latitude
8
+ - cos_longitude
9
+ - sin_latitude
10
+ - sin_longitude
11
+ - cos_julian_day
12
+ - cos_local_time
13
+ - sin_julian_day
14
+ - sin_local_time
15
+ - insolation
16
+ - lsm
17
+ - sdor
18
+ - slor
19
+ - z
20
+ diagnostic:
21
+ - tp
22
+ - cp
23
+ - sf
24
+ - tcc
25
+ - hcc
26
+ - lcc
27
+ - mcc
28
+ - ro
29
+ - ssrd
30
+ - strd
31
+ - 100u
32
+ - 100v
33
+ remapped: null
34
+ normalizer:
35
+ default: mean-std
36
+ remap:
37
+ cp: tp
38
+ sf: tp
39
+ std:
40
+ - tp
41
+ - cp
42
+ - sf
43
+ - ro
44
+ - tcw
45
+ - ssrd
46
+ - q_50
47
+ - q_100
48
+ - q_150
49
+ - q_200
50
+ - q_250
51
+ - q_300
52
+ - q_400
53
+ - q_500
54
+ - q_600
55
+ - q_700
56
+ - q_850
57
+ - q_925
58
+ - q_1000
59
+ min-max: null
60
+ max:
61
+ - sdor
62
+ - slor
63
+ - z
64
+ none:
65
  - cos_latitude
66
  - cos_longitude
67
  - sin_latitude
 
72
  - sin_local_time
73
  - insolation
74
  - lsm
 
 
 
 
 
 
 
75
  - tcc
76
+ - mcc
77
  - hcc
78
  - lcc
79
+ - swvl1
80
+ - swvl2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  imputer:
82
  default: none
83
+ minimum:
84
+ - swvl1
85
+ - swvl2
86
+ - ro
87
+ mean:
88
+ - stl1
89
+ - stl2
90
  remapper:
91
  default: none
92
  processors:
93
+ imputer:
94
+ _target_: anemoi.models.preprocessing.imputer.InputImputer
95
+ _convert_: all
96
+ config: ${data.imputer}
97
  normalizer:
98
  _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
99
+ config: ${data.normalizer}
100
+ num_features: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  dataloader:
102
  prefetch_factor: 2
103
+ pin_memory: true
104
+ read_group_size: ${hardware.num_gpus_per_model}
105
  num_workers:
106
+ training: 8
107
+ validation: 8
108
+ test: 1
109
+ predict: 1
110
  batch_size:
111
  training: 1
112
  validation: 1
 
114
  predict: 4
115
  limit_batches:
116
  training: null
117
+ validation: null
118
  test: 20
119
  predict: 20
120
+ grid_indices:
121
+ _target_: anemoi.training.data.grid_indices.FullGrid
122
+ nodes_name: ${graph.data}
123
  dataset: ${hardware.paths.data}/${hardware.files.dataset}
 
 
124
  training:
125
  dataset:
126
+ - dataset: ${hardware.paths.data}/${hardware.files.dataset}
127
+ start: null
128
+ end: 2022
129
+ frequency: ${data.frequency}
 
 
 
 
 
 
130
  start: null
131
  end: 2022
132
  drop: []
133
  validation:
134
  dataset:
135
+ - dataset: ${hardware.paths.data}/${hardware.files.dataset}
136
+ start: 2022
137
+ end: 2024
138
+ frequency: ${data.frequency}
 
 
 
 
 
 
139
  start: 2022
140
+ end: 2024
141
+ drop: []
142
+ test:
143
+ dataset:
144
+ - dataset: ${hardware.paths.data}/${hardware.files.dataset}
145
+ start: 2022
146
+ end: null
147
+ frequency: ${data.frequency}
148
+ start: 2022
149
+ end: null
150
  drop: []
 
 
151
  diagnostics:
152
  plot:
153
+ asynchronous: true
154
+ datashader: true
155
  frequency:
156
  batch: 750
157
+ epoch: 5
158
+ parameters:
159
+ - z_500
160
+ - t_850
161
+ - u_850
162
+ - v_850
163
+ - 2t
164
+ - 10u
165
+ - 10v
166
+ - sp
167
+ - tp
168
+ - cp
169
  sample_idx: 0
170
+ precip_and_related_fields:
171
+ - tp
172
+ - cp
173
+ colormaps:
174
+ default:
175
+ _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
176
+ name: viridis
177
+ error:
178
+ _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
179
+ name: bwr
180
+ precip:
181
+ _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormapClevels
182
+ clevels:
183
+ - '#ffffff'
184
+ - '#04e9e7'
185
+ - '#019ff4'
186
+ - '#0300f4'
187
+ - '#02fd02'
188
+ - '#01c501'
189
+ - '#008e00'
190
+ - '#fdf802'
191
+ - '#e5bc00'
192
+ - '#fd9500'
193
+ - '#fd0000'
194
+ - '#d40000'
195
+ - '#bc0000'
196
+ - '#f800fd'
197
+ variables: ${diagnostics.plot.precip_and_related_fields}
198
+ callbacks: []
199
+ callbacks: []
200
  benchmark_profiler:
201
  memory:
202
+ enabled: true
203
  steps: 5
204
  warmup: 2
205
+ extra_plots: false
206
+ trace_rank0_only: false
207
  time:
208
+ enabled: true
209
+ verbose: false
210
  speed:
211
+ enabled: true
212
  system:
213
+ enabled: true
214
  model_summary:
215
+ enabled: true
216
  snapshot:
217
+ enabled: true
218
  steps: 4
219
  warmup: 0
220
  debug:
221
+ anomaly_detection: false
222
+ profiler: false
223
+ enable_checkpointing: true
224
  checkpoint:
225
  every_n_minutes:
226
  save_frequency: 30
227
  num_models_saved: 3
228
  every_n_epochs:
229
  save_frequency: 1
230
+ num_models_saved: -1
231
  every_n_train_steps:
232
  save_frequency: null
233
  num_models_saved: 0
234
  log:
235
  wandb:
236
+ enabled: false
237
+ offline: false
238
+ log_model: false
239
+ project: Anemoi
240
+ entity: ???
241
+ gradients: false
242
+ parameters: false
243
  tensorboard:
244
+ enabled: false
245
  mlflow:
246
+ enabled: false
247
+ offline: false
248
+ authentication: false
249
+ log_model: false
250
+ tracking_uri: ???
251
+ experiment_name: ???
252
+ project_name: ???
253
+ system: true
254
+ terminal: true
255
+ run_name: null
256
+ on_resume_create_child: true
257
+ expand_hyperparams:
258
+ - config
259
+ http_max_retries: 35
260
  interval: 100
261
+ enable_progress_bar: true
262
+ print_memory_summary: false
 
263
  hardware:
264
  paths:
265
  data: ${oc.decode:${oc.env:DATASETS_PATH}}
266
+ output: ${oc.decode:${oc.env:OUTPUT_PATH}}
267
  logs:
268
+ base: ${hardware.paths.output}logs/
269
+ wandb: ${hardware.paths.logs.base}
270
+ mlflow: ${hardware.paths.logs.base}mlflow/
271
+ tensorboard: ${hardware.paths.logs.base}tensorboard/
272
+ checkpoints: ${hardware.paths.output}checkpoint/
273
+ plots: ${hardware.paths.output}plots/
274
+ profiler: ${hardware.paths.output}profiler/
275
+ graph: ${hardware.paths.output}graphs/
276
  files:
277
+ dataset: aifs-ea-an-oper-0001-mars-${data.resolution}-1979-2024-6h-v1-aifs-single-v1.zarr
278
+ graph: graph_enc_proc_dec_${data.resolution}.pt
279
+ truncation: null
280
+ truncation_inv: null
281
  checkpoint:
282
  every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
283
  every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
 
287
  num_gpus_per_node: 4
288
  num_nodes: 16
289
  num_gpus_per_model: 4
 
290
  graph:
291
+ overwrite: true
292
  data: data
293
  hidden: hidden
294
  nodes:
 
296
  node_builder:
297
  _target_: anemoi.graphs.nodes.ZarrDatasetNodes
298
  dataset: ${dataloader.dataset}
299
+ attributes: ${graph.attributes.nodes}
 
 
 
300
  hidden:
301
  node_builder:
302
  _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
303
  grid: o96
304
  edges:
305
+ - source_name: ${graph.data}
306
+ target_name: ${graph.hidden}
307
+ edge_builders:
308
+ - _target_: anemoi.graphs.edges.CutOffEdges
309
+ cutoff_factor: 0.6
310
+ source_mask_attr_name: null
311
+ target_mask_attr_name: null
312
+ attributes: ${graph.attributes.edges}
313
+ - source_name: ${graph.hidden}
314
+ target_name: ${graph.data}
315
+ edge_builders:
316
+ - _target_: anemoi.graphs.edges.KNNEdges
317
+ num_nearest_neighbours: 3
318
+ source_mask_attr_name: null
319
+ target_mask_attr_name: null
320
+ attributes: ${graph.attributes.edges}
321
+ attributes:
322
+ nodes:
323
+ area_weight:
324
+ _target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights
325
+ norm: unit-max
326
+ fill_value: 0
327
+ edges:
328
+ edge_length:
329
+ _target_: anemoi.graphs.edges.attributes.EdgeLength
330
+ norm: unit-std
331
+ edge_dirs:
332
+ _target_: anemoi.graphs.edges.attributes.EdgeDirection
333
+ norm: unit-std
334
+ post_processors: []
335
  model:
336
  activation: GELU
337
  num_channels: 1024
338
+ cpu_offload: false
339
+ output_mask: null
340
  model:
341
  _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
342
+ layer_kernels:
343
+ processor:
344
+ LayerNorm:
345
+ _target_: torch.nn.LayerNorm
346
+ _partial_: true
347
+ Linear:
348
+ _target_: torch.nn.Linear
349
+ _partial_: true
350
+ QueryNorm:
351
+ _target_: anemoi.models.layers.normalization.AutocastLayerNorm
352
+ _partial_: true
353
+ bias: false
354
+ KeyNorm:
355
+ _target_: anemoi.models.layers.normalization.AutocastLayerNorm
356
+ _partial_: true
357
+ bias: false
358
+ encoder:
359
+ LayerNorm:
360
+ _target_: torch.nn.LayerNorm
361
+ _partial_: true
362
+ Linear:
363
+ _target_: torch.nn.Linear
364
+ _partial_: true
365
+ decoder:
366
+ LayerNorm:
367
+ _target_: torch.nn.LayerNorm
368
+ _partial_: true
369
+ Linear:
370
+ _target_: torch.nn.Linear
371
+ _partial_: true
372
  processor:
373
  _target_: anemoi.models.layers.processor.TransformerProcessor
374
+ activation: ${model.activation}
 
375
  num_layers: 16
376
  num_chunks: 2
377
  mlp_hidden_ratio: 4
378
  num_heads: 16
379
  window_size: 1120
380
+ dropout_p: 0.0
381
+ attention_implementation: flash_attention
382
+ qk_norm: false
383
+ softcap: 0.0
384
+ use_alibi_slopes: false
385
+ cpu_offload: ${model.cpu_offload}
386
  encoder:
387
  _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
388
+ trainable_size: ${model.trainable_parameters.data2hidden}
389
+ sub_graph_edge_attributes: ${model.attributes.edges}
390
+ activation: ${model.activation}
 
391
  num_chunks: 1
392
  mlp_hidden_ratio: 4
393
  num_heads: 16
394
+ qk_norm: false
395
+ cpu_offload: ${model.cpu_offload}
396
  decoder:
397
  _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
398
+ trainable_size: ${model.trainable_parameters.hidden2data}
399
+ sub_graph_edge_attributes: ${model.attributes.edges}
400
+ activation: ${model.activation}
 
401
  num_chunks: 1
402
  mlp_hidden_ratio: 4
403
  num_heads: 16
404
+ initialise_data_extractor_zero: false
405
+ qk_norm: false
406
+ cpu_offload: ${model.cpu_offload}
407
  trainable_parameters:
408
  data: 8
409
  hidden: 8
410
  data2hidden: 8
411
  hidden2data: 8
412
  attributes:
413
+ edges:
414
+ - edge_length
415
+ - edge_dirs
416
  nodes: []
 
417
  bounding:
418
+ - _target_: anemoi.models.layers.bounding.ReluBounding
419
+ variables:
420
+ - tp
421
+ - ro
422
+ - tcw
423
+ - ssrd
424
+ - ro
425
+ - q_50
426
+ - q_100
427
+ - q_150
428
+ - q_200
429
+ - q_250
430
+ - q_300
431
+ - q_400
432
+ - q_500
433
+ - q_600
434
+ - q_700
435
+ - q_850
436
+ - q_925
437
+ - q_1000
438
+ - _target_: anemoi.models.layers.bounding.HardtanhBounding
439
+ variables:
440
+ - tcc
441
+ - swvl1
442
+ - swvl2
443
+ min_val: 0
444
+ max_val: 1
445
+ - _target_: anemoi.models.layers.bounding.FractionBounding
446
+ variables:
447
+ - cp
448
+ - sf
449
+ min_val: 0
450
+ max_val: 1
451
+ total_var: tp
452
+ - _target_: anemoi.models.layers.bounding.FractionBounding
453
+ variables:
454
+ - lcc
455
+ - mcc
456
+ - hcc
457
+ min_val: 0
458
+ max_val: 1
459
+ total_var: tcc
460
  training:
461
  run_id: null
462
  fork_run_id: null
463
+ transfer_learning: false
464
+ load_weights_only: false
465
+ deterministic: false
466
  precision: 16-mixed
467
  multistep_input: 2
468
  accum_grad_batches: 1
469
  num_sanity_val_steps: 6
470
  gradient_clip:
471
+ val: 32.0
472
  algorithm: value
473
  swa:
474
+ enabled: false
475
  lr: 0.0001
476
+ optimizer:
477
+ zero: false
478
+ kwargs:
479
+ betas:
480
+ - 0.9
481
+ - 0.95
482
+ model_task: anemoi.training.train.forecaster.GraphForecaster
483
+ strategy:
484
+ _target_: anemoi.training.distributed.strategy.DDPGroupStrategy
485
+ num_gpus_per_model: ${hardware.num_gpus_per_model}
486
+ read_group_size: ${dataloader.read_group_size}
487
+ loss_gradient_scaling: false
488
  training_loss:
489
  _target_: anemoi.training.losses.mse.WeightedMSELoss
490
+ scalars:
491
+ - variable
492
+ - loss_weights_mask
493
+ ignore_nans: false
494
  validation_metrics:
495
+ - _target_: anemoi.training.losses.mse.WeightedMSELoss
496
+ scalars: []
497
+ ignore_nans: true
498
+ scale_validation_metrics:
499
+ scalars_to_apply:
500
+ - variable
501
+ metrics:
502
+ - all
503
  rollout:
504
  start: 1
505
  epoch_increment: 0
 
507
  max_epochs: null
508
  max_steps: 260000
509
  lr:
510
+ warmup: 1000
511
+ rate: 3.125e-05
512
  iterations: 260000
513
+ min: 3.0e-07
514
  variable_loss_scaling:
515
  default: 1
516
  pl:
 
529
  2d: 0.5
530
  tp: 0.025
531
  cp: 0.0025
532
+ ro: 0.0025
533
  sf: 0.025
534
  tcc: 0.1
535
  mcc: 0.1
536
  lcc: 0.1
537
  hcc: 0.1
538
+ swvl2: 2
539
+ swvl1: 1
540
  stl2: 10
541
  stl1: 1
542
  ssrd: 0.05
543
  strd: 0.1
544
+ metrics:
545
+ - z_500
546
+ - t_850
547
+ - u_850
548
+ - v_850
549
  pressure_level_scaler:
550
  _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
551
  minimum: 0.2
552
+ slope: 0.001
553
+ node_loss_weights:
554
+ _target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
555
+ target_nodes: ${graph.data}
556
+ node_attribute: area_weight
557
+ submodules_to_freeze: []