Graph Machine Learning
AnemoI
English
File size: 12,175 Bytes
ffdd6fa
 
 
 
 
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
 
 
 
 
 
 
 
 
598f35c
ffdd6fa
 
598f35c
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
 
 
ffdd6fa
 
598f35c
 
ffdd6fa
 
598f35c
 
ffdd6fa
598f35c
 
 
 
ffdd6fa
 
 
 
 
 
 
598f35c
ffdd6fa
 
598f35c
 
 
ffdd6fa
 
 
598f35c
 
 
 
ffdd6fa
 
 
 
 
598f35c
 
 
 
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
598f35c
ffdd6fa
 
598f35c
 
ffdd6fa
598f35c
 
ffdd6fa
598f35c
ffdd6fa
598f35c
ffdd6fa
598f35c
ffdd6fa
598f35c
ffdd6fa
 
 
598f35c
 
 
ffdd6fa
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
 
 
 
 
 
ffdd6fa
598f35c
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
598f35c
 
ffdd6fa
 
 
598f35c
ffdd6fa
598f35c
 
 
 
 
 
 
 
ffdd6fa
598f35c
 
 
 
ffdd6fa
 
 
 
 
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
 
 
 
 
ffdd6fa
 
598f35c
 
 
ffdd6fa
 
 
598f35c
 
ffdd6fa
 
598f35c
 
 
ffdd6fa
 
 
598f35c
 
 
ffdd6fa
 
 
 
 
 
598f35c
 
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
 
ffdd6fa
 
 
 
 
598f35c
ffdd6fa
 
598f35c
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
598f35c
 
 
 
ffdd6fa
598f35c
 
 
 
 
 
 
 
ffdd6fa
 
 
 
 
 
 
598f35c
 
ffdd6fa
598f35c
ffdd6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
ffdd6fa
 
 
 
598f35c
 
 
 
 
ffdd6fa
 
 
598f35c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
data:
  format: zarr
  resolution: n320
  frequency: 6h
  timestep: 6h
  forcing:
  - cos_latitude
  - cos_longitude
  - sin_latitude
  - sin_longitude
  - cos_julian_day
  - cos_local_time
  - sin_julian_day
  - sin_local_time
  - insolation
  - lsm
  - sdor
  - slor
  - z
  diagnostic:
  - tp
  - cp
  - sf
  - tcc
  - hcc
  - lcc
  - mcc
  - ro
  - ssrd
  - strd
  - 100u
  - 100v
  remapped: null
  normalizer:
    default: mean-std
    remap:
      cp: tp
      sf: tp
    std:
    - tp
    - cp
    - sf
    - ro
    - tcw
    - ssrd
    - q_50
    - q_100
    - q_150
    - q_200
    - q_250
    - q_300
    - q_400
    - q_500
    - q_600
    - q_700
    - q_850
    - q_925
    - q_1000
    min-max: null
    max:
    - sdor
    - slor
    - z
    none:
    - cos_latitude
    - cos_longitude
    - sin_latitude
    - sin_longitude
    - cos_julian_day
    - cos_local_time
    - sin_julian_day
    - sin_local_time
    - insolation
    - lsm
    - tcc
    - mcc
    - hcc
    - lcc
    - swvl1
    - swvl2
  imputer:
    default: none
    minimum:
    - swvl1
    - swvl2
    - ro
    mean:
    - stl1
    - stl2
  remapper:
    default: none
  processors:
    imputer:
      _target_: anemoi.models.preprocessing.imputer.InputImputer
      _convert_: all
      config: ${data.imputer}
    normalizer:
      _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
      config: ${data.normalizer}
  num_features: null
dataloader:
  prefetch_factor: 2
  pin_memory: true
  read_group_size: ${hardware.num_gpus_per_model}
  num_workers:
    training: 8
    validation: 8
    test: 1
    predict: 1
  batch_size:
    training: 1
    validation: 1
    test: 4
    predict: 4
  limit_batches:
    training: null
    validation: null
    test: 20
    predict: 20
  grid_indices:
    _target_: anemoi.training.data.grid_indices.FullGrid
    nodes_name: ${graph.data}
  dataset: ${hardware.paths.data}/${hardware.files.dataset}
  training:
    dataset:
    - dataset: ${hardware.paths.data}/${hardware.files.dataset}
      start: null
      end: 2022
      frequency: ${data.frequency}
    start: null
    end: 2022
    drop: []
  validation:
    dataset:
    - dataset: ${hardware.paths.data}/${hardware.files.dataset}
      start: 2022
      end: 2024
      frequency: ${data.frequency}
    start: 2022
    end: 2024
    drop: []
  test:
    dataset:
    - dataset: ${hardware.paths.data}/${hardware.files.dataset}
      start: 2022
      end: null
      frequency: ${data.frequency}
    start: 2022
    end: null
    drop: []
diagnostics:
  plot:
    asynchronous: true
    datashader: true
    frequency:
      batch: 750
      epoch: 5
    parameters:
    - z_500
    - t_850
    - u_850
    - v_850
    - 2t
    - 10u
    - 10v
    - sp
    - tp
    - cp
    sample_idx: 0
    precip_and_related_fields:
    - tp
    - cp
    colormaps:
      default:
        _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
        name: viridis
      error:
        _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
        name: bwr
      precip:
        _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormapClevels
        clevels:
        - '#ffffff'
        - '#04e9e7'
        - '#019ff4'
        - '#0300f4'
        - '#02fd02'
        - '#01c501'
        - '#008e00'
        - '#fdf802'
        - '#e5bc00'
        - '#fd9500'
        - '#fd0000'
        - '#d40000'
        - '#bc0000'
        - '#f800fd'
        variables: ${diagnostics.plot.precip_and_related_fields}
    callbacks: []
  callbacks: []
  benchmark_profiler:
    memory:
      enabled: true
      steps: 5
      warmup: 2
      extra_plots: false
      trace_rank0_only: false
    time:
      enabled: true
      verbose: false
    speed:
      enabled: true
    system:
      enabled: true
    model_summary:
      enabled: true
    snapshot:
      enabled: true
      steps: 4
      warmup: 0
  debug:
    anomaly_detection: false
  profiler: false
  enable_checkpointing: true
  checkpoint:
    every_n_minutes:
      save_frequency: 30
      num_models_saved: 3
    every_n_epochs:
      save_frequency: 1
      num_models_saved: -1
    every_n_train_steps:
      save_frequency: null
      num_models_saved: 0
  log:
    wandb:
      enabled: false
      offline: false
      log_model: false
      project: Anemoi
      entity: ???
      gradients: false
      parameters: false
    tensorboard:
      enabled: false
    mlflow:
      enabled: false
      offline: false
      authentication: false
      log_model: false
      tracking_uri: ???
      experiment_name: ???
      project_name: ???
      system: true
      terminal: true
      run_name: null
      on_resume_create_child: true
      expand_hyperparams:
      - config
      http_max_retries: 35
    interval: 100
  enable_progress_bar: true
  print_memory_summary: false
hardware:
  paths:
    data: ${oc.decode:${oc.env:DATASETS_PATH}}
    output: ${oc.decode:${oc.env:OUTPUT_PATH}}
    logs:
      base: ${hardware.paths.output}logs/
      wandb: ${hardware.paths.logs.base}
      mlflow: ${hardware.paths.logs.base}mlflow/
      tensorboard: ${hardware.paths.logs.base}tensorboard/
    checkpoints: ${hardware.paths.output}checkpoint/
    plots: ${hardware.paths.output}plots/
    profiler: ${hardware.paths.output}profiler/
    graph: ${hardware.paths.output}graphs/
  files:
    dataset: aifs-ea-an-oper-0001-mars-${data.resolution}-1979-2024-6h-v1-aifs-single-v1.zarr
    graph: graph_enc_proc_dec_${data.resolution}.pt
    truncation: null
    truncation_inv: null
    checkpoint:
      every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
      every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
      every_n_minutes: aifs-by_time-epoch_{epoch:03d}-step_{step:06d}
    warm_start: null
  accelerator: auto
  num_gpus_per_node: 4
  num_nodes: 16
  num_gpus_per_model: 4
graph:
  overwrite: true
  data: data
  hidden: hidden
  nodes:
    data:
      node_builder:
        _target_: anemoi.graphs.nodes.ZarrDatasetNodes
        dataset: ${dataloader.dataset}
      attributes: ${graph.attributes.nodes}
    hidden:
      node_builder:
        _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
        grid: o96
  edges:
  - source_name: ${graph.data}
    target_name: ${graph.hidden}
    edge_builders:
    - _target_: anemoi.graphs.edges.CutOffEdges
      cutoff_factor: 0.6
      source_mask_attr_name: null
      target_mask_attr_name: null
    attributes: ${graph.attributes.edges}
  - source_name: ${graph.hidden}
    target_name: ${graph.data}
    edge_builders:
    - _target_: anemoi.graphs.edges.KNNEdges
      num_nearest_neighbours: 3
      source_mask_attr_name: null
      target_mask_attr_name: null
    attributes: ${graph.attributes.edges}
  attributes:
    nodes:
      area_weight:
        _target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights
        norm: unit-max
        fill_value: 0
    edges:
      edge_length:
        _target_: anemoi.graphs.edges.attributes.EdgeLength
        norm: unit-std
      edge_dirs:
        _target_: anemoi.graphs.edges.attributes.EdgeDirection
        norm: unit-std
  post_processors: []
model:
  activation: GELU
  num_channels: 1024
  cpu_offload: false
  output_mask: null
  model:
    _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
  layer_kernels:
    processor:
      LayerNorm:
        _target_: torch.nn.LayerNorm
        _partial_: true
      Linear:
        _target_: torch.nn.Linear
        _partial_: true
      QueryNorm:
        _target_: anemoi.models.layers.normalization.AutocastLayerNorm
        _partial_: true
        bias: false
      KeyNorm:
        _target_: anemoi.models.layers.normalization.AutocastLayerNorm
        _partial_: true
        bias: false
    encoder:
      LayerNorm:
        _target_: torch.nn.LayerNorm
        _partial_: true
      Linear:
        _target_: torch.nn.Linear
        _partial_: true
    decoder:
      LayerNorm:
        _target_: torch.nn.LayerNorm
        _partial_: true
      Linear:
        _target_: torch.nn.Linear
        _partial_: true
  processor:
    _target_: anemoi.models.layers.processor.TransformerProcessor
    activation: ${model.activation}
    num_layers: 16
    num_chunks: 2
    mlp_hidden_ratio: 4
    num_heads: 16
    window_size: 1120
    dropout_p: 0.0
    attention_implementation: flash_attention
    qk_norm: false
    softcap: 0.0
    use_alibi_slopes: false
    cpu_offload: ${model.cpu_offload}
  encoder:
    _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
    trainable_size: ${model.trainable_parameters.data2hidden}
    sub_graph_edge_attributes: ${model.attributes.edges}
    activation: ${model.activation}
    num_chunks: 1
    mlp_hidden_ratio: 4
    num_heads: 16
    qk_norm: false
    cpu_offload: ${model.cpu_offload}
  decoder:
    _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
    trainable_size: ${model.trainable_parameters.hidden2data}
    sub_graph_edge_attributes: ${model.attributes.edges}
    activation: ${model.activation}
    num_chunks: 1
    mlp_hidden_ratio: 4
    num_heads: 16
    initialise_data_extractor_zero: false
    qk_norm: false
    cpu_offload: ${model.cpu_offload}
  trainable_parameters:
    data: 8
    hidden: 8
    data2hidden: 8
    hidden2data: 8
  attributes:
    edges:
    - edge_length
    - edge_dirs
    nodes: []
  bounding:
  - _target_: anemoi.models.layers.bounding.ReluBounding
    variables:
    - tp
    - ro
    - tcw
    - ssrd
    - ro
    - q_50
    - q_100
    - q_150
    - q_200
    - q_250
    - q_300
    - q_400
    - q_500
    - q_600
    - q_700
    - q_850
    - q_925
    - q_1000
  - _target_: anemoi.models.layers.bounding.HardtanhBounding
    variables:
    - tcc
    - swvl1
    - swvl2
    min_val: 0
    max_val: 1
  - _target_: anemoi.models.layers.bounding.FractionBounding
    variables:
    - cp
    - sf
    min_val: 0
    max_val: 1
    total_var: tp
  - _target_: anemoi.models.layers.bounding.FractionBounding
    variables:
    - lcc
    - mcc
    - hcc
    min_val: 0
    max_val: 1
    total_var: tcc
training:
  run_id: null
  fork_run_id: null
  transfer_learning: false
  load_weights_only: false
  deterministic: false
  precision: 16-mixed
  multistep_input: 2
  accum_grad_batches: 1
  num_sanity_val_steps: 6
  gradient_clip:
    val: 32.0
    algorithm: value
  swa:
    enabled: false
    lr: 0.0001
  optimizer:
    zero: false
    kwargs:
      betas:
      - 0.9
      - 0.95
  model_task: anemoi.training.train.forecaster.GraphForecaster
  strategy:
    _target_: anemoi.training.distributed.strategy.DDPGroupStrategy
    num_gpus_per_model: ${hardware.num_gpus_per_model}
    read_group_size: ${dataloader.read_group_size}
  loss_gradient_scaling: false
  training_loss:
    _target_: anemoi.training.losses.mse.WeightedMSELoss
    scalars:
    - variable
    - loss_weights_mask
    ignore_nans: false
  validation_metrics:
  - _target_: anemoi.training.losses.mse.WeightedMSELoss
    scalars: []
    ignore_nans: true
  scale_validation_metrics:
    scalars_to_apply:
    - variable
    metrics:
    - all
  rollout:
    start: 1
    epoch_increment: 0
    max: 1
  max_epochs: null
  max_steps: 260000
  lr:
    warmup: 1000
    rate: 3.125e-05
    iterations: 260000
    min: 3.0e-07
  variable_loss_scaling:
    default: 1
    pl:
      q: 0.6
      t: 6
      u: 0.8
      v: 0.5
      w: 0.001
      z: 12
    sfc:
      sp: 10
      10u: 0.5
      10v: 0.5
      100u: 0.1
      100v: 0.1
      2d: 0.5
      tp: 0.025
      cp: 0.0025
      ro: 0.0025
      sf: 0.025
      tcc: 0.1
      mcc: 0.1
      lcc: 0.1
      hcc: 0.1
      swvl2: 2
      swvl1: 1
      stl2: 10
      stl1: 1
      ssrd: 0.05
      strd: 0.1
  metrics:
  - z_500
  - t_850
  - u_850
  - v_850
  pressure_level_scaler:
    _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
    minimum: 0.2
    slope: 0.001
  node_loss_weights:
    _target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
    target_nodes: ${graph.data}
    node_attribute: area_weight
  submodules_to_freeze: []