File size: 12,175 Bytes

ffdd6fa
 
 
 
 
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
 
 
 
 
 
 
 
 
598f35c
ffdd6fa
 
598f35c
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
 
 
ffdd6fa
 
598f35c
 
ffdd6fa
 
598f35c
 
ffdd6fa
598f35c
 
 
 
ffdd6fa
 
 
 
 
 
 
598f35c
ffdd6fa
 
598f35c
 
 
ffdd6fa
 
 
598f35c
 
 
 
ffdd6fa
 
 
 
 
598f35c
 
 
 
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
598f35c
ffdd6fa
 
598f35c
 
ffdd6fa
598f35c
 
ffdd6fa
598f35c
ffdd6fa
598f35c
ffdd6fa
598f35c
ffdd6fa
598f35c
ffdd6fa
 
 
598f35c
 
 
ffdd6fa
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
 
 
 
 
 
ffdd6fa
598f35c
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
598f35c
 
ffdd6fa
 
 
598f35c
ffdd6fa
598f35c
 
 
 
 
 
 
 
ffdd6fa
598f35c
 
 
 
ffdd6fa
 
 
 
 
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
 
 
 
 
ffdd6fa
 
598f35c
 
 
ffdd6fa
 
 
598f35c
 
ffdd6fa
 
598f35c
 
 
ffdd6fa
 
 
598f35c
 
 
ffdd6fa
 
 
 
 
 
598f35c
 
 
ffdd6fa
 
598f35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
 
598f35c
 
 
ffdd6fa
 
 
 
 
598f35c
ffdd6fa
 
598f35c
ffdd6fa
598f35c
 
 
 
 
 
 
 
 
 
 
 
ffdd6fa
 
598f35c
 
 
 
ffdd6fa
598f35c
 
 
 
 
 
 
 
ffdd6fa
 
 
 
 
 
 
598f35c
 
ffdd6fa
598f35c
ffdd6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598f35c
ffdd6fa
 
 
 
 
598f35c
 
ffdd6fa
 
 
 
598f35c
 
 
 
 
ffdd6fa
 
 
598f35c

data:
  format: zarr
  resolution: n320
  frequency: 6h
  timestep: 6h
  forcing:
  - cos_latitude
  - cos_longitude
  - sin_latitude
  - sin_longitude
  - cos_julian_day
  - cos_local_time
  - sin_julian_day
  - sin_local_time
  - insolation
  - lsm
  - sdor
  - slor
  - z
  diagnostic:
  - tp
  - cp
  - sf
  - tcc
  - hcc
  - lcc
  - mcc
  - ro
  - ssrd
  - strd
  - 100u
  - 100v
  remapped: null
  normalizer:
    default: mean-std
    remap:
      cp: tp
      sf: tp
    std:
    - tp
    - cp
    - sf
    - ro
    - tcw
    - ssrd
    - q_50
    - q_100
    - q_150
    - q_200
    - q_250
    - q_300
    - q_400
    - q_500
    - q_600
    - q_700
    - q_850
    - q_925
    - q_1000
    min-max: null
    max:
    - sdor
    - slor
    - z
    none:
    - cos_latitude
    - cos_longitude
    - sin_latitude
    - sin_longitude
    - cos_julian_day
    - cos_local_time
    - sin_julian_day
    - sin_local_time
    - insolation
    - lsm
    - tcc
    - mcc
    - hcc
    - lcc
    - swvl1
    - swvl2
  imputer:
    default: none
    minimum:
    - swvl1
    - swvl2
    - ro
    mean:
    - stl1
    - stl2
  remapper:
    default: none
  processors:
    imputer:
      _target_: anemoi.models.preprocessing.imputer.InputImputer
      _convert_: all
      config: ${data.imputer}
    normalizer:
      _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
      config: ${data.normalizer}
  num_features: null
dataloader:
  prefetch_factor: 2
  pin_memory: true
  read_group_size: ${hardware.num_gpus_per_model}
  num_workers:
    training: 8
    validation: 8
    test: 1
    predict: 1
  batch_size:
    training: 1
    validation: 1
    test: 4
    predict: 4
  limit_batches:
    training: null
    validation: null
    test: 20
    predict: 20
  grid_indices:
    _target_: anemoi.training.data.grid_indices.FullGrid
    nodes_name: ${graph.data}
  dataset: ${hardware.paths.data}/${hardware.files.dataset}
  training:
    dataset:
    - dataset: ${hardware.paths.data}/${hardware.files.dataset}
      start: null
      end: 2022
      frequency: ${data.frequency}
    start: null
    end: 2022
    drop: []
  validation:
    dataset:
    - dataset: ${hardware.paths.data}/${hardware.files.dataset}
      start: 2022
      end: 2024
      frequency: ${data.frequency}
    start: 2022
    end: 2024
    drop: []
  test:
    dataset:
    - dataset: ${hardware.paths.data}/${hardware.files.dataset}
      start: 2022
      end: null
      frequency: ${data.frequency}
    start: 2022
    end: null
    drop: []
diagnostics:
  plot:
    asynchronous: true
    datashader: true
    frequency:
      batch: 750
      epoch: 5
    parameters:
    - z_500
    - t_850
    - u_850
    - v_850
    - 2t
    - 10u
    - 10v
    - sp
    - tp
    - cp
    sample_idx: 0
    precip_and_related_fields:
    - tp
    - cp
    colormaps:
      default:
        _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
        name: viridis
      error:
        _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
        name: bwr
      precip:
        _target_: anemoi.training.utils.custom_colormaps.MatplotlibColormapClevels
        clevels:
        - '#ffffff'
        - '#04e9e7'
        - '#019ff4'
        - '#0300f4'
        - '#02fd02'
        - '#01c501'
        - '#008e00'
        - '#fdf802'
        - '#e5bc00'
        - '#fd9500'
        - '#fd0000'
        - '#d40000'
        - '#bc0000'
        - '#f800fd'
        variables: ${diagnostics.plot.precip_and_related_fields}
    callbacks: []
  callbacks: []
  benchmark_profiler:
    memory:
      enabled: true
      steps: 5
      warmup: 2
      extra_plots: false
      trace_rank0_only: false
    time:
      enabled: true
      verbose: false
    speed:
      enabled: true
    system:
      enabled: true
    model_summary:
      enabled: true
    snapshot:
      enabled: true
      steps: 4
      warmup: 0
  debug:
    anomaly_detection: false
  profiler: false
  enable_checkpointing: true
  checkpoint:
    every_n_minutes:
      save_frequency: 30
      num_models_saved: 3
    every_n_epochs:
      save_frequency: 1
      num_models_saved: -1
    every_n_train_steps:
      save_frequency: null
      num_models_saved: 0
  log:
    wandb:
      enabled: false
      offline: false
      log_model: false
      project: Anemoi
      entity: ???
      gradients: false
      parameters: false
    tensorboard:
      enabled: false
    mlflow:
      enabled: false
      offline: false
      authentication: false
      log_model: false
      tracking_uri: ???
      experiment_name: ???
      project_name: ???
      system: true
      terminal: true
      run_name: null
      on_resume_create_child: true
      expand_hyperparams:
      - config
      http_max_retries: 35
    interval: 100
  enable_progress_bar: true
  print_memory_summary: false
hardware:
  paths:
    data: ${oc.decode:${oc.env:DATASETS_PATH}}
    output: ${oc.decode:${oc.env:OUTPUT_PATH}}
    logs:
      base: ${hardware.paths.output}logs/
      wandb: ${hardware.paths.logs.base}
      mlflow: ${hardware.paths.logs.base}mlflow/
      tensorboard: ${hardware.paths.logs.base}tensorboard/
    checkpoints: ${hardware.paths.output}checkpoint/
    plots: ${hardware.paths.output}plots/
    profiler: ${hardware.paths.output}profiler/
    graph: ${hardware.paths.output}graphs/
  files:
    dataset: aifs-ea-an-oper-0001-mars-${data.resolution}-1979-2024-6h-v1-aifs-single-v1.zarr
    graph: graph_enc_proc_dec_${data.resolution}.pt
    truncation: null
    truncation_inv: null
    checkpoint:
      every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
      every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
      every_n_minutes: aifs-by_time-epoch_{epoch:03d}-step_{step:06d}
    warm_start: null
  accelerator: auto
  num_gpus_per_node: 4
  num_nodes: 16
  num_gpus_per_model: 4
graph:
  overwrite: true
  data: data
  hidden: hidden
  nodes:
    data:
      node_builder:
        _target_: anemoi.graphs.nodes.ZarrDatasetNodes
        dataset: ${dataloader.dataset}
      attributes: ${graph.attributes.nodes}
    hidden:
      node_builder:
        _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
        grid: o96
  edges:
  - source_name: ${graph.data}
    target_name: ${graph.hidden}
    edge_builders:
    - _target_: anemoi.graphs.edges.CutOffEdges
      cutoff_factor: 0.6
      source_mask_attr_name: null
      target_mask_attr_name: null
    attributes: ${graph.attributes.edges}
  - source_name: ${graph.hidden}
    target_name: ${graph.data}
    edge_builders:
    - _target_: anemoi.graphs.edges.KNNEdges
      num_nearest_neighbours: 3
      source_mask_attr_name: null
      target_mask_attr_name: null
    attributes: ${graph.attributes.edges}
  attributes:
    nodes:
      area_weight:
        _target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights
        norm: unit-max
        fill_value: 0
    edges:
      edge_length:
        _target_: anemoi.graphs.edges.attributes.EdgeLength
        norm: unit-std
      edge_dirs:
        _target_: anemoi.graphs.edges.attributes.EdgeDirection
        norm: unit-std
  post_processors: []
model:
  activation: GELU
  num_channels: 1024
  cpu_offload: false
  output_mask: null
  model:
    _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
  layer_kernels:
    processor:
      LayerNorm:
        _target_: torch.nn.LayerNorm
        _partial_: true
      Linear:
        _target_: torch.nn.Linear
        _partial_: true
      QueryNorm:
        _target_: anemoi.models.layers.normalization.AutocastLayerNorm
        _partial_: true
        bias: false
      KeyNorm:
        _target_: anemoi.models.layers.normalization.AutocastLayerNorm
        _partial_: true
        bias: false
    encoder:
      LayerNorm:
        _target_: torch.nn.LayerNorm
        _partial_: true
      Linear:
        _target_: torch.nn.Linear
        _partial_: true
    decoder:
      LayerNorm:
        _target_: torch.nn.LayerNorm
        _partial_: true
      Linear:
        _target_: torch.nn.Linear
        _partial_: true
  processor:
    _target_: anemoi.models.layers.processor.TransformerProcessor
    activation: ${model.activation}
    num_layers: 16
    num_chunks: 2
    mlp_hidden_ratio: 4
    num_heads: 16
    window_size: 1120
    dropout_p: 0.0
    attention_implementation: flash_attention
    qk_norm: false
    softcap: 0.0
    use_alibi_slopes: false
    cpu_offload: ${model.cpu_offload}
  encoder:
    _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
    trainable_size: ${model.trainable_parameters.data2hidden}
    sub_graph_edge_attributes: ${model.attributes.edges}
    activation: ${model.activation}
    num_chunks: 1
    mlp_hidden_ratio: 4
    num_heads: 16
    qk_norm: false
    cpu_offload: ${model.cpu_offload}
  decoder:
    _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
    trainable_size: ${model.trainable_parameters.hidden2data}
    sub_graph_edge_attributes: ${model.attributes.edges}
    activation: ${model.activation}
    num_chunks: 1
    mlp_hidden_ratio: 4
    num_heads: 16
    initialise_data_extractor_zero: false
    qk_norm: false
    cpu_offload: ${model.cpu_offload}
  trainable_parameters:
    data: 8
    hidden: 8
    data2hidden: 8
    hidden2data: 8
  attributes:
    edges:
    - edge_length
    - edge_dirs
    nodes: []
  bounding:
  - _target_: anemoi.models.layers.bounding.ReluBounding
    variables:
    - tp
    - ro
    - tcw
    - ssrd
    - ro
    - q_50
    - q_100
    - q_150
    - q_200
    - q_250
    - q_300
    - q_400
    - q_500
    - q_600
    - q_700
    - q_850
    - q_925
    - q_1000
  - _target_: anemoi.models.layers.bounding.HardtanhBounding
    variables:
    - tcc
    - swvl1
    - swvl2
    min_val: 0
    max_val: 1
  - _target_: anemoi.models.layers.bounding.FractionBounding
    variables:
    - cp
    - sf
    min_val: 0
    max_val: 1
    total_var: tp
  - _target_: anemoi.models.layers.bounding.FractionBounding
    variables:
    - lcc
    - mcc
    - hcc
    min_val: 0
    max_val: 1
    total_var: tcc
training:
  run_id: null
  fork_run_id: null
  transfer_learning: false
  load_weights_only: false
  deterministic: false
  precision: 16-mixed
  multistep_input: 2
  accum_grad_batches: 1
  num_sanity_val_steps: 6
  gradient_clip:
    val: 32.0
    algorithm: value
  swa:
    enabled: false
    lr: 0.0001
  optimizer:
    zero: false
    kwargs:
      betas:
      - 0.9
      - 0.95
  model_task: anemoi.training.train.forecaster.GraphForecaster
  strategy:
    _target_: anemoi.training.distributed.strategy.DDPGroupStrategy
    num_gpus_per_model: ${hardware.num_gpus_per_model}
    read_group_size: ${dataloader.read_group_size}
  loss_gradient_scaling: false
  training_loss:
    _target_: anemoi.training.losses.mse.WeightedMSELoss
    scalars:
    - variable
    - loss_weights_mask
    ignore_nans: false
  validation_metrics:
  - _target_: anemoi.training.losses.mse.WeightedMSELoss
    scalars: []
    ignore_nans: true
  scale_validation_metrics:
    scalars_to_apply:
    - variable
    metrics:
    - all
  rollout:
    start: 1
    epoch_increment: 0
    max: 1
  max_epochs: null
  max_steps: 260000
  lr:
    warmup: 1000
    rate: 3.125e-05
    iterations: 260000
    min: 3.0e-07
  variable_loss_scaling:
    default: 1
    pl:
      q: 0.6
      t: 6
      u: 0.8
      v: 0.5
      w: 0.001
      z: 12
    sfc:
      sp: 10
      10u: 0.5
      10v: 0.5
      100u: 0.1
      100v: 0.1
      2d: 0.5
      tp: 0.025
      cp: 0.0025
      ro: 0.0025
      sf: 0.025
      tcc: 0.1
      mcc: 0.1
      lcc: 0.1
      hcc: 0.1
      swvl2: 2
      swvl1: 1
      stl2: 10
      stl1: 1
      ssrd: 0.05
      strd: 0.1
  metrics:
  - z_500
  - t_850
  - u_850
  - v_850
  pressure_level_scaler:
    _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
    minimum: 0.2
    slope: 0.001
  node_loss_weights:
    _target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
    target_nodes: ${graph.data}
    node_attribute: area_weight
  submodules_to_freeze: []