Graph Machine Learning
AnemoI
English
aifs-single-1.1 / config_finetuning.yaml
anaprietonem's picture
Upload 2 files
598f35c verified
data:
format: zarr
resolution: n320
frequency: 6h
timestep: 6h
forcing:
- cos_latitude
- cos_longitude
- sin_latitude
- sin_longitude
- cos_julian_day
- cos_local_time
- sin_julian_day
- sin_local_time
- insolation
- lsm
- sdor
- slor
- z
diagnostic:
- tp
- cp
- sf
- tcc
- hcc
- lcc
- mcc
- ro
- ssrd
- strd
- 100u
- 100v
remapped: null
normalizer:
default: mean-std
remap:
cp: tp
sf: tp
std:
- tp
- cp
- sf
- ro
- tcw
- ssrd
- q_50
- q_100
- q_150
- q_200
- q_250
- q_300
- q_400
- q_500
- q_600
- q_700
- q_850
- q_925
- q_1000
min-max: null
max:
- sdor
- slor
- z
none:
- cos_latitude
- cos_longitude
- sin_latitude
- sin_longitude
- cos_julian_day
- cos_local_time
- sin_julian_day
- sin_local_time
- insolation
- lsm
- tcc
- mcc
- hcc
- lcc
- swvl1
- swvl2
imputer:
default: none
minimum:
- swvl1
- swvl2
- ro
mean:
- stl1
- stl2
remapper:
default: none
processors:
imputer:
_target_: anemoi.models.preprocessing.imputer.InputImputer
_convert_: all
config: ${data.imputer}
normalizer:
_target_: anemoi.models.preprocessing.normalizer.InputNormalizer
config: ${data.normalizer}
num_features: null
dataloader:
prefetch_factor: 2
pin_memory: true
read_group_size: ${hardware.num_gpus_per_model}
num_workers:
training: 8
validation: 8
test: 1
predict: 1
batch_size:
training: 1
validation: 1
test: 4
predict: 4
limit_batches:
training: 1000
validation: 10
test: 20
predict: 20
grid_indices:
_target_: anemoi.training.data.grid_indices.FullGrid
nodes_name: ${graph.data}
dataset: ${hardware.paths.data}/${hardware.files.dataset}
training:
dataset:
- dataset: ${hardware.paths.data}/${hardware.files.dataset}
start: null
end: 2022
frequency: ${data.frequency}
start: null
end: 2022
drop: []
validation:
dataset:
- dataset: ${hardware.paths.data}/${hardware.files.dataset}
start: 2022
end: 2024
frequency: ${data.frequency}
start: 2022
end: 2024
drop: []
test:
dataset:
- dataset: ${hardware.paths.data}/${hardware.files.dataset}
start: 2022
end: null
frequency: ${data.frequency}
start: 2022
end: null
drop: []
diagnostics:
plot:
asynchronous: true
datashader: true
frequency:
batch: 750
epoch: 5
parameters:
- z_500
- t_850
- u_850
- v_850
- 2t
- 10u
- 10v
- sp
- tp
- cp
sample_idx: 0
precip_and_related_fields:
- tp
- cp
colormaps:
default:
_target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
name: viridis
error:
_target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
name: bwr
precip:
_target_: anemoi.training.utils.custom_colormaps.MatplotlibColormapClevels
clevels:
- '#ffffff'
- '#04e9e7'
- '#019ff4'
- '#0300f4'
- '#02fd02'
- '#01c501'
- '#008e00'
- '#fdf802'
- '#e5bc00'
- '#fd9500'
- '#fd0000'
- '#d40000'
- '#bc0000'
- '#f800fd'
variables: ${diagnostics.plot.precip_and_related_fields}
callbacks: []
callbacks: []
benchmark_profiler:
memory:
enabled: true
steps: 5
warmup: 2
extra_plots: false
trace_rank0_only: false
time:
enabled: true
verbose: false
speed:
enabled: true
system:
enabled: true
model_summary:
enabled: true
snapshot:
enabled: true
steps: 4
warmup: 0
debug:
anomaly_detection: false
profiler: false
enable_checkpointing: true
checkpoint:
every_n_minutes:
save_frequency: 30
num_models_saved: 3
every_n_epochs:
save_frequency: 1
num_models_saved: -1
every_n_train_steps:
save_frequency: null
num_models_saved: 0
log:
wandb:
enabled: false
offline: false
log_model: false
project: Anemoi
entity: ???
gradients: false
parameters: false
tensorboard:
enabled: false
mlflow:
enabled: false
offline: false
authentication: false
log_model: false
tracking_uri: ???
experiment_name: ???
project_name: ???
system: true
terminal: true
run_name: null
on_resume_create_child: true
expand_hyperparams:
- config
http_max_retries: 35
interval: 100
enable_progress_bar: true
print_memory_summary: false
hardware:
paths:
data: ${oc.decode:${oc.env:DATASETS_PATH}}
output: ${oc.decode:${oc.env:OUTPUT_PATH}}
logs:
base: ${hardware.paths.output}logs/
wandb: ${hardware.paths.logs.base}
mlflow: ${hardware.paths.logs.base}mlflow/
tensorboard: ${hardware.paths.logs.base}tensorboard/
checkpoints: ${hardware.paths.output}checkpoint/
plots: ${hardware.paths.output}plots/
profiler: ${hardware.paths.output}profiler/
graph: ${hardware.paths.output}graphs/
files:
dataset: aifs-ea-an-oper-0001-mars-${data.resolution}-1979-2024-6h-v1-aifs-single-v1.zarr
graph: graph_enc_proc_dec_${data.resolution}.pt
checkpoint:
every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
every_n_minutes: aifs-by_time-epoch_{epoch:03d}-step_{step:06d}
warm_start: null
accelerator: auto
num_gpus_per_node: 4
num_nodes: 16
num_gpus_per_model: 4
graph:
overwrite: true
data: data
hidden: hidden
nodes:
data:
node_builder:
_target_: anemoi.graphs.nodes.ZarrDatasetNodes
dataset: ${dataloader.dataset}
attributes: ${graph.attributes.nodes}
hidden:
node_builder:
_target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
grid: o96
edges:
- source_name: ${graph.data}
target_name: ${graph.hidden}
edge_builders:
- _target_: anemoi.graphs.edges.CutOffEdges
cutoff_factor: 0.6
source_mask_attr_name: null
target_mask_attr_name: null
attributes: ${graph.attributes.edges}
- source_name: ${graph.hidden}
target_name: ${graph.data}
edge_builders:
- _target_: anemoi.graphs.edges.KNNEdges
num_nearest_neighbours: 3
source_mask_attr_name: null
target_mask_attr_name: null
attributes: ${graph.attributes.edges}
attributes:
nodes:
area_weight:
_target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights
norm: unit-max
fill_value: 0
edges:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-std
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std
post_processors: []
model:
activation: GELU
num_channels: 1024
cpu_offload: false
output_mask: null
model:
_target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
layer_kernels:
processor:
LayerNorm:
_target_: torch.nn.LayerNorm
_partial_: true
Linear:
_target_: torch.nn.Linear
_partial_: true
QueryNorm:
_target_: anemoi.models.layers.normalization.AutocastLayerNorm
_partial_: true
bias: false
KeyNorm:
_target_: anemoi.models.layers.normalization.AutocastLayerNorm
_partial_: true
bias: false
encoder:
LayerNorm:
_target_: torch.nn.LayerNorm
_partial_: true
Linear:
_target_: torch.nn.Linear
_partial_: true
decoder:
LayerNorm:
_target_: torch.nn.LayerNorm
_partial_: true
Linear:
_target_: torch.nn.Linear
_partial_: true
processor:
_target_: anemoi.models.layers.processor.TransformerProcessor
activation: ${model.activation}
num_layers: 16
num_chunks: 2
mlp_hidden_ratio: 4
num_heads: 16
window_size: 1120
dropout_p: 0.0
attention_implementation: flash_attention
qk_norm: false
softcap: 0.0
use_alibi_slopes: false
cpu_offload: ${model.cpu_offload}
encoder:
_target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
trainable_size: ${model.trainable_parameters.data2hidden}
sub_graph_edge_attributes: ${model.attributes.edges}
activation: ${model.activation}
num_chunks: 1
mlp_hidden_ratio: 4
num_heads: 16
qk_norm: false
cpu_offload: ${model.cpu_offload}
decoder:
_target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
trainable_size: ${model.trainable_parameters.hidden2data}
sub_graph_edge_attributes: ${model.attributes.edges}
activation: ${model.activation}
num_chunks: 1
mlp_hidden_ratio: 4
num_heads: 16
initialise_data_extractor_zero: false
qk_norm: false
cpu_offload: ${model.cpu_offload}
trainable_parameters:
data: 8
hidden: 8
data2hidden: 8
hidden2data: 8
attributes:
edges:
- edge_length
- edge_dirs
nodes: []
bounding:
- _target_: anemoi.models.layers.bounding.ReluBounding
variables:
- tp
- ro
- tcw
- ssrd
- ro
- q_50
- q_100
- q_150
- q_200
- q_250
- q_300
- q_400
- q_500
- q_600
- q_700
- q_850
- q_925
- q_1000
- _target_: anemoi.models.layers.bounding.HardtanhBounding
variables:
- tcc
- swvl1
- swvl2
min_val: 0
max_val: 1
- _target_: anemoi.models.layers.bounding.FractionBounding
variables:
- cp
- sf
min_val: 0
max_val: 1
total_var: tp
- _target_: anemoi.models.layers.bounding.FractionBounding
variables:
- lcc
- mcc
- hcc
min_val: 0
max_val: 1
total_var: tcc
training:
run_id: null
fork_run_id: ${oc.decode:${oc.env:PRETRAINING_RUN_ID}}
transfer_learning: false
load_weights_only: true
deterministic: false
precision: 16-mixed
multistep_input: 2
accum_grad_batches: 1
num_sanity_val_steps: 6
gradient_clip:
val: 32.0
algorithm: value
swa:
enabled: false
lr: 0.0001
optimizer:
zero: false
kwargs:
betas:
- 0.9
- 0.95
model_task: anemoi.training.train.forecaster.GraphForecaster
strategy:
_target_: anemoi.training.distributed.strategy.DDPGroupStrategy
num_gpus_per_model: ${hardware.num_gpus_per_model}
read_group_size: ${dataloader.read_group_size}
loss_gradient_scaling: false
training_loss:
_target_: anemoi.training.losses.mse.WeightedMSELoss
scalars:
- variable
- loss_weights_mask
ignore_nans: false
validation_metrics:
- _target_: anemoi.training.losses.mse.WeightedMSELoss
scalars: []
ignore_nans: true
scale_validation_metrics:
scalars_to_apply:
- variable
metrics:
- all
rollout:
start: 1
epoch_increment: 1
max: 12
max_epochs: 13
max_steps: 150000
lr:
warmup: 1000
rate: 8.0e-07
iterations: 7900
min: 3.0e-07
warmup_t: 100
variable_loss_scaling:
default: 1
pl:
q: 0.6
t: 6
u: 0.8
v: 0.5
w: 0.001
z: 12
sfc:
sp: 10
10u: 0.5
10v: 0.5
100u: 0.1
100v: 0.1
2d: 0.5
tp: 0.025
cp: 0.0025
ro: 0.0025
sf: 0.025
tcc: 0.1
mcc: 0.1
lcc: 0.1
hcc: 0.1
swvl2: 2
swvl1: 1
stl2: 10
stl1: 1
ssrd: 0.05
strd: 0.1
metrics:
- z_500
- t_850
- u_850
- v_850
pressure_level_scaler:
_target_: anemoi.training.data.scaling.ReluPressureLevelScaler
minimum: 0.2
slope: 0.001
node_loss_weights:
_target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
target_nodes: ${graph.data}
node_attribute: area_weight
submodules_to_freeze: []