aifs-single-1.1 / config_finetuning.yaml

Upload 2 files

598f35c verified about 1 month ago

12.2 kB

	data:
	format: zarr
	resolution: n320
	frequency: 6h
	timestep: 6h
	forcing:
	- cos_latitude
	- cos_longitude
	- sin_latitude
	- sin_longitude
	- cos_julian_day
	- cos_local_time
	- sin_julian_day
	- sin_local_time
	- insolation
	- lsm
	- sdor
	- slor
	- z
	diagnostic:
	- tp
	- cp
	- sf
	- tcc
	- hcc
	- lcc
	- mcc
	- ro
	- ssrd
	- strd
	- 100u
	- 100v
	remapped: null
	normalizer:
	default: mean-std
	remap:
	cp: tp
	sf: tp
	std:
	- tp
	- cp
	- sf
	- ro
	- tcw
	- ssrd
	- q_50
	- q_100
	- q_150
	- q_200
	- q_250
	- q_300
	- q_400
	- q_500
	- q_600
	- q_700
	- q_850
	- q_925
	- q_1000
	min-max: null
	max:
	- sdor
	- slor
	- z
	none:
	- cos_latitude
	- cos_longitude
	- sin_latitude
	- sin_longitude
	- cos_julian_day
	- cos_local_time
	- sin_julian_day
	- sin_local_time
	- insolation
	- lsm
	- tcc
	- mcc
	- hcc
	- lcc
	- swvl1
	- swvl2
	imputer:
	default: none
	minimum:
	- swvl1
	- swvl2
	- ro
	mean:
	- stl1
	- stl2
	remapper:
	default: none
	processors:
	imputer:
	_target_: anemoi.models.preprocessing.imputer.InputImputer
	_convert_: all
	config: ${data.imputer}
	normalizer:
	_target_: anemoi.models.preprocessing.normalizer.InputNormalizer
	config: ${data.normalizer}
	num_features: null
	dataloader:
	prefetch_factor: 2
	pin_memory: true
	read_group_size: ${hardware.num_gpus_per_model}
	num_workers:
	training: 8
	validation: 8
	test: 1
	predict: 1
	batch_size:
	training: 1
	validation: 1
	test: 4
	predict: 4
	limit_batches:
	training: 1000
	validation: 10
	test: 20
	predict: 20
	grid_indices:
	_target_: anemoi.training.data.grid_indices.FullGrid
	nodes_name: ${graph.data}
	dataset: ${hardware.paths.data}/${hardware.files.dataset}
	training:
	dataset:
	- dataset: ${hardware.paths.data}/${hardware.files.dataset}
	start: null
	end: 2022
	frequency: ${data.frequency}
	start: null
	end: 2022
	drop: []
	validation:
	dataset:
	- dataset: ${hardware.paths.data}/${hardware.files.dataset}
	start: 2022
	end: 2024
	frequency: ${data.frequency}
	start: 2022
	end: 2024
	drop: []
	test:
	dataset:
	- dataset: ${hardware.paths.data}/${hardware.files.dataset}
	start: 2022
	end: null
	frequency: ${data.frequency}
	start: 2022
	end: null
	drop: []
	diagnostics:
	plot:
	asynchronous: true
	datashader: true
	frequency:
	batch: 750
	epoch: 5
	parameters:
	- z_500
	- t_850
	- u_850
	- v_850
	- 2t
	- 10u
	- 10v
	- sp
	- tp
	- cp
	sample_idx: 0
	precip_and_related_fields:
	- tp
	- cp
	colormaps:
	default:
	_target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
	name: viridis
	error:
	_target_: anemoi.training.utils.custom_colormaps.MatplotlibColormap
	name: bwr
	precip:
	_target_: anemoi.training.utils.custom_colormaps.MatplotlibColormapClevels
	clevels:
	- '#ffffff'
	- '#04e9e7'
	- '#019ff4'
	- '#0300f4'
	- '#02fd02'
	- '#01c501'
	- '#008e00'
	- '#fdf802'
	- '#e5bc00'
	- '#fd9500'
	- '#fd0000'
	- '#d40000'
	- '#bc0000'
	- '#f800fd'
	variables: ${diagnostics.plot.precip_and_related_fields}
	callbacks: []
	callbacks: []
	benchmark_profiler:
	memory:
	enabled: true
	steps: 5
	warmup: 2
	extra_plots: false
	trace_rank0_only: false
	time:
	enabled: true
	verbose: false
	speed:
	enabled: true
	system:
	enabled: true
	model_summary:
	enabled: true
	snapshot:
	enabled: true
	steps: 4
	warmup: 0
	debug:
	anomaly_detection: false
	profiler: false
	enable_checkpointing: true
	checkpoint:
	every_n_minutes:
	save_frequency: 30
	num_models_saved: 3
	every_n_epochs:
	save_frequency: 1
	num_models_saved: -1
	every_n_train_steps:
	save_frequency: null
	num_models_saved: 0
	log:
	wandb:
	enabled: false
	offline: false
	log_model: false
	project: Anemoi
	entity: ???
	gradients: false
	parameters: false
	tensorboard:
	enabled: false
	mlflow:
	enabled: false
	offline: false
	authentication: false
	log_model: false
	tracking_uri: ???
	experiment_name: ???
	project_name: ???
	system: true
	terminal: true
	run_name: null
	on_resume_create_child: true
	expand_hyperparams:
	- config
	http_max_retries: 35
	interval: 100
	enable_progress_bar: true
	print_memory_summary: false
	hardware:
	paths:
	data: ${oc.decode:${oc.env:DATASETS_PATH}}
	output: ${oc.decode:${oc.env:OUTPUT_PATH}}
	logs:
	base: ${hardware.paths.output}logs/
	wandb: ${hardware.paths.logs.base}
	mlflow: ${hardware.paths.logs.base}mlflow/
	tensorboard: ${hardware.paths.logs.base}tensorboard/
	checkpoints: ${hardware.paths.output}checkpoint/
	plots: ${hardware.paths.output}plots/
	profiler: ${hardware.paths.output}profiler/
	graph: ${hardware.paths.output}graphs/
	files:
	dataset: aifs-ea-an-oper-0001-mars-${data.resolution}-1979-2024-6h-v1-aifs-single-v1.zarr
	graph: graph_enc_proc_dec_${data.resolution}.pt
	checkpoint:
	every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
	every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
	every_n_minutes: aifs-by_time-epoch_{epoch:03d}-step_{step:06d}
	warm_start: null
	accelerator: auto
	num_gpus_per_node: 4
	num_nodes: 16
	num_gpus_per_model: 4
	graph:
	overwrite: true
	data: data
	hidden: hidden
	nodes:
	data:
	node_builder:
	_target_: anemoi.graphs.nodes.ZarrDatasetNodes
	dataset: ${dataloader.dataset}
	attributes: ${graph.attributes.nodes}
	hidden:
	node_builder:
	_target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
	grid: o96
	edges:
	- source_name: ${graph.data}
	target_name: ${graph.hidden}
	edge_builders:
	- _target_: anemoi.graphs.edges.CutOffEdges
	cutoff_factor: 0.6
	source_mask_attr_name: null
	target_mask_attr_name: null
	attributes: ${graph.attributes.edges}
	- source_name: ${graph.hidden}
	target_name: ${graph.data}
	edge_builders:
	- _target_: anemoi.graphs.edges.KNNEdges
	num_nearest_neighbours: 3
	source_mask_attr_name: null
	target_mask_attr_name: null
	attributes: ${graph.attributes.edges}
	attributes:
	nodes:
	area_weight:
	_target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights
	norm: unit-max
	fill_value: 0
	edges:
	edge_length:
	_target_: anemoi.graphs.edges.attributes.EdgeLength
	norm: unit-std
	edge_dirs:
	_target_: anemoi.graphs.edges.attributes.EdgeDirection
	norm: unit-std
	post_processors: []
	model:
	activation: GELU
	num_channels: 1024
	cpu_offload: false
	output_mask: null
	model:
	_target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
	layer_kernels:
	processor:
	LayerNorm:
	_target_: torch.nn.LayerNorm
	_partial_: true
	Linear:
	_target_: torch.nn.Linear
	_partial_: true
	QueryNorm:
	_target_: anemoi.models.layers.normalization.AutocastLayerNorm
	_partial_: true
	bias: false
	KeyNorm:
	_target_: anemoi.models.layers.normalization.AutocastLayerNorm
	_partial_: true
	bias: false
	encoder:
	LayerNorm:
	_target_: torch.nn.LayerNorm
	_partial_: true
	Linear:
	_target_: torch.nn.Linear
	_partial_: true
	decoder:
	LayerNorm:
	_target_: torch.nn.LayerNorm
	_partial_: true
	Linear:
	_target_: torch.nn.Linear
	_partial_: true
	processor:
	_target_: anemoi.models.layers.processor.TransformerProcessor
	activation: ${model.activation}
	num_layers: 16
	num_chunks: 2
	mlp_hidden_ratio: 4
	num_heads: 16
	window_size: 1120
	dropout_p: 0.0
	attention_implementation: flash_attention
	qk_norm: false
	softcap: 0.0
	use_alibi_slopes: false
	cpu_offload: ${model.cpu_offload}
	encoder:
	_target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
	trainable_size: ${model.trainable_parameters.data2hidden}
	sub_graph_edge_attributes: ${model.attributes.edges}
	activation: ${model.activation}
	num_chunks: 1
	mlp_hidden_ratio: 4
	num_heads: 16
	qk_norm: false
	cpu_offload: ${model.cpu_offload}
	decoder:
	_target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
	trainable_size: ${model.trainable_parameters.hidden2data}
	sub_graph_edge_attributes: ${model.attributes.edges}
	activation: ${model.activation}
	num_chunks: 1
	mlp_hidden_ratio: 4
	num_heads: 16
	initialise_data_extractor_zero: false
	qk_norm: false
	cpu_offload: ${model.cpu_offload}
	trainable_parameters:
	data: 8
	hidden: 8
	data2hidden: 8
	hidden2data: 8
	attributes:
	edges:
	- edge_length
	- edge_dirs
	nodes: []
	bounding:
	- _target_: anemoi.models.layers.bounding.ReluBounding
	variables:
	- tp
	- ro
	- tcw
	- ssrd
	- ro
	- q_50
	- q_100
	- q_150
	- q_200
	- q_250
	- q_300
	- q_400
	- q_500
	- q_600
	- q_700
	- q_850
	- q_925
	- q_1000
	- _target_: anemoi.models.layers.bounding.HardtanhBounding
	variables:
	- tcc
	- swvl1
	- swvl2
	min_val: 0
	max_val: 1
	- _target_: anemoi.models.layers.bounding.FractionBounding
	variables:
	- cp
	- sf
	min_val: 0
	max_val: 1
	total_var: tp
	- _target_: anemoi.models.layers.bounding.FractionBounding
	variables:
	- lcc
	- mcc
	- hcc
	min_val: 0
	max_val: 1
	total_var: tcc
	training:
	run_id: null
	fork_run_id: ${oc.decode:${oc.env:PRETRAINING_RUN_ID}}
	transfer_learning: false
	load_weights_only: true
	deterministic: false
	precision: 16-mixed
	multistep_input: 2
	accum_grad_batches: 1
	num_sanity_val_steps: 6
	gradient_clip:
	val: 32.0
	algorithm: value
	swa:
	enabled: false
	lr: 0.0001
	optimizer:
	zero: false
	kwargs:
	betas:
	- 0.9
	- 0.95
	model_task: anemoi.training.train.forecaster.GraphForecaster
	strategy:
	_target_: anemoi.training.distributed.strategy.DDPGroupStrategy
	num_gpus_per_model: ${hardware.num_gpus_per_model}
	read_group_size: ${dataloader.read_group_size}
	loss_gradient_scaling: false
	training_loss:
	_target_: anemoi.training.losses.mse.WeightedMSELoss
	scalars:
	- variable
	- loss_weights_mask
	ignore_nans: false
	validation_metrics:
	- _target_: anemoi.training.losses.mse.WeightedMSELoss
	scalars: []
	ignore_nans: true
	scale_validation_metrics:
	scalars_to_apply:
	- variable
	metrics:
	- all
	rollout:
	start: 1
	epoch_increment: 1
	max: 12
	max_epochs: 13
	max_steps: 150000
	lr:
	warmup: 1000
	rate: 8.0e-07
	iterations: 7900
	min: 3.0e-07
	warmup_t: 100
	variable_loss_scaling:
	default: 1
	pl:
	q: 0.6
	t: 6
	u: 0.8
	v: 0.5
	w: 0.001
	z: 12
	sfc:
	sp: 10
	10u: 0.5
	10v: 0.5
	100u: 0.1
	100v: 0.1
	2d: 0.5
	tp: 0.025
	cp: 0.0025
	ro: 0.0025
	sf: 0.025
	tcc: 0.1
	mcc: 0.1
	lcc: 0.1
	hcc: 0.1
	swvl2: 2
	swvl1: 1
	stl2: 10
	stl1: 1
	ssrd: 0.05
	strd: 0.1
	metrics:
	- z_500
	- t_850
	- u_850
	- v_850
	pressure_level_scaler:
	_target_: anemoi.training.data.scaling.ReluPressureLevelScaler
	minimum: 0.2
	slope: 0.001
	node_loss_weights:
	_target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
	target_nodes: ${graph.data}
	node_attribute: area_weight
	submodules_to_freeze: []