ecmwf · ankitpatnala · Aug 6, 2025 · Aug 6, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -9,8 +9,8 @@ embed_dropout_rate: 0.1
 
 target_cell_local_prediction: True
 
-ae_local_dim_embed: 1024
-ae_local_num_blocks: 2
+ae_local_dim_embed: 2048
+ae_local_num_blocks: 0
 ae_local_num_heads: 16
 ae_local_dropout_rate: 0.1
 ae_local_with_qk_lnorm: True
@@ -24,7 +24,7 @@ ae_adapter_with_residual: True
 ae_adapter_dropout_rate: 0.1
 
 ae_global_dim_embed: 2048
-ae_global_num_blocks: 8
+ae_global_num_blocks: 4
 ae_global_num_heads: 32
 ae_global_dropout_rate: 0.1
 ae_global_with_qk_lnorm: True
@@ -33,6 +33,7 @@ ae_global_with_qk_lnorm: True
 ae_global_att_dense_rate: 1.0
 ae_global_block_factor: 64
 ae_global_mlp_hidden_factor: 2
+ae_global_trailing_layer_norm: False
 
 decoder_type: PerceiverIOCoordConditioning # CrossAttentionAdaNormConditioning
 pred_adapter_kv: False
@@ -42,16 +43,18 @@ pred_mlp_adaln: True
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
-forecast_offset : 0
+forecast_offset : 1
 forecast_delta_hrs: 0
-forecast_steps: 0
-forecast_policy: null
+forecast_steps: 2
+forecast_policy: "fixed"
+forecast_freeze_model: False
 forecast_att_dense_rate: 1.0
-fe_num_blocks: 0
+fe_num_blocks: 16
 fe_num_heads: 16
 fe_dropout_rate: 0.1
 fe_with_qk_lnorm: True
-impute_latent_noise_std: 0.0  # 1e-4
+fe_layer_norm_after_blocks: [7]  # Index starts at 0. Thus, [3] adds a LayerNorm after the fourth layer
+impute_latent_noise_std: 1e-4
 
 healpix_level: 5
 
@@ -77,7 +80,12 @@ loss_fcts_val:
   -
     - "mse"
     - 1.0
-
+timestep_weight: [spike_function,
+                  {"type":"probability",
+                   "values":{ 4 : 0.6,
+                              6 : 0.2,
+                              8 : 0.1,
+                              10 : 0.1} ] 
 batch_size_per_gpu: 1
 batch_size_validation_per_gpu: 1
 
@@ -93,15 +101,15 @@ ema_halflife_in_thousands: 1e-3
 
 # training mode: "forecast" or "masking" (masked token modeling)
 # for "masking" to train with auto-encoder mode, forecast_offset should be 0
-training_mode: "masking"
+training_mode: "forecast"
 # masking rate when training mode is "masking"; ignored in foreacast mode
 masking_rate: 0.6
 # sample the masking rate (with normal distribution centered at masking_rate)
 # note that a sampled masking rate leads to varying requirements
 masking_rate_sampling: True
 # sample a subset of all target points, useful e.g. to reduce memory requirements (also can specify per-stream)
 sampling_rate_target: 1.0
-# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination"
+# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "combination"
 masking_strategy: "random"
 # masking_strategy_config is a dictionary of additional parameters for the masking strategy
 # required for "healpix" and "channel" masking strategies
@@ -113,21 +121,23 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"],
                           "same_strategy_per_batch": false
                           }
 
-num_mini_epochs: 32
-samples_per_mini_epoch: 4096
+num_epochs: 128
+samples_per_epoch: 4096
 samples_per_validation: 512
 shuffle: True
 
 lr_scaling_policy: "sqrt"
 lr_start: 1e-6
-lr_max: 5e-5
-lr_final_decay: 1e-6
+lr_max: 0.0001
+lr_final_decay: 2e-6
 lr_final: 0.0
-lr_steps_warmup: 512 
+lr_steps_warmup: 256
 lr_steps_cooldown: 512
 lr_policy_warmup: "cosine"
 lr_policy_decay: "constant"
 lr_policy_cooldown: "linear"
+adam_beta1: null  # Becomes 0.8 with 2 nodes
+adam_beta2: null  # Becomes 0.9 with 2 nodes
 
 grad_clip: 1.0
 weight_decay: 0.1
@@ -136,9 +146,9 @@ nn_module: "te"
 log_grad_norms: False
 
 start_date: 197901010000
-end_date: 202012310000
-start_date_val: 202101010000
-end_date_val: 202201010000
+end_date: 202212310000
+start_date_val: 202310010000
+end_date_val: 202312310000
 len_hrs: 6
 step_hrs: 6
 input_window_steps: 1
@@ -161,3 +171,25 @@ train_log_freq:
   terminal: 10
   metrics: 20
   checkpoint: 250
+
+# Tags for experiment tracking
+# These tags will be logged in MLFlow along with completed runs for train, eval, val
+# The tags are free-form, with the following rules:
+# - tags should be primitive types (strings, numbers, booleans). NO lists or dictionaries
+# - tags should not duplicate existing config entries.
+# - try to reuse existing tags where possible. MLFlow does not like having too many unique tags
+# - do not use long strings in values (less than 20 characters is a good rule of thumb, we may enforce this in the future)
+wgtags:
+  # The name of the organization of the person running the experiment.
+  # This may be autofilled in the future. Expected values are lowercase strings of 
+  # the organizations codenames in https://confluence.ecmwf.int/display/MAEL/Staff+Contact+List
+  # e.g. "ecmwf", "cmcc", "metnor", "jsc", "escience"
+  org: None
+  issue: 1495
+  # The name of the experiment. This is a distinctive codename for the experiment campaign being run.
+  # This is expected to be the primary tag for comparing experiments in MLFlow.
+  # Expected values are lowercase strings with no spaces, just underscores:
+  # Examples: "rollout_ablation_grid"  
+  exp: "rollout_params"
+  # *** Experiment-specific tags ***
+  grid_search: "dropout"
diff --git a/config/eval_config.yml b/config/eval_config.yml
@@ -0,0 +1,219 @@
+global_plotting_options:
+  image_format : "png" #options: "png", "pdf", "svg", "eps", "jpg" ..
+  dpi_val : 300
+  ERA5:
+    marker_size: 4
+
+evaluation:
+  metrics  : ["froct", "rmse"]
+  regions: ["global"]
+  summary_plots : true
+  summary_dir: "./plots/"
+  print_summary: false #print out score values on screen. it can be verbose
+  log_scale: false
+  add_grid: true
+
+run_ids :
+
+  # lr=5e-4
+  #xs5l8zmj:
+  #  label: "cosine scheduler lr_max=5e-4 v1"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #x9zvml1k:
+  #  label: "cosine scheduler lr_max=5e-4 v2"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #an8rap5h:
+  #  label: "cosine scheduler lr_max=5e-4 v3"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  # lr=1e-4  
+  #u2qk39pi:
+  #  label: "cosine scheduler lr_max=1e-4 v1  epoch=32"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #zswipf53:
+  #  label: "cosine scheduler lr_max=1e-4 v2 epoch=32"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #dsdvzg59:
+  #  label: "cosine scheduler lr_max=1e-4 v3 epoch=32"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  qc5dw7ki:
+    label: "cosine scheduler lr_max=1e-4 v1 epoch=48"
+    epoch: 0
+    rank: 0
+    streams:
+      ERA5:
+        channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+        evaluation:
+          sample: "all"
+          forecast_step: "all"
+
+  oqe79vpk:
+    label: "cosine scheduler lr_max=1e-4 v2 epoch=48"
+    epoch: 0
+    rank: 0
+    streams:
+      ERA5:
+        channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+        evaluation:
+          sample: "all"
+          forecast_step: "all"
+
+  hhblaokc:
+    label: "cosine scheduler lr_max=1e-4 v3 epoch=48" 
+    epoch: 0
+    rank: 0
+    streams:
+      ERA5:
+        channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+        evaluation:
+          sample: "all"
+          forecast_step: "all"
+  ## lr=5e-5
+  #r812ji96:
+  #  label: "cosine scheduler lr_max=5e-5 v1"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #gj6eq2dx:
+  #  label: "cosine scheduler lr_max=5e-5 v2"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #ff80snum:
+  #  label: "cosine scheduler lr_max=5e-5 v3"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #        ## lr=1e-5
+  #v0yha29i:
+  #  label: "cosine scheduler lr_max=1e-5 v1"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #cbmk73y0:
+  #  label: "cosine scheduler lr_max=1e-5 v2"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #ngdrjcbt:
+  #  label: "cosine scheduler lr_max=1e-5 v3"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  ## lr=5e-6
+  #voulcvsi:
+  #  label: "cosine scheduler lr_max=5e-6 v1"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #urlp39xq:
+  #  label: "cosine scheduler lr_max=5e-6 v2"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+
+  #ch1n05gd:
+  #  label: "cosine scheduler lr_max=5e-6 v3"
+  #  epoch: 0
+  #  rank: 0
+  #  streams:
+  #    ERA5:
+  #      channels: ["2t", "10u", "10v", "q_850", "t_850", "u_850", "v_850", "z_500"]
+  #      evaluation:
+  #        sample: "all"
+  #        forecast_step: "all"
+