ecmwf · sophie-xhonneux · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/config/config_jepa.yml b/config/config_jepa.yml
@@ -26,7 +26,7 @@ ae_adapter_with_residual: True
 ae_adapter_dropout_rate: 0.1
 
 ae_global_dim_embed: 2048
-ae_global_num_blocks: 2
+ae_global_num_blocks: 0
 ae_global_num_heads: 32
 ae_global_dropout_rate: 0.1
 ae_global_with_qk_lnorm: True
@@ -37,7 +37,7 @@ ae_global_block_factor: 64
 ae_global_mlp_hidden_factor: 2
 ae_global_trailing_layer_norm: False
 
-ae_aggregation_num_blocks: 8
+ae_aggregation_num_blocks: 12
 ae_aggregation_num_heads: 32
 ae_aggregation_dropout_rate: 0.1
 ae_aggregation_with_qk_lnorm: True
@@ -130,10 +130,33 @@ data_loading :
 
 # config for training
 training_config:
-  
+
   # training_mode: "masking", "student_teacher", "latent_loss"
   training_mode: ["student_teacher"]
 
+  # Collapse monitoring for SSL training (JEPA/DINO/iBOT)
+  # Detects representation collapse via various metrics
+  collapse_monitoring:
+    enabled: true
+    compute_frequency: 100  # batches between metric computations
+    log_frequency: 100      # batches between metric logging
+    metrics:
+      effective_rank:
+        enabled: true
+        tensor_source: "both"  # "student", "teacher", or "both"
+        sample_size: 2048      # max samples for SVD (0 = no sampling)
+      singular_values:
+        enabled: true
+        tensor_source: "both"
+        sample_size: 2048
+      dimension_variance:
+        enabled: true
+        tensor_source: "both"  # cheap to compute, good early indicator
+      prototype_entropy:
+        enabled: true  # only applies to DINO
+      ema_beta:
+        enabled: true
+
   num_mini_epochs: 32
   samples_per_mini_epoch: 4096
   shuffle: True
@@ -148,25 +171,36 @@ training_config:
 
   learning_rate_scheduling :
     lr_start: 1e-6
-    lr_max: 5e-5
+    lr_max: 1e-4
     lr_final_decay: 1e-6
     lr_final: 0.0
-    num_steps_warmup: 512
+    num_steps_warmup: 4096
     num_steps_cooldown: 512
     policy_warmup: "cosine"
     policy_decay: "constant"
     policy_cooldown: "linear"
     parallel_scaling_policy: "sqrt"
 
   optimizer:
-    grad_clip: 1.0
-    weight_decay: 0.1
+    # Optimizer type: "adamw" (default) or "muon_adamw" (Muon for hidden weights, AdamW for embeddings/heads)
+    type: "muon_adamw"
+    grad_clip: 0.1
+    weight_decay: 0.05
     log_grad_norms: False
     adamw :
       # parameters are scaled by number of DDP workers
       beta1 : 0.975
       beta2 : 0.9875
       eps : 2e-08
+    muon:
+      # Learning rate multiplier for Muon relative to base LR (muon_lr = base_lr * lr_multiplier)
+      lr_multiplier: 30.0
+      # Momentum factor for Muon SGD
+      momentum: 0.95
+      # Use Nesterov momentum
+      nesterov: true
+      # Weight decay for Muon parameters (uses optimizer.weight_decay if not specified)
+      weight_decay: 0.05
 
   losses : {
     "student-teacher": {
@@ -179,16 +213,20 @@ training_config:
             "num_blocks": 6, "num_heads": 12, "with_qk_lnorm": True, "intermediate_dim": 768, 
             "dropout_rate": 0.1,
             target_source_correspondence: {0 : {0 : "subset"} },
+          },
         },
-        },
-        target_and_aux_calc: { "EMATeacher" : 
-          { ema_ramp_up_ratio : 0.09,
-            ema_halflife_in_thousands: 1e-3,
-            model_param_overrides : { 
-              training_config: { losses: { student-teacher:{ loss_fcts :{JEPA: {head: identity} }}}}
-            },
-          }
-        }
+        target_and_aux_calc: {FrozenTeacher: {
+            teacher_run_id:  "yoqxf234", # "zosrc8ti",    # Required
+            teacher_mini_epoch: -1}},
+        # },
+        # target_and_aux_calc: { "EMATeacher" : 
+        #   { ema_ramp_up_ratio : null,
+        #     ema_halflife_in_thousands: 1e-1,
+        #     model_param_overrides : { 
+        #       training_config: { losses: { student-teacher:{ loss_fcts :{JEPA: {head: identity} }}}}
+        #     },
+        #   }
+        # }
       }
   }
 

diff --git a/config/config_jepa_finetuning.yml b/config/config_jepa_finetuning.yml
@@ -92,7 +92,8 @@ zarr_store: "zip" # "zarr" for LocalStore, "zip" for ZipStore
 #####################################
 
 # streams_directory: "./config/streams/era5_1deg/"
-streams_directory: "./config/streams/era5_synop_finetuning/"
+# streams_directory: "./config/streams/era5_synop_finetuning/"
+streams_directory: "./config/streams/era5_nppatms_finetuning/"
 streams: ???
 
 general:
@@ -139,8 +140,8 @@ training_config:
   samples_per_mini_epoch: 4096
   shuffle: True
 
-  start_date: 1979-01-01T00:00
-  end_date: 2022-12-31T00:00
+  start_date: 2012-01-01T00:00
+  end_date: 2021-12-31T00:00
 
   time_window_step: 06:00:00
   time_window_len: 06:00:00
@@ -271,7 +272,7 @@ validation_config:
     # write samples in normalized model space
     normalized_samples: False,
     # output streams to write; default all
-    streams: ["SurfaceCombined"],
+    streams: ["NPPATMS"],
   }
 
   # run validation before training starts (mainly for model development)