From edc6f7460c3202370fc29112be2e8a591c3b8161 Mon Sep 17 00:00:00 2001 From: drlyamzin Date: Wed, 29 Jan 2025 06:54:43 +0000 Subject: [PATCH 1/2] enabling precipitation inference: initial code corrections --- README.md | 22 ++++++++++++++ config/AFNO.yaml | 39 +++++++++++++++---------- data_process/parallel_copy_small_set.py | 27 +++++++++++++++++ docker/Dockerfile | 9 +++--- utils/data_loader_multifiles.py | 1 + 5 files changed, 79 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 705f03a..12ed099 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,25 @@ +# Recursive Forked Version + +0. Clone the repository, + +``` +git clone https://github.com/recursiveai/FourCastNet +cd FourCastNet +``` + +1. Build and run the container + +NOTE: Assumes nvidia runtime is available for docker. + +```docker build -f docker/Dockerfile -t fourcastnet .``` +```docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --runtime nvidia -v ${PWD}:/workspace -it fourcastnet:latest``` + +2. Copy data and checkpoints from the bucket. + +NOTE: Checkpoints are 1Gb each and take time to download. + +```python download_data.sh``` + # FourCastNet ![nvidia](assets/nvidia.png) ![nersc](assets/nersc.png) diff --git a/config/AFNO.yaml b/config/AFNO.yaml index ef42448..9039768 100644 --- a/config/AFNO.yaml +++ b/config/AFNO.yaml @@ -7,8 +7,8 @@ full_field: &FULL_FIELD dt: 1 # how many timesteps ahead the model will predict n_history: 0 #how many previous timesteps to consider prediction_type: 'iterative' - prediction_length: 41 #applicable only if prediction_type == 'iterative' - n_initial_conditions: 5 #applicable only if prediction_type == 'iterative' + prediction_length: 4 #applicable only if prediction_type == 'iterative' + n_initial_conditions: 1 #applicable only if prediction_type == 'iterative' ics_type: "default" save_raw_forecasts: !!bool True save_channel: !!bool False @@ -35,11 +35,11 @@ full_field: &FULL_FIELD normalization: 'zscore' #options zscore (minmax not supported) train_data_path: '/pscratch/sd/j/jpathak/wind/train' valid_data_path: '/pscratch/sd/j/jpathak/wind/test' - inf_data_path: '/pscratch/sd/j/jpathak/wind/out_of_sample' # test set path for inference + inf_data_path: "/workspace/data" # test set path for inference, FourCastNet/ is mounted to /workspace/ exp_dir: '/pscratch/sd/j/jpathak/ERA5_expts_gtc/wind' - time_means_path: '/pscratch/sd/j/jpathak/wind/time_means.npy' - global_means_path: '/pscratch/sd/j/jpathak/wind/global_means.npy' - global_stds_path: '/pscratch/sd/j/jpathak/wind/global_stds.npy' + time_means_path: "/workspace/data/stats/time_means.npy" + global_means_path: "/workspace/data/stats/global_means.npy" + global_stds_path: "/workspace/data/stats/global_stds.npy" orography: !!bool False orography_path: None @@ -73,10 +73,11 @@ afno_backbone: &backbone exp_dir: '/pscratch/sd/s/shas1693/results/era5_wind' train_data_path: '/pscratch/sd/s/shas1693/data/era5/train' valid_data_path: '/pscratch/sd/s/shas1693/data/era5/test' - inf_data_path: '/pscratch/sd/s/shas1693/data/era5/out_of_sample' - time_means_path: '/pscratch/sd/s/shas1693/data/era5/time_means.npy' - global_means_path: '/pscratch/sd/s/shas1693/data/era5/global_means.npy' - global_stds_path: '/pscratch/sd/s/shas1693/data/era5/global_stds.npy' + inf_data_path: "/workspace/data" + time_means_path: "/workspace/data/stats/time_means.npy" # backbone time means + global_means_path: "/workspace/data/stats/global_means.npy" + global_stds_path: "/workspace/data/stats/global_stds.npy" + afno_backbone_orography: &backbone_orography <<: *backbone @@ -120,12 +121,20 @@ precip: &precip out_channels: [0] nettype: 'afno' nettype_wind: 'afno' - log_to_wandb: !!bool True + log_to_wandb: !!bool False lr: 2.5E-4 batch_size: 64 max_epochs: 25 - precip: '/pscratch/sd/p/pharring/ERA5/precip/total_precipitation' - time_means_path_tp: '/pscratch/sd/p/pharring/ERA5/precip/total_precipitation/time_means.npy' - model_wind_path: '/pscratch/sd/s/shas1693/results/era5_wind/afno_backbone_finetune/0/training_checkpoints/best_ckpt.tar' + precip: "/workspace" # parent directory of "out_of_sample" test data directory + time_means_path_tp: "/workspace/data/stats/time_means_tp.npy" + model_wind_path: "/workspace/checkpoints/precip.ckpt" precip_eps: !!float 1e-5 - + # consider moving the following to constants + era5_source: "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/" + era5_vars: ['geopotential', 'temperature', 'u_component_of_wind', 'v_component_of_wind', + "2m_dewpoint_temperature", '10m_u_component_of_wind', '10m_v_component_of_wind', + '2m_temperature', 'mean_sea_level_pressure', 'surface_pressure', + 'total_column_water_vapour', 'u_component_of_wind', 'v_component_of_wind', + 'geopotential'] + time_sel: ["2023-01-01T00:00:00", "2023-01-01T06:00:00", "2023-01-01T12:00:00", "2023-01-01T18:00:00"] + pressure_level: [50, 500, 850, 1000] diff --git a/data_process/parallel_copy_small_set.py b/data_process/parallel_copy_small_set.py index 20c9e03..f81603e 100644 --- a/data_process/parallel_copy_small_set.py +++ b/data_process/parallel_copy_small_set.py @@ -106,6 +106,33 @@ def writetofile(src, dest, channel_idx, varslist, src_idx=0, frmt='nc'): mins = (ttot - 3600*hrs)//60 secs = (ttot - 3600*hrs - 60*mins) channel_idx += 1 + + +def writetofile_simplest(src, dest, channel_idx, variable_name): + """Write era5 data from nc file to hdf5 with channels in the right order for FourCastNet. + + Args: + src - str: path to the source nc file + dest - str: path to the destination hdf5 file + channel_idx - int: index of the channel in the target hdf5 file + variable_name - str: variable to copy from the source file + """ + batch = 4 + nfeatures = 20 + latlon = (721, 1440) + + with h5py.File(dest, 'a') as fdest: + if "fields" not in fdest: + shape = (batch, nfeatures, *latlon) + dtype = "float32" + fdest.create_dataset("fields", shape, dtype=dtype) + + fsrc = DS(src, 'r', format="NETCDF4").variables[variable_name] + + with h5py.File(dest, 'a') as fdest: + fdest['fields'][:, channel_idx, :, :] = fsrc[:] + + filestr = 'oct_2021_19_31' dest = '/global/cscratch1/sd/jpathak/21var/oct_2021_19_21.h5' diff --git a/docker/Dockerfile b/docker/Dockerfile index 90985d2..300d249 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -14,7 +14,11 @@ RUN pip install wandb && \ pip install ruamel.yaml && \ pip install --upgrade tqdm && \ pip install timm && \ - pip install einops + pip install einops && \ + pip install zarr && \ + pip install netCDF4 && \ + pip install xarray && \ + pip install gcsfs # benchy RUN pip install git+https://github.com/romerojosh/benchy.git @@ -29,11 +33,8 @@ COPY copernicus /opt/ERA5_wind/copernicus COPY docker /opt/ERA5_wind/docker COPY networks /opt/ERA5_wind/networks COPY utils /opt/ERA5_wind/utils -COPY plotting /opt/ERA5_wind/plotting -COPY mpu /opt/ERA5_wind/mpu COPY *.py /opt/ERA5_wind/ COPY *.sh /opt/ERA5_wind/ -COPY perf_tests /opt/perf_tests # create dummy git image RUN cd /opt/ERA5_wind && git init diff --git a/utils/data_loader_multifiles.py b/utils/data_loader_multifiles.py index eae359f..c1bbefa 100644 --- a/utils/data_loader_multifiles.py +++ b/utils/data_loader_multifiles.py @@ -113,6 +113,7 @@ def _get_files_stats(self): self.files_paths = glob.glob(self.location + "/*.h5") self.files_paths.sort() self.n_years = len(self.files_paths) + logging.info(f"Inference files paths: {self.files_paths}") with h5py.File(self.files_paths[0], 'r') as _f: logging.info("Getting file stats from {}".format(self.files_paths[0])) self.n_samples_per_year = _f['fields'].shape[0] From 6ea0c87c4fcdc3dc581c24b65a8656a321b6ed62 Mon Sep 17 00:00:00 2001 From: drlyamzin Date: Wed, 29 Jan 2025 07:10:42 +0000 Subject: [PATCH 2/2] enabling precipitation inference: added data download script --- README.md | 8 ++++++-- data_process/data_for_inference.py | 31 ++++++++++++++++++++++++++++++ download_data.sh | 7 +++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 data_process/data_for_inference.py create mode 100644 download_data.sh diff --git a/README.md b/README.md index 12ed099..4513736 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,12 @@ cd FourCastNet NOTE: Assumes nvidia runtime is available for docker. -```docker build -f docker/Dockerfile -t fourcastnet .``` -```docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --runtime nvidia -v ${PWD}:/workspace -it fourcastnet:latest``` +``` +docker build -f docker/Dockerfile -t fourcastnet . +``` +``` +docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --runtime nvidia -v ${PWD}:/workspace -it fourcastnet:latest +``` 2. Copy data and checkpoints from the bucket. diff --git a/data_process/data_for_inference.py b/data_process/data_for_inference.py new file mode 100644 index 0000000..19cf8aa --- /dev/null +++ b/data_process/data_for_inference.py @@ -0,0 +1,31 @@ +import xarray as xr +import argparse +import os +from utils.YParams import YParams + +def get_era5_data(params): + """Load the ERA5 data from zarr file, subselect variables and time range, save as nc file. + """ + + era5_zarr = xr.open_zarr(params.era5_source) + all_vars = params.era5_vars + era5_data = era5_zarr.sel(time=params.time_sel, level=params.pressure_level)[all_vars] + era5_data = era5_data.compute() + era5_data.to_netcdf("data/era5_data.nc") + return + + +def main(params): + # load the data + get_era5_data(params) + # format for inference + # save as h5 + return + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--yaml_config", default='./config/AFNO.yaml', type=str) + args = parser.parse_args() + params = YParams(os.path.abspath(args.yaml_config), 'full_field') + main(params) \ No newline at end of file diff --git a/download_data.sh b/download_data.sh new file mode 100644 index 0000000..9754228 --- /dev/null +++ b/download_data.sh @@ -0,0 +1,7 @@ +# download variable stats and checkpoints from our bucket, see README for original source + +set -e +mkdir -p data/stats +mkdir -p checkpoints +gsutil -m cp -r gs://borealis-models/fourcastnet/stats/* data/stats/ +gsutil -m cp -r gs://borealis-models/fourcastnet/checkpoints/* checkpoints/ \ No newline at end of file