From e19ce3e2e2c553d22bda722e03a5b7571491d8cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Doruk=20Arda=C4=9F?= <38666458+Dooruk@users.noreply.github.com> Date: Mon, 9 Feb 2026 17:08:42 -0500 Subject: [PATCH 1/4] Update CI workflow for swell tier 1 applications and add hofx_cf Refactor workflow to use environment variable for CI base path and streamline job dependencies. --- .../swell-tier1_application_discover.yml | 432 +++--------------- 1 file changed, 55 insertions(+), 377 deletions(-) diff --git a/.github/workflows/swell-tier1_application_discover.yml b/.github/workflows/swell-tier1_application_discover.yml index d7d1172..2cd7997 100644 --- a/.github/workflows/swell-tier1_application_discover.yml +++ b/.github/workflows/swell-tier1_application_discover.yml @@ -6,426 +6,104 @@ defaults: run: shell: bash +env: + CI_BASE: /discover/nobackup/gmao_ci/swell/tier1 + jobs: + # Define the suite list once + define-matrix: + runs-on: nccs-discover + outputs: + suites: ${{ steps.set-suites.outputs.suites }} + suites_space: ${{ steps.set-suites.outputs.suites_space }} + steps: + - name: Set suite list + id: set-suites + run: | + SUITES='["3dvar_cycle", "3dfgat_cycle", "hofx", "3dvar", "3dvar_atmos", "3dfgat_atmos", "localensembleda", "hofx_cf"]' + echo "suites=$SUITES" >> $GITHUB_OUTPUT + echo "suites_space=3dvar_cycle 3dfgat_cycle hofx 3dvar 3dvar_atmos 3dfgat_atmos localensembleda hofx_cf" >> $GITHUB_OUTPUT # Initialization needed by all the workflows - # ------------------------------------------ swell-tier_1-setup: - runs-on: nccs-discover timeout-minutes: 30 - + needs: define-matrix steps: - name: validate-workflow - run: | - /home/jardizzo/bin/nams_check.py ${{ github.triggering_actor }} swell + run: /home/jardizzo/bin/nams_check.py ${{ github.triggering_actor }} swell - name: acquire-swell uses: actions/checkout@v3 - name: install-swell run: | - # Make experiment directory - mkdir /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - # Copy and source modules - cp ${GITHUB_WORKSPACE}/src/swell/deployment/platforms/nccs_discover_sles15/modules /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/ - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - pip install --prefix=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/swell -r ${GITHUB_WORKSPACE}/requirements.txt --no-cache-dir ${GITHUB_WORKSPACE} - # Remove source code (needed to ensure nothing relies on the source) - - # Run 3dvar_cycle workflow - # ------------------------ - swell-tier_1-3dvar_cycle: + mkdir ${{ env.CI_BASE }}/${GITHUB_RUN_ID} + cp ${GITHUB_WORKSPACE}/src/swell/deployment/platforms/nccs_discover_sles15/modules ${{ env.CI_BASE }}/${GITHUB_RUN_ID}/ + source ${{ env.CI_BASE }}/${GITHUB_RUN_ID}/modules + pip install --prefix=${{ env.CI_BASE }}/${GITHUB_RUN_ID}/swell -r ${GITHUB_WORKSPACE}/requirements.txt --no-cache-dir ${GITHUB_WORKSPACE} + # Run all test workflows using matrix + swell-tier_1-test: runs-on: nccs-discover timeout-minutes: 600 - needs: swell-tier_1-setup + needs: [define-matrix, swell-tier_1-setup] + strategy: + fail-fast: false + max-parallel: 5 + matrix: + suite: ${{ fromJson(needs.define-matrix.outputs.suites) }} steps: - - - name: run-swell-3dvar_cycle + - name: run-swell-${{ matrix.suite }} run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - SUITE_NAME=3dvar_cycle - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} + CI_WORKSPACE=${{ env.CI_BASE }}/${GITHUB_RUN_ID} + SUITE_NAME=${{ matrix.suite }} + CI_WORKSPACE_JOB=${CI_WORKSPACE}/${SUITE_NAME} EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} mkdir -p $CI_WORKSPACE_JOB + source ${CI_WORKSPACE}/modules - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=`python --version | awk '{print $2}' | awk -F. '{print $1"."$2}'` - + PYVER=$(python --version | awk '{print $2}' | awk -F. '{print $1"."$2}') export PATH=$CI_WORKSPACE/swell/bin:$PATH export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml + cat > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml < $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_1-3dfgat_cycle-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_1-3dfgat_cycle - if: failure() - - steps: - - name: Fail hold for 3dfgat_cycle - run: | - SUITE_NAME=3dfgat_cycle - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - - name: Copy cylc Logs - - # Run hofx workflow - # ----------------- - swell-tier_1-hofx: - - runs-on: nccs-discover - timeout-minutes: 600 - needs: swell-tier_1-setup - - steps: - - - name: run-swell-hofx - run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - SUITE_NAME=hofx - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=`python --version | awk '{print $2}' | awk -F. '{print $1"."$2}'` - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_1-hofx-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_1-hofx - if: failure() - - steps: - - name: Fail hold for hofx - run: | - SUITE_NAME=hofx - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - - # Run 3dvar workflow - # ----------------- - swell-tier_1-3dvar: - - runs-on: nccs-discover - timeout-minutes: 600 - needs: swell-tier_1-setup - - steps: - - - name: run-swell-3dvar - run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - SUITE_NAME=3dvar - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=`python --version | awk '{print $2}' | awk -F. '{print $1"."$2}'` - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_1-3dvar-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_1-3dvar - if: failure() - - steps: - - name: Fail hold for 3dvar - run: | - SUITE_NAME=3dvar - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - - # Run 3dvar_atmos workflow - # ----------------- - swell-tier_1-3dvar_atmos: - - runs-on: nccs-discover - timeout-minutes: 600 - needs: swell-tier_1-setup - - steps: - - - name: run-swell-3dvar_atmos - run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - SUITE_NAME=3dvar_atmos - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=`python --version | awk '{print $2}' | awk -F. '{print $1"."$2}'` - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_1-3dvar_atmos-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_1-3dvar_atmos - if: failure() - - steps: - - name: Fail hold for 3dvar_atmos - run: | - SUITE_NAME=3dvar_atmos - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - - # Run 3dfgat_atmos workflow - # ----------------- - swell-tier_1-3dfgat_atmos: - - runs-on: nccs-discover - timeout-minutes: 600 - needs: swell-tier_1-setup - - steps: - - - name: run-swell-3dfgat_atmos - run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - SUITE_NAME=3dfgat_atmos - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=`python --version | awk '{print $2}' | awk -F. '{print $1"."$2}'` - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_1-3dfgat_atmos-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_1-3dfgat_atmos - if: failure() - - steps: - - name: Fail hold for 3dfgat_atmos - run: | - SUITE_NAME=3dfgat_atmos - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - - - # Run localensembleda workflow - # ----------------- - swell-tier_1-localensembleda: - - runs-on: nccs-discover - timeout-minutes: 600 - needs: swell-tier_1-setup - - steps: - - - name: run-swell-localensembleda - run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} - SUITE_NAME=localensembleda - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=`python --version | awk '{print $2}' | awk -F. '{print $1"."$2}'` - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_1-localensembleda-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_1-localensembleda - if: failure() - - steps: - - name: Fail hold for localensembleda - run: | - SUITE_NAME=localensembleda - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - -# Perform all the clean up -# ------------------------ + CI_WORKSPACE_JOB=${{ env.CI_BASE }}/${GITHUB_RUN_ID}/${{ matrix.suite }} + mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED || true + # Cleanup on success swell-tier_1-clean_up_success: - runs-on: nccs-discover timeout-minutes: 30 - needs: [swell-tier_1-3dvar_cycle, swell-tier_1-3dfgat_cycle, swell-tier_1-hofx, swell-tier_1-3dvar, swell-tier_1-3dvar_atmos, swell-tier_1-3dfgat_atmos, swell-tier_1-localensembleda] - + needs: [define-matrix, swell-tier_1-test] steps: - - name: Remove the run directory - run: | - rm -r -f /discover/nobackup/gmao_ci/swell/tier1/${GITHUB_RUN_ID} + run: rm -rf ${{ env.CI_BASE }}/${GITHUB_RUN_ID} + # Cleanup always (cylc logs) swell-tier_1-clean_up_always: - runs-on: nccs-discover timeout-minutes: 30 - needs: [swell-tier_1-3dvar_cycle, swell-tier_1-3dfgat_cycle, swell-tier_1-hofx, swell-tier_1-3dvar, swell-tier_1-3dvar_atmos, swell-tier_1-3dfgat_atmos, swell-tier_1-localensembleda] - if: always() # Always run the clean up, even if failed or cancelled - + needs: [define-matrix, swell-tier_1-test] + if: always() steps: - - - name: Remove the cylc logging directories - run: | - rm -r -f $HOME/cylc-run/swell-hofx-${GITHUB_RUN_ID}-suite - rm -r -f $HOME/cylc-run/swell-3dvar-${GITHUB_RUN_ID}-suite - rm -r -f $HOME/cylc-run/swell-3dvar_cycle-${GITHUB_RUN_ID}-suite - rm -r -f $HOME/cylc-run/swell-3dfgat_cycle-${GITHUB_RUN_ID}-suite - rm -r -f $HOME/cylc-run/swell-3dvar_atmos-${GITHUB_RUN_ID}-suite - rm -r -f $HOME/cylc-run/swell-3dfgat_atmos-${GITHUB_RUN_ID}-suite - rm -r -f $HOME/cylc-run/swell-localensembleda-${GITHUB_RUN_ID}-suite - - - name: Remove the R2D2 experiment output + - name: Remove cylc logging directories run: | - rm -r -f /discover/nobackup/gmao_ci/R2D2DataStore/Local/ncdiag/ob/swell-hofx-${GITHUB_RUN_ID} + for suite in ${{ needs.define-matrix.outputs.suites_space }}; do + rm -rf $HOME/cylc-run/swell-${suite}-${GITHUB_RUN_ID}-suite + done From a22b213e3034fedf3924fb27bbfc7cbf9199d633 Mon Sep 17 00:00:00 2001 From: Michael Anstett Date: Wed, 11 Feb 2026 18:28:00 -0500 Subject: [PATCH 2/4] Use experiments from stable --- .github/workflows/swell-tier2_application_discover.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/swell-tier2_application_discover.yml b/.github/workflows/swell-tier2_application_discover.yml index 0e7f09e..b99d32b 100644 --- a/.github/workflows/swell-tier2_application_discover.yml +++ b/.github/workflows/swell-tier2_application_discover.yml @@ -226,7 +226,9 @@ jobs: echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - COMPARISON_EXP_PATH_1=$(realpath "/discover/nobackup/gmao_ci/SwellExperiments/current-3dfgat_cycle-comparison") + echo "publish_directory: ~/swell_publication_location" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml + + COMPARISON_EXP_PATH_1=/discover/nobackup/gmao_ci/swell/tier2/stable/3dfgat_cycle-comparison/swell-3dfgat_cycle-comparison-*/swell-3dfgat_cycle-comparison-*-suite/experiment.yaml EXPERIMENT_ID_2=swell-${COMPARISON_SUITE}-${GITHUB_RUN_ID} COMPARISON_EXP_PATH_2=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${COMPARISON_SUITE}/${EXPERIMENT_ID_2}/${EXPERIMENT_ID_2}-suite/experiment.yaml @@ -327,8 +329,9 @@ jobs: echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml + echo "publish_directory: ~/swell_publication_location" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - COMPARISON_EXP_PATH_1=$(realpath "/discover/nobackup/gmao_ci/SwellExperiments/current-3dfgat_atmos-comparison") + COMPARISON_EXP_PATH_1=/discover/nobackup/gmao_ci/swell/tier2/stable/3dfgat_atmos-comparison/swell-3dfgat_atmos-comparison-*/swell-3dfgat_atmos-comparison-*-suite/experiment.yaml EXPERIMENT_ID_2=swell-${COMPARISON_SUITE}-${GITHUB_RUN_ID} COMPARISON_EXP_PATH_2=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${COMPARISON_SUITE}/${EXPERIMENT_ID_2}/${EXPERIMENT_ID_2}-suite/experiment.yaml @@ -368,7 +371,7 @@ jobs: runs-on: nccs-discover timeout-minutes: 30 - needs: [swell-tier_2-hofx, swell-tier_2-3dfgat_cycle, swell-tier_2-3dfgat_atmos] + needs: [swell-tier_2-hofx, swell-tier_2-3dfgat_cycle, swell-tier_2-3dfgat_atmos, swell-tier_2-3dfgat_atmos-comparison, swell-tier_2-3dfgat_cycle-comparison] steps: - name: Replace link to stable with link to current run and remove old directory From 09f4b94a786d0040c7a188d077778e7161385807 Mon Sep 17 00:00:00 2001 From: Michael Anstett Date: Thu, 12 Feb 2026 13:52:11 -0500 Subject: [PATCH 3/4] Use matrices --- .../swell-tier2_application_discover.yml | 279 +++++------------- 1 file changed, 73 insertions(+), 206 deletions(-) diff --git a/.github/workflows/swell-tier2_application_discover.yml b/.github/workflows/swell-tier2_application_discover.yml index b99d32b..47d33b5 100644 --- a/.github/workflows/swell-tier2_application_discover.yml +++ b/.github/workflows/swell-tier2_application_discover.yml @@ -8,13 +8,40 @@ defaults: jobs: + # Define the suite list once + define-matrix: + runs-on: nccs-discover + outputs: + suites: ${{ steps.set-suites.outputs.suites }} + suites_space: ${{ steps.set-suites.outputs.suites_space }} + steps: + - name: Set suite list + id: set-suites + run: | + SUITES='["hofx", "3dfgat_cycle", "3dfgat_atmos"]' + echo "suites=$SUITES" >> $GITHUB_OUTPUT + echo "suites_space=hofx 3dfgat_cycle 3dfgat_atmos" >> $GITHUB_OUTPUT + + define-comparison-matrix: + runs-on: nccs-discover + outputs: + comparison_suites: ${{ steps.set-suites.out + comparison_suites_space: ${{ steps.set-comparison-suites.outputs.comparison_suites_space }} + steps: + - name: Set comparison suite list + id: set-comparison-suites + run: | + SUITES='["3dfgat_cycle", "3dfgat_atmos"]' + echo "comparison_suites=$SUITES" >> $GITHUB_OUTPUT + echo "comparison_suites_space=3dfgat_cycle-comparison 3dfgat_atmos-comparison" >> $GITHUB_OUTPUT + # Initialization needed by all the workflows # ------------------------------------------ swell-tier_2-setup: runs-on: nccs-discover timeout-minutes: 30 - + needs: define-matrix steps: - name: validate-workflow run: | @@ -99,116 +126,75 @@ jobs: # ---------------------------------------- # STEP2: RUN TESTING SUITES WITH NEW BUILD # ---------------------------------------- - - # Run hofx suite - swell-tier_2-hofx: - + swell-tier_2-test: runs-on: nccs-discover timeout-minutes: 600 - needs: swell-tier_2-build_jedi + needs: [define-matrix, swell-tier_2-setup, swell-tier_2-build_jedi] + strategy: + fail-fast: false + max-parallel: 5 + matrix: + suite: ${{ fromJson(needs.define-matrix.outputs.suites) }} steps: - - - name: run-swell-hofx + - name: run-swell-${{ matrix.suite }} run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID} - SUITE_NAME=hofx - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} + CI_WORKSPACE=${{ env.CI_BASE }}/${GITHUB_RUN_ID} + SUITE_NAME=${{ matrix.suite }} + CI_WORKSPACE_JOB=${CI_WORKSPACE}/${SUITE_NAME} + EXPERIMENT_ID=swell${SUITE_NAME}-${GITHUB_RUN_ID} mkdir -p $CI_WORKSPACE_JOB + source ${CI_WORKSPACE}/modules - source /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/modules - - # Get python version PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages + export PATH=$CI_WORKSPACE/swell/bin:$PATH + export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - # Point to the active build - echo "existing_jedi_source_directory: /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/build_jedi/jedi_bundle/source" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "existing_jedi_build_directory: /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/build_jedi/jedi_bundle/build" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite + cat > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml < $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - # Point to the active build - echo "existing_jedi_source_directory: /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/build_jedi/jedi_bundle/source" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "existing_jedi_build_directory: /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/build_jedi/jedi_bundle/build" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - swell-tier2-3dfgat_cycle-comparison: + swell-tier2-comparison: runs-on: nccs-discover timeout-minutes: 30 - needs: swell-tier_2-3dfgat_cycle + needs: [define-comparison-matrix, swell-tier_2-setup, swell-tier_2-test] + strategy: + fail-fast: false + max-parallel: 5 + matrix: + suite: ${{ fromJson(needs.define-comparison-matrix.outputs.comparison_suites) }} steps: - - name: run-swell-3dfgat_cycle-comparison + - name: run-swell-${{ matrix.suite }}-comparison run: | CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID} - COMPARISON_SUITE=3dfgat_cycle - SUITE_NAME=${COMPARISON_SUITE}-comparison + SUITE_NAME=${{ matrix.suite }}-comparison - CONFIG_NAME=compare_fgat_marine + if [ "${{ matrix.suite }}" = "3dfgat_cycle" ]; then + CONFIG_NAME=compare_fgat_marine + else + CONFIG_NAME=compare_variational_atmosphere CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${SUITE_NAME} EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} @@ -226,7 +212,7 @@ jobs: echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "publish_directory: ~/swell_publication_location" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml + echo "publish_directory: /discover/nobackup/gmao_ci/swell_publication_location" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml COMPARISON_EXP_PATH_1=/discover/nobackup/gmao_ci/swell/tier2/stable/3dfgat_cycle-comparison/swell-3dfgat_cycle-comparison-*/swell-3dfgat_cycle-comparison-*-suite/experiment.yaml @@ -243,124 +229,6 @@ jobs: swell create ${CONFIG_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - # Move experiment directory on failure - swell-tier_2-3dfgat_cycle-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_2-3dfgat_cycle - if: failure() - - steps: - - name: Fail hold for 3dfgat_cycle - run: | - SUITE_NAME=3dfgat_cycle - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - - # Run 3dfgat_atmos suite - swell-tier_2-3dfgat_atmos: - - runs-on: nccs-discover - timeout-minutes: 600 - needs: swell-tier_2-build_jedi - - steps: - - - name: run-swell-3dfgat_atmos - run: | - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID} - SUITE_NAME=3dfgat_atmos - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - # Point to the active build - echo "existing_jedi_source_directory: /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/build_jedi/jedi_bundle/source" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "existing_jedi_build_directory: /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/build_jedi/jedi_bundle/build" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${SUITE_NAME}_tier2 -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - swell-tier2-3dfgat_atmos-comparison: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_2-3dfgat_atmos - - steps: - - name: run-swell-3dfgat_atmos-comparison - run: | - - CI_WORKSPACE=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID} - - COMPARISON_SUITE=3dfgat_atmos - SUITE_NAME=${COMPARISON_SUITE}-comparison - - CONFIG_NAME=compare_variational_atmosphere - - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${SUITE_NAME} - EXPERIMENT_ID=swell-${SUITE_NAME}-${GITHUB_RUN_ID} - - mkdir -p $CI_WORKSPACE_JOB - - source /discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/modules - - # Get python version - PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") - - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - - echo "experiment_id: $EXPERIMENT_ID" > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "experiment_root: $CI_WORKSPACE_JOB" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "publish_directory: ~/swell_publication_location" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - COMPARISON_EXP_PATH_1=/discover/nobackup/gmao_ci/swell/tier2/stable/3dfgat_atmos-comparison/swell-3dfgat_atmos-comparison-*/swell-3dfgat_atmos-comparison-*-suite/experiment.yaml - - EXPERIMENT_ID_2=swell-${COMPARISON_SUITE}-${GITHUB_RUN_ID} - COMPARISON_EXP_PATH_2=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${COMPARISON_SUITE}/${EXPERIMENT_ID_2}/${EXPERIMENT_ID_2}-suite/experiment.yaml - - echo "comparison_experiment_paths:" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "- $COMPARISON_EXP_PATH_1" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - echo "- $COMPARISON_EXP_PATH_2" >> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - - rm -r -f $HOME/cylc-run/${EXPERIMENT_ID}-suite - - cd $CI_WORKSPACE_JOB - swell create ${CONFIG_NAME} -m defaults -p nccs_discover_sles15 -o $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - swell launch $CI_WORKSPACE_JOB/${EXPERIMENT_ID}/${EXPERIMENT_ID}-suite --no-detach --log_path $CI_WORKSPACE_JOB/${EXPERIMENT_ID} - - # Move experiment directory on failure - swell-tier_2-3dfgat_atmos-failure: - - runs-on: nccs-discover - timeout-minutes: 30 - needs: swell-tier_2-3dfgat_atmos - if: failure() - - steps: - - name: Fail hold for 3dfgat_atmos - run: | - SUITE_NAME=3dfgat_atmos - CI_WORKSPACE_JOB=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${SUITE_NAME} - mv $CI_WORKSPACE_JOB ${CI_WORKSPACE_JOB}_FAILED - # ------------------------------------------------------------- # STEP3: PERFORM UPDATES OF STABLE NIGHTLY POINTER AND CLEAN UP # ------------------------------------------------------------- @@ -371,7 +239,7 @@ jobs: runs-on: nccs-discover timeout-minutes: 30 - needs: [swell-tier_2-hofx, swell-tier_2-3dfgat_cycle, swell-tier_2-3dfgat_atmos, swell-tier_2-3dfgat_atmos-comparison, swell-tier_2-3dfgat_cycle-comparison] + needs: [swell-tier_2-test, swell-tier_2-comparison] steps: - name: Replace link to stable with link to current run and remove old directory @@ -401,11 +269,10 @@ jobs: runs-on: nccs-discover timeout-minutes: 30 - needs: [swell-tier_2-hofx, swell-tier_2-3dfgat_cycle, swell-tier_2-3dfgat_atmos, swell-tier2-3dfgat_cycle-comparison, swell-tier2-3dfgat_atmos-comparison] + needs: [swell-tier_2-test, swell-tier_2-comparison] if: always() # Always run the clean up, even if failed or cancelled steps: - - name: Remove the cylc logging directories run: | rm -r -f $HOME/cylc-run/swell-hofx-${GITHUB_RUN_ID}-suite From 9def79897231b60d8ca34dbbd33ba68af7b66136 Mon Sep 17 00:00:00 2001 From: Michael Anstett Date: Thu, 12 Feb 2026 16:56:49 -0500 Subject: [PATCH 4/4] Fixes --- .../swell-tier2_application_discover.yml | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/swell-tier2_application_discover.yml b/.github/workflows/swell-tier2_application_discover.yml index 47d33b5..beb9b51 100644 --- a/.github/workflows/swell-tier2_application_discover.yml +++ b/.github/workflows/swell-tier2_application_discover.yml @@ -149,21 +149,21 @@ jobs: PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") - export PATH=$CI_WORKSPACE/swell/bin:$PATH - export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages + export PATH=$CI_WORKSPACE/swell/bin:$PATH + export PYTHONPATH=${PYTHONPATH}:$CI_WORKSPACE/swell/lib/python$PYVER/site-packages - cat > $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml < $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml <> $CI_WORKSPACE_JOB/${SUITE_NAME}-override.yaml - COMPARISON_EXP_PATH_1=/discover/nobackup/gmao_ci/swell/tier2/stable/3dfgat_cycle-comparison/swell-3dfgat_cycle-comparison-*/swell-3dfgat_cycle-comparison-*-suite/experiment.yaml + COMPARISON_EXP_PATH_1=/discover/nobackup/gmao_ci/swell/tier2/stable/${SUITE_NAME}/swell-${SUITE_NAME}-*/swell-${SUITE_NAME}-*-suite/experiment.yaml EXPERIMENT_ID_2=swell-${COMPARISON_SUITE}-${GITHUB_RUN_ID} COMPARISON_EXP_PATH_2=/discover/nobackup/gmao_ci/swell/tier2/${GITHUB_RUN_ID}/${COMPARISON_SUITE}/${EXPERIMENT_ID_2}/${EXPERIMENT_ID_2}-suite/experiment.yaml