diff --git a/DATA/production/configurations/asyncReco/async_pass.sh b/DATA/production/configurations/asyncReco/async_pass.sh index c83c65c83..d2665a647 100755 --- a/DATA/production/configurations/asyncReco/async_pass.sh +++ b/DATA/production/configurations/asyncReco/async_pass.sh @@ -412,67 +412,46 @@ fi echo "SETTING_ROOT_OUTPUT = $SETTING_ROOT_OUTPUT" # Enabling GPUs -if [[ -n "$ALIEN_JDL_USEGPUS" && $ALIEN_JDL_USEGPUS != 0 ]] ; then - echo "Enabling GPUS" - [[ -z ${GPUTYPE:-} ]] && export GPUTYPE="HIP" - [[ -z ${GPUMEMSIZE:-} ]] && export GPUMEMSIZE=$((25 << 30)) - if [[ "0$ASYNC_PASS_NO_OPTIMIZED_DEFAULTS" != "01" ]]; then +if [[ $ASYNC_PASS_NO_OPTIMIZED_DEFAULTS != 1 ]]; then + export OPTIMIZED_PARALLEL_ASYNC_AUTO_SHM_LIMIT=1 + if [[ $ALIEN_JDL_USEGPUS == 1 ]] ; then + echo "Enabling GPUS" + if [[ -z $ALIEN_JDL_SITEARCH ]]; then echo "ERROR: Must set ALIEN_JDL_SITEARCH to define GPU architecture!"; exit 1; fi + if [[ $ALIEN_JDL_SITEARCH == "NERSC" ]]; then # Disable mlock / ulimit / gpu memory registration - has performance impact, but doesn't work at NERSC for now + export SETENV_NO_ULIMIT=1 + export CONFIG_EXTRA_PROCESS_o2_gpu_reco_workflow+="GPU_proc.noGPUMemoryRegistration=1;" + fi + ALIEN_JDL_SITEARCH_TMP=$ALIEN_JDL_SITEARCH + if [[ $ALIEN_JDL_SITEARCH == "EPN_MI100" ]]; then + ALIEN_JDL_SITEARCH_TMP=EPN + export EPN_NODE_MI100=1 + elif [[ $ALIEN_JDL_SITEARCH == "EPN_MI50" ]]; then + ALIEN_JDL_SITEARCH_TMP=EPN + fi if [[ "ALIEN_JDL_USEFULLNUMADOMAIN" == 0 ]]; then if [[ $keep -eq 0 ]]; then if [[ $ALIEN_JDL_UNOPTIMIZEDGPUSETTINGS != 1 ]]; then - export OPTIMIZED_PARALLEL_ASYNC=pp_1gpu # sets the multiplicities to optimized defaults for this configuration (1 job with 1 gpu on EPNs) - export OPTIMIZED_PARALLEL_ASYNC_AUTO_SHM_LIMIT=1 + export OPTIMIZED_PARALLEL_ASYNC=pp_1gpu_${ALIEN_JDL_SITEARCH_TMP} # (16 cores, 1 gpu per job, pp) else - # forcing multiplicities to be 1 - export MULTIPLICITY_PROCESS_tof_matcher=1 - export MULTIPLICITY_PROCESS_mch_cluster_finder=1 - export MULTIPLICITY_PROCESS_tpc_entropy_decoder=1 - export MULTIPLICITY_PROCESS_itstpc_track_matcher=1 - export MULTIPLICITY_PROCESS_its_tracker=1 - export OMP_NUM_THREADS=4 - export TIMEFRAME_RATE_LIMIT=8 - export SHMSIZE=30000000000 + export OPTIMIZED_PARALLEL_ASYNC=pp_1gpu_${ALIEN_JDL_SITEARCH_TMP}_unoptimized # (16 cores, 1 gpu per job, pp, low CPU multiplicities) fi else - export TIMEFRAME_RATE_LIMIT=4 - export SHMSIZE=30000000000 + export OPTIMIZED_PARALLEL_ASYNC=keep_root fi else if [[ $BEAMTYPE == "pp" ]]; then - export OPTIMIZED_PARALLEL_ASYNC=pp_4gpu # sets the multiplicities to optimized defaults for this configuration (1 Numa, pp) - export OPTIMIZED_PARALLEL_ASYNC_AUTO_SHM_LIMIT=1 + export OPTIMIZED_PARALLEL_ASYNC=pp_4gpu_${ALIEN_JDL_SITEARCH_TMP} # (64 cores, 1 NUMA, 4 gpu per job, pp) else # PbPb - export OPTIMIZED_PARALLEL_ASYNC=PbPb_4gpu # sets the multiplicities to optimized defaults for this configuration (1 Numa, PbPb) - export OPTIMIZED_PARALLEL_ASYNC_AUTO_SHM_LIMIT=1 + export OPTIMIZED_PARALLEL_ASYNC=PbPb_4gpu_${ALIEN_JDL_SITEARCH_TMP} # (64 cores, 1 NUMA 4 gpu per job, PbPb) fi fi - fi -else - # David, Oct 13th - # the optimized settings for the 8 core GRID queue without GPU are - # (overwriting the values above) - # - if [[ "0$ASYNC_PASS_NO_OPTIMIZED_DEFAULTS" != "01" ]]; then - if [[ "$ALIEN_JDL_EPNFULLNUMACPUONLY" != 1 ]]; then - if [[ $BEAMTYPE == "pp" ]]; then - if (( $(echo "$RUN_IR > 800000" | bc -l) )); then - export TIMEFRAME_RATE_LIMIT=1 - elif (( $(echo "$RUN_IR < 50000" | bc -l) )); then - export TIMEFRAME_RATE_LIMIT=6 - else - export TIMEFRAME_RATE_LIMIT=3 - fi - export OPTIMIZED_PARALLEL_ASYNC=pp_8cpu # sets the multiplicities to optimized defaults for this configuration (grid) - export SHMSIZE=16000000000 - else # PbPb - export TIMEFRAME_RATE_LIMIT=2 - export OPTIMIZED_PARALLEL_ASYNC=pp_8cpu - export SHMSIZE=16000000000 - export SVERTEX_THREADS=5 - fi + else + export SETENV_NO_ULIMIT=1 + export DPL_DEFAULT_PIPELINE_LENGTH=16 # to avoid memory issues - affects performance, so don't do with GPUs + if [[ $ALIEN_JDL_EPNFULLNUMACPUONLY != 1 ]]; then + export OPTIMIZED_PARALLEL_ASYNC=8cpu # (8 cores per job, grid) else - export OPTIMIZED_PARALLEL_ASYNC=pp_64cpu # to use EPNs with full NUMA domain but without GPUs - export OPTIMIZED_PARALLEL_ASYNC_AUTO_SHM_LIMIT=1 + export OPTIMIZED_PARALLEL_ASYNC=pp_64cpu # (64 cores per job, 1 NUMA, EPN) fi fi fi diff --git a/DATA/production/configurations/asyncReco/setenv_extra.sh b/DATA/production/configurations/asyncReco/setenv_extra.sh index fb74d36f3..c9a822550 100644 --- a/DATA/production/configurations/asyncReco/setenv_extra.sh +++ b/DATA/production/configurations/asyncReco/setenv_extra.sh @@ -3,15 +3,6 @@ # process flags passed to the script -if [[ -z "$ALIEN_JDL_USEGPUS" || $ALIEN_JDL_USEGPUS != 1 ]]; then - export SETENV_NO_ULIMIT=1 -fi - -# to avoid memory issues - we don't do this on the EPNs, since it can affect the performance -if [[ $ALIEN_JDL_USEGPUS != 1 ]]; then - export DPL_DEFAULT_PIPELINE_LENGTH=16 -fi - # check if this is a production on skimmed data if grep -q /skimmed/ wn.xml ; then export ON_SKIMMED_DATA=1; diff --git a/DATA/production/workflow-multiplicities.sh b/DATA/production/workflow-multiplicities.sh index 3929e0456..76679ade9 100644 --- a/DATA/production/workflow-multiplicities.sh +++ b/DATA/production/workflow-multiplicities.sh @@ -53,11 +53,23 @@ if [[ ! -z ${OPTIMIZED_PARALLEL_ASYNC:-} ]]; then [[ ! -z ${TIMEFRAME_RATE_LIMIT:-} ]] && unset TIMEFRAME_RATE_LIMIT [[ ! -z ${SHMSIZE:-} ]] && unset SHMSIZE fi - if [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_8cpu" ]]; then + if [[ $OPTIMIZED_PARALLEL_ASYNC == "8cpu" ]]; then [[ -z ${TIMEFRAME_RATE_LIMIT:-} ]] && TIMEFRAME_RATE_LIMIT=3 [[ -z ${SHMSIZE:-} ]] && SHMSIZE=16000000000 NGPURECOTHREADS=5 - elif [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_16cpu" ]]; then + if [[ $BEAMTYPE == "pp" ]]; then + if (( $(echo "$RUN_IR > 800000" | bc -l) )); then + TIMEFRAME_RATE_LIMIT=1 + elif (( $(echo "$RUN_IR < 50000" | bc -l) )); then + TIMEFRAME_RATE_LIMIT=6 + else + TIMEFRAME_RATE_LIMIT=3 + fi + else # PbPb + TIMEFRAME_RATE_LIMIT=2 + SVERTEX_THREADS=5 + fi + elif [[ $OPTIMIZED_PARALLEL_ASYNC == "16cpu" ]]; then [[ -z ${TIMEFRAME_RATE_LIMIT:-} ]] && TIMEFRAME_RATE_LIMIT=8 [[ -z ${SHMSIZE:-} ]] && SHMSIZE=22000000000 NGPURECOTHREADS=9 @@ -78,20 +90,28 @@ if [[ ! -z ${OPTIMIZED_PARALLEL_ASYNC:-} ]]; then N_MCHCL=3 N_TOFMATCH=2 N_TPCENTDEC=3 - elif [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_1gpu" ]]; then + elif [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_1gpu_EPN" || $OPTIMIZED_PARALLEL_ASYNC == "pp_1gpu_EPN_unoptimized" ]]; then [[ -z ${TIMEFRAME_RATE_LIMIT:-} ]] && TIMEFRAME_RATE_LIMIT=8 [[ -z ${SHMSIZE:-} ]] && SHMSIZE=30000000000 - N_TOFMATCH=2 - N_MCHCL=3 - N_TPCENTDEC=2 - N_TPCITS=3 + NGPUS=1 + GPUTYPE=HIP + GPUMEMSIZE=$((25 << 30)) N_MCHTRK=2 - N_ITSTRK=3 - NGPURECOTHREADS=8 + N_TPCTRK=$NGPUS + if [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_1gpu_EPN" ]]; then + N_TOFMATCH=2 + N_MCHCL=3 + N_TPCENTDEC=2 + N_TPCITS=3 + N_ITSTRK=3 + NGPURECOTHREADS=8 + else + NGPURECOTHREADS=4 + fi NTRDTRKTHREADS=3 ITSTRK_THREADS=2 ITSTPC_THREADS=2 - elif [[ $OPTIMIZED_PARALLEL_ASYNC =~ ^pp_4gpu(_|$) ]]; then + elif [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_gpu_NERSC" || $OPTIMIZED_PARALLEL_ASYNC == "pp_4gpu_EPN" ]]; then if [[ -z ${TIMEFRAME_RATE_LIMIT:-} ]]; then if [[ ! -z ${ALIEN_JDL_LPMANCHORYEAR} && ${ALIEN_JDL_LPMANCHORYEAR} -lt 2023 ]]; then TIMEFRAME_RATE_LIMIT=45 @@ -100,19 +120,21 @@ if [[ ! -z ${OPTIMIZED_PARALLEL_ASYNC:-} ]]; then fi fi [[ -z ${SHMSIZE:-} ]] && SHMSIZE=100000000000 - if [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_4gpu_NERSC" ]]; then + if [[ $OPTIMIZED_PARALLEL_ASYNC == "pp_gpu_NERSC" ]]; then NGPUS=1 GPUTYPE=CUDA else NGPUS=4 + GPUTYPE=HIP fi + GPUMEMSIZE=$((25 << 30)) NGPURECOTHREADS=8 NTRDTRKTHREADS=2 ITSTRK_THREADS=2 ITSTPC_THREADS=2 SVERTEX_THREADS=4 TPCTIMESERIES_THREADS=2 - N_TPCTRK=4 + N_TPCTRK=$NGPUS N_FWDMATCH=2 N_PRIMVTXMATCH=1 N_PRIMVTX=1 @@ -128,16 +150,18 @@ if [[ ! -z ${OPTIMIZED_PARALLEL_ASYNC:-} ]]; then N_ITSTRK=12 N_ITSCL=2 export DPL_SMOOTH_RATE_LIMITING=1 - elif [[ $OPTIMIZED_PARALLEL_ASYNC =~ ^PbPb_4gpu(_|$) ]]; then + elif [[ $OPTIMIZED_PARALLEL_ASYNC == "PbPb_gpu_NERSC" || $OPTIMIZED_PARALLEL_ASYNC == "PbPb_4gpu_EPN" ]]; then [[ -z ${TIMEFRAME_RATE_LIMIT:-} ]] && TIMEFRAME_RATE_LIMIT=35 [[ -z ${SHMSIZE:-} ]] && SHMSIZE=100000000000 # SHM_LIMIT 3/4 [[ -z ${TIMEFRAME_SHM_LIMIT:-} ]] && TIMEFRAME_SHM_LIMIT=$(($SHMSIZE / 3)) - if [[ $OPTIMIZED_PARALLEL_ASYNC == "PbPb_4gpu_NERSC" ]]; then + if [[ $OPTIMIZED_PARALLEL_ASYNC == "PbPb_gpu_NERSC" ]]; then NGPUS=1 GPUTYPE=CUDA else NGPUS=4 + GPUTYPE=HIP fi + GPUMEMSIZE=$((25 << 30)) NGPURECOTHREADS=8 NTRDTRKTHREADS=8 ITSTRK_THREADS=5 @@ -172,6 +196,9 @@ if [[ ! -z ${OPTIMIZED_PARALLEL_ASYNC:-} ]]; then N_MCHTRK=1 N_TOFMATCH=9 N_TPCTRK=6 + elif [[ $OPTIMIZED_PARALLEL_ASYNC == "keep_root" ]]; then + TIMEFRAME_RATE_LIMIT=4 + SHMSIZE=30000000000 else echo "Invalid optimized setting '$OPTIMIZED_PARALLEL_ASYNC'" 1>&2 exit 1