diff --git a/services/htcondor/LICENSE b/services/htcondor/LICENSE new file mode 100644 index 0000000..b99d89e --- /dev/null +++ b/services/htcondor/LICENSE @@ -0,0 +1,23 @@ +The HTCondor MJF module is licensed under the MIT License +[http://www.opensource.org/licenses/mit-license.php] + +Copyright (c) 2014 Igor Sfiligoi, isfiligoi@ucsd.edu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/services/htcondor/README b/services/htcondor/README new file mode 100644 index 0000000..d7489c3 --- /dev/null +++ b/services/htcondor/README @@ -0,0 +1,29 @@ +The htcondor JobMachineFeatures services package +contains both the software and the configuration templates +needed to add MJF features to a HTCondor pool. + +The module has two parts: +*) src/ contains the code to be used by HTCondor +*) etc/ contains the configuration templates + used by the code above + +Here are summary instructions on how to use it: +0) get files from github +1) put etc/wlcg-mjf-htcondor.config into /etc + on all worker nodes + (no customizations should be needed) +2) create a customized version of etc/wlcg-mjf-hs06.map + (possibly using the file in github as a starting point) + and put it in /etc on all worker nodes +3) put src/bootstrap/create_hostdir.sh + in some system area, e.g. /usr/local/sbin + run it once + and also add it to the boot sequence +4) put src/wrapper/set_job_env.source.sh + in some system area, e.g. /usr/local/libexec + and configure HTCondor to source it in + the USER_JOB_WRAPPER used by the system setup + +See the README in each section for more details. + + diff --git a/services/htcondor/etc/README b/services/htcondor/etc/README new file mode 100644 index 0000000..7934a0e --- /dev/null +++ b/services/htcondor/etc/README @@ -0,0 +1,35 @@ +The htcondor JobMachineFeatures services package +contains both the software and the configuration templates +needed to add MJF features to a HTCondor pool. + +This etc subdirectory contains the configuration templates +used by the module. + +There are two files: +*) wlcg-mjf-htcondor.config +*) wlcg-mjf-hs06.map + +Both should be copied into the /etc directory of all worker nodes, +after being properly customized. + +The aim is to allow a site admin to use +the same identical file on all the worker nodes, +but a site admin can choose to make node-by-node +config changes as well. + +wlcg-mjf-htcondor.config +======================== +This is the main config file. +There is generally no need to modify this, +unless you want to override autodetection. + + +wlcg-mjf-hs06.map +================= +This file is used to estimate the HS06 number, +based on the processor model number. +The file in the repository is just a template; +the site admin must populate it with the values +appropriate to the HW it is operating. + + diff --git a/services/htcondor/etc/wlcg-mjf-hs06.map b/services/htcondor/etc/wlcg-mjf-hs06.map new file mode 100644 index 0000000..8769189 --- /dev/null +++ b/services/htcondor/etc/wlcg-mjf-hs06.map @@ -0,0 +1,13 @@ +# This file must contain the mapping from CPUID to HS06 +# if the HS06 is not explicity set in the config file. +# +# The file in git contains a couple example lines +# The site is expected to replace them with values +# appropriate for their setup. +# +# The semantics is +# "CPUID" HS06NUM +# + +"Intel(R) Xeon(R) CPU X5650 @ 2.67GHz" 213 + diff --git a/services/htcondor/etc/wlcg-mjf-htcondor.config b/services/htcondor/etc/wlcg-mjf-htcondor.config new file mode 100644 index 0000000..6e826f5 --- /dev/null +++ b/services/htcondor/etc/wlcg-mjf-htcondor.config @@ -0,0 +1,46 @@ +# This file contains the configuraton knobs needed +# by the HTCondor-specific WLCG MJF service tools +# +# Syntax is +# key=val +# No spaces allowed. +# +# The file is expected to reside in +# /etc/wlcg-mjf-htcondor.config +# + +# +# Host config location (absoulte) +# +MACHINEFEATURES=/var/run/wlcg-mjf-host-features + +# +# Job config locate (relative to job startup dir) +# +JOBFEATURES=wlcg-mjf-job-features + +# +# The directory used for the job->host communication +# +JOBSTATUS=wlcg-rmjf-job-status + +# +# Various global knobs that can be set by hand +# If not set, the system will try to auto-detect them +# + +# Host specific ones +# +HS06MAPFILE=/etc/wlcg-mjf-hs06.map +#HS06VAL=10 +#NUM_CPUS=1 +#NUM_HT_CPUS=2 +#NUM_SLOTS=1 + +# Job specific ones +# +#SLOT_CPUS=1 +#SLOT_MEM=2500 +#SLOT_DISK=20 +#SLOT_TIME=250000 + diff --git a/services/htcondor/src/README b/services/htcondor/src/README new file mode 100644 index 0000000..421df13 --- /dev/null +++ b/services/htcondor/src/README @@ -0,0 +1,24 @@ +The htcondor JobMachineFeatures services package +contains both the software and the configuration templates +needed to add MJF features to a HTCondor pool. + +This src subdirectory contains the code to be used by HTCondor. + +It contains 2 subdirectories: +*) bootstrap/ contains the code to configure the node +*) wrapper/ contains the code needed at each job startup + +Here are summary instructions on how they are to be used: +0) properly configure the node config files +1) put bootstrap/create_hostdir.sh + in some system area, e.g. /usr/local/sbin + run it once + and also add it to the boot sequence +2) put wrapper/set_job_env.source.sh + in some system area, e.g. /usr/local/libexec + and configure HTCondor to source it in + the USER_JOB_WRAPPER used by the system setup + +See the README in each section for more details. + + diff --git a/services/htcondor/src/bootstrap/README b/services/htcondor/src/bootstrap/README new file mode 100644 index 0000000..33ea3c5 --- /dev/null +++ b/services/htcondor/src/bootstrap/README @@ -0,0 +1,29 @@ +The htcondor JobMachineFeatures services package +contains both the software and the configuration templates +needed to add MJF features to a HTCondor pool. + +This subdirectory contains the code to configure the node. + +There is only one file at this time: +*) create_hostdir.sh + +It should be copied in some system area, e.g. /usr/local/sbin +run it once and also added to the boot sequence. + +create_hostdir.sh +======================== +This script parses the configuration files and creates +the directories and files that describe the host. + +It autodetection is enabled (the default), it will fail +if some of the values cannot be determined. + +There are two typical failure modes: +a) Cannot determine the HTCondor properties, + e.g. if HTCondor is not properly installed +b) Cannot determine the HS06 value, + e.g. if the CPU model is not listed in the mapfile + +Please notice that successful completion is needed +for the package to work. + diff --git a/services/htcondor/src/bootstrap/create_hostdir.sh b/services/htcondor/src/bootstrap/create_hostdir.sh new file mode 100755 index 0000000..cd29a0a --- /dev/null +++ b/services/htcondor/src/bootstrap/create_hostdir.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +CFG=/etc/wlcg-mjf-htcondor.config + +# +# Find the directory +# + +MACHINEFEATURES=`awk '/^MACHINEFEATURES=/{split($0,a,"="); print a[2];}' $CFG` +if [ -z "$MACHINEFEATURES" ]; then + echo "Could not find the MACHINEFEATURES param in $CFG" 1>&2 + exit 1 +fi + +# +# Put in place CPU info +# +NUM_CPUS=`awk '/^NUM_CPUS=/{split($0,a,"="); print a[2];}' $CFG` +if [ -z "${NUM_CPUS}" ]; then + NUM_CPUS=`cat /proc/cpuinfo |grep '^processor' |wc -l` +fi + +NUM_HT_CPUS=`awk '/^NUM_HT_CPUS=/{split($0,a,"="); print a[2];}' $CFG` +if [ -z "${NUM_HT_CPUS}" ]; then + NUM_HT_CPUS=`cat /proc/cpuinfo |awk '/^siblings/{if (notfirst!=1) {split($0,a,": "); print a[2];}; notfirst=1;}'` + if [ -z "${NUM_HT_CPUS}" ]; then + NUM_HT_CPUS=${NUM_CPUS} + fi +fi + +# +# Find number of job slots +# Query HTCondor, if needed +# +NUM_SLOTS=`awk '/^NUM_SLOTS=/{split($0,a,"="); print a[2];}' $CFG` +if [ -z "${NUM_SLOTS}" ]; then + NUM_SLOTS=`condor_config_val NUM_SLOTS` + if [ -z "${NUM_SLOTS}" ]; then + # make the best guess + NUM_SLOTS=`condor_config_val NUM_CPUS` + if [ -z "${NUM_SLOTS}" ]; then + # just default to something, do not fail + echo "Could not find the number of slots, defaulting to ${NUM_CPUS}" 1>&2 + NUM_SLOTS=${NUM_CPUS} + fi + fi +fi + +# +# Find the HS06 number +# +HS06=`awk '/^HS06VAL=/{split($0,a,"="); print a[2];}' $CFG` +if [ -z "$HS06" ]; then + HS06MAPFILE=`awk '/^HS06MAPFILE=/{split($0,a,"="); print a[2];}' $CFG` + if [ -z "HS06MAPFILE" ]; then + HS06MAPFILE=/etc/wlcg-mjf-hs06.map + fi + + awk_expr='/^model name/{if (notfirst!=1) {split($0,a,": "); print "\""a[2]"\"";}; notfirst=1;}' + CPUID="`cat /proc/cpuinfo |awk \"${awk_expr}\"`" + HS06=`grep "^${CPUID} " "$HS06MAPFILE" | awk '{split($0,a,"\" "); print a[2]}'|head -1` + if [ -z "$HS06" ]; then + # too hard to guess a vlid default, fail + echo "Cound not find a HS06 mapping in $HS06MAPFILE" 1>&2 + echo "CPUID ${CPUID}" 1>&2 + exit 1 + fi +fi + +# +# Now that we have all the info, put it on disk +# + +umask 0022 +if [ ! -d $MACHINEFEATURES ]; then + mkdir -p $MACHINEFEATURES && chmod 0755 $MACHINEFEATURES + if [ $? -ne 0 ]; then + echo "Failed to create $MACHINEFEATURES"1>&2 + exit 2 + fi +fi + +echo ${NUM_CPUS} > $MACHINEFEATURES/log_cores +echo ${NUM_HT_CPUS} > $MACHINEFEATURES/phys_cores +echo ${NUM_SLOTS} > $MACHINEFEATURES/jobslots +echo ${HS06} > $MACHINEFEATURES/hs06 + diff --git a/services/htcondor/src/wrapper/README b/services/htcondor/src/wrapper/README new file mode 100644 index 0000000..d5426ed --- /dev/null +++ b/services/htcondor/src/wrapper/README @@ -0,0 +1,27 @@ +The htcondor JobMachineFeatures services package +contains both the software and the configuration templates +needed to add MJF features to a HTCondor pool. + +This subdirectory contains the code needed at each job startup. + +There is only one file at this time: +*) set_job_env.source.sh + +It should be copied in some system area, +e.g. /usr/local/libexec +and sourced as part of the +HTCondor's USER_JOB_WRAPPER bash-based script (not included). + +set_job_env.source.sh +======================== +This script parses the configuration files, +creates the necessary directories and files, +and sets the environment variables as per the +JobMachineFeatures specification, i.e. +MACHINEFEATURES, JOBFEATURES and JOBSTATUS + +More details can be found at +https://twiki.cern.ch/twiki/bin/view/LCG/WMTEGEnvironmentVariables + +If any problems are detected, the env variables will not be set. + diff --git a/services/htcondor/src/wrapper/set_job_env.source.sh b/services/htcondor/src/wrapper/set_job_env.source.sh new file mode 100644 index 0000000..2a5d707 --- /dev/null +++ b/services/htcondor/src/wrapper/set_job_env.source.sh @@ -0,0 +1,103 @@ +# This file should be sourced before the job starts +# It expects to run inside bash + +WLCG_MJF_CFG=/etc/wlcg-mjf-htcondor.config + +if [ -z "$_CONDOR_MACHINE_AD" ]; then + # just so it is defined for grepping + _CONDOR_MACHINE_AD="$PWD/.machine.ad" +fi + +MACHINEFEATURES=`awk '/^MACHINEFEATURES=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ ! -z "$MACHINEFEATURES" ]; then +################################# MACHINE FEATURES ##################### +LOCALJOBFEATURES=`awk '/^JOBFEATURES=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ -z "$LOCALJOBFEATURES" ]; then + # easy to pick a valid default + LOCALJOBFEATURES=wlcg-mjf-job-features +fi + +JOBFEATURES="${PWD}/${LOCALJOBFEATURES}" +mkdir "${JOBFEATURES}" +if [ $? -eq 0 ]; then +################################# JOB FEATURES ##################### + +# make it /tmp like +chmod 1777 "${JOBFEATURES}" + +# When needed values are not in the config file, look them up in the MachineAd +# If there is nothing there, put in reasonable defaults + +START_TIME=`awk '/^EnteredCurrentActivity =/{split($0,a,"= "); print a[2];}' ${_CONDOR_MACHINE_AD}` +if [ -z "${START_TIME}" ]; then + START_TIME=`date +%s` +fi + +SLOT_CPUS=`awk '/^SLOT_CPUS=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ -z "${SLOT_CPUS}" ]; then + SLOT_CPUS=`awk '/^Cpus =/{split($0,a,"= "); print a[2];}' ${_CONDOR_MACHINE_AD}` + if [ -z "${SLOT_CPUS}" ]; then + SLOT_CPUS=1 + fi +fi + +SLOT_MEM=`awk '/^SLOT_MEM=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ -z "${SLOT_MEM}" ]; then + SLOT_MEM=`awk '/^Memory =/{split($0,a,"= "); print a[2];}' ${_CONDOR_MACHINE_AD}` + if [ -z "${SLOT_MEM}" ]; then + SLOT_MEM=2500 + fi +fi + +SLOT_DISK=`awk '/^SLOT_DISK=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ -z "${SLOT_DISK}" ]; then + SLOT_DISK=`awk '/^Disk =/{split($0,a,"= "); print int(a[2]/1000000);}' ${_CONDOR_MACHINE_AD}` + if [ -z "${SLOT_DISK}" ]; then + SLOT_DISK=20 + fi +fi + +SLOT_TIME=`awk '/^SLOT_TIME=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ -z "${SLOT_TIME}" ]; then + SLOT_TIME_RAW=`awk '/^MaxJobRetirementTime =/{split($0,a,"= "); print a[2];}' ${_CONDOR_MACHINE_AD}` + let SLOT_TIME="${SLOT_TIME_RAW}" + if [ -z "${SLOT_TIME}" ]; then + SLOT_TIME=250000 + fi +fi + + +echo $START_TIME > "${JOBFEATURES}/jobstart_secs" +echo $SLOT_CPUS > "${JOBFEATURES}/allocated_CPU" +echo $SLOT_MEM > "${JOBFEATURES}/mem_limit_MB" +echo $SLOT_DISK > "${JOBFEATURES}/disk_limit_GB" +echo $SLOT_TIME > "${JOBFEATURES}/wall_limit_secs" +echo $SLOT_TIME > "${JOBFEATURES}/wall_limit_secs_lrms" +echo 1 > "${JOBFEATURES}/cpufactor_lrms" + +################################# JOB FEATURES ##################### +export JOBFEATURES + +LOCALJOBSTATUS=`awk '/^JOBSTATUS=/{split($0,a,"="); print a[2];}' ${WLCG_MJF_CFG}` +if [ -z "$LOCALJOBSTATUS" ]; then + # easy to pick a valid default + LOCALJOBSTATUS=wlcg-rmjf-job-status +fi + +JOBSTATUS="${PWD}/${LOCALJOBSTATUS}" +mkdir "${JOBSTATUS}" +if [ $? -eq 0 ]; then +################################# JOB STATUS ##################### + +# make sure it is world readable +chmod 0755 "${JOBSTATUS}" + +################################# JOB STATUS ##################### +export JOBSTATUS +fi # if jobstatus created + +fi # if jobfeatures created + +################################# MACHINE FEATURES ##################### +export MACHINEFEATURES +fi # if MACHINEFEATURES