aboutsummaryrefslogtreecommitdiff
path: root/Tools
diff options
context:
space:
mode:
authorKris Kennaway <kris@FreeBSD.org>2004-12-28 05:40:15 +0000
committerKris Kennaway <kris@FreeBSD.org>2004-12-28 05:40:15 +0000
commite47e30512604dbb97bf30d841a5b77aa05875d73 (patch)
tree312b3c3598dd45388e0694acb0f015fe0a5bc8ed /Tools
parentb6626b754a7a5e13e0aecb5c36a33da9440aa7d9 (diff)
downloadports-e47e30512604dbb97bf30d841a5b77aa05875d73.tar.gz
ports-e47e30512604dbb97bf30d841a5b77aa05875d73.zip
Overhaul of the job scheduler. The new scheduler runs builds
synchronously instead of probabilistically scheduling jobs, which means that the job load on a machine never exceeds a desired threshold, and we can preferentially use faster machines when they are available. This has a dramatic effect on package build throughput, although I don't yet have precise measurements of the performance improvements. Specifically, the changes are: * Introduce the new variable maxjobs in portbuild. This replaces the build scheduling weights previously listed in the mlist file, which now changes format to list the build machines only, ranked in order of preference for job dispatches (i.e. faster machines first). * The ${arch}/queue directory is used to list machines available for jobs (file content is the number of jobs currently running on the machine). Changes to files in this directory are serialized using lockf on the .lock file. * Claim a machine with the getmachine script, with the .lock held. This picks the machine with the fewestnumber of jobs running, which is listed highest in the mlist file in case of multiple machines with equal load. The job counter is incremented, and the file removed if the counter reaches ${maxjobs} for that machine. If all machines are busy, sleep for 15 seconds and retry. * After we have claimed a machine, we run claim-chroot on it to claim an empty chroot, as before. If the claim fails, release the job from the queue with the releasemachine script and retry after a 15 second wait. * When the build is finished, decrement the job counter with the releasemachine script, with .lock held. * The checkmachines script now exists only to poll the load averages for admin convenience (every 2 minutes), and to ping for unreachable machines. When a machine cannot be reached, remove the entry in the queue directory to stop further job dispatches to it. This needs more work to deal with reinitialization of machines after they become available again. Additional changes to this file: * Exit if passed a null package name, to avoid badness later on * Send a nag-mail if pkg-plist errors are detected in the build
Notes
Notes: svn path=/head/; revision=125316
Diffstat (limited to 'Tools')
-rwxr-xr-xTools/portbuild/scripts/pdispatch59
1 files changed, 33 insertions, 26 deletions
diff --git a/Tools/portbuild/scripts/pdispatch b/Tools/portbuild/scripts/pdispatch
index 799a86b4c7b9..56c5743f86f6 100755
--- a/Tools/portbuild/scripts/pdispatch
+++ b/Tools/portbuild/scripts/pdispatch
@@ -36,21 +36,12 @@ if grep -qxF ${pkgname} ${pb}/${arch}/${branch}/duds; then
exit 1
fi
-args=${1+"$@"}
-
-num=$(wc -w ${pb}/${arch}/ulist | awk '{print $1}')
-random=$(jot -r 1 1 ${num})
-mach=$(cat ${pb}/${arch}/ulist | cut -f ${random} -d ' ' )
-# If ulist is empty, then all build machines are busy, so try again in 15 seconds.
-if [ -z "${mach}" ]; then
- echo "All machines busy, sleeping"
- sleep 15
- echo "Retrying build of ${pkgname}"
- make ${pkgname}
- exit 0
+if [ -z "${pkgname}" ]; then
+ echo "null packagename"
+ exit 1
fi
-set $mach
+args=${1+"$@"}
flags=""
noclean=0
if [ "x$NOCLEAN" != "x" ]; then
@@ -75,20 +66,27 @@ fi
if [ "x$TRYBROKEN" != "x" ]; then
flags="${flags} -trybroken"
fi
-host=$1
-. ${pb}/${arch}/portbuild.${host}
-
-while [ -z "${chroot}" ]; do
- echo "Claiming a directory for ${pkgname} on ${host}"
- # May still fail if ssh times out?
- chroot=$(ssh -a -n ${client_user}@${host} ${sudo_cmd} ${pb}/scripts/claim-chroot ${arch} ${branch} ${pkgname})
- status=$?
- if [ ! ${status} ]; then
- echo "!!! Exiting from claim-chroot with status ${status} (${host} ${pkgname})"
- exit ${status}
+
+host=
+chroot=
+while [ -z "${host}" -o -z "${chroot}" ]; do
+ chroot=
+ host=$(lockf ${pb}/${arch}/queue/.lock ${pb}/scripts/getmachine ${pb} ${arch} ${branch})
+ # If ulist is empty, then all build machines are busy, so try again in 15 seconds.
+ if [ -z "${host}" ]; then
+ sleep 15
+ else
+ test -f ${pb}/${arch}/portbuild.${host} && . ${pb}/${arch}/portbuild.${host}
+ chroot=$(ssh -a -n ${client_user}@${host} ${sudo_cmd} ${pb}/scripts/claim-chroot ${arch} ${branch} ${pkgname})
+ if [ -z "${chroot}" ]; then
+ echo "Failed to claim chroot on ${host}"
+ lockf ${pb}/${arch}/queue/.lock ${pb}/scripts/releasemachine ${arch} ${host}
+ fi
fi
done
-echo "--> got directory ${chroot}"
+echo ${chroot}@${host}
+
+test -f ${pb}/${arch}/portbuild.${host} && . ${pb}/${arch}/portbuild.${host}
echo "dispatching: ssh -a -t -n ${client_user}@${host} ${sudo_cmd} ${command} ${arch} ${branch} ${chroot} ${flags} \"$ED\" \"$PD\" \"$FD\" \"$BD\" \"$RD\" ${args}"
${pb}/scripts/ptimeout.host $timeout ssh -a -t -n ${client_user}@${host} ${sudo_cmd} ${command} ${arch} ${branch} ${chroot} ${flags} \"$ED\" \"$PD\" \"$FD\" \"$BD\" \"$RD\" ${args}
@@ -113,9 +111,17 @@ if [ "${error}" = 0 ]; then
touch ${pb}/${arch}/${branch}/packages/All/${pkgname}${PKGSUFFIX}
rm -f ${pb}/${arch}/${branch}/errors/${pkgname}.log
lockf ${pb}/${arch}/${branch}/failure.lock ${pb}/scripts/buildsuccess ${arch} ${branch} ${pkgname}
- if grep -q "even though it is marked BROKEN" ${pb}/${arch}/${branch}/logs/$pkgname.log; then
+ log=${pb}/${arch}/${branch}/logs/$pkgname.log
+ if grep -q "even though it is marked BROKEN" ${log}; then
echo | mail -s "${pkgname} BROKEN but built on ${arch} ${branch}" kris@FreeBSD.org
fi
+ if [ "${arch}" = "i386" ]; then
+ if grep -q "^list of .*file" ${log}; then
+ buildlogdir=$(realpath ${pb}/${arch}/${branch}/logs/)
+ baselogdir=$(basename ${buildlogdir})
+ (sed -e '/^build started/,$d' $log;echo;echo "For the full build log, see"; echo; echo " http://${master}/errorlogs/${arch}-errorlogs/${baselogdir}/$(basename $log)";echo;sed -e '1,/^=== Checking filesystem state/d' $log) | mail -s "${pkgname} pkg-plist errors on ${arch} ${branch}" kris@FreeBSD.org
+ fi
+ fi
else
log=${pb}/${arch}/${branch}/errors/${pkgname}.log
scp ${client_user}@${host}:${chroot}/tmp/${pkgname}.log ${log} || (echo ${chroot}@${host}; ssh -a -n ${client_user}@${host} ls -laR ${chroot}/tmp) | mail -s "${pkgname} logfile not found" kris@FreeBSD.org
@@ -127,4 +133,5 @@ fi
ssh -a -n ${client_user}@${host} ${sudo_cmd} ${pb}/scripts/clean-chroot ${arch} ${branch} ${chroot} ${noclean}
+lockf ${pb}/${arch}/queue/.lock ${pb}/scripts/releasemachine ${arch} ${host}
exit ${error}