aboutsummaryrefslogtreecommitdiff
path: root/usr.sbin/watchdogd
diff options
context:
space:
mode:
authorAlfred Perlstein <alfred@FreeBSD.org>2013-02-27 19:03:31 +0000
committerAlfred Perlstein <alfred@FreeBSD.org>2013-02-27 19:03:31 +0000
commit4b9b732ac0023efb10c6f3f61d804cfe1fb785e6 (patch)
treec811638778fdb39d8898681e0f6391ecf9ba3e04 /usr.sbin/watchdogd
parentdc1558d1cd79a8244a37aaee3d50c97b0657b5ff (diff)
downloadsrc-4b9b732ac0023efb10c6f3f61d804cfe1fb785e6.tar.gz
src-4b9b732ac0023efb10c6f3f61d804cfe1fb785e6.zip
watchdogd(8) and watchdog(4) enhancements.
The following support was added to watchdog(4): - Support to query the outstanding timeout. - Support to set a software pre-timeout function watchdog with an 'action' - Support to set a software only watchdog with a configurable 'action' 'action' can be a mask specifying a single operation or a combination of: log(9), printf(9), panic(9) and/or kdb_enter(9). Support the following in watchdogged: - Support to utilize the new additions to watchdog(4). - Support to warn if a watchdog script runs for too long. - Support for "dry run" where we do not actually arm the watchdog, but only report on our timing. Sponsored by: iXsystems, Inc. MFC after: 1 month
Notes
Notes: svn path=/head/; revision=247405
Diffstat (limited to 'usr.sbin/watchdogd')
-rw-r--r--usr.sbin/watchdogd/watchdogd.8104
-rw-r--r--usr.sbin/watchdogd/watchdogd.c333
2 files changed, 418 insertions, 19 deletions
diff --git a/usr.sbin/watchdogd/watchdogd.8 b/usr.sbin/watchdogd/watchdogd.8
index b00fef17cdba..096eb9d55b76 100644
--- a/usr.sbin/watchdogd/watchdogd.8
+++ b/usr.sbin/watchdogd/watchdogd.8
@@ -1,3 +1,5 @@
+.\" Copyright (c) 2013 iXsystems.com,
+.\" author: Alfred Perlstein <alfred@freebsd.org>
.\" Copyright (c) 2004 Poul-Henning Kamp <phk@FreeBSD.org>
.\" Copyright (c) 2003 Sean M. Kelly <smkelly@FreeBSD.org>
.\" All rights reserved.
@@ -25,7 +27,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd September 2, 2006
+.Dd September 2, 2013
.Dt WATCHDOGD 8
.Os
.Sh NAME
@@ -33,11 +35,17 @@
.Nd watchdog daemon
.Sh SYNOPSIS
.Nm
-.Op Fl d
+.Op Fl dnw
+.Op Fl -debug
+.Op Fl -softtimeout
+.Op Fl -softtimeout-action Ar action
+.Op Fl -pretimeout Ar timeout
+.Op Fl -pretimeout-action Ar action
.Op Fl e Ar cmd
.Op Fl I Ar file
.Op Fl s Ar sleep
.Op Fl t Ar timeout
+.Op Fl T Ar script_timeout
.Sh DESCRIPTION
The
.Nm
@@ -62,6 +70,13 @@ is not specified, the daemon will perform a trivial file system
check instead.
.Pp
The
+.Fl n
+argument 'dry-run' will cause watchdog not to arm the system watchdog and
+instead only run the watchdog function and report on failures.
+This is useful for developing new watchdogd scripts as the system will not
+reboot if there are problems with the script.
+.Pp
+The
.Fl s Ar sleep
argument can be used to control the sleep period between each execution
of the check and defaults to one second.
@@ -78,6 +93,16 @@ If this occurs,
will no longer execute and thus the kernel's watchdog routines will take
action after a configurable timeout.
.Pp
+The
+.Fl T Ar script_timeout
+specifies the threshold (in seconds) at which the watchdogd will complain
+that its script has run for too long.
+If unset
+.Ar script_timeout
+defaults to the value specified by the
+.Fl s Ar sleep
+option.
+.Pp
Upon receiving the
.Dv SIGTERM
or
@@ -90,17 +115,85 @@ will terminate.
The
.Nm
utility recognizes the following runtime options:
-.Bl -tag -width ".Fl I Ar file"
+.Bl -tag -width ".Fl -softtimeout-action Ar action "
.It Fl I Ar file
Write the process ID of the
.Nm
utility in the specified file.
-.It Fl d
+.It Fl d Fl -debug
Do not fork.
When this option is specified,
.Nm
will not fork into the background at startup.
+.Pp
+.It Fl w
+Complain when the watchdog script takes too long.
+This flag will cause watchdogd to complain when the amount of time to
+execute the watchdog script exceeds the threshold of 'sleep' option.
+.Pp
+.It Fl -pretimeout Ar timeout
+Set a "pretimeout" watchdog. At "timeout" seconds before the watchdog
+will fire attempt an action. The action is set by the --pretimeout-action
+flag. The default is just to log a message (WD_SOFT_LOG) via
+.Xr log 9 .
+.Pp
+.It Fl -pretimeout-action Ar action
+Set the timeout action for the pretimeout. See the section
+.Sx Timeout Actions .
+.Pp
+.It Fl -softtimeout
+Instead of arming the various hardware watchdogs, only use a basic software
+watchdog. The default action is just to
+.Xr log 9
+a message (WD_SOFT_LOG).
+.Pp
+.It Fl -softtimeout-action Ar action
+Set the timeout action for the softtimeout. See the section
+.Sx Timeout Actions .
+.Pp
.El
+.Sh Timeout Actions
+The following timeout actions are available via the
+.Fl -pretimeout-action
+and
+.Fl -softtimeout-action
+flags:
+.Bl -tag -width ".Ar printf "
+.It Ar panic
+Call
+.Xr panic 9
+when the timeout is reached.
+.Pp
+.It Ar ddb
+Enter the kernel debugger via
+.Xr kdb_enter 9
+when the timeout is reached.
+.Pp
+.It Ar log
+Log a message using
+.Xr log 9
+when the timeout is reached.
+.Pp
+.It Ar printf
+call the kernel
+.Xr printf 9
+to display a message to the console and
+.Xr dmesg 8
+buffer.
+.Pp
+.El
+Actions can be combined in a comma separated list as so:
+.Ar log,printf
+which would both
+.Xr printf 9
+and
+.Xr log 9
+which will send messages both to
+.Xr dmesg 8
+and the kernel
+.Xr log 4
+device for
+.Xr syslog 8 .
.Sh FILES
.Bl -tag -width ".Pa /var/run/watchdogd.pid" -compact
.It Pa /var/run/watchdogd.pid
@@ -125,3 +218,6 @@ and
.Pp
Some contributions made by
.An Jeff Roberson Aq jeff@FreeBSD.org .
+.Pp
+The pretimeout and softtimeout action system was added by
+.An Alfred Perlstein Aq alfred@freebsd.org .
diff --git a/usr.sbin/watchdogd/watchdogd.c b/usr.sbin/watchdogd/watchdogd.c
index 50bdd91fa79d..bb923875a69d 100644
--- a/usr.sbin/watchdogd/watchdogd.c
+++ b/usr.sbin/watchdogd/watchdogd.c
@@ -1,5 +1,8 @@
/*-
* Copyright (c) 2003-2004 Sean M. Kelly <smkelly@FreeBSD.org>
+ * Copyright (c) 2013 iXsystems.com,
+ * author: Alfred Perlstein <alfred@freebsd.org>
+ *
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -50,8 +53,11 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <strings.h>
#include <sysexits.h>
+#include <syslog.h>
#include <unistd.h>
+#include <getopt.h>
+
static void parseargs(int, char *[]);
static void sighandler(int);
static void watchdog_loop(void);
@@ -63,13 +69,39 @@ static void usage(void);
static int debugging = 0;
static int end_program = 0;
static const char *pidfile = _PATH_VARRUN "watchdogd.pid";
-static u_int timeout = WD_TO_16SEC;
+static u_int timeout = WD_TO_128SEC;
+static u_int pretimeout = 0;
static u_int passive = 0;
static int is_daemon = 0;
+static int is_dry_run = 0; /* do not arm the watchdog, only
+ report on timing of the watch
+ program */
+static int do_timedog = 0;
+static int do_syslog = 0;
static int fd = -1;
static int nap = 1;
+static int carp_thresh_seconds = -1;
static char *test_cmd = NULL;
+static const char *getopt_shortopts;
+
+static int pretimeout_set;
+static int pretimeout_act;
+static int pretimeout_act_set;
+
+static int softtimeout_set;
+static int softtimeout_act;
+static int softtimeout_act_set;
+
+static struct option longopts[] = {
+ { "debug", no_argument, &debugging, 1 },
+ { "pretimeout", required_argument, &pretimeout_set, 1 },
+ { "pretimeout-action", required_argument, &pretimeout_act_set, 1 },
+ { "softtimeout", no_argument, &softtimeout_set, 1 },
+ { "softtimeout-action", required_argument, &softtimeout_act_set, 1 },
+ { NULL, 0, NULL, 0}
+};
+
/*
* Ask malloc() to map minimum-sized chunks of virtual address space at a time,
* so that mlockall() won't needlessly wire megabytes of unused memory into the
@@ -93,12 +125,18 @@ main(int argc, char *argv[])
parseargs(argc, argv);
+ if (do_syslog) {
+ openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR,
+ LOG_DAEMON);
+
+ }
+
rtp.type = RTP_PRIO_REALTIME;
rtp.prio = 0;
if (rtprio(RTP_SET, 0, &rtp) == -1)
err(EX_OSERR, "rtprio");
- if (watchdog_init() == -1)
+ if (!is_dry_run && watchdog_init() == -1)
errx(EX_SOFTWARE, "unable to initialize watchdog");
if (is_daemon) {
@@ -108,6 +146,7 @@ main(int argc, char *argv[])
pfh = pidfile_open(pidfile, 0600, &otherpid);
if (pfh == NULL) {
if (errno == EEXIST) {
+ watchdog_onoff(0);
errx(EX_SOFTWARE, "%s already running, pid: %d",
getprogname(), otherpid);
}
@@ -164,6 +203,9 @@ static int
watchdog_init(void)
{
+ if (is_dry_run)
+ return 0;
+
fd = open("/dev/" _PATH_WATCHDOG, O_RDWR);
if (fd >= 0)
return (0);
@@ -172,26 +214,98 @@ watchdog_init(void)
}
/*
+ * If we are doing timing, then get the time.
+ */
+static int
+watchdog_getuptime(struct timespec *tp)
+{
+ int error;
+
+ if (!do_timedog)
+ return 0;
+
+ error = clock_gettime(CLOCK_UPTIME_FAST, tp);
+ if (error)
+ warn("clock_gettime");
+ return (error);
+}
+
+static long
+watchdog_check_dogfunction_time(struct timespec *tp_start,
+ struct timespec *tp_end)
+{
+ struct timeval tv_start, tv_end, tv;
+ const char *cmd_prefix, *cmd;
+ int sec;
+
+ if (!do_timedog)
+ return (0);
+
+ TIMESPEC_TO_TIMEVAL(&tv_start, tp_start);
+ TIMESPEC_TO_TIMEVAL(&tv_end, tp_end);
+ timersub(&tv_end, &tv_start, &tv);
+ sec = tv.tv_sec;
+ if (sec < carp_thresh_seconds)
+ return (sec);
+
+ if (test_cmd) {
+ cmd_prefix = "Watchdog program";
+ cmd = test_cmd;
+ } else {
+ cmd_prefix = "Watchdog operation";
+ cmd = "stat(\"/etc\", &sb)";
+ }
+ if (do_syslog)
+ syslog(LOG_CRIT, "%s: '%s' took too long: "
+ "%d.%06ld seconds >= %d seconds threshhold",
+ cmd_prefix, cmd, sec, (long)tv.tv_usec,
+ carp_thresh_seconds);
+ warnx("%s: '%s' took too long: "
+ "%d.%06ld seconds >= %d seconds threshhold",
+ cmd_prefix, cmd, sec, (long)tv.tv_usec, carp_thresh_seconds);
+ return (sec);
+}
+
+
+/*
* Main program loop which is iterated every second.
*/
static void
watchdog_loop(void)
{
+ struct timespec ts_start, ts_end;
struct stat sb;
- int failed;
+ long waited;
+ int error, failed;
while (end_program != 2) {
failed = 0;
+ error = watchdog_getuptime(&ts_start);
+ if (error) {
+ end_program = 1;
+ goto try_end;
+ }
+
if (test_cmd != NULL)
failed = system(test_cmd);
else
failed = stat("/etc", &sb);
+ error = watchdog_getuptime(&ts_end);
+ if (error) {
+ end_program = 1;
+ goto try_end;
+ }
+
+ waited = watchdog_check_dogfunction_time(&ts_start, &ts_end);
+
if (failed == 0)
watchdog_patpat(timeout|WD_ACTIVE);
- sleep(nap);
+ if (nap - waited > 0)
+ sleep(nap - waited);
+try_end:
if (end_program != 0) {
if (watchdog_onoff(0) == 0) {
end_program = 2;
@@ -211,6 +325,9 @@ static int
watchdog_patpat(u_int t)
{
+ if (is_dry_run)
+ return 0;
+
return ioctl(fd, WDIOCPATPAT, &t);
}
@@ -221,11 +338,62 @@ watchdog_patpat(u_int t)
static int
watchdog_onoff(int onoff)
{
-
- if (onoff)
+ int error;
+
+ /* fake successful watchdog op if a dry run */
+ if (is_dry_run)
+ return 0;
+
+ if (onoff) {
+ /*
+ * Call the WDIOC_SETSOFT regardless of softtimeout_set
+ * because we'll need to turn it off if someone had turned
+ * it on.
+ */
+ error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set);
+ if (error) {
+ warn("setting WDIOC_SETSOFT %d", softtimeout_set);
+ return (error);
+ }
+ error = watchdog_patpat((timeout|WD_ACTIVE));
+ if (error) {
+ warn("watchdog_patpat failed");
+ goto failsafe;
+ }
+ if (softtimeout_act_set) {
+ error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT,
+ &softtimeout_act);
+ if (error) {
+ warn("setting WDIOC_SETSOFTTIMEOUTACT %d",
+ softtimeout_act);
+ goto failsafe;
+ }
+ }
+ if (pretimeout_set) {
+ error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout);
+ if (error) {
+ warn("setting WDIOC_SETPRETIMEOUT %d",
+ pretimeout);
+ goto failsafe;
+ }
+ }
+ if (pretimeout_act_set) {
+ error = ioctl(fd, WDIOC_SETPRETIMEOUTACT,
+ &pretimeout_act);
+ if (error) {
+ warn("setting WDIOC_SETPRETIMEOUTACT %d",
+ pretimeout_act);
+ goto failsafe;
+ }
+ }
+ /* pat one more time for good measure */
return watchdog_patpat((timeout|WD_ACTIVE));
- else
+ } else {
return watchdog_patpat(0);
+ }
+failsafe:
+ watchdog_patpat(0);
+ return (error);
}
/*
@@ -235,27 +403,132 @@ static void
usage(void)
{
if (is_daemon)
- fprintf(stderr, "usage: watchdogd [-d] [-e cmd] [-I file] [-s sleep] [-t timeout]\n");
+ fprintf(stderr, "usage:\n"
+" watchdogd [-dnw] [-e cmd] [-I file] [-s sleep] [-t timeout]\n"
+" [-T script_timeout]\n"
+" [--debug]\n"
+" [--pretimeout seconds] [-pretimeout-action action]\n"
+" [--softtimeout] [-softtimeout-action action]\n"
+);
else
fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n");
exit(EX_USAGE);
}
+static long
+fetchtimeout(int opt, const char *longopt, const char *myoptarg)
+{
+ const char *errstr;
+ char *p;
+ long rv;
+
+ errstr = NULL;
+ p = NULL;
+ errno = 0;
+ rv = strtol(myoptarg, &p, 0);
+ if ((p != NULL && *p != '\0') || errno != 0)
+ errstr = "is not a number";
+ if (rv <= 0)
+ errstr = "must be greater than zero";
+ if (errstr) {
+ if (longopt)
+ errx(EX_USAGE, "--%s argument %s", longopt, errstr);
+ else
+ errx(EX_USAGE, "-%c argument %s", opt, errstr);
+ }
+ return (rv);
+}
+
+struct act_tbl {
+ const char *at_act;
+ int at_value;
+};
+
+struct act_tbl act_tbl[] = {
+ { "panic", WD_SOFT_PANIC },
+ { "ddb", WD_SOFT_DDB },
+ { "log", WD_SOFT_LOG },
+ { "printf", WD_SOFT_PRINTF },
+ { NULL, 0 }
+};
+
+static void
+timeout_act_error(const char *lopt, const char *badact)
+{
+ char *opts, *oldopts;
+ int i;
+
+ opts = NULL;
+ for (i = 0; act_tbl[i].at_act != NULL; i++) {
+ oldopts = opts;
+ if (asprintf(&opts, "%s%s%s",
+ oldopts == NULL ? "" : oldopts,
+ oldopts == NULL ? "" : ", ",
+ act_tbl[i].at_act) == -1)
+ err(EX_OSERR, "malloc");
+ free(oldopts);
+ }
+ warnx("bad --%s argument '%s' must be one of (%s).",
+ lopt, badact, opts);
+ usage();
+}
+
+/*
+ * Take a comma separated list of actions and or the flags
+ * together for the ioctl.
+ */
+static int
+timeout_act_str2int(const char *lopt, const char *acts)
+{
+ int i;
+ char *dupacts, *tofree;
+ char *o;
+ int rv = 0;
+
+ tofree = dupacts = strdup(acts);
+ if (!tofree)
+ err(EX_OSERR, "malloc");
+ while ((o = strsep(&dupacts, ",")) != NULL) {
+ for (i = 0; act_tbl[i].at_act != NULL; i++) {
+ if (!strcmp(o, act_tbl[i].at_act)) {
+ rv |= act_tbl[i].at_value;
+ break;
+ }
+ }
+ if (act_tbl[i].at_act == NULL)
+ timeout_act_error(lopt, o);
+ }
+ free(tofree);
+ return rv;
+}
+
/*
* Handle the few command line arguments supported.
*/
static void
parseargs(int argc, char *argv[])
{
+ int longindex;
int c;
char *p;
+ const char *lopt;
double a;
+ /*
+ * if we end with a 'd' aka 'watchdogd' then we are the daemon program,
+ * otherwise run as a command line utility.
+ */
c = strlen(argv[0]);
if (argv[0][c - 1] == 'd')
is_daemon = 1;
- while ((c = getopt(argc, argv,
- is_daemon ? "I:de:s:t:?" : "dt:?")) != -1) {
+
+ if (is_daemon)
+ getopt_shortopts = "I:de:ns:t:ST:w?";
+ else
+ getopt_shortopts = "dt:?";
+
+ while ((c = getopt_long(argc, argv, getopt_shortopts, longopts,
+ &longindex)) != -1) {
switch (c) {
case 'I':
pidfile = optarg;
@@ -266,17 +539,19 @@ parseargs(int argc, char *argv[])
case 'e':
test_cmd = strdup(optarg);
break;
+ case 'n':
+ is_dry_run = 1;
+ break;
#ifdef notyet
case 'p':
passive = 1;
break;
#endif
case 's':
- p = NULL;
- errno = 0;
- nap = strtol(optarg, &p, 0);
- if ((p != NULL && *p != '\0') || errno != 0)
- errx(EX_USAGE, "-s argument is not a number");
+ nap = fetchtimeout(c, NULL, optarg);
+ break;
+ case 'S':
+ do_syslog = 1;
break;
case 't':
p = NULL;
@@ -286,6 +561,7 @@ parseargs(int argc, char *argv[])
errx(EX_USAGE, "-t argument is not a number");
if (a < 0)
errx(EX_USAGE, "-t argument must be positive");
+
if (a == 0)
timeout = WD_TO_NEVER;
else
@@ -294,12 +570,39 @@ parseargs(int argc, char *argv[])
printf("Timeout is 2^%d nanoseconds\n",
timeout);
break;
+ case 'T':
+ carp_thresh_seconds = fetchtimeout(c, "NULL", optarg);
+ break;
+ case 'w':
+ do_timedog = 1;
+ break;
+ case 0:
+ lopt = longopts[longindex].name;
+ if (!strcmp(lopt, "pretimeout")) {
+ pretimeout = fetchtimeout(0, lopt, optarg);
+ } else if (!strcmp(lopt, "pretimeout-action")) {
+ pretimeout_act = timeout_act_str2int(lopt,
+ optarg);
+ } else if (!strcmp(lopt, "softtimeout-action")) {
+ softtimeout_act = timeout_act_str2int(lopt,
+ optarg);
+ } else {
+ /* warnx("bad option at index %d: %s", optind,
+ argv[optind]);
+ usage();
+ */
+ }
+ break;
case '?':
default:
usage();
/* NOTREACHED */
}
}
+
+ if (carp_thresh_seconds == -1)
+ carp_thresh_seconds = nap;
+
if (argc != optind)
errx(EX_USAGE, "extra arguments.");
if (is_daemon && timeout < WD_TO_1SEC)