s6

Mirror/fork of https://skarnet.org/software/s6/
git clone https://ccx.te2000.cz/git/s6
Log | Files | Refs | README | LICENSE

commit 75c4223a7f5a8a073ba0d898ef9d841fdaef2f63
parent 91eba4b27017c3b7bf0d0bb548a12a694ed51a3b
Author: Laurent Bercot <ska-skaware@skarnet.org>
Date:   Wed, 21 Mar 2018 18:00:10 +0000

 Add s6-svdt-clear, s6-permafailon

Diffstat:
M.gitignore | 3+++
MNEWS | 3++-
Mdoc/index.html | 2++
Adoc/s6-permafailon.html | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/s6-svdt-clear.html | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/s6-svdt.html | 6+++---
Mdoc/s6-svstat.html | 5++++-
Mdoc/upgrade.html | 6+++++-
Mpackage/deps.mak | 6++++++
Mpackage/modes | 2++
Mpackage/targets.mak | 2++
Asrc/supervision/deps-exe/s6-permafailon | 2++
Asrc/supervision/deps-exe/s6-svdt-clear | 2++
Asrc/supervision/s6-permafailon.c | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/supervision/s6-svdt-clear.c | 15+++++++++++++++
15 files changed, 322 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -24,6 +24,9 @@ /s6-svlisten1 /s6-svlisten /s6-notifyoncheck +/s6-svdt +/s6-svdt-clear +/s6-permafailon /s6-envdir /s6-envuidgid /s6-fghack diff --git a/NEWS b/NEWS @@ -5,7 +5,8 @@ In 2.7.1.0 - Bugfixes. - s6-svwait et al. now exit 102 instead of hanging if s6-supervise dies. - - New command: s6-svdt + - New commands: s6-svdt, s6-svdt-clear, s6-permafailon + - s6-tai64nlocal can now print GMT times with the -g option. In 2.7.0.0 diff --git a/doc/index.html b/doc/index.html @@ -157,6 +157,8 @@ a user interface to control those processes and monitor service states. <li><a href="s6-svlisten.html">The <tt>s6-svlisten</tt> program</a></li> <li><a href="s6-notifyoncheck.html">The <tt>s6-notifyoncheck</tt> program</a></li> <li><a href="s6-svdt.html">The <tt>s6-svdt</tt> program</a></li> +<li><a href="s6-svdt-clear.html">The <tt>s6-svdt-clear</tt> program</a></li> +<li><a href="s6-permafailon.html">The <tt>s6-permafailon</tt> program</a></li> </ul> <h4> Daemontools-like utilities </h4> diff --git a/doc/s6-permafailon.html b/doc/s6-permafailon.html @@ -0,0 +1,98 @@ +<html> + <head> + <meta name="viewport" content="width=device-width, initial-scale=1.0" /> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <meta http-equiv="Content-Language" content="en" /> + <title>s6: the s6-permafailon program</title> + <meta name="Description" content="s6: the s6-permafailon program" /> + <meta name="Keywords" content="s6 supervision finish permanent failure service" /> + <!-- <link rel="stylesheet" type="text/css" href="//skarnet.org/default.css" /> --> + </head> +<body> + +<p> +<a href="index.html">s6</a><br /> +<a href="//skarnet.org/software/">Software</a><br /> +<a href="//skarnet.org/">skarnet.org</a> +</p> + +<h1> The <tt>s6-permafailon</tt> program </h1> + +<p> +<tt>s6-permafailon</tt> is a program that is meant to be used +in the <tt>./finish</tt> script of a +<a href="servicedir.html">service directory</a> supervised by +<a href="s6-supervise.html">s6-supervise</a>. When used, it +reads and analyses the death tally of a service (i.e. the recent +process death events that happened), and if the death tally +matches a given pattern, it causes <em>permanent failure</em> +of the service, i.e. it tells the supervisor not to try and +restart it. +</p> + +<h2> Interface </h2> + +<pre> + s6-permafailon <em>secs</em> <em>deathcount</em> <em>events</em> <em>prog...</em> +</pre> + +<ul> + <li> <tt>s6-permafailon</tt> must have the service directory of the +tested service as its current directory. This is the default if it is +called from the <tt>finish</tt> script of the service. </li> + <li> It reads the <em>death tally</em> of the service, which is +maintained by <a href="s6-supervise.html">s6-supervise</a>. </li> + <li> If the supervised process has died at least <em>deathcount</em> +times in the last <em>secs</em> seconds with a cause listed in +<em>events</em>, then <tt>s6-permafailon</tt> exits 125. </li> + <li> Else <tt>s6-permafailon</tt> execs into <em>prog...</em>. </li> +</ul> + +<p> + <em>events</em> is a comma-separated list of events. An event can be +one of the following: +</p> + +<ul> + <li> An exit code, which is an integer between 0 and 255. Example: <tt>1</tt> </li> + <li> An exit code interval, which is two exit codes separated by a dash. Example: <tt>1-50</tt> </li> + <li> A signal name, or a signal number preceded by "SIG". Examples: <tt>SIGTERM</tt>, <tt>sigabrt</tt>, <tt>sig11</tt> </li> +</ul> + +<h2> Usage </h2> + +<ul> + <li> <a href="s6-supervise.html">s6-supervise</a> detects when the <tt>./finish</tt> +script of its service exits 125, and stops respawning the service. So, if the +<tt>./finish</tt> script is a chain-loading command line starting with a +<tt>s6-permafailon</tt> invocation (or containing such an invocation), when +<tt>s6-permafailon</tt> exits 125, then the <tt>./finish</tt> script also +exits 125 (because it is the same process), and the service is then marked as +failing permanently. </li> + <li> The <tt>./finish</tt> script is <em>naturally</em> a chain-loading +command line if it is written in the +<a href="//skarnet.org/software/execline/">execline</a> language. It +can also be made into a chain-loading command line from a shell script by using +<tt>exec s6-permafailon secs deathcount events rest-of-chainloading-cmdline...</tt> </li> + <li> Multiple invocations of <tt>s6-permafailon</tt> can be chained, in order +to test several death patterns. </li> + <li> If a permanent failure is triggered and <em>secs</em> is high, it is +possible that when the administrator manually launches the service again, +the next death triggers a permanent failure again. If this is not wanted, +the administrator should clear the death tally with the +<a href="s6-svdt-clear.html">s6-svdt-clear</a> command. </li> + <li> The current death tally can be viewed via the <a href="s6-svdt.html">s6-svdt</a> +command. </li> +</ul> + +<h2> Example </h2> + +<p> + <tt>s6-permafailon 60 5 1,101-103,SIGSEGV,SIBBUS <em>prog...</em></tt> +will exit 125 if the service has died 5 times in the last 60 seconds with +an exit code of 1, 101, 102 or 103, a SIGSEGV or a SIGBUS. Else it will +chainload into the <em>prog...</em> command line. +</p> + +</body> +</html> diff --git a/doc/s6-svdt-clear.html b/doc/s6-svdt-clear.html @@ -0,0 +1,58 @@ +<html> + <head> + <meta name="viewport" content="width=device-width, initial-scale=1.0" /> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <meta http-equiv="Content-Language" content="en" /> + <title>s6: the s6-svdt-clear program</title> + <meta name="Description" content="s6: the s6-svdt-clear program" /> + <meta name="Keywords" content="s6 command s6-svdt-clear servicedir death tally clearing process supervision s6-supervise" /> + <!-- <link rel="stylesheet" type="text/css" href="//skarnet.org/default.css" /> --> + </head> +<body> + +<p> +<a href="index.html">s6</a><br /> +<a href="//skarnet.org/software/">Software</a><br /> +<a href="//skarnet.org/">skarnet.org</a> +</p> + +<h1> The <tt>s6-svdt-clear</tt> program </h1> + +<p> +<tt>s6-svdt-clear</tt> clears the recorded death tally of a service. +</p> + +<h2> Interface </h2> + +<pre> + s6-svdt-clear <em>servicedir</em> +</pre> + +<p> + s6-svdt-clear clears the recorded death tally of the service being +currently supervised at the <em>servicedir</em> +<a href="servicedir.html">service directory</a>. +</p> + +<ul> + <li> 0: success </li> + <li> 100: wrong usage </li> + <li> 111: system call failed </li> +</ul> + +<h2> Notes </h2> + +<ul> + <li> Use of <tt>s6-svdt-clear</tt> impacts the listings obtained +by the <a href="s6-svdt.html">s6-svdt</a> command. </li> + <li> It also impacts the behaviour of the +<a href="s6-permafailon.html">s6-permafailon</a> command. This is +the main reason to use <tt>s6-svdt-clear</tt>: once a service has +failed permanently due to an excessive number of deaths in a given +time, it can be useful to erase that record of deaths before +starting the service again, in order to avoid permanently failing +again too fast. </li> +</ul> + +</body> +</html> diff --git a/doc/s6-svdt.html b/doc/s6-svdt.html @@ -16,10 +16,10 @@ <a href="//skarnet.org/">skarnet.org</a> </p> -<h1> The s6-svdt program </h1> +<h1> The <tt>s6-svdt</tt> program </h1> <p> -s6-svstat prints the recorded death tally of a service, i.e. a list of the times +<tt>s6-svdt</tt> prints the recorded death tally of a service, i.e. a list of the times the process died, with the cause of death. </p> @@ -30,7 +30,7 @@ the process died, with the cause of death. </pre> <p> - s6-svdt prints the contents of the recorded death tally of the service being + <tt>s6-svdt</tt> prints the contents of the recorded death tally of the service being currently supervised at the <em>servicedir</em> <a href="servicedir.html">service directory</a>, then exits 0. </p> diff --git a/doc/s6-svstat.html b/doc/s6-svstat.html @@ -76,8 +76,11 @@ are as follows. names are the following: <ul> <li> <tt>up</tt>: print <tt>true</tt> if the service is up and <tt>false</tt> if it is down. +<!-- If the service is being throttled (i.e. technically up, but sleeping for a certain -amount of time before it is really launched), it prints <tt>throttled</tt> instead of <tt>true</tt>. </li> +amount of time before it is really launched), it prints <tt>throttled</tt> instead of <tt>true</tt>. +--> + </li> <li> <tt>wantedup</tt>: print <tt>true</tt> if <a href="s6-supervise.html">s6-supervise</a> is currently instructed to (re)start the service when it is down, and <tt>false</tt> if <a href="s6-supervise.html">s6-supervise</a> is currently instructed to leave the service alone. </li> diff --git a/doc/upgrade.html b/doc/upgrade.html @@ -22,7 +22,11 @@ <ul> <li> skalibs dependency bumped to 2.6.4.0. </li> - <li> New command: <a href="s6-svdt.html">s6-svdt</a>. </li> + <li> New commands: <a href="s6-svdt.html">s6-svdt</a>, +<a href="s6-svdt-clear.html">s6-svdt-clear</a>, +<a href="s6-permafailon.html">s6-permafailon</a> </li> + <li> <a href="s6-tai64nlocal.html">s6-tai64nlocal</a> can now +print GMT times with the <tt>-g</tt> option. </li> </ul> <h2> in 2.7.0.0 </h2> diff --git a/package/deps.mak b/package/deps.mak @@ -125,8 +125,10 @@ src/pipe-tools/s6-ftrig-notify.o src/pipe-tools/s6-ftrig-notify.lo: src/pipe-too src/pipe-tools/s6-ftrig-wait.o src/pipe-tools/s6-ftrig-wait.lo: src/pipe-tools/s6-ftrig-wait.c src/include/s6/ftrigr.h src/pipe-tools/s6-mkfifodir.o src/pipe-tools/s6-mkfifodir.lo: src/pipe-tools/s6-mkfifodir.c src/include/s6/ftrigw.h src/supervision/s6-notifyoncheck.o src/supervision/s6-notifyoncheck.lo: src/supervision/s6-notifyoncheck.c src/include/s6/ftrigr.h src/include/s6/s6-supervise.h +src/supervision/s6-permafailon.o src/supervision/s6-permafailon.lo: src/supervision/s6-permafailon.c src/include/s6/s6-supervise.h src/supervision/s6-supervise.o src/supervision/s6-supervise.lo: src/supervision/s6-supervise.c src/include/s6/ftrigw.h src/include/s6/s6-supervise.h src/supervision/s6-svc.o src/supervision/s6-svc.lo: src/supervision/s6-svc.c src/include/s6/config.h src/include/s6/s6-supervise.h +src/supervision/s6-svdt-clear.o src/supervision/s6-svdt-clear.lo: src/supervision/s6-svdt-clear.c src/include/s6/s6-supervise.h src/supervision/s6-svdt.o src/supervision/s6-svdt.lo: src/supervision/s6-svdt.c src/include/s6/s6-supervise.h src/supervision/s6-svlisten.o src/supervision/s6-svlisten.lo: src/supervision/s6-svlisten.c src/supervision/s6-svlisten.h src/supervision/s6-svlisten1.o src/supervision/s6-svlisten1.lo: src/supervision/s6-svlisten1.c src/supervision/s6-svlisten.h @@ -245,12 +247,16 @@ s6-mkfifodir: EXTRA_LIBS := s6-mkfifodir: src/pipe-tools/s6-mkfifodir.o ${LIBS6} -lskarnet s6-notifyoncheck: EXTRA_LIBS := ${SOCKET_LIB} ${TAINNOW_LIB} ${SPAWN_LIB} s6-notifyoncheck: src/supervision/s6-notifyoncheck.o ${LIBS6} -lskarnet +s6-permafailon: EXTRA_LIBS := +s6-permafailon: src/supervision/s6-permafailon.o ${LIBS6} -lskarnet s6-supervise: EXTRA_LIBS := ${TAINNOW_LIB} s6-supervise: src/supervision/s6-supervise.o ${LIBS6} -lskarnet s6-svc: EXTRA_LIBS := s6-svc: src/supervision/s6-svc.o ${LIBS6} -lskarnet s6-svdt: EXTRA_LIBS := s6-svdt: src/supervision/s6-svdt.o ${LIBS6} -lskarnet +s6-svdt-clear: EXTRA_LIBS := +s6-svdt-clear: src/supervision/s6-svdt-clear.o ${LIBS6} -lskarnet s6-svlisten: EXTRA_LIBS := ${SOCKET_LIB} ${TAINNOW_LIB} ${SPAWN_LIB} s6-svlisten: src/supervision/s6-svlisten.o src/supervision/s6_svlisten_signal_handler.o src/supervision/s6_svlisten_loop.o ${LIBS6} -lexecline -lskarnet s6-svlisten1: EXTRA_LIBS := ${SOCKET_LIB} ${TAINNOW_LIB} ${SPAWN_LIB} diff --git a/package/modes b/package/modes @@ -15,10 +15,12 @@ s6-svscanctl 0755 s6-svok 0755 s6-svstat 0755 s6-svdt 0755 +s6-svdt-clear 0755 s6-svwait 0755 s6-svlisten1 0755 s6-svlisten 0755 s6-notifyoncheck 0755 +s6-permafailon 0755 s6-applyuidgid 0700 s6-envdir 0755 s6-envuidgid 0755 diff --git a/package/targets.mak b/package/targets.mak @@ -15,10 +15,12 @@ s6-svscanctl \ s6-svok \ s6-svstat \ s6-svdt \ +s6-svdt-clear \ s6-svwait \ s6-svlisten1 \ s6-svlisten \ s6-notifyoncheck \ +s6-permafailon \ s6-envdir \ s6-envuidgid \ s6-fghack \ diff --git a/src/supervision/deps-exe/s6-permafailon b/src/supervision/deps-exe/s6-permafailon @@ -0,0 +1,2 @@ +${LIBS6} +-lskarnet diff --git a/src/supervision/deps-exe/s6-svdt-clear b/src/supervision/deps-exe/s6-svdt-clear @@ -0,0 +1,2 @@ +${LIBS6} +-lskarnet diff --git a/src/supervision/s6-permafailon.c b/src/supervision/s6-permafailon.c @@ -0,0 +1,118 @@ +/* ISC license. */ + +#include <sys/stat.h> +#include <string.h> +#include <signal.h> +#include <skalibs/types.h> +#include <skalibs/strerr2.h> +#include <skalibs/bitarray.h> +#include <skalibs/sig.h> +#include <skalibs/tai.h> +#include <skalibs/djbunix.h> +#include <s6/s6-supervise.h> + +#define USAGE "s6-permafailon seconds deathcount statuslist prog..." +#define dieusage() strerr_dieusage(100, USAGE) + +static void list_scan (char const *s, unsigned char *codes, sigset_t *sigs) +{ + size_t pos = 0 ; + while (s[pos]) + { + unsigned int u ; + size_t len = uint_scan(s + pos, &u) ; + if (len) + { + if (u > 255) strerr_dief1x(100, "invalid exit code") ; + pos += len ; + if (s[pos] == '-') + { + unsigned int v ; + pos++ ; + len = uint_scan(s + pos, &v) ; + if (!len) strerr_dief1x(100, "invalid interval specification") ; + if (v > 255) strerr_dief1x(100, "invalid exit code") ; + if (v < u) strerr_dief1x(100, "invalid interval") ; + pos += len ; + bitarray_setn(codes, u, v - u + 1) ; + } + else bitarray_set(codes, u) ; + } + else + { + int sig ; + size_t next = pos ; + while (!strchr(",; \n\r\t", s[next])) next++ ; + char tmp[next - pos + 1] ; + memcpy(tmp, s + pos, next - pos) ; + tmp[next - pos] = 0 ; + len = sig0_scan(tmp, &sig) ; + if (!len) strerr_dief1x(100, "invalid status list specification") ; + pos += len ; + if (sigaddset(sigs, sig) < 0) strerr_dief1x(100, "invalid signal") ; + } + while (memchr(",; \n\r\t", s[pos], 6)) pos++ ; + } +} + +int main (int argc, char const *const *argv, char const *const *envp) +{ + unsigned char codes[32] ; + sigset_t sigs ; + unsigned int total, seconds, n ; + struct stat st ; + PROG = "s6-permafailon" ; + if (argc < 4) dieusage() ; + + if (!uint0_scan(argv[1], &seconds)) dieusage() ; + if (!uint0_scan(argv[2], &n)) dieusage() ; + if (!n) dieusage() ; + if (n > S6_MAX_DEATH_TALLY) n = S6_MAX_DEATH_TALLY ; + list_scan(argv[3], codes, &sigs) ; + + if (stat(S6_DTALLY_FILENAME, &st) < 0) + { + strerr_warnwu2sys("stat ", S6_DTALLY_FILENAME) ; + goto cont ; + } + if (st.st_size % S6_DTALLY_PACK || st.st_size > S6_DTALLY_PACK * S6_MAX_DEATH_TALLY) + { + strerr_warnw2x("invalid ", S6_DTALLY_FILENAME) ; + goto cont ; + } + total = st.st_size / S6_DTALLY_PACK ; + { + tain_t mintime ; + unsigned int matches = 0 ; + s6_dtally_t tab[total] ; + ssize_t r = s6_dtally_read(".", tab, total) ; + if (r <= 0) + { + if (r < 0) strerr_warnwu2sys("read ", S6_DTALLY_FILENAME) ; + goto cont ; + } + if (r < n) goto cont ; + tain_uint(&mintime, seconds) ; + tain_sub(&mintime, &tab[r-1].stamp, &mintime) ; + + for (unsigned int i = 0 ; i < r ; i++) + { + if (!tain_less(&tab[i].stamp, &mintime) + && ((tab[i].sig && sigismember(&sigs, tab[i].sig)) || bitarray_peek(codes, tab[i].exitcode)) + && ++matches >= n) + { + char fmtevent[4] ; + char fmtseconds[UINT_FMT] ; + char fmtn[UINT_FMT] ; + fmtevent[uint_fmt(fmtevent, tab[i].sig ? tab[i].sig : tab[i].exitcode)] = 0 ; + fmtseconds[uint_fmt(fmtseconds, seconds)] = 0 ; + fmtn[uint_fmt(fmtseconds, n)] = 0 ; + strerr_warni8x("PERMANENT FAILURE triggered after ", fmtn, " events involving ", tab[i].sig ? "signal " : "exit code ", fmtevent, " in the last ", fmtseconds, " seconds") ; + return 125 ; + } + } + } + + cont: + xpathexec0_run(argv + 4, envp) ; +} diff --git a/src/supervision/s6-svdt-clear.c b/src/supervision/s6-svdt-clear.c @@ -0,0 +1,15 @@ +/* ISC license. */ + +#include <skalibs/strerr2.h> +#include <s6/s6-supervise.h> + +#define USAGE "s6-svdt-clear servicedir" +#define dieusage() strerr_dieusage(100, USAGE) + +int main (int argc, char const *const *argv) +{ + if (argc < 1) dieusage() ; + if (!s6_dtally_write(argv[1], 0, 0)) + strerr_diefu2sys(111, "clear death tally for service ", argv[1]) ; + return 0 ; +}