Count failures and send begin/end notifications - reed-alert - Lightweight agentless alerting system for server HTML git clone git://bitreich.org/reed-alert/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/reed-alert/ DIR Log DIR Files DIR Refs DIR Tags DIR README DIR LICENSE --- DIR commit f352b8458e9b406ce8795bf00c704c260c511cd6 DIR parent 1b2f15bf2974f893f7dd55ff6b4742dd0c0430d2 HTML Author: Solene Rapenne <solene@perso.pw> Date: Wed, 17 Jan 2018 20:38:54 +0100 Count failures and send begin/end notifications Diffstat: M README | 37 ++++++++++++++++++++++++++++--- M config.lisp.sample | 6 +++--- M example.lisp | 8 ++++---- M functions.lisp | 53 +++++++++++++++++++++++++++---- 4 files changed, 88 insertions(+), 16 deletions(-) --- DIR diff --git a/README b/README @@ -63,9 +63,29 @@ The configuration is explained below. The Notification System ======================= -When a check return an error, a previously defined notifier will be -called. The notifier is a shell command with a name. The shell command -can contains variables from reed-alert. +When a check return a failure, a previously defined notifier will be +called. This will be triggered only after reed-alert find **3** +failures (not more or less) in a row for this check, this is a default +value that can be changed per probe with the :try parameter as +explained later in this document. This is to prevent reed-alert to +spam notifications for a long time (number of failures very high, like +a disk space usage that can't be fixed before a long time) OR +preventing reed-alert to send notifications about a check on the edge +of the limit like a ping almost working but failing from time to time +or the load average around the limit. + +reed-alert will use the notifier system when it reach its try number +and when the problem is fixed, so you know when it begins and when it +ends. + +reed-alert keep tracks of the count of failures with one file per +probe failing in the "states" folder. To ensure unique filenames, the +following format is used (+ means it's concatenated) : + + alert-name + probe-name + hash of probe parameters + +The notifier is a shell command with a name. The shell command can +contains variables from reed-alert. + %function% : the name of the probe + %date% : the current date with format YYYY/MM/DD hh:mm:ss @@ -76,6 +96,7 @@ can contains variables from reed-alert. + %level% : the type of notification used + %os% : the type of operating system (FreeBSD/Linux/OpenBSD) + %newline% : a newline character ++ %state% : "start" / "end" when problem happen / is solved Example Probe 1: 'Check For Load Average' @@ -119,6 +140,16 @@ does. It can be put in every probe. :desc "STRING" +The :try Parameter +------------------ +The :try parameter allows you to change how many failure to wait +before the alert is triggered. By default, it's triggered after 3 +failures. Sometimes, when using ping for example, you want to be +notified when it fails a few cycles and not at first failure. + + :try INTEGER + + Overview -------- As of this commit, reed-alert ships with the following probes: DIR diff --git a/config.lisp.sample b/config.lisp.sample @@ -1,8 +1,8 @@ (load "functions.lisp") -(alert mail "echo -n 'Problem with %function% %date% %params%' | mail -s alarm mail@isp.net") -(alert sms "/home/user/sms.sh '%date% %function% %params% %hostname%") -(alert available-variables "REMINDER : %function% %params% %date% %hostname% %desc% %level% %os% %newline% %result%") +(alert mail "echo -n '[%state%] Problem with %function% %date% %params%' | mail -s '[%state%] alarm' mail@isp.net") +(alert sms "/home/user/sms.sh '%date% %state% %function% %params% %hostname%") +(alert available-variables "REMINDER : %function% %params% %date% %hostname% %desc% %level% %os% %newline% %result% %state%") (alert empty "") DIR diff --git a/example.lisp b/example.lisp @@ -1,9 +1,9 @@ (load "functions.lisp") -(alert dont-use-it "REMINDER %function% %params% %date% %hostname% %desc% %level% %os% %newline% _ %space% %result%") +(alert dont-use-it "REMINDER %state% %function% %params% %date% %hostname% %desc% %level% %os% %newline% _ %space% %result%") (alert empty "") (alert mail "") -(alert peroket "echo 'problem at %date% with %function% %params%'") +(alert peroket "echo '%state% problem at %date% with %function% %params% : %result%'") (alert sms "echo -n '%date% %function% CRITICAL on %hostname%' | curl http://somewebservice") ;(alert mail "echo -n '%date% %hostname% had problem on %function% %newline% %params% values %result% %newline% ; %desc%' | mail -s '[Error] %function% - %hostname%' foo@bar.com") @@ -15,8 +15,8 @@ (=> peroket disk-usage :path "/tmp" :limit 0) ;; failure ;; check if :path file exists -(=> mail file-exists :path "/bsd.rd" :desc "OpenBSD kernel /bsd.rd") -(=> empty file-exists :path "/non-existant-file") ;; failure file not found +(=> mail file-exists :path "/bsd.rd" :desc "OpenBSD kernel /bsd.rd") +(=> empty file-exists :path "/non-existant-file" :try 1) ;; failure file not found ;; check if :path file exists and has been updated since :limit minutes (=> empty file-updated :path "/var/log/messages" :limit 400) DIR diff --git a/functions.lisp b/functions.lisp @@ -1,6 +1,8 @@ (require 'asdf) +(defparameter *tries* 3) (defparameter *alerts* '()) +(ensure-directories-exist "states/") (defun color(num1 num2) (format nil "~a[~a;~am" #\Escape num1 num2)) @@ -57,9 +59,10 @@ (push (list ',name ,string) *alerts*))) -(defun trigger-alert(level function params result) +(defun trigger-alert(level function params result state) (let* ((notifier-command (assoc level *alerts*)) (command-string (cadr notifier-command))) + (setf command-string (replace-all command-string "%state%" (if (eql 'error state) "Start" "End"))) (setf command-string (replace-all command-string "%result%" (format nil "~a" result))) (setf command-string (replace-all command-string "%hostname%" (machine-instance))) (setf command-string (replace-all command-string "%os%" (software-type))) @@ -85,15 +88,53 @@ (defun =>(level fonction &rest params) (format t "[~a~a ~20A~a] ~45A" *yellow* level fonction *white* (getf params :desc params)) - (let ((hash (fnv-hash (format nil "~{~a~}" (nconc (list level fonction) (remove-if #'symbolp params))))) - (result (funcall fonction params))) + (let* ((hash (fnv-hash (format nil "~{~a~}" (remove-if #'symbolp params)))) + (result (funcall fonction params)) + (filename (format nil "~a-~a-~a" level fonction hash)) + (filepath (format nil "states/~a" filename))) (if (not (listp result)) (progn - (format t " => ~asuccess~a~%" *green* *white*) + (if (probe-file filepath) + ;; last time was a failure + (progn + (uiop:run-program (trigger-alert level fonction params t 'success) :output t) + (delete-file filepath) + (format t " => ~afailure => success~a~%" *green* *white*)) + ;; last time was a success + (format t " => ~asuccess~a~%" *green* *white*)) + ;; we return t because it's ok t) + (progn - (format t " => ~aerror~a~%" *red* *white*) - (uiop:run-program (trigger-alert level fonction params (cadr result)) :output t) + (if (probe-file filepath) + ;; error before + ;; but how many ? + (with-open-file (stream filepath :direction :input) + (let ((tries (parse-integer (read-line stream 0 nil)))) + (format t " => ~aerror (~a failures before)~a~%" *red* tries *white*) + + ;; more error than limit, send alert once + (when (= tries (getf params :try *tries*)) + (uiop:run-program (trigger-alert level fonction params (cadr result) 'error) :output t)) + + ;; increment the file + (progn + (with-open-file (stream-out filepath :direction :output + :if-exists :supersede) + (format stream-out "~a~%~a~%" (+ 1 tries) params))))) + + ;; file doesn't exist + (with-open-file (stream-out filepath :direction :output + :if-exists :supersede) + (format t " => ~aerror (first failure)~a~%" *red* *white*) + + ;; maybe we would be warned at first error ? + ;; code is duplicated from above because it + ;; requires reading the non existent file + (when (= 1 (getf params :try *tries*)) + (uiop:run-program (trigger-alert level fonction params (cadr result) 'error) :output t)) + + (format stream-out "1~%~a~%" params))) nil)))) (load "probes.lisp")