summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Gabriel <mike.gabriel@das-netzwerkteam.de>2016-07-19 18:37:39 +0200
committerMike Gabriel <mike.gabriel@das-netzwerkteam.de>2016-07-19 18:38:11 +0200
commitd3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67 (patch)
treeaca241aa542ff6654a64494043dac3eeea720854
parent0f80e23fb61ea6023ddfb66a925e0e9b9791ed81 (diff)
downloaditzks-systems-d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67.tar.gz
itzks-systems-d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67.tar.bz2
itzks-systems-d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67.zip
Monitoring: Add check_dirvish (must run via sudo).
-rw-r--r--debian/changelog3
-rw-r--r--debian/itzks-systems-backup.install2
-rwxr-xr-xdebian/itzks-systems-backup.postinst41
-rwxr-xr-xdebian/itzks-systems-backup.prerm35
-rw-r--r--etc/sudoers.d/itzks-systems-backup1
-rwxr-xr-xusr-lib-nagios-plugins/check_dirvish3
-rwxr-xr-xusr-lib-nagios-plugins/check_dirvish.py362
7 files changed, 446 insertions, 1 deletions
diff --git a/debian/changelog b/debian/changelog
index a66f9cf..b42f2ec 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -2,8 +2,9 @@ itzks-systems (2016.07.19.1) UNRELEASED; urgency=medium
* CRON: Silence calls to apt-get autoclean on system reboot. Such messages
are "flooding" root mails on TJENER.
+ * Monitoring: Add check_dirvish (must run via sudo).
- -- Mike Gabriel <mike.gabriel@das-netzwerkteam.de> Tue, 19 Jul 2016 18:26:19 +0200
+ -- Mike Gabriel <mike.gabriel@das-netzwerkteam.de> Tue, 19 Jul 2016 18:37:47 +0200
itzks-systems (2016.07.15.7) unstable; urgency=medium
diff --git a/debian/itzks-systems-backup.install b/debian/itzks-systems-backup.install
index 9301dd2..27ddb6e 100644
--- a/debian/itzks-systems-backup.install
+++ b/debian/itzks-systems-backup.install
@@ -1 +1,3 @@
usr-lib-nagios-plugins/check_md_raid usr/lib/nagios/plugins/
+usr-lib-nagios-plugins/check_dirvish* usr/lib/nagios/plugins/
+etc/sudoers.d/itzks-systems-backup etc/sudoers.d/
diff --git a/debian/itzks-systems-backup.postinst b/debian/itzks-systems-backup.postinst
new file mode 100755
index 0000000..5ace5ac
--- /dev/null
+++ b/debian/itzks-systems-backup.postinst
@@ -0,0 +1,41 @@
+#!/bin/sh
+# postinst script for itzks-systems-backup
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postinst> `configure' <most-recently-configured-version>
+# * <old-postinst> `abort-upgrade' <new version>
+# * <conflictor's-postinst> `abort-remove' `in-favour' <package>
+# <new-version>
+# * <postinst> `abort-remove'
+# * <deconfigured's-postinst> `abort-deconfigure' `in-favour'
+# <failed-install-package> <version> `removing'
+# <conflicting-package> <version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+case "$1" in
+ configure)
+ if ! dpkg-statoverride --list /etc/sudoers.d/itzks-systems-backup >/dev/null; then
+ dpkg-statoverride --add --update root root 0440 /etc/sudoers.d/itzks-systems-backup
+ fi
+ ;;
+
+ abort-upgrade|abort-remove|abort-deconfigure)
+ ;;
+
+ *)
+ echo "postinst called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
diff --git a/debian/itzks-systems-backup.prerm b/debian/itzks-systems-backup.prerm
new file mode 100755
index 0000000..6bb0047
--- /dev/null
+++ b/debian/itzks-systems-backup.prerm
@@ -0,0 +1,35 @@
+#!/bin/sh
+# prerm script for itzks-systems-backup
+#
+# see: dh_installdeb(1)
+# summary of how this script can be called:
+# * <prerm> `remove'
+# * <old-prerm> `upgrade' <new-version>
+# * <new-prerm> `failed-upgrade' <old-version>
+# * <conflictor's-prerm> `remove' `in-favour' <package> <new-version>
+# * <deconfigured's-prerm> `deconfigure' `in-favour'
+# <package-being-installed> <version> `removing'
+# <conflicting-package> <version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+set -e
+case "$1" in
+ remove)
+ if dpkg-statoverride --list /etc/sudoers.d/itzks-systems-backup >/dev/null; then
+ dpkg-statoverride --remove /etc/sudoers.d/itzks-systems-backup
+ fi
+ ;;
+ deconfigure|upgrade|failed-upgrade)
+ :
+ ;;
+ *) echo "$0: didn't understand being called with \`$1'" 1>&2
+ exit 1;;
+esac
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
diff --git a/etc/sudoers.d/itzks-systems-backup b/etc/sudoers.d/itzks-systems-backup
new file mode 100644
index 0000000..cf5a04d
--- /dev/null
+++ b/etc/sudoers.d/itzks-systems-backup
@@ -0,0 +1 @@
+%nagios ALL=(root:root) NOPASSWD: /usr/lib/nagios/plugins/check_dirvish.py
diff --git a/usr-lib-nagios-plugins/check_dirvish b/usr-lib-nagios-plugins/check_dirvish
new file mode 100755
index 0000000..3d4e37a
--- /dev/null
+++ b/usr-lib-nagios-plugins/check_dirvish
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+sudo -u root "$(dirname $0)/check_dirvish.py" $@ \ No newline at end of file
diff --git a/usr-lib-nagios-plugins/check_dirvish.py b/usr-lib-nagios-plugins/check_dirvish.py
new file mode 100755
index 0000000..5802bb2
--- /dev/null
+++ b/usr-lib-nagios-plugins/check_dirvish.py
@@ -0,0 +1,362 @@
+#!/usr/bin/python3
+
+"""Nagios plugin to check the existence and freshness of a valid backup"""
+
+import argparse
+import logging
+import subprocess
+import os
+import datetime
+import collections
+import re
+
+try:
+ import nagiosplugin
+except ImportError as e:
+ print("Please install python3-nagiosplugin")
+ raise e
+
+try:
+ import dateutil.parser
+except ImportError as e:
+ print("Please install python3-dateutil")
+ raise e
+
+
+_log = logging.getLogger('nagiosplugin')
+
+
+class E_PathNotAccessible(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return "Basepath %r is not accessible" %repr(self.value)
+
+class E_PathNoDir(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return "Basepath %r is not a directory" %repr(self.value)
+
+class E_HistoryFileNotFound(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return "HistoryFile %r not found. Is there at last one Backup?" %repr(self.value)
+
+class E_BackupNotValid(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return "Backup is not valid. %s" % (self.value)
+
+class E_VaultIsNotDirvishDirectory(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return "Dirvish config in %r not found!" %repr(self.value)
+
+class E_FileNotAccessible(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return "File %r is not accessible" %repr(self.value)
+
+class Backup(nagiosplugin.Resource):
+ """Domain model: Dirvish vaults"""
+
+ def __init__(self, vault, base_path):
+ self.vault = vault
+ self.base_path = base_path
+ self.vault_base_path = os.path.join(self.base_path, self.vault)
+ self.valid_backup_found = 0
+ self.backup_running_now = 0
+
+ @property
+ def name(self):
+ """formatting the Testname (will be formatted as uppercase letters)"""
+ return "%s %s" % (self.__class__.__name__, self.vault.split('.')[0])
+
+
+ def check_path_accessible(self, directory):
+ _log.debug("Check if %r is accessible and a directory", directory)
+ if not os.access(directory, os.R_OK | os.X_OK):
+ raise E_PathNotAccessible(directory)
+ if not os.path.isdir(directory):
+ raise E_PathNoDir(directory)
+ return
+
+ def check_file_accessible(self, filename):
+ _log.debug("Check if %r is accessible", filename)
+ if not os.access(filename, os.R_OK):
+ raise E_FileNotAccessible(filename)
+ return
+
+ def backups(self):
+ """Returns a iterable of backup-sub-directories"""
+ _log.debug('Finding the latest backup for vault "%s"', self.vault)
+ self.history_file = os.path.join(self.vault_base_path, 'dirvish', 'default.hist')
+ _log.debug('Check for %r' % self.history_file)
+ resultS = set()
+ if os.access(self.history_file, os.R_OK):
+ with open(self.history_file) as histfile:
+ lines = histfile.readlines()[1:]
+ for entry in reversed(lines):
+ try:
+ last_entry = entry.strip()
+ image = last_entry.split('\t')[0]
+ _log.info("Found next backup in %r", image)
+ except Exception as e:
+ _log.error("Something unexpected happened, while reading file %r", self.history_file)
+ next
+ resultS.add(image)
+ for dirname, dirnames, filenames in os.walk(self.vault_base_path):
+ _log.info("Adding directories in %r", self.vault_base_path)
+ # files that should be in every dirvish backup directory:
+ mustHaveS = {'log', 'summary', 'tree'}
+ for directory in dirnames:
+ dirCont = set(os.listdir(os.path.join(self.vault_base_path, directory)))
+ if mustHaveS.issubset(dirCont):
+ resultS.add(directory)
+ dirnames.clear()
+ _log.info("Found possible backups: %r", resultS)
+ return resultS
+
+ def parse_backup(self, backup, parameterL = ['status', 'backup-begin', 'backup-complete']):
+ """ Check the last backup for validity.
+ Returns a dict with found keys in parameterL.
+ All parameters are treated as caseinsensitive via str.casefold
+ """
+ _log.debug('Parsing backup: %r', backup)
+ _parameterL = [ s.casefold() for s in parameterL ]
+ _log.debug("Searching for parameters %r", _parameterL)
+ _resultD = dict()
+ backup_image = os.path.join(self.vault_base_path, backup)
+ self.check_path_accessible(backup_image)
+ self.check_path_accessible(os.path.join(backup_image, 'tree'))
+ summary_file = os.path.join(backup_image, 'summary')
+ if not os.access(summary_file, os.R_OK):
+ raise E_BackupNotValid('could not access summary file')
+ with open(summary_file) as summary:
+ for line in summary.readlines():
+ parts = line.strip().split(': ')
+ if len(parts) >= 2:
+ # we have a definition
+ parameter = parts[0]
+ value = " ".join(parts[1:])
+ _log.debug('Found parameter %r with value %r', parameter.casefold(), value)
+ parameter_casefold = parameter.casefold()
+ if parameter_casefold in _parameterL:
+ _log.debug("Adding parameter %r to returnDict", parameter_casefold)
+ _resultD[parameter_casefold] = value
+ _log.info("parsed Backup to: %r", _resultD)
+ return _resultD
+
+ def check_backups(self):
+ backups = self.backups()
+ if len(backups) == 0:
+ self.valid_backup_found = 0
+ return
+ for backup in reversed(sorted(backups)):
+ try:
+ parsed_backup = self.parse_backup(backup, ['status', 'backup-begin', 'backup-complete'])
+ except E_PathNotAccessible as e:
+ _log.debug("Exception thrown: %s", e)
+ continue
+ begin = dateutil.parser.parse(parsed_backup['backup-begin'])
+ _log.debug("Backup begin %r to %r", parsed_backup['backup-begin'], begin)
+ if parsed_backup.get('backup-complete') is None:
+ # backup is probably still running or was killed hard!
+ self.backup_running_now = round((datetime.datetime.now() - begin).total_seconds())
+ continue
+ end = dateutil.parser.parse(parsed_backup['backup-complete'])
+ _log.debug("Backup end %r to %r", parsed_backup.get('backup-complete'), end)
+ dur = end - begin
+ _log.debug("Duration is: %s", dur)
+ if self.duration is None:
+ self.duration = round(dur.total_seconds())
+ _log.info('Gathered last duration to %s hours', dur)
+ if self.last_try is None:
+ age = datetime.datetime.now() - begin
+ self.last_try = round(age.total_seconds())
+ _log.info('Gathered last_try to %s days', age)
+ if Backup.status_has_errors(parsed_backup['status']):
+ _log.debug('Valid backup found: %r', backup)
+ self.valid_backup_found = 1
+ if self.last_success is None:
+ age = datetime.datetime.now() - begin
+ self.last_success = round(age.total_seconds())
+ _log.info('Gathered last_success to %s', age)
+ if self.duration and self.last_try and self.last_success:
+ _log.info('I have all required Informations. Exiting backup loop')
+ break
+
+ @staticmethod
+ def status_has_errors(status):
+ """ this takes the status line and validates it.
+ check the gitolite code
+ status is something of: (value i have seen in dirvish code)
+ (255) --
+ success
+ warning (24) -- file vanished on sender
+ """
+ regexp = re.compile(r"""
+ (?P<status>\w+)? \s* # success|warning|fatal|error|unknown
+ (\((?P<rsyncexitcode>\d+)\))? \s* # rsync exitcode
+ (--)? \s* # separator
+ (?P<description>.*)? # description
+ """, re.IGNORECASE|re.VERBOSE)
+ statusD = regexp.search(status).groupdict()
+ if statusD['status'] in ['success', 'warning']:
+ return True
+ return False
+
+
+
+ def check_valid_dirvish_vault(self):
+ _log.debug("Check if %r is a dirvish vault", self.vault)
+ dirvish_dir = os.path.join(self.vault_base_path, 'dirvish')
+ try:
+ self.check_path_accessible(dirvish_dir)
+ self.check_file_accessible(os.path.join(dirvish_dir, 'default.conf'))
+ except (E_PathNotAccessible, E_FileNotAccessible):
+ raise E_VaultIsNotDirvishDirectory(dirvish_dir)
+
+ def probe(self):
+ """Create check metric for Backups
+
+ 'last_success' is the metric for the lastsuccessful backup
+ 'last_try' is the metric for the last try
+ 'duraction' is the metric for the duration of the last backup
+ """
+ self.duration = None
+ self.last_try = None
+ self.last_success = None
+
+ self.check_path_accessible(self.base_path)
+ self.check_path_accessible(self.vault_base_path)
+ self.check_valid_dirvish_vault()
+ self.check_backups()
+
+ # the order of metrices matters which human readable output you'll get!
+ _log.debug('last_success is %r seconds ago <%r>', self.last_success, type(self.last_success))
+ if isinstance(self.last_success, int):
+ yield nagiosplugin.Metric('last_success', self.last_success, uom='s', min=0)
+ _log.debug('last_try is %r seconds ago, <%r>', self.last_try, type(self.last_try))
+ if isinstance(self.last_try, int):
+ yield nagiosplugin.Metric('last_try', self.last_try, uom='s', min=0)
+ _log.debug('duration is instance of: %r seconds <%r>', self.duration, type(self.duration))
+ if isinstance(self.duration, int):
+ yield nagiosplugin.Metric('duration', self.duration, uom='s', min=0)
+ _log.debug('Running backup runs for: %r seconds <%r>', self.backup_running_now, type(self.backup_running_now))
+ if self.backup_running_now:
+ yield nagiosplugin.Metric('running_backup_for', self.backup_running_now, uom='s', min=0)
+ _log.debug('Valid Backup found: %r <%r>', self.valid_backup_found, type(self.valid_backup_found))
+ yield nagiosplugin.Metric('valid_backup_found', self.valid_backup_found, min=0, max=1)
+
+class Duration_Fmt_Metric(object):
+ """ this class only use is to format a metric containing timedeltas
+ to print a human readable output like 7:30 or 6Y7d. """
+
+ def __init__(self, fmt_string):
+ self.fmt_string = fmt_string
+
+ @staticmethod
+ def seconds_human_readable(seconds):
+ year = 60*60*24*365
+ month = 60*60*24*30
+ day = 60*60*24
+ hour = 60*60
+ minute = 60
+
+ string = ""
+ remaining_unitcount = 2
+ years, remain = divmod(seconds, year)
+ if years > 0:
+ string += "%sY" % years
+ seconds = remain
+ remaining_unitcount -= 1
+ if remaining_unitcount <=0:
+ return string
+ months, remain = divmod(seconds, month)
+ if months > 2:
+ string += "%sM" % months
+ seconds = remain
+ remaining_unitcount -= 1
+ if remaining_unitcount <=0:
+ return string
+ days, remain = divmod(seconds, day)
+ if days > 0:
+ string += "%sd" % days
+ seconds = remain
+ remaining_unitcount -= 1
+ if remaining_unitcount <=0:
+ return string
+ hours, seconds = divmod(seconds, hour)
+ minutes, seconds = divmod(seconds, minute)
+ if remaining_unitcount > 1:
+ string += "{0:0>2}h{1:0>2}".format(hours, minutes)
+ else:
+ string += "{0:0>2}h".format(hours)
+ assert seconds < 60
+ return string
+
+ def __call__(self, metric, context):
+ assert metric.uom == "s"
+ valueunit = self.seconds_human_readable(int(metric.value))
+ return self.fmt_string.format(
+ name=metric.name, value=metric.value, uom=metric.uom,
+ valueunit=valueunit, min=metric.min, max=metric.max)
+
+class Bool_Fmt_Metric(object):
+ """print a message for a bool-metric """
+
+ def __init__(self, msg_success, msg_fail):
+ self.msg_success = msg_success
+ self.msg_fail = msg_fail
+
+ def __call__(self, metric, context):
+ _log.debug('UOM: %r', metric.uom)
+ if metric.value:
+ return self.msg_success
+ else:
+ return self.msg_fail
+
+
+
+@nagiosplugin.guarded
+def main():
+ argp = argparse.ArgumentParser()
+ argp.add_argument('-w', '--warning', metavar='RANGE',
+ help='warning if backup age is outside RANGE in seconds'),
+ argp.add_argument('-c', '--critical', metavar='RANGE',
+ help='critical if backup age is outside RANGE in seconds')
+ argp.add_argument('-v', '--verbose', action='count', default=0,
+ help='increase output verbosity (use up to 3 times)')
+ argp.add_argument('-t', '--timeout', default=10,
+ help='abort execution after TIMEOUT seconds')
+ argp.add_argument('--base-path', default="/srv/backup/",
+ help="Path to the bank of the vault (/srv/backup)")
+ argp.add_argument('--max-duration', default=3600, metavar='RANGE',
+ help="max time in hours to take a backup (3600) in seconds")
+ argp.add_argument('--vault', help='Name of the vault to check')
+ args = argp.parse_args()
+ check = nagiosplugin.Check(
+ Backup(args.vault, args.base_path),
+ nagiosplugin.ScalarContext( 'valid_backup_found', critical='0.5:1',
+ fmt_metric = Bool_Fmt_Metric('Valid backup found!', 'No valid Backup found!')),
+ nagiosplugin.ScalarContext( 'last_success', args.warning, args.critical,
+ Duration_Fmt_Metric('Last successful backup is {valueunit} old')),
+ nagiosplugin.ScalarContext( 'last_try', args.warning, args.critical,
+ Duration_Fmt_Metric('Last backup tried {valueunit} ago')),
+ nagiosplugin.ScalarContext( name = 'duration',
+ warning = args.max_duration,
+ fmt_metric = Duration_Fmt_Metric('Last backuprun took {valueunit}')),
+ nagiosplugin.ScalarContext( name = 'running_backup_for',
+ warning = args.max_duration,
+ critical = args.max_duration*3,
+ fmt_metric = Duration_Fmt_Metric('Running backup since {valueunit}')),)
+ check.main(args.verbose, args.timeout)
+
+if __name__ == '__main__':
+ main()