diff options
author | Mike Gabriel <mike.gabriel@das-netzwerkteam.de> | 2016-07-19 18:37:39 +0200 |
---|---|---|
committer | Mike Gabriel <mike.gabriel@das-netzwerkteam.de> | 2016-07-19 18:38:11 +0200 |
commit | d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67 (patch) | |
tree | aca241aa542ff6654a64494043dac3eeea720854 /usr-lib-nagios-plugins/check_dirvish.py | |
parent | 0f80e23fb61ea6023ddfb66a925e0e9b9791ed81 (diff) | |
download | itzks-systems-d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67.tar.gz itzks-systems-d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67.tar.bz2 itzks-systems-d3eb24ecbe1c4445c6444c343b6c88c7ed7ceb67.zip |
Monitoring: Add check_dirvish (must run via sudo).
Diffstat (limited to 'usr-lib-nagios-plugins/check_dirvish.py')
-rwxr-xr-x | usr-lib-nagios-plugins/check_dirvish.py | 362 |
1 files changed, 362 insertions, 0 deletions
diff --git a/usr-lib-nagios-plugins/check_dirvish.py b/usr-lib-nagios-plugins/check_dirvish.py new file mode 100755 index 0000000..5802bb2 --- /dev/null +++ b/usr-lib-nagios-plugins/check_dirvish.py @@ -0,0 +1,362 @@ +#!/usr/bin/python3 + +"""Nagios plugin to check the existence and freshness of a valid backup""" + +import argparse +import logging +import subprocess +import os +import datetime +import collections +import re + +try: + import nagiosplugin +except ImportError as e: + print("Please install python3-nagiosplugin") + raise e + +try: + import dateutil.parser +except ImportError as e: + print("Please install python3-dateutil") + raise e + + +_log = logging.getLogger('nagiosplugin') + + +class E_PathNotAccessible(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return "Basepath %r is not accessible" %repr(self.value) + +class E_PathNoDir(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return "Basepath %r is not a directory" %repr(self.value) + +class E_HistoryFileNotFound(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return "HistoryFile %r not found. Is there at last one Backup?" %repr(self.value) + +class E_BackupNotValid(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return "Backup is not valid. %s" % (self.value) + +class E_VaultIsNotDirvishDirectory(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return "Dirvish config in %r not found!" %repr(self.value) + +class E_FileNotAccessible(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return "File %r is not accessible" %repr(self.value) + +class Backup(nagiosplugin.Resource): + """Domain model: Dirvish vaults""" + + def __init__(self, vault, base_path): + self.vault = vault + self.base_path = base_path + self.vault_base_path = os.path.join(self.base_path, self.vault) + self.valid_backup_found = 0 + self.backup_running_now = 0 + + @property + def name(self): + """formatting the Testname (will be formatted as uppercase letters)""" + return "%s %s" % (self.__class__.__name__, self.vault.split('.')[0]) + + + def check_path_accessible(self, directory): + _log.debug("Check if %r is accessible and a directory", directory) + if not os.access(directory, os.R_OK | os.X_OK): + raise E_PathNotAccessible(directory) + if not os.path.isdir(directory): + raise E_PathNoDir(directory) + return + + def check_file_accessible(self, filename): + _log.debug("Check if %r is accessible", filename) + if not os.access(filename, os.R_OK): + raise E_FileNotAccessible(filename) + return + + def backups(self): + """Returns a iterable of backup-sub-directories""" + _log.debug('Finding the latest backup for vault "%s"', self.vault) + self.history_file = os.path.join(self.vault_base_path, 'dirvish', 'default.hist') + _log.debug('Check for %r' % self.history_file) + resultS = set() + if os.access(self.history_file, os.R_OK): + with open(self.history_file) as histfile: + lines = histfile.readlines()[1:] + for entry in reversed(lines): + try: + last_entry = entry.strip() + image = last_entry.split('\t')[0] + _log.info("Found next backup in %r", image) + except Exception as e: + _log.error("Something unexpected happened, while reading file %r", self.history_file) + next + resultS.add(image) + for dirname, dirnames, filenames in os.walk(self.vault_base_path): + _log.info("Adding directories in %r", self.vault_base_path) + # files that should be in every dirvish backup directory: + mustHaveS = {'log', 'summary', 'tree'} + for directory in dirnames: + dirCont = set(os.listdir(os.path.join(self.vault_base_path, directory))) + if mustHaveS.issubset(dirCont): + resultS.add(directory) + dirnames.clear() + _log.info("Found possible backups: %r", resultS) + return resultS + + def parse_backup(self, backup, parameterL = ['status', 'backup-begin', 'backup-complete']): + """ Check the last backup for validity. + Returns a dict with found keys in parameterL. + All parameters are treated as caseinsensitive via str.casefold + """ + _log.debug('Parsing backup: %r', backup) + _parameterL = [ s.casefold() for s in parameterL ] + _log.debug("Searching for parameters %r", _parameterL) + _resultD = dict() + backup_image = os.path.join(self.vault_base_path, backup) + self.check_path_accessible(backup_image) + self.check_path_accessible(os.path.join(backup_image, 'tree')) + summary_file = os.path.join(backup_image, 'summary') + if not os.access(summary_file, os.R_OK): + raise E_BackupNotValid('could not access summary file') + with open(summary_file) as summary: + for line in summary.readlines(): + parts = line.strip().split(': ') + if len(parts) >= 2: + # we have a definition + parameter = parts[0] + value = " ".join(parts[1:]) + _log.debug('Found parameter %r with value %r', parameter.casefold(), value) + parameter_casefold = parameter.casefold() + if parameter_casefold in _parameterL: + _log.debug("Adding parameter %r to returnDict", parameter_casefold) + _resultD[parameter_casefold] = value + _log.info("parsed Backup to: %r", _resultD) + return _resultD + + def check_backups(self): + backups = self.backups() + if len(backups) == 0: + self.valid_backup_found = 0 + return + for backup in reversed(sorted(backups)): + try: + parsed_backup = self.parse_backup(backup, ['status', 'backup-begin', 'backup-complete']) + except E_PathNotAccessible as e: + _log.debug("Exception thrown: %s", e) + continue + begin = dateutil.parser.parse(parsed_backup['backup-begin']) + _log.debug("Backup begin %r to %r", parsed_backup['backup-begin'], begin) + if parsed_backup.get('backup-complete') is None: + # backup is probably still running or was killed hard! + self.backup_running_now = round((datetime.datetime.now() - begin).total_seconds()) + continue + end = dateutil.parser.parse(parsed_backup['backup-complete']) + _log.debug("Backup end %r to %r", parsed_backup.get('backup-complete'), end) + dur = end - begin + _log.debug("Duration is: %s", dur) + if self.duration is None: + self.duration = round(dur.total_seconds()) + _log.info('Gathered last duration to %s hours', dur) + if self.last_try is None: + age = datetime.datetime.now() - begin + self.last_try = round(age.total_seconds()) + _log.info('Gathered last_try to %s days', age) + if Backup.status_has_errors(parsed_backup['status']): + _log.debug('Valid backup found: %r', backup) + self.valid_backup_found = 1 + if self.last_success is None: + age = datetime.datetime.now() - begin + self.last_success = round(age.total_seconds()) + _log.info('Gathered last_success to %s', age) + if self.duration and self.last_try and self.last_success: + _log.info('I have all required Informations. Exiting backup loop') + break + + @staticmethod + def status_has_errors(status): + """ this takes the status line and validates it. + check the gitolite code + status is something of: (value i have seen in dirvish code) + (255) -- + success + warning (24) -- file vanished on sender + """ + regexp = re.compile(r""" + (?P<status>\w+)? \s* # success|warning|fatal|error|unknown + (\((?P<rsyncexitcode>\d+)\))? \s* # rsync exitcode + (--)? \s* # separator + (?P<description>.*)? # description + """, re.IGNORECASE|re.VERBOSE) + statusD = regexp.search(status).groupdict() + if statusD['status'] in ['success', 'warning']: + return True + return False + + + + def check_valid_dirvish_vault(self): + _log.debug("Check if %r is a dirvish vault", self.vault) + dirvish_dir = os.path.join(self.vault_base_path, 'dirvish') + try: + self.check_path_accessible(dirvish_dir) + self.check_file_accessible(os.path.join(dirvish_dir, 'default.conf')) + except (E_PathNotAccessible, E_FileNotAccessible): + raise E_VaultIsNotDirvishDirectory(dirvish_dir) + + def probe(self): + """Create check metric for Backups + + 'last_success' is the metric for the lastsuccessful backup + 'last_try' is the metric for the last try + 'duraction' is the metric for the duration of the last backup + """ + self.duration = None + self.last_try = None + self.last_success = None + + self.check_path_accessible(self.base_path) + self.check_path_accessible(self.vault_base_path) + self.check_valid_dirvish_vault() + self.check_backups() + + # the order of metrices matters which human readable output you'll get! + _log.debug('last_success is %r seconds ago <%r>', self.last_success, type(self.last_success)) + if isinstance(self.last_success, int): + yield nagiosplugin.Metric('last_success', self.last_success, uom='s', min=0) + _log.debug('last_try is %r seconds ago, <%r>', self.last_try, type(self.last_try)) + if isinstance(self.last_try, int): + yield nagiosplugin.Metric('last_try', self.last_try, uom='s', min=0) + _log.debug('duration is instance of: %r seconds <%r>', self.duration, type(self.duration)) + if isinstance(self.duration, int): + yield nagiosplugin.Metric('duration', self.duration, uom='s', min=0) + _log.debug('Running backup runs for: %r seconds <%r>', self.backup_running_now, type(self.backup_running_now)) + if self.backup_running_now: + yield nagiosplugin.Metric('running_backup_for', self.backup_running_now, uom='s', min=0) + _log.debug('Valid Backup found: %r <%r>', self.valid_backup_found, type(self.valid_backup_found)) + yield nagiosplugin.Metric('valid_backup_found', self.valid_backup_found, min=0, max=1) + +class Duration_Fmt_Metric(object): + """ this class only use is to format a metric containing timedeltas + to print a human readable output like 7:30 or 6Y7d. """ + + def __init__(self, fmt_string): + self.fmt_string = fmt_string + + @staticmethod + def seconds_human_readable(seconds): + year = 60*60*24*365 + month = 60*60*24*30 + day = 60*60*24 + hour = 60*60 + minute = 60 + + string = "" + remaining_unitcount = 2 + years, remain = divmod(seconds, year) + if years > 0: + string += "%sY" % years + seconds = remain + remaining_unitcount -= 1 + if remaining_unitcount <=0: + return string + months, remain = divmod(seconds, month) + if months > 2: + string += "%sM" % months + seconds = remain + remaining_unitcount -= 1 + if remaining_unitcount <=0: + return string + days, remain = divmod(seconds, day) + if days > 0: + string += "%sd" % days + seconds = remain + remaining_unitcount -= 1 + if remaining_unitcount <=0: + return string + hours, seconds = divmod(seconds, hour) + minutes, seconds = divmod(seconds, minute) + if remaining_unitcount > 1: + string += "{0:0>2}h{1:0>2}".format(hours, minutes) + else: + string += "{0:0>2}h".format(hours) + assert seconds < 60 + return string + + def __call__(self, metric, context): + assert metric.uom == "s" + valueunit = self.seconds_human_readable(int(metric.value)) + return self.fmt_string.format( + name=metric.name, value=metric.value, uom=metric.uom, + valueunit=valueunit, min=metric.min, max=metric.max) + +class Bool_Fmt_Metric(object): + """print a message for a bool-metric """ + + def __init__(self, msg_success, msg_fail): + self.msg_success = msg_success + self.msg_fail = msg_fail + + def __call__(self, metric, context): + _log.debug('UOM: %r', metric.uom) + if metric.value: + return self.msg_success + else: + return self.msg_fail + + + +@nagiosplugin.guarded +def main(): + argp = argparse.ArgumentParser() + argp.add_argument('-w', '--warning', metavar='RANGE', + help='warning if backup age is outside RANGE in seconds'), + argp.add_argument('-c', '--critical', metavar='RANGE', + help='critical if backup age is outside RANGE in seconds') + argp.add_argument('-v', '--verbose', action='count', default=0, + help='increase output verbosity (use up to 3 times)') + argp.add_argument('-t', '--timeout', default=10, + help='abort execution after TIMEOUT seconds') + argp.add_argument('--base-path', default="/srv/backup/", + help="Path to the bank of the vault (/srv/backup)") + argp.add_argument('--max-duration', default=3600, metavar='RANGE', + help="max time in hours to take a backup (3600) in seconds") + argp.add_argument('--vault', help='Name of the vault to check') + args = argp.parse_args() + check = nagiosplugin.Check( + Backup(args.vault, args.base_path), + nagiosplugin.ScalarContext( 'valid_backup_found', critical='0.5:1', + fmt_metric = Bool_Fmt_Metric('Valid backup found!', 'No valid Backup found!')), + nagiosplugin.ScalarContext( 'last_success', args.warning, args.critical, + Duration_Fmt_Metric('Last successful backup is {valueunit} old')), + nagiosplugin.ScalarContext( 'last_try', args.warning, args.critical, + Duration_Fmt_Metric('Last backup tried {valueunit} ago')), + nagiosplugin.ScalarContext( name = 'duration', + warning = args.max_duration, + fmt_metric = Duration_Fmt_Metric('Last backuprun took {valueunit}')), + nagiosplugin.ScalarContext( name = 'running_backup_for', + warning = args.max_duration, + critical = args.max_duration*3, + fmt_metric = Duration_Fmt_Metric('Running backup since {valueunit}')),) + check.main(args.verbose, args.timeout) + +if __name__ == '__main__': + main() |