/usr/share/check_mk/checks/df.include

#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# |             ____ _               _        __  __ _  __           |
# |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
# |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
# |           | |___| | | |  __/ (__|   <    | |  | | . \            |
# |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
# |                                                                  |
# | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software;  you can redistribute it and/or modify it
# under the  terms of the  GNU General Public License  as published by
# the Free Software Foundation in version 2.  check_mk is  distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
# out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
# PARTICULAR PURPOSE. See the  GNU General Public License for more de-
# tails. You should have  received  a copy of the  GNU  General Public
# License along with GNU Make; see the file  COPYING.  If  not,  write
# to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
# Boston, MA 02110-1301 USA.

# Common include file for all filesystem checks (df, df_netapp, hr_fs, ...)

# Settings for filesystem checks (df, df_vms, df_netapp and maybe others)
filesystem_levels         = [] # obsolete. Just here to check config and warn if changed
filesystem_default_levels = {} # can also be dropped some day in future

# Filesystems to ignore (shouldn't be sent by agent anyway)
inventory_df_exclude_fs            = [ 'tmpfs', 'nfs', 'smbfs', 'cifs', 'iso9660' ]
inventory_df_exclude_mountpoints   = [ '/dev' ]

# Grouping of filesystems into groups that are monitored as one entity
# Example:
# filesystem_groups = [
#     ( [ ( "Storage pool", "/data/pool*" ) ], [ 'linux', 'prod' ], ALL_HOSTS ),
#     ( [ ( "Backup space 1", "/usr/backup/*.xyz" ),
#         ( "Backup space 2", "/usr/backup2/*.xyz" ) ], ALL_HOSTS ),
# ]
filesystem_groups = []

# Alternative syntax for parameters:
# {  "levels"         : (80, 90),  # levels in percent
#    "magic"          : 0.5,       # magic factor
#    "magic_normsize" : 20,        # normsize in GB
#    "levels_low"     : (50, 60),  # magic never lowers levels below this (percent)
#    "trend_range"    : 24,        # hours
#    "trend_mb"       : (10, 20),  # MB of change during trend_range
#    "trend_perc"     : (1, 2),    # Percent change during trend_range
#    "trend_timeleft" : (72, 48)   # run time left in hours until full
# }

factory_settings["filesystem_default_levels"] = {
    "levels"          : (80.0, 90.0), # warn/crit in percent
    "magic_normsize"  : 20,       # Standard size if 20 GB
    "levels_low"      : (50.0, 60.0), # Never move warn level below 50% due to magic factor
    "trend_range"     : 24,
    "trend_perfdata"  : True,    # do send performance data for trends
    "show_levels"     : "onmagic",
    "inodes_levels"   : (10.0, 5.0),
    "show_inodes"     : "onlow",
    "show_reserved"   : False,
}

def df_inventory(mplist):
    group_patterns = {}
    for line in host_extra_conf(g_hostname, filesystem_groups):
        for group_name, pattern in line:
            group_patterns.setdefault(group_name, []).append(pattern)

    # Create one service for each mount point that is not contained
    # in a filesystem group. And create one service for each non-empty
    # filesystem group.
    inventory = []
    have_groups = set([])
    for mp in mplist:
        in_group = False
        for group_name, patterns in group_patterns.items():
            for pattern in patterns:
                if fnmatch.fnmatch(mp, pattern):
                    have_groups.add(group_name)
                    in_group = True
                    break
        if not in_group:
            inventory.append((mp, {}))

    for group_name in have_groups:
        inventory.append((group_name, { "patterns" : group_patterns[group_name] }))

    return inventory


# Users might have set filesystem_default_levels to old format like (80, 90)

# needed by df, df_netapp and vms_df and maybe others in future:
# compute warning and critical levels. Takes into account the size of
# the filesystem and the magic number. Since the size is only known at
# check time this function's result cannot be precompiled.
# Note: this function is in our days only needed in order to support
# old style params a'la (80, 90). As soon as we drop support for that
# (can happen any decade now), we can get rid of this function.
def get_filesystem_levels(host, mountpoint, size_gb, params):
    mega = 1024 * 1024
    giga = mega * 1024
    # Start with factory settings
    levels = factory_settings["filesystem_default_levels"].copy()

    def convert_legacy_levels(value):
        if type(params) == tuple or not params.get("flex_levels"):
            return tuple(map(float, value))
        else:
            return value

    # convert default levels to dictionary. This is in order support
    # old style levels like (80, 90)
    if type(filesystem_default_levels) == dict:
        fs_default_levels = filesystem_default_levels.copy()
        fs_levels = fs_default_levels.get("levels")
        if fs_levels:
            fs_default_levels["levels"] = convert_legacy_levels(fs_levels)
        levels.update(filesystem_default_levels)
    else:
        levels = factory_settings["filesystem_default_levels"].copy()
        levels["levels"] = convert_legacy_levels(filesystem_default_levels[:2])
        if len(filesystem_default_levels) == 2:
            levels["magic"] = None
        else:
            levels["magic"] = filesystem_default_levels[2]

    # If params is a dictionary, make that override the default values
    if type(params) == dict:
        levels.update(params)

    else: # simple format - explicitely override levels and magic
        levels["levels"] = convert_legacy_levels(params[:2])
        if len(params) >= 3:
            levels["magic"] = params[2]

    # Determine real warn, crit levels
    if type(levels["levels"]) == tuple:
        warn, crit = levels["levels"]
    else:
        # A list of levels. Choose the correct one depending on the
        # size of the current filesystem. We do not make the first
        # rule match, but that with the largest size_gb. That way
        # the order of the entries is not important.
        found = False
        found_size = 0
        for to_size, this_levels in levels["levels"]:
            if size_gb * giga > to_size and to_size >= found_size:
                warn, crit = this_levels
                found_size = to_size
                found = True
        if not found:
            warn, crit = 100.0, 100.0 # entry not found in list


    # If the magic factor is used, take disk size and magic factor
    # into account in order to move levels
    magic = levels.get("magic")
    # We need a way to disable the magic factor so check
    # if magic not 1.0
    if magic and magic != 1.0:
        # convert warn/crit to percentage
        if type(warn) != float:
            warn = savefloat(warn * mega / float(size_gb * giga)) * 100
        if type(crit) != float:
            crit = savefloat(crit * mega / float(size_gb * giga)) * 100

        normsize = levels["magic_normsize"]
        hgb_size = size_gb / float(normsize)
        felt_size = hgb_size ** magic
        scale = felt_size / hgb_size
        warn_scaled = 100 - (( 100 - warn ) * scale)
        crit_scaled = 100 - (( 100 - crit ) * scale)

        # Make sure, levels do never get too low due to magic factor
        lowest_warning_level, lowest_critical_level = levels["levels_low"]
        if warn_scaled < lowest_warning_level:
            warn_scaled = lowest_warning_level
        if crit_scaled < lowest_critical_level:
            crit_scaled = lowest_critical_level
    else:
        if type(warn) != float:
            warn_scaled = savefloat(warn * mega / float(size_gb * giga)) * 100
        else:
            warn_scaled = warn

        if type(crit) != float:
            crit_scaled = savefloat(crit * mega / float(size_gb * giga)) * 100
        else:
            crit_scaled = crit

    size_mb     = size_gb * 1024
    warn_mb     = savefloat(size_mb * warn_scaled / 100)
    crit_mb     = savefloat(size_mb * crit_scaled / 100)
    levels["levels_mb"] = (warn_mb, crit_mb)
    if type(warn) == float:
        if warn_scaled < 0 and crit_scaled < 0:
            label = 'warn/crit at free space below'
            warn_scaled *= -1
            crit_scaled *= -1
        else:
            label = 'warn/crit at'
        levels["levels_text"] = "(%s %.2f/%.2f%%)" % (label, warn_scaled, crit_scaled)
    else:
        if warn * mega < 0 and crit * mega < 0:
            label = 'warn/crit at free space below'
            warn *= -1
            crit *= -1
        else:
            label = 'warn/crit at'
        warn_hr = get_bytes_human_readable(warn * mega)
        crit_hr = get_bytes_human_readable(crit * mega)
        levels["levels_text"] = "(%s %s/%s)" % (label, warn_hr, crit_hr)

    if "inodes_levels" in params:
        if type(levels["inodes_levels"]) == tuple:
            warn, crit = levels["inodes_levels"]
        else:
            # A list of inode levels. Choose the correct one depending on the
            # size of the current filesystem. We do not make the first
            # rule match, but that with the largest size_gb. That way
            # the order of the entries is not important.
            found = False
            found_size = 0
            for to_size, this_levels in levels["inodes_levels"]:
                if size_gb * giga > to_size and to_size >= found_size:
                    warn, crit = this_levels
                    found_size = to_size
                    found = True
            if not found:
                warn, crit = 100.0, 100.0 # entry not found in list
        levels["inodes_levels"] = warn, crit
    else:
        levels["inodes_levels"] = (None, None)

    return levels

# Legacy function for checks that do not support groups yet
def df_check_filesystem(hostname, mountpoint, size_mb, avail_mb, reserved_mb, params):
    return df_check_filesystem_list(mountpoint, params, [(mountpoint, size_mb, avail_mb, reserved_mb)])

# New function for checks that support groups.
def df_check_filesystem_list(item, params, fslist_blocks, fslist_inodes = None):
    if "patterns" in params:
        patterns = params["patterns"]
        count = 0
        total_size_mb      = 0
        total_avail_mb     = 0
        total_reserved_mb  = 0
        total_inodes       = 0
        total_inodes_avail = 0
        for idx, (mp, size_mb, avail_mb, reserved_mb) in enumerate(fslist_blocks):
            for pattern in patterns:
                if fnmatch.fnmatch(mp, pattern):
                    count += 1
                    total_size_mb     += size_mb
                    total_avail_mb    += avail_mb
                    total_reserved_mb += reserved_mb
                    if fslist_inodes:
                        total_inodes       += fslist_inodes[idx][1]
                        total_inodes_avail += fslist_inodes[idx][2]
                    break
        # If no filesystem has been found we cannot do the
        # actual check since the size is zero.
        if count == 0:
            return (3, "No filesystem matching the patterns")
        else:
            status, infotext, perfdata = df_check_filesystem_single(g_hostname, item, total_size_mb, total_avail_mb, total_reserved_mb, total_inodes, total_inodes_avail, params)
            infotext += " (%d filesystems)" % count
            return status, infotext, perfdata
    else:
        for idx, (mp, size_mb, avail_mb, reserved_mb) in enumerate(fslist_blocks):
            if mp == item:
                if fslist_inodes:
                    inodes_total, inodes_avail = fslist_inodes[idx][1], fslist_inodes[idx][2]
                else:
                    inodes_total, inodes_avail = None, None
                return df_check_filesystem_single(g_hostname, mp, size_mb, avail_mb, reserved_mb, inodes_total, inodes_avail, params)
        return (3, "filesystem not found")


def df_check_filesystem_single(hostname, mountpoint, size_mb, avail_mb, reserved_mb, inodes_total, inodes_avail, params, this_time = None):
    if size_mb == 0:
        return (1, "size of filesystem is 0 MB", [])

    used_mb   = size_mb - avail_mb
    used_perc = 100.0 * (float(used_mb) / size_mb)
    size_gb   = size_mb / 1024.0

    # Get warning and critical levels already with 'magic factor' applied
    levels = get_filesystem_levels(g_hostname, mountpoint, size_gb, params)
    warn_mb, crit_mb       = levels["levels_mb"]
    warn_inode, crit_inode = levels["inodes_levels"]

    # Take into account magic scaling factor (third optional argument
    # in check params). A factor of 1.0 changes nothing. Factor should
    # be > 0 and <= 1. A smaller factor raises levels for big file systems
    # bigger than 100 GB and lowers it for file systems smaller than 100 GB.
    # Please run df_magic_factor.py to understand how it works.

    used_hr = get_bytes_human_readable(used_mb * 1024 * 1024)
    size_hr = get_bytes_human_readable(size_mb * 1024 * 1024)
    # If both numbers end with both MB or GB or TB, then drop the first one
    if used_hr[-2:] == size_hr[-2:]:
        used_hr = used_hr[:-3]

    # Show enough decimal digits so that very small percentages are still
    # visible!
    infotext = "%s used (%s of %s)" % (
       get_percent_human_readable(used_perc), used_hr, size_hr)

    if warn_mb < 0.0:
        # Negative levels, so user configured thresholds based on space left. Calculate the
        # upper thresholds based on the size of the filesystem
        crit_mb = size_mb + crit_mb
        warn_mb = size_mb + warn_mb

    status = 0
    if used_mb >= crit_mb:
        status = 2
    elif used_mb >= warn_mb:
        status = max(1, status)

    # TODO: In some future version use a fixed name as perf variable
    perf_var = mountpoint.replace(" ", "_")
    perfdata = [(perf_var, str(used_mb) + 'MB', warn_mb, crit_mb, 0, size_mb)]
    perfdata.append(('fs_size', str(size_mb) + 'MB'))

    if type(params) == dict:
        show_levels = params.get("show_levels")
        if show_levels == "always" or \
            (show_levels == "onproblem" and status > 0) or \
            (show_levels == "onmagic" and (status > 0 or levels.get("magic", 1.0) != 1.0)):
            infotext += ", " + levels["levels_text"]

        if reserved_mb > 0 and params["show_reserved"]:
            reserved_perc = 100.0 * float(reserved_mb) / size_mb
            infotext += ", therein reserved for root: %.1f%% (%.2f MB)" % (reserved_perc, reserved_mb)
            perfdata.append(("reserved", reserved_mb))

    # Trends. The trends are computed in two steps. In the first step we
    # compute the delta to the last check, using a normal check_mk counter.
    # In the second step we compute an average over that counter and can
    # make a long-time prediction. If we have that values we can optionally
    # apply levels.

    # Trend parameters:
    #    "trend_range"    : 24,        # hours
    #    "trend_mb"       : (10, 20),  # MB of change during trend_range
    #    "trend_perc"     : (1, 2),    # Percent change during trend_range
    #    "trend_timeleft" : (72, 48)   # run time left in hours until full

    problems = []

    MB = 1024 * 1024.0
    H24 = 60 * 60 * 24

    if levels.get("trend_range"):
        try:
            range = levels["trend_range"] # in hours
            range_sec = range * 3600.0
            if not this_time:
                this_time = time.time()

            # first compute current rate in MB/s by computing delta since last check
            rate = get_rate("df.%s.delta" % mountpoint, this_time, used_mb, allow_negative=True, onwrap=ZERO)
            if levels.get("trend_perfdata"):
                # Change in 1.1.13i3: The trend perfdata always outputs
                # the growth in MB/24h, not any longer in MB/trendrange
                perfdata.append(("growth", rate * H24))

            # average trend, initialize with zero, rate_avg is in MB/s
            rate_avg = get_average("df.%s.trend" % mountpoint,
                                   this_time, rate, range_sec / 60.0, True)

            # rate_avg is growth in MB/s, trend is in MB per trend range hours
            trend = rate_avg * range_sec
            sign = trend > 0 and "+" or ""
            infotext += ", trend: %s%s / %g hours" % \
                        (sign, get_bytes_human_readable(trend * MB), range)

            # levels for performance data
            warn_perf, crit_perf = None, None

            # apply levels for absolute growth in MB / interval
            trend_mb = levels.get("trend_mb")
            if trend_mb:
                wa, cr = trend_mb
                warn_perf, crit_perf = wa, cr
                if trend >= wa:
                    problems.append("growing too fast (warn/crit at %s/%s per %.1f h)(!" %
                       ( get_bytes_human_readable(wa * MB), get_bytes_human_readable(cr * MB), range))
                    status = max(1, status)
                    if trend >= cr:
                        status = 2
                        problems[-1] += "!"
                    problems[-1] += ")"
            else:
                wa, cr = None, None

            # apply levels for growth relative to filesystem size
            trend_perc = levels.get("trend_perc")
            if trend_perc:
                wa_perc, cr_perc = trend_perc
                wa = wa_perc / 100.0 * size_mb
                cr = cr_perc / 100.0 * size_mb
                if warn_perf != None:
                    warn_perf = min(warn_perf, wa)
                    crit_perf = min(crit_perf, cr)
                else:
                    warn_perf, crit_perf = wa, cr
                if trend >= wa:
                    problems.append("growing too fast (warn/crit at %.3f%%/%.3f%% per %.1f h)(!" %
                       ( wa_perc, cr_perc, range))
                    status = max(1, status)
                    if trend >= cr:
                        status = 2
                        problems[-1] += "!"
                    problems[-1] += ")"


            # compute time until filesystem is full (only for positive trend, of course)

            # The start value of hours_left is negative. The pnp graph and the perfometer
            # will interpret this as inifinite -> not growing
            hours_left = -1
            if trend > 0:
                space_left = size_mb - used_mb
                hours_left = space_left / trend * range
                timeleft = levels.get("trend_timeleft")
                def format_hours(hours):
                    if hours > 365 * 24:
                        return "more than a year"
                    elif hours > 90 * 24:
                        return "%0d months" % (hours/ (30 * 24))
                    elif hours > 4 * 7 * 24: # 4 weeks
                        return "%0d weeks" % (hours/ (7 * 24))
                    elif hours > 7 * 24:   # 1 week
                        return "%0.1f weeks" % (hours/ (7 * 24))
                    elif hours > 2 * 24:   # 2 days
                        return "%0.1f days" % (hours/24)
                    else:
                        return "%d hours" % hours

                if timeleft:
                    wa, cr = timeleft
                    if hours_left <= cr:
                        status = 2
                        problems.append("only %s until disk full(!!)" % format_hours(hours_left))
                    elif hours_left <= wa:
                        status = max(status, 1)
                        problems.append("only %s until disk full(!)" % format_hours(hours_left))
                    elif hours_left <= wa * 2 or levels.get("trend_showtimeleft"):
                        problems.append("time left until disk full: %s" % format_hours(hours_left))
                elif levels.get("trend_showtimeleft"):
                    problems.append("time left until disk full: %s" % format_hours(hours_left))

            if levels.get("trend_perfdata"):
                # New in 1.1.13i3: output trend not as MB / trend_range, but as
                # MB / 24 hours. The same holds for the warn and crit information.
                # It is configured in MB / trend range but in the performance data
                # it's sent as MB / 24h.
                perfdata.append(("trend", rate_avg * H24,
                                warn_perf != None and (warn_perf / range_sec * H24) or None,
                                crit_perf != None and (crit_perf / range_sec * H24) or None,
                                0, size_mb / range))

            if levels.get("trend_showtimeleft"):
                perfdata.append(("trend_hoursleft", hours_left))


        except MKCounterWrapped:
            # need more data for computing a trend

            # In this case erase all perfdata to prevent an rrd file which has no space
            # for the trend information
            perfdata = []

    if problems:
        infotext += " - %s" % ", ".join(problems)
        problems = []

    # Check inode levels
    inode_status = 0
    if inodes_total:
        inodes_avail_perc = 100.0 * inodes_avail / inodes_total
        inodes_warn, inodes_crit = levels["inodes_levels"]
        if inodes_warn != None:
            # Levels in absolute numbers
            if type(inodes_warn) == int:
                if inodes_crit > inodes_avail:
                    inode_status = 2
                    problems.append("less than %dk inodes available(!!)" % (crit_inode / 1000))
                elif inodes_warn > inodes_avail:
                    inode_status = 1
                    problems.append("less than %dk inodes available(!)" % (warn_inode / 1000))
                inodes_warn_abs = inodes_warn
                inodes_crit_abs = inodes_crit

            # Levels in percent
            else:
                if inodes_crit > inodes_avail_perc:
                    inode_status = 2
                    problems.append("less than %0.2f%% inodes available(!!)" % inodes_crit)
                elif inodes_warn > inodes_avail_perc:
                    inode_status = 1
                    problems.append("less than %.02f%% inodes available(!)" % inodes_warn)
                inodes_warn_abs = (100 - inodes_warn) / 100.0 * inodes_total
                inodes_crit_abs = (100 - inodes_crit) / 100.0 * inodes_total

        else:
            inodes_warn_abs = None
            inodes_crit_abs = None

        # Only show inodes if they are at less then 50%
        status = max(status, inode_status)
        show_inodes = levels["show_inodes"]
        if show_inodes == "always" or \
            (show_inodes == "onlow" and (inode_status or inodes_avail_perc < 50)) or \
            (show_inodes == "onproblem" and inode_status):
            infotext += ", inodes available: %dk/%0.2f%%" % (inodes_avail / 1000, inodes_avail_perc)

        perfdata += [ ("inodes_used", inodes_total - inodes_avail, inodes_warn_abs, inodes_crit_abs, 0, inodes_total) ]

    if problems:
        infotext += " - %s" % ", ".join(problems)
        problems = []

    return (status, infotext, perfdata)
check-mk-server 1.2.8p16-1ubuntu0.1 / usr / share / check_mk / checks / df.include