/usr/share/check_mk/checks/diskstat.include

#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# |             ____ _               _        __  __ _  __           |
# |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
# |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
# |           | |___| | | |  __/ (__|   <    | |  | | . \            |
# |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
# |                                                                  |
# | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software;  you can redistribute it and/or modify it
# under the  terms of the  GNU General Public License  as published by
# the Free Software Foundation in version 2.  check_mk is  distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
# out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
# PARTICULAR PURPOSE. See the  GNU General Public License for more de-
# tails. You should have  received  a copy of the  GNU  General Public
# License along with GNU Make; see the file  COPYING.  If  not,  write
# to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
# Boston, MA 02110-1301 USA.

diskstat_inventory_mode = "rule" # "summary", "single", "legacy"

diskstat_default_levels = {
#    "read" :    (10, 20),   # MB/sec
#    "write" :   (20, 40),   # MB/sec
#    "average" : 15,         # min
#    "latency" : (10, 20),   # ms
#    "latency_perfdata" : True,
}

# Rule for controlling diskstat inventory more fine grained
diskstat_inventory = []

# Example
# diskstat_inventory = [
#  ( [], [ 'linux' ], ALL_HOST ), --> No diskstat on this host
#  ( [ 'summary', 'physical', 'lvm', 'vxvm' ], ALL_HOSTS ),
# ]

def inventory_diskstat_generic(info):
    # Skip over on empty data
    if not info:
        return

    # New style: use rule based configuration, defaulting to summary mode
    if diskstat_inventory_mode == "rule":
        hits = host_extra_conf(g_hostname, diskstat_inventory)
        if len(hits) > 0:
            modes = hits[0]
        else:
            modes = [ "summary" ]

    elif diskstat_inventory_mode == "single":
        modes = [ "physical" ]
    elif diskstat_inventory_mode == "summary":
        modes = [ "summary" ]
    else:
        modes = [ "legacy" ]

    inventory = []
    if "summary" in modes:
        inventory.append( ( "SUMMARY", "diskstat_default_levels" ) )

    if "legacy" in modes:
        inventory += [ ( "read", None ), ( "write", None ) ]

    if "physical" in modes:
        inventory += [ (line[1], "diskstat_default_levels")
                       for line in info
                       if not ' ' in line[1] ]

    if "lvm" in modes:
        inventory += [ (line[1], "diskstat_default_levels")
                       for line in info
                       if line[1].startswith("LVM ") ]

    if "vxvm" in modes:
        inventory += [ (line[1], "diskstat_default_levels")
                       for line in info
                       if line[1].startswith("VxVM ") ]

    return inventory



def check_diskstat_line(this_time, item, params, line, mode='sectors'):
    average_range = params.get("average")
    if average_range == 0:
        average_range = None # disable averaging when 0 is set

    perfdata = []
    infos = []
    status = 0
    node = line[0]
    if node != None and node != "":
        infos.append("Node %s" % node)
    prediction_perf = []
    for what, ctr in [ ("read",  line[2]), ("write", line[3]) ]:
        if node:
            countername = "diskstat.%s.%s.%s" % (node, item, what)
        else:
            countername = "diskstat.%s.%s" % (item, what)

        # unpack levels now, need also for perfdata
        levels = params.get(what)
        if type(levels) == tuple:
            warn, crit = levels
        else:
            warn, crit = None, None

        per_sec = get_rate(countername, this_time, int(ctr))
        if mode == 'sectors':
            # compute IO rate in bytes/sec
            bytes_per_sec = per_sec * 512
        elif mode == 'bytes':
            bytes_per_sec = per_sec

        infos.append("%s/sec %s" % (get_bytes_human_readable(bytes_per_sec), what))
        perfdata.append( (what, bytes_per_sec, warn, crit) )
        dsname = what

        # compute average of the rate over ___ minutes
        if average_range != None:
            avg = get_average(countername + ".avg", this_time, bytes_per_sec, average_range)
            dsname = what + ".avg"
            perfdata.append( (dsname, avg) )
            bytes_per_sec = avg

        # check levels
        state, text, extraperf = check_levels(bytes_per_sec, dsname, levels,
                                              unit = "MB/s", scale = 1048576, statemarkers=True)
        if text:
            infos.append(text)
        status = max(state, status)
        prediction_perf += extraperf

    # Add performance data for averaged IO
    if average_range != None:
        perfdata = [ perfdata[0], perfdata[2], perfdata[1], perfdata[3] ]

    # Process IOs when available
    ios_per_sec = None
    if len(line) >= 6 and line[4] >= 0 and line[5] > 0:
        reads, writes = map(int, line[4:6])
        ios = reads + writes
        ios_per_sec = get_rate(countername + ".ios", this_time, ios)
        infos.append("IOs: %.2f/sec" % ios_per_sec)

        if params.get("latency_perfdata"):
            perfdata.append(("ios", ios_per_sec))

    # Do Latency computation if this information is available:
    if len(line) >= 7 and line[6] >= 0:
        timems = int(line[6])
        timems_per_sec = get_rate(countername + ".time", this_time, timems)
        if not ios_per_sec:
            latency = 0.0
        else:
            latency = timems_per_sec / ios_per_sec
        infos.append("Latency: %.2fms" % latency)
        if "latency" in params:
            warn, crit = params["latency"]
            if latency >= crit:
                status = 2
                infos[-1] += "(!!)"
            elif latency >= warn:
                status = max(status, 1)
                infos[-1] += "(!)"
        else:
            warn, crit = None, None

        if params.get("latency_perfdata"):
            perfdata.append(("latency", latency, warn, crit))

    # Queue Lengths (currently only Windows). Windows uses counters here.
    # I have not understood, why....
    if len(line) >= 9:
        for what, ctr in [ ("read",  line[7]), ("write", line[8]) ]:
            countername = "diskstat.%s.ql.%s" % (item, what)
            levels = params.get(what + "_ql")
            if levels:
                warn, crit = levels
            else:
                warn, crit = None, None

            qlx = get_rate(countername, this_time, int(ctr))
            ql = qlx / 10000000.0
            infos.append(what.title() + " Queue: %.2f" % ql)

            # check levels
            if levels != None:
                if ql >= crit:
                    status = 2
                    infos[-1] += "(!!)"
                elif ql >= warn:
                    status = max(status, 1)
                    infos[-1] += "(!)"

            if params.get("ql_perfdata"):
                perfdata.append((what + "_ql", ql))

    perfdata += prediction_perf

    return (status, ", ".join(infos) , perfdata)


def check_diskstat_generic(item, params, this_time, info, mode='sectors'):
    # legacy version if item is "read" or "write"
    if item in [ 'read', 'write' ]:
        return check_diskstat_old(item, params, this_time, info)

    # Sum up either all physical disks (if item is "SUMMARY") or
    # all entries matching the item in question. It is not a bug if
    # a disk appears more than once. This can for example happen in
    # Windows clusters - even if they are no Check_MK clusters.

    summed_up = [0] * 13
    matching = 0

    for line in info:
        if item == 'SUMMARY' and line[0] != None:
            return 3, "summary mode not supported in a cluster"

        elif item == 'SUMMARY' and ' ' in line[1]:
            continue # skip non-physical disks

        elif item == 'SUMMARY' or line[1] == item:
            matching += 1
            summed_up = map(lambda e: e[0] + int(e[1]), zip(summed_up, line[2:]))

    if matching == 0:
        return 3, "No matching disk found"
    else:
        return check_diskstat_line(this_time, item, params, [None, ''] + summed_up, mode)


# This is the legacy version of diskstat as used in <= 1.1.10.
# We keep it here for a while in order to be compatible with
# old installations.
def check_diskstat_old(item, params, this_time, info):
    # sum up over all devices
    if item == 'read':
        index = 2 # sectors read
    elif item == 'write':
        index = 3 # sectors written
    else:
        return (3, "invalid item %s" % (item,))

    this_val = 0
    for line in info:
        if line[0] != None:
            return 3, "read/write mode not supported in a cluster"
        if ' ' not in line[1]:
            this_val += int(line[index])

    per_sec = get_rate("diskstat." + item, this_time, this_val)
    mb_per_s = per_sec / 2048.0    # Diskstat output is in sectors a 512 Byte
    kb_per_s = per_sec / 2.0
    perfdata = [ (item, "%f" % kb_per_s ) ]
    return (0, "%.1f MB/s" % mb_per_s, perfdata)


#.
#   .--Dict based API------------------------------------------------------.
#   |  ____  _      _     _                        _      _    ____ ___    |
#   | |  _ \(_) ___| |_  | |__   __ _ ___  ___  __| |    / \  |  _ \_ _|   |
#   | | | | | |/ __| __| | '_ \ / _` / __|/ _ \/ _` |   / _ \ | |_) | |    |
#   | | |_| | | (__| |_  | |_) | (_| \__ \  __/ (_| |  / ___ \|  __/| |    |
#   | |____/|_|\___|\__| |_.__/ \__,_|___/\___|\__,_| /_/   \_\_|  |___|   |
#   |                                                                      |
#   +----------------------------------------------------------------------+
#   |  The newest generation of Disk IO checks parse all informatin info   |
#   |  a dictionary, where counters are aleady resolved. Look at diskstat  |
#   |  (the Linux diskstat check) for an example.                          |
#   '----------------------------------------------------------------------'

def diskstat_select_disk(disks, item):

    # In summary mode we add up the throughput values, but
    # we average the other values for disks that have a throughput
    # > 0. Note: This is not very precise. Strictly spoken
    # we would need to do the summarization directly in the
    # parse function. But there we do not have information about
    # the physical multipath devices and would add up the traffic
    # of the paths with the traffice of the device itself....

    if item == "SUMMARY":
        summarized = {
            "node"                       : None,
            # We do not set these settings explictly because some
            # devices may not provide all of them.
            # "read_ios"                   : 0.0,
            # "write_ios"                  : 0.0,
            # "read_throughput"            : 0.0,
            # "write_throughput"           : 0.0,
            # "utilization"                : 0.0,
            # "latency"                    : 0.0,
            # "average_request_size"       : 0.0,
            # "average_wait"               : 0.0,
            # "average_read_wait"          : 0.0,
            # "average_read_request_size"  : 0.0,
            # "average_write_wait"         : 0.0,
            # "average_write_request_size" : 0.0,
            # "queue_length"               : 0.0,
        }

        if disks:
            num_averaged = 0
            for device, disk in disks.items():
                # If all disks are idle the summarized dict would have no keys
                # So we take care that at least all keys of this disk are set
                for key in disk.keys():
                    if key != "node":
                        summarized.setdefault(key, 0.0)

                if device.startswith("LVM "):
                    continue # skip LVM devices for summary

                if True or disk["read_throughput"] + disk["write_throughput"] > 0: # skip idle disks
                    num_averaged += 1
                    for key, value in disk.items():
                        if key != "node":
                            summarized[key] += value

            if num_averaged:
                for key, value in summarized.items():
                    if key.startswith("ave") or key in ("utilization", "latency", "queue_length"):
                        summarized[key] /= num_averaged

        return summarized

    elif item not in disks:
        return None

    else:
        return disks[item]

# New version for this diskstat checks that use the new dict
# format. The first one is "diskstat" - the Linux version of
# this check. Look there for examples of the format of the
# dictionary "disks". Example:
# disks = { "sda" : {
#       'node'                       : None,
#       'average_read_request_size'  : 0.0,
#       'average_read_wait'          : 0.0,
#       'average_request_size'       : 40569.90476190476,
#       'average_wait'               : 0.761904761904762,
#       'average_write_request_size' : 40569.90476190476,
#       'average_write_wait'         : 0.0007619047619047619,
#       'read_ios'                   : 0.0,
#       'read_throughput'            : 0.0,
#       'latency'                    : 0.00038095238095238096,
#       'utilization'                : 0.0006153846153846154,
#       'write_ios'                  : 1.6153846153846154,
#       'write_throughput'           : 65536.0,
#       'queue_length'               : 0.0,
# }}
def check_diskstat_dict(item, params, disks):
    # Take care of previously discovered services
    if item in ("read", "write"):
        yield 3, "Sorry, the new version of this check does not " \
                  "support one service for read and one for write anymore."
        return

    this_time = time.time()
    disk = diskstat_select_disk(disks, item)
    if not disk:
        return

    # Averaging
    # Note: this check uses a simple method of averaging: As soon as averaging
    # is turned on the actual metrics are *replaced* by the averaged ones. No
    # duplication of performance data or check output here. This is because we
    # have so many metrics...
    prefix = ""
    averaging = params.get("average") # in seconds here!
    if averaging:
        avg_disk = {} # Do not modify our arguments!!
        for key, value in disk.items():
            if type(value) in (int, float):
                avg_disk[key] = get_average("diskstat.%s.%s.avg" % (item, key), this_time, value, averaging / 60.0)
            else:
                avg_disk[key] = value
        disk = avg_disk
        prefix = "%s average: " % get_age_human_readable(averaging)


    # Utilization
    if "utilization" in disk:
        util = disk["utilization"]
        state, text, extraperf = check_levels(util, "disk_utilization", params.get("utilization"),
                                              unit = "%", scale = 0.01, statemarkers=False)
        yield state, "%sUtilization: %.1f%%%s" % (prefix, util * 100, text), extraperf


    # Throughput
    for what in "read", "write":
        if what + "_throughput" in disk:
            throughput = disk[what + "_throughput"]
            state, text, extraperf = check_levels(throughput, "disk_" + what + "_throughput", params.get(what),
                                                  unit = "MB/s", scale = 1048576, statemarkers=False)
            yield state, "%s: %s/s%s" % (what.title(), get_bytes_human_readable(throughput), text), extraperf


    # Average wait from end to end
    for what in [ "wait", "read_wait", "write_wait"]:
        if "average_" + what in disk:
            wait = disk["average_" + what]
            state, text, extraperf = check_levels(wait, what, params.get(what),
                                                  unit = "ms", scale = 0.001, statemarkers=False)
            yield state, "Average %s: %.2f ms%s" % (what.title().replace("_", " "), wait * 1000, text), extraperf

    # Average disk latency
    if "latency" in disk:
        latency = disk["latency"]
        state, text, extraperf = check_levels(latency, "disk_latency", params.get("latency"),
                                              unit = "ms", scale = 0.001, statemarkers=False)
        yield state, "Latency: %.2f ms%s" % (latency * 1000.0, text), extraperf

    # All the other metrics are currently not output in the plugin output - simply because
    # of their amount. They are present as performance data and will shown in graphs.

    # Send everything as performance data now. Sort keys alphabetically
    perfdata = []
    for key in sorted(disk.keys()):
        value = disk[key]
        if type(value) in (int, float):
            # Currently the levels are not shown in the perfdata
            perfdata.append(("disk_" + key, value))

    yield 0, None, perfdata
check-mk-server 1.2.8p16-1ubuntu0.1 / usr / share / check_mk / checks / diskstat.include