/usr/share/check_mk/checks/ps.include

#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# |             ____ _               _        __  __ _  __           |
# |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
# |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
# |           | |___| | | |  __/ (__|   <    | |  | | . \            |
# |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
# |                                                                  |
# | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software;  you can redistribute it and/or modify it
# under the  terms of the  GNU General Public License  as published by
# the Free Software Foundation in version 2.  check_mk is  distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
# out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
# PARTICULAR PURPOSE. See the  GNU General Public License for more de-
# tails. You should have  received  a copy of the  GNU  General Public
# License along with GNU Make; see the file  COPYING.  If  not,  write
# to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
# Boston, MA 02110-1301 USA.

factory_settings["ps_default_levels"] = {
    "levels" : (1, 1, 99999, 99999),
}

def ps_get_inventory_specs(invdata, invrules):
    inventory_specs = []

    # Handle inventory_processes, inventory_processes_perf variables
    default_levels = None
    for inv_entry in invdata:
        # New in 1.1.7i1: inventory_processes may be prefixed with list of
        # host names or tags + ALL_HOSTS
        # New in 1.2.6b7: inventory_processes might have no levels specified
        tags = None
        if len(inv_entry) in (4, 8):
            tags = []
            hostlist = inv_entry[0]
            inv_entry = inv_entry[1:]
            if len(inv_entry) == 8:
                default_levels = inv_entry[3:7]
        elif len(inv_entry) in (5, 9):
            tags = inv_entry[0]
            hostlist = inv_entry[1]
            inv_entry = inv_entry[2:]
            default_levels = inv_entry[4:8]

        # Filter out entries with do not match our current host
        if tags != None and ( not hosttags_match_taglist(tags_of_host(g_hostname), tags) or
                              not in_extraconf_hostlist(hostlist, g_hostname)):
            continue

        if len(inv_entry) == 7: # extend old inventory_processes
            inv_entry += ([],)
            default_levels = inv_entry[3:7]

        inventory_specs.append(inv_entry[:3] + (None,) + ({ "levels" : default_levels },))

    # Handle new WATO style inventory rules
    for rule in invrules:
        taglist, hostlist = rule[1:3]
        if len(rule) >= 4:
            options = rule[3]
            if options.get("disabled"):
                continue

        # Filter out entries with do not match our current host
        if not hosttags_match_taglist(tags_of_host(g_hostname), taglist) \
           or not in_extraconf_hostlist(hostlist, g_hostname):
            continue

        v = rule[0]

        inventory_specs.append((v['descr'], v.get('match'), v.get('user'), v.get('icon'), v.get('default_params', v)))

    return inventory_specs


def inventory_ps_common(invdata, invrules, parsed):
    inventory_specs = ps_get_inventory_specs(invdata, invrules)

    inventory = []
    for servicedesc, pattern, userspec, icon, default_params in inventory_specs:
        num_perc_s = servicedesc.count("%s")
        for line in parsed:
            # First entry in line is the node name or None for non-clusters
            process_line = line[1:]
            matches, grabbed_user = process_matches_for_discovery(process_line, pattern, userspec)
            if matches == False:
                continue # skip not matched lines

            if len(matches) < num_perc_s:
                raise MKGeneralException("Invalid entry in inventory_processes: service description '%s' contains "
                        "%d times '%%s', but regular expression '%s' contains only %d subexpression(s)." % \
                        (servicedesc, num_perc_s, pattern, len(matches)))

            if userspec == GRAB_USER:
                i_userspec = grabbed_user
                i_servicedesc = servicedesc.replace("%u", grabbed_user)
            else:
                i_userspec = userspec
                i_servicedesc = servicedesc

            # New in 1.2.2b4: Alle %1, %2, etc. to be replaced with first, second, ...
            # group. This allows a reordering of the matched groups
            for nr, group in enumerate(matches):
                i_servicedesc = i_servicedesc.replace("%%%d" % (nr+1), group)

            # It is allowed (1.1.4) that the pattern contains more subexpressions then the
            # service description. In that case only the first subexpressions are used as
            # item.
            i_servicedesc = i_servicedesc % tuple(matches[:num_perc_s])

            # Problem here: We need to instantiate all subexpressions
            # with their actual values of the found process.
            i_pattern = instantiate_regex_pattern(pattern, matches)

            inv_params = {
                "process" : i_pattern,
                "user"    : i_userspec,
            }

            if icon:
                inv_params["icon"] = icon

            # default_params is either a clean dict with optional parameters to set as default
            # or - from version 1.2.4 - the dict from the rule itself. In the later case
            # we need o remove the keys that do not specify default parameters
            for key, value in default_params.items():
                if key not in ( "descr", "match", "user", "perfdata" ):
                    inv_params.setdefault(key, value)
            inv = ( i_servicedesc, inv_params )

            if inv not in inventory:
                inventory.append(inv)

    return inventory

def instantiate_regex_pattern(pattern, matches):
    for m in matches:
        pattern = instantiate_regex_pattern_once(pattern, m)
    return pattern

def instantiate_regex_pattern_once(pattern, match):
    # this correctly handles \( and \) but not [^)] - sorry
    return re.compile(r"(?<!\\)\(.*?(?<!\\)\)").sub(escape_regex_chars(match), pattern, 1)

def escape_regex_chars(match):
    r = ""
    for c in match:
        if c in r"[]\().?{}|*^$+":
            r += "\\"
        r += c
    return r


# FIXME: Merge logic of process_matches_for_* functions
def process_matches_for_discovery(process_line, process_pattern, userspec):
    user, command_line = process_line[0][0], process_line[1:]

    if user != None:
        if userspec:
            if userspec.startswith('~'):
                if not re.match(userspec[1:], user):
                    return False, None

            elif userspec != user:
                return False, None

        grabbed_user = user
    else:
        grabbed_user = None

    if not process_pattern:
        return (), grabbed_user

    elif process_pattern.startswith("~"):
        reg = regex(process_pattern[1:]) # skip "~"
        matchobject = reg.match(" ".join(command_line))
        if matchobject:
            return [ g and g or "" for g in matchobject.groups() ], grabbed_user
    else:
        if command_line[0] == process_pattern:
            return (), grabbed_user

    return False, grabbed_user


# Returns True or False
def process_matches_for_check(process_line, process_pattern, user_pattern):
    user, command_line = process_line[0][0], process_line[1:]

    # Check process owner. Note: the first field with process information is
    # optional and might be missing (old agents).
    if user_pattern:
        if user_pattern.startswith('~'):
            if not regex(user_pattern[1:]).match(user):
                return False

        elif user_pattern != user:
            return False

    if not process_pattern:
        # Process name not relevant
        return True

    elif process_pattern.startswith("~"):
        # Regex for complete process command line
        reg = regex(process_pattern[1:]) # skip "~"
        return reg.match(" ".join(command_line))

    else:
        # Exact match on name of executable
        return command_line[0] == process_pattern


# produce text or html output intended for the long output field of a check from details about a
# process.
# the input is expected to be a list (one per process) of lists (one per data field) of key-value
# tuples where the value is again a 2-field tuple, first is the value, second is the unit.
# This function is actually fairly generic so it could be used for other data structured the same
# way
def format_process_list(processes, html_output):
    def format_value(value):
        value, unit = value
        if isinstance(value, float):
            return "%.2f%s" % (value, unit)
        else:
            return "%s%s" % (value, unit)

    if html_output:
        table_bracket  = "<table>%s</table>"
        line_bracket   = "<tr>%s</tr>"
        cell_bracket   = "<td>%.0s%s</td>"
        cell_seperator = ""

        headers = []
        headers_found = set()

        for process in processes:
            for key, value in process:
                if key not in headers_found:
                    headers.append(key)
                    headers_found.add(key)

        # make sure each process has all fields from the table
        processes_filled = []
        for process in processes:
            dictified = dict(process)
            processes_filled.append([
                (key, dictified.get(key, "")) for key in headers
            ])
        processes = processes_filled
        header_line = "<tr><th>" + "</th><th>".join(headers) + "</th></tr>"
    else:
        table_bracket  = "%s"
        line_bracket   = "%s\r\n"
        cell_bracket   = "%s %s"
        cell_seperator = ", "
        header_line    = ""

    return table_bracket % (header_line + "".join([
        line_bracket % cell_seperator.join([
            cell_bracket % (key, format_value(value)) for key, value in process
        ])
        for process in processes
    ]))


# New parameter format: dictionary. Example:
# {
#    "user" : "foo",
#    "process" : "/usr/bin/food",
#    "warnmin" : 1,
#    "okmin"   : 1,
#    "okmax"   : 1,
#    "warnmax" : 1,
# }

# Even newer format:
# {
#   "user" : "foo",
#   "levels" : (1, 1, 99999, 99999)
# }
# Parse time as output by ps into seconds.
# Example 1: "12:17"
# Example 2: "55:12:17"
# Example 3: "7-12:34:59" (with 7 days)
def parse_ps_time(text):
    if "-" in text:
        tokens = text.split("-")
        days   = int(tokens[0])
        text   = tokens[1]
    else:
        days = 0
    parts = map(int, text.split(":"))
    if len(parts) == 3:
        hours, minutes, seconds = parts
    else:
        hours, minutes, seconds = 0, parts[0], parts[1]

    return 86400 * days + 3600 * hours + 60 * minutes + seconds


def ps_cleanup_params(params):
    if type(params) in (list, tuple):
        if len(params) == 5:
            procname, warnmin, okmin, okmax, warnmax = params
            user = None
        elif len(params) == 6:
            procname, user, warnmin, okmin, okmax, warnmax = params

        params = {
            "process" : procname,
            "levels" :  (warnmin, okmin, okmax, warnmax),
        }

        if user != None:
            params["user"] = user

    elif "okmin" in params or "warnmin" in params or "okmax" in params or "warnmax" in params:
        params["levels"] = (
            params.get("warnmin", 1),
            params.get("okmin", 1),
            params.get("okmax", 99999),
            params.get("warnmax", 99999),
        )

    return params


def ps_has_extended_perfdata(process_info):
    return len(process_info) > 4


def check_ps_common(item, params, parsed, cpu_cores = 1, info_name = "processes", total_ram = None):
    now = time.time()
    params = ps_cleanup_params(params)

    count = 0
    virtual_size   = 0
    resident_size  = 0
    handle_count   = 0
    percent_cpu    = 0.0
    max_elapsed    = None
    min_elapsed    = None
    extended_perfdata = False
    processes = []

    running_on_nodes = set() # collect information about nodes the processes run on
    for line in parsed:
        node_name, process_line = line[0], line[1:]
        process_info, command_line = process_line[0], process_line[1:]

        if process_matches_for_check(process_line, params.get("process"), params.get("user")):
            process = []
            count += 1

            if node_name != None:
                running_on_nodes.add(node_name)

            if process_info:
                if command_line:
                    process.append(("name", (command_line[0], "")))

                if len(process_info) >= 4: # extended performance data: virtualsize, residentsize, %cpu
                    extended_perfdata = True
                    process.append(("user",          (process_info[0], "")))
                    process.append(("virtual size",  (int(process_info[1]), "kB")))
                    process.append(("resident size", (int(process_info[2]), "kB")))

                    virtual_size  += int(process_info[1])  # kB
                    resident_size += int(process_info[2]) # kB
                    if len(process_info) >= 10:
                        # even more data: processId, pagefile_usage, usermodetime, kernelmodetime, threadCount, openHandles
                        pid, pagefile_usage, user_c, kernel_c, handle_c = map(saveint, process_info[4:9])

                        process.append(("pagefile usage", (pagefile_usage, "")))
                        process.append(("handle count",   (handle_c, "")))

                        counter_wrapped = False
                        try:
                            user_per_sec   = get_rate("ps_wmic.user.%d" % pid, now, user_c, onwrap=RAISE)
                        except MKCounterWrapped, e:
                            counter_wrapped = True

                        try:
                            kernel_per_sec = get_rate("ps_wmic.kernel.%d" % pid, now, kernel_c, onwrap=RAISE)
                        except MKCounterWrapped, e:
                            counter_wrapped = True

                        if counter_wrapped:
                            user_per_sec   = 0
                            kernel_per_sec = 0

                        user_perc = user_per_sec / 100000.0 / cpu_cores
                        kernel_perc = kernel_per_sec / 100000.0 / cpu_cores
                        percent_cpu += user_perc + kernel_perc
                        handle_count += handle_c
                        process.append(("cpu usage (user space)",   (user_perc, "%")))
                        process.append(("cpu_usage (kernel space)", (kernel_perc, "%")))
                        process.append(("pid", (pid, "")))
                    else:
                        # process_info[3] contains the used CPU time and possibly, separated by /, also
                        # the total elapsed time since the birth of the process.
                        if ":" in process_info[3]:
                            if '/' in process_info[3]:
                                pcpu_text, elapsed_text = process_info[3].split('/')
                                elapsed = parse_ps_time(elapsed_text)
                                max_elapsed = max(max_elapsed, elapsed)
                                if min_elapsed == None:
                                    min_elapsed = elapsed
                                else:
                                    min_elapsed = min(min_elapsed, elapsed)
                            else:
                                pcpu_text = process_info[3]

                            total_seconds = parse_ps_time(pcpu_text)
                            pid = process_info[4]
                            try:
                                cputime = get_rate("ps_stat.pcpu.%s" % pid, now, total_seconds, onwrap=RAISE)
                            except MKCounterWrapped, e:
                                cputime = 0

                            pcpu = cputime * 100
                            process.append(("pid", (pid, "")))
                        else:
                            pcpu = savefloat(process_info[3])
                        percent_cpu += pcpu
                        process.append(("cpu usage", (pcpu, "%")))

            elif command_line:
                # otherwise we have only two fields, one of which is the node. if the remaining
                # field is not the process name, that ps makes no sense at all
                process.append(("name", (command_line[0], "")))

            processes.append(process)

    if "cpulevels" in params:
        warn_cpu, crit_cpu = params["cpulevels"]
    else:
        warn_cpu, crit_cpu = None, None

    warnmin, okmin, okmax, warnmax = params["levels"]
    perfdata = [ ("count", count, okmax+1, warnmax+1, 0) ]
    if count == 0 and not extended_perfdata:
        # No matching process found. Therefore we don't know yet,
        # wether the agents sends extended data for processes.
        # We need to know this in order to create the RRD with the
        # correct number of DSes. To just look at the first process
        # in the agent output to make sure. We assume that at least
        # one process is always present.
        extended_perfdata = ps_has_extended_perfdata(parsed[0][1])

    if extended_perfdata:
        warn_vsz, crit_vsz = params.get("virtual_levels", (None, None))
        warn_rss, crit_rss = params.get("resident_levels", (None, None))
        perfdata += [ ("vsz", virtual_size, warn_rss, crit_rss),
                      ("rss", resident_size, warn_rss, crit_rss)]
        perfdata.append(("pcpu", percent_cpu, warn_cpu, crit_cpu))

    infotext = "%d %s" % (count, info_name)

    if running_on_nodes:
        infotext += " [running on %s]" % ", ".join(running_on_nodes)

    state = 0
    if count > warnmax or count < warnmin:
        state = 2
        infotext += " (ok from %d to %d)(!!)" % (okmin, okmax)
    elif count > okmax or count < okmin:
        state = 1
        infotext += " (ok from %d to %d)(!)" % (okmin, okmax)

    if virtual_size:
        for size, name in [ (virtual_size, "virtual"), (resident_size, "resident") ]:
            infotext += " %.1f MB %s" % ((size / 1024.0), name)
            if "%s_levels" % name in params:
                warn_level, crit_level = params["%s_levels" % name]
                if size * 1024 >= crit_level:
                    state = 2
                    infotext += "(!!)"
                elif size * 1024 >= warn_level:
                    state = max(state, 1)
                    infotext += "(!)"
            infotext += ","

        # Check levels that are in percent of the total RAM of the host
        if "resident_levels_perc" in params:
            warn_perc, crit_perc = params["resident_levels_perc"]
            if not total_ram:
                infotext += ", percentual RAM levels, but total RAM is unknown"
                if state in (0, 1):
                    state = 3
            else:
                resident_perc = 100 * float(resident_size * 1024) / total_ram
                infotext += " %s of total RAM" % get_percent_human_readable(resident_perc)
                levelstext = " (warn/crit at %d%%/%d%%)" % (warn_perc, crit_perc)
                if resident_perc >= crit_perc:
                    state = 2
                    infotext += levelstext
                elif resident_perc >= warn_perc:
                    state = max(state, 1)
                    infotext += levelstext
                else:
                    infotext += levelstext

            infotext += ","

        infotext += " %.1f%% CPU" % percent_cpu

        if "cpu_average" in params:
            avg_cpu = get_average("ps.%s.cpu" % item, now, percent_cpu, params["cpu_average"], False)
            infotext += " (%d min average: %.1f%%)" % (params["cpu_average"], avg_cpu)
            percent_cpu = avg_cpu # use this for level comparison
            if extended_perfdata:
                perfdata.append(("pcpuavg", avg_cpu, warn_cpu, crit_cpu, 0, params["cpu_average"]))

        if "cpulevels" in params:
            if percent_cpu >= crit_cpu:
                state = 2
                infotext += "(!!)"
            elif percent_cpu >= warn_cpu:
                state = max(state, 1)
                infotext += "(!)"

        # only check handle_count if provided by wmic counters
        if handle_count:
            infotext += ", %d process handles" % handle_count
            if "handle_count" in params:
                warn_handle, crit_handle = params["handle_count"]
                if handle_count >= crit_handle:
                    state = 2
                    infotext += "(!!)"
                elif handle_count >= warn_handle:
                    state = max(state, 1)
                    infotext += "(!)"
            else:
                warn_handle, crit_handle = None, None
            perfdata.append(("process_handles", handle_count, warn_handle, crit_handle))

    # Check how long the process is running
    if min_elapsed != None:
        if min_elapsed == max_elapsed:
            infotext += ", running for %s" % get_age_human_readable(min_elapsed)
        else:
            infotext += ", youngest running for %s" % get_age_human_readable(min_elapsed)
            infotext += ", oldest running for %s" % get_age_human_readable(max_elapsed)
    if "max_age" in params:
        warn_age, crit_age = params["max_age"]
        if max_elapsed >= crit_age:
            state = 2
            infotext += "(!!)"
        elif max_elapsed >= crit_age:
            state = max(state, 1)
            infotext += "(!)"

    if params.get("process_info", None) is not None:
        infotext += "\r\n" + format_process_list(processes, params["process_info"] == "html")

    return state, infotext, perfdata
check-mk-server 1.2.8p16-1ubuntu0.1 / usr / share / check_mk / checks / ps.include