Index: trunk/debs/ganglios/debian/control |
— | — | @@ -0,0 +1,19 @@ |
| 2 | +Source: ganglios |
| 3 | +Section: net |
| 4 | +Priority: optional |
| 5 | +Maintainer: Lex Linden <lex@lindenlab.com> |
| 6 | +Build-Depends: debhelper (>= 5), python-support |
| 7 | +Standards-Version: 3.7.2 |
| 8 | + |
| 9 | +Package: ganglios |
| 10 | +Architecture: any |
| 11 | +Depends: ${shlibs:Depends}, ${misc:Depends}, ${python:Depends}, nagios3 |
| 12 | +Description: Scripts to help nagios act on data from ganglia |
| 13 | + Ganglios retrieves metrics from a gmond collector node and stores them in the |
| 14 | + local filesystem, one file per host. Nagios checks can refer to this data to |
| 15 | + perform checks on a wide range of hosts much more efficiently than SSHing into |
| 16 | + every node. |
| 17 | + . |
| 18 | + Ganglios includes a Python module, ganglios.py, to make acting on this data |
| 19 | + simpler. Example nagios checks are included to show how to use the data |
| 20 | + ganglios retrieves. |
Index: trunk/debs/ganglios/debian/dirs |
— | — | @@ -0,0 +1,3 @@ |
| 2 | +usr/sbin |
| 3 | +usr/lib/nagios/plugins |
| 4 | +usr/shared/pyshared/ganglios |
Index: trunk/debs/ganglios/debian/pyversions |
— | — | @@ -0,0 +1 @@ |
| 2 | +2.4-2.6 |
Index: trunk/debs/ganglios/debian/compat |
— | — | @@ -0,0 +1 @@ |
| 2 | +5 |
Index: trunk/debs/ganglios/debian/ganglios.cron |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +# |
| 3 | +# update /var/lib/ganglia/xmlcache/ every 2 minutes |
| 4 | +# |
| 5 | + |
| 6 | +*/2 * * * * nagios /usr/sbin/ganglia_parser |
| 7 | + |
Index: trunk/debs/ganglios/debian/postinst |
— | — | @@ -0,0 +1,39 @@ |
| 2 | +#!/bin/sh |
| 3 | +# postinst script for debiandir |
| 4 | +# |
| 5 | +# see: dh_installdeb(1) |
| 6 | + |
| 7 | +set -e |
| 8 | + |
| 9 | +# summary of how this script can be called: |
| 10 | +# * <postinst> `configure' <most-recently-configured-version> |
| 11 | +# * <old-postinst> `abort-upgrade' <new version> |
| 12 | +# * <conflictor's-postinst> `abort-remove' `in-favour' <package> |
| 13 | +# <new-version> |
| 14 | +# * <postinst> `abort-remove' |
| 15 | +# * <deconfigured's-postinst> `abort-deconfigure' `in-favour' |
| 16 | +# <failed-install-package> <version> `removing' |
| 17 | +# <conflicting-package> <version> |
| 18 | +# for details, see http://www.debian.org/doc/debian-policy/ or |
| 19 | +# the debian-policy package |
| 20 | + |
| 21 | + |
| 22 | +case "$1" in |
| 23 | + configure) |
| 24 | + ;; |
| 25 | + |
| 26 | + abort-upgrade|abort-remove|abort-deconfigure) |
| 27 | + ;; |
| 28 | + |
| 29 | + *) |
| 30 | + echo "postinst called with unknown argument \`$1'" >&2 |
| 31 | + exit 1 |
| 32 | + ;; |
| 33 | +esac |
| 34 | + |
| 35 | +# dh_installdeb will replace this with shell code automatically |
| 36 | +# generated by other debhelper scripts. |
| 37 | + |
| 38 | +#DEBHELPER# |
| 39 | + |
| 40 | +exit 0 |
Index: trunk/debs/ganglios/debian/prerm |
— | — | @@ -0,0 +1,38 @@ |
| 2 | +#!/bin/sh |
| 3 | +# prerm script for debiandir |
| 4 | +# |
| 5 | +# see: dh_installdeb(1) |
| 6 | + |
| 7 | +set -e |
| 8 | + |
| 9 | +# summary of how this script can be called: |
| 10 | +# * <prerm> `remove' |
| 11 | +# * <old-prerm> `upgrade' <new-version> |
| 12 | +# * <new-prerm> `failed-upgrade' <old-version> |
| 13 | +# * <conflictor's-prerm> `remove' `in-favour' <package> <new-version> |
| 14 | +# * <deconfigured's-prerm> `deconfigure' `in-favour' |
| 15 | +# <package-being-installed> <version> `removing' |
| 16 | +# <conflicting-package> <version> |
| 17 | +# for details, see http://www.debian.org/doc/debian-policy/ or |
| 18 | +# the debian-policy package |
| 19 | + |
| 20 | + |
| 21 | +case "$1" in |
| 22 | + remove|upgrade|deconfigure) |
| 23 | + ;; |
| 24 | + |
| 25 | + failed-upgrade) |
| 26 | + ;; |
| 27 | + |
| 28 | + *) |
| 29 | + echo "prerm called with unknown argument \`$1'" >&2 |
| 30 | + exit 1 |
| 31 | + ;; |
| 32 | +esac |
| 33 | + |
| 34 | +# dh_installdeb will replace this with shell code automatically |
| 35 | +# generated by other debhelper scripts. |
| 36 | + |
| 37 | +#DEBHELPER# |
| 38 | + |
| 39 | +exit 0 |
Index: trunk/debs/ganglios/debian/changelog |
— | — | @@ -0,0 +1,5 @@ |
| 2 | +ganglios (1.0-1) stable; urgency=low |
| 3 | + |
| 4 | + * Initial release |
| 5 | + |
| 6 | + -- Lex Linden <lex@lindenlab.com> Mon, 26 Oct 2009 10:17:24 -0400 |
Index: trunk/debs/ganglios/debian/copyright |
— | — | @@ -0,0 +1,18 @@ |
| 2 | +This package was debianized by Lex Linden <lex@lindenlab.com> on |
| 3 | +Mon, 26 Oct 2009 10:17:24 -0400. |
| 4 | + |
| 5 | +Copyright: |
| 6 | + |
| 7 | + Copyright (C) 2008 Linden Lab |
| 8 | + |
| 9 | +License: |
| 10 | + Released under the GPL v2 or later. |
| 11 | + For a full description of the license, please examine: |
| 12 | + /usr/share/common-licenses/GPL-2 |
| 13 | + |
| 14 | +The Debian packaging is: |
| 15 | + |
| 16 | + Copyright (C) 2009 Linden Lab |
| 17 | + |
| 18 | +and is licensed under the GPL v2 or later |
| 19 | + |
Index: trunk/debs/ganglios/debian/docs |
— | — | @@ -0,0 +1,2 @@ |
| 2 | +debian/README |
| 3 | +debian/ganglios.cron |
\ No newline at end of file |
Index: trunk/debs/ganglios/debian/rules |
— | — | @@ -0,0 +1,91 @@ |
| 2 | +#!/usr/bin/make -f |
| 3 | +# -*- makefile -*- |
| 4 | +# Sample debian/rules that uses debhelper. |
| 5 | +# This file was originally written by Joey Hess and Craig Small. |
| 6 | +# As a special exception, when this file is copied by dh-make into a |
| 7 | +# dh-make output file, you may use that output file without restriction. |
| 8 | +# This special exception was added by Craig Small in version 0.37 of dh-make. |
| 9 | + |
| 10 | +# Uncomment this to turn on verbose mode. |
| 11 | +#export DH_VERBOSE=1 |
| 12 | + |
| 13 | + |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | +configure: configure-stamp |
| 18 | +configure-stamp: |
| 19 | + dh_testdir |
| 20 | + # Add here commands to configure the package. |
| 21 | + |
| 22 | + touch configure-stamp |
| 23 | + |
| 24 | + |
| 25 | +build: build-stamp |
| 26 | + |
| 27 | +build-stamp: configure-stamp |
| 28 | + dh_testdir |
| 29 | + |
| 30 | + # Add here commands to compile the package. |
| 31 | + $(MAKE) |
| 32 | + #docbook-to-man debian/ganglios.sgml > ganglios.1 |
| 33 | + |
| 34 | + touch $@ |
| 35 | + |
| 36 | +clean: |
| 37 | + dh_testdir |
| 38 | + dh_testroot |
| 39 | + rm -f build-stamp configure-stamp |
| 40 | + |
| 41 | + # Add here commands to clean up after the build process. |
| 42 | + $(MAKE) clean |
| 43 | + |
| 44 | + dh_clean |
| 45 | + |
| 46 | +install: build |
| 47 | + dh_testdir |
| 48 | + dh_testroot |
| 49 | + # dh_prep |
| 50 | + dh_installdirs |
| 51 | + |
| 52 | + # Add here commands to install the package into debian/ganglios. |
| 53 | + $(MAKE) DESTDIR=$(CURDIR)/debian/ganglios install |
| 54 | + |
| 55 | + |
| 56 | +# Build architecture-independent files here. |
| 57 | +binary-indep: install |
| 58 | +# We have nothing to do by default. |
| 59 | + |
| 60 | +# Build architecture-dependent files here. |
| 61 | +binary-arch: install |
| 62 | + dh_testdir |
| 63 | + dh_testroot |
| 64 | + dh_installchangelogs |
| 65 | + dh_installdocs |
| 66 | + dh_installexamples |
| 67 | +# dh_install |
| 68 | +# dh_installmenu |
| 69 | +# dh_installdebconf |
| 70 | +# dh_installlogrotate |
| 71 | +# dh_installemacsen |
| 72 | +# dh_installpam |
| 73 | +# dh_installmime |
| 74 | + dh_pysupport |
| 75 | +# dh_installinit |
| 76 | +# dh_installcron |
| 77 | +# dh_installinfo |
| 78 | + dh_installman |
| 79 | + dh_link |
| 80 | + dh_strip |
| 81 | + dh_compress |
| 82 | + dh_fixperms |
| 83 | +# dh_perl |
| 84 | +# dh_makeshlibs |
| 85 | + dh_installdeb |
| 86 | + dh_shlibdeps |
| 87 | + dh_gencontrol |
| 88 | + dh_md5sums |
| 89 | + dh_builddeb |
| 90 | + |
| 91 | +binary: binary-indep binary-arch |
| 92 | +.PHONY: build clean binary-indep binary-arch binary install configure |
Property changes on: trunk/debs/ganglios/debian/rules |
___________________________________________________________________ |
Added: svn:executable |
1 | 93 | + * |
Index: trunk/debs/ganglios/debian/README |
— | — | @@ -0,0 +1,37 @@ |
| 2 | +################## |
| 3 | +## |
| 4 | +## Ganglios - Integration of Nagios with Ganglia |
| 5 | +## |
| 6 | +## This directory contains a number of scripts that make it easy to |
| 7 | +## configure nagios to take action based on data gathered by ganglia |
| 8 | +## |
| 9 | +## Written for Linden Lab, copyright 2008 |
| 10 | +## License to modify and redistribute granted under the GPL v2 or later |
| 11 | +## |
| 12 | +################## |
| 13 | + |
| 14 | + |
| 15 | +0. Table of Contents |
| 16 | +1. Overview |
| 17 | +2. Installation |
| 18 | + |
| 19 | + |
| 20 | +1. Overview |
| 21 | + |
| 22 | +This document assumes familiarity with both Ganglia (http://ganglia.info) and Nagios (http://www.nagios.org). |
| 23 | + |
| 24 | +Ganglios retrieves metrics from a gmond collector node and stores them in the local filesystem, one file per host. Nagios checks can refer to this data to perform checks on a wide range of hosts much more efficiently than SSHing into every node. |
| 25 | + |
| 26 | +Your ganglia configuration should have one (or more) gmond nodes that are serving as collector nodes. In a multicast environment, any gmond node will do. This nagios / ganglia integration works by retrieving all the ganglia stats from one or more gmond nodes and storing them locally for nagios to examine. The nagios plugins then examine the locally stored files to retrieve the relevant statistics, using the included python module that understands the on-disk ganglia stats. |
| 27 | + |
| 28 | +Note that you must only point ganglia_parser at multiple gmond collector nodes if they have different information, rather than for redundancy. If you have separate clusters in your environment, you should point ganglia_parser to one gmond for each cluster. |
| 29 | + |
| 30 | +2. Installation |
| 31 | + |
| 32 | +On your nagios host: |
| 33 | +2a. modify the listXMLSources() function in ganglia_parser to correctly return a list of your gmond collector nodes. |
| 34 | +2b. install ganglia_parser in /usr/local/sbin/ (or in a location of your choice) |
| 35 | +2c. install the 'ganglios' python module in /usr/local/lib/python2.3/site-packages/ganglios/ |
| 36 | +2d. install the cronjob (ganglios.cron) to collect ganglia data in /etc/cron.d/ Set the user as whom this cronjob runs to the owner of /var/lib/ganglia/ |
| 37 | + |
| 38 | +At this point, you should be populating /var/lib/ganglia/xmlcache/ every 2 minutes. Examine this data to make sure it seems right. You should have /var/lib/ganglia/xmlcache/<hostname>.xml (one for each collector node) and /var/lib/ganglia/xmlcache/hosts/<hostname> |
Index: trunk/debs/ganglios/src/check_ganglios_memory_v2 |
— | — | @@ -0,0 +1,95 @@ |
| 2 | +#!/usr/bin/python -tt |
| 3 | +# |
| 4 | +# Copyright (C) 2009 Linden Lab |
| 5 | +# |
| 6 | +# Released under the GPL v2 or later. For a full description of the license, |
| 7 | +# please visit http://www.gnu.org/licenses/gpl-2.0.html |
| 8 | +# |
| 9 | + |
| 10 | +""" |
| 11 | +@file check_ganglios_ |
| 12 | +@brief a ganglios module that checks memory used for a host |
| 13 | + |
| 14 | +Copyright (c) 2008, Linden Research, Inc. |
| 15 | +$License$ |
| 16 | +""" |
| 17 | + |
| 18 | +import sys |
| 19 | +import ganglios.ganglios as ganglios |
| 20 | +import operator |
| 21 | +from optparse import OptionParser |
| 22 | + |
| 23 | +__revision__ = 1; |
| 24 | + |
| 25 | +def main(): |
| 26 | + """ |
| 27 | + Check $host in the ganglia uses less than $limit memory |
| 28 | + This check is interesting because memory used is not reported by ganglia. |
| 29 | + Only total and memory free etc. are reported. This module will do the |
| 30 | + right math to calculate memory used. |
| 31 | + """ |
| 32 | + |
| 33 | + parser = OptionParser() |
| 34 | + |
| 35 | + parser.add_option('-H', dest='hostname', help='hostname') |
| 36 | + parser.add_option('-u', '--used', action='store_true', dest='used', default=False, help='check memory used is less than threshold (default)') |
| 37 | + parser.add_option('-f', '--avail', action='store_true', dest='avail', default=False, help='check memory available is greater than threshold') |
| 38 | + parser.add_option('-w', dest='warning', help='warning threshold for the metric') |
| 39 | + parser.add_option('-c', dest='critical', help='critical threshold for the metric') |
| 40 | + |
| 41 | + options, args = parser.parse_args() |
| 42 | + host_name = options.hostname |
| 43 | + warn_value = float(options.warning) |
| 44 | + crit_value = float(options.critical) |
| 45 | + if( options.used and options.avail ): |
| 46 | + # error - both used and avail were set |
| 47 | + parser.error("options --used and --avail are mutually exclusive") |
| 48 | + |
| 49 | + mem_total_raw = ganglios.get_metric_for_host(host_name, 'mem_total') |
| 50 | + mem_shared_raw = ganglios.get_metric_for_host(host_name, 'mem_shared') |
| 51 | + mem_free_raw = ganglios.get_metric_for_host(host_name, 'mem_free') |
| 52 | + mem_cached_raw = ganglios.get_metric_for_host(host_name, 'mem_cached') |
| 53 | + mem_buffers_raw = ganglios.get_metric_for_host(host_name, 'mem_buffers') |
| 54 | + |
| 55 | + try: |
| 56 | + mem_total = float(mem_total_raw) |
| 57 | + mem_shared = float(mem_shared_raw) |
| 58 | + mem_free = float(mem_free_raw) |
| 59 | + mem_cached = float(mem_cached_raw) |
| 60 | + mem_buffers = float(mem_buffers_raw) |
| 61 | + except TypeError, e: |
| 62 | + # ganglios didn't return a number - probably None |
| 63 | + status = 2 |
| 64 | + output = "CRITICAL - check failed (returned '%s' when casting '%s, %s, %s, %s, %s')" % (e, mem_total_raw, mem_shared_raw, mem_free_raw, mem_cached_raw, mem_buffers_raw) |
| 65 | + sys.stdout.write(output) |
| 66 | + ganglios.done(2) |
| 67 | + |
| 68 | + if( options.free ): |
| 69 | + # testing against free memory |
| 70 | + mem = mem_shared + mem_free + mem_cached + mem_buffers |
| 71 | + metric = 'mem_avail' |
| 72 | + op = 'lt' |
| 73 | + else: |
| 74 | + # testing against used memory. |
| 75 | + mem = mem_total - mem_shared - mem_free - mem_cached - mem_buffers |
| 76 | + metric = 'mem_used' |
| 77 | + op = 'gt' |
| 78 | + |
| 79 | + |
| 80 | + if getattr(operator, op)(cur_val, crit_value): |
| 81 | + if mem_used > crit_value: |
| 82 | + status = 2 |
| 83 | + output = "CRITICAL: %s is %s (op %s)" % (metric, cur_val, op, crit_value) |
| 84 | + elif mem_used > warn_value: |
| 85 | + status = 1 |
| 86 | + output = "WARN: %s is %s (op %s)" % (metric, cur_val, op, warn_value) |
| 87 | + else: |
| 88 | + status = 0 |
| 89 | + output = "OK: %s is %s" % (metric, cur_val) |
| 90 | + |
| 91 | + sys.stdout.write(output) |
| 92 | + ganglios.done(status) |
| 93 | + |
| 94 | +if __name__ == "__main__": |
| 95 | + main() |
| 96 | + |
Property changes on: trunk/debs/ganglios/src/check_ganglios_memory_v2 |
___________________________________________________________________ |
Added: svn:executable |
1 | 97 | + * |
Index: trunk/debs/ganglios/src/check_ganglios_diskio |
— | — | @@ -0,0 +1,59 @@ |
| 2 | +#!/usr/bin/python -tt |
| 3 | +# |
| 4 | +# Copyright (C) 2009 Linden Lab |
| 5 | +# |
| 6 | +# Released under the GPL v2 or later. For a full description of the license, |
| 7 | +# please visit http://www.gnu.org/licenses/gpl-2.0.html |
| 8 | +# |
| 9 | + |
| 10 | +""" |
| 11 | +@file check_ganglios_diskio |
| 12 | +@brief look in ganglia for heavy disk-io |
| 13 | + |
| 14 | +Copyright (c) 2007, Linden Research, Inc. |
| 15 | +$License$ |
| 16 | +""" |
| 17 | + |
| 18 | +import sys |
| 19 | +import ganglios.ganglios as ganglios |
| 20 | + |
| 21 | +__revision__ = '0' |
| 22 | + |
| 23 | + |
| 24 | +def main (): |
| 25 | + """ |
| 26 | + checks all hosts for the 'disk_io' metric. |
| 27 | + Usage: |
| 28 | + cehck_ganglios_diskio threshold |
| 29 | + threshold should be an integer above which hosts will trigger the alert |
| 30 | + """ |
| 31 | + report = {} |
| 32 | + |
| 33 | + cutoff = int (sys.argv[ 1 ]) |
| 34 | + |
| 35 | + def thunk (host, metric, value): |
| 36 | + """ callback """ |
| 37 | + num = int (float (value)) |
| 38 | + if (num > cutoff): |
| 39 | + report[ host.replace ('.lindenlab.com', '') ] = str (num) |
| 40 | + |
| 41 | + status = 0 # OK |
| 42 | + ganglios.parse_ganglia (['disk_io'], thunk) |
| 43 | + |
| 44 | + hosts = report.keys () |
| 45 | + hosts.sort () |
| 46 | + if len (hosts) > 0: |
| 47 | + sys.stdout.write ('<b>disk-IO</b>:') |
| 48 | + for host in hosts: |
| 49 | + if not host.startswith ('sim'): |
| 50 | + sys.stdout.write ('%s:%s ' % (host, report[ host ])) |
| 51 | + for host in hosts: |
| 52 | + if host.startswith ('sim'): |
| 53 | + sys.stdout.write ('%s:%s ' % (host, report[ host ])) |
| 54 | + status = 1 # warn |
| 55 | + |
| 56 | + ganglios.done (status) |
| 57 | + |
| 58 | + |
| 59 | +if __name__ == "__main__": |
| 60 | + main () |
Property changes on: trunk/debs/ganglios/src/check_ganglios_diskio |
___________________________________________________________________ |
Added: svn:executable |
1 | 61 | + * |
Index: trunk/debs/ganglios/src/ganglia/ganglia_parser.log |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +2011-10-03 23:01:28,936 CRITICAL Starting ganglia_parser |
| 3 | +2011-10-03 23:01:28,962 CRITICAL Failed to rename /tmp/tmp8TIOms to ./ganglia/xmlcache/spence.xml |
| 4 | +2011-10-03 23:01:28,973 CRITICAL Finished ganglia_parser |
| 5 | +2011-10-03 23:02:42,089 CRITICAL Starting ganglia_parser |
| 6 | +2011-10-03 23:02:42,106 CRITICAL Failed to rename /tmp/tmpTSpsNG to ./ganglia/xmlcache/spence.xml |
| 7 | +2011-10-03 23:02:42,107 CRITICAL Finished ganglia_parser |
Index: trunk/debs/ganglios/src/ganglios/__init__.py |
— | — | @@ -0,0 +1,8 @@ |
| 2 | +"""\ |
| 3 | +@file __init__.py |
| 4 | +@brief Initialization file for the ganglios module. |
| 5 | + |
| 6 | +Copyright (c) 2006-2007, Linden Research, Inc. |
| 7 | +License to use, modify, and distribute under the GPLv2 or later |
| 8 | +http://www.gnu.org/licenses/gpl-2.0.html |
| 9 | +""" |
Property changes on: trunk/debs/ganglios/src/ganglios/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 10 | + native |
Index: trunk/debs/ganglios/src/ganglios/ganglios.py |
— | — | @@ -0,0 +1,160 @@ |
| 2 | + |
| 3 | +""" |
| 4 | +@file ganglios.py |
| 5 | +@brief utilities to make ganglios checks (nagios plugins) less ugly |
| 6 | + |
| 7 | +Copyright (c) 2007, Linden Research, Inc. |
| 8 | +License to use, modify, and distribute under the GPLv2 or later |
| 9 | +http://www.gnu.org/licenses/gpl-2.0.html |
| 10 | + |
| 11 | +There are two ways to use this module. |
| 12 | +a) examine *all* hosts for a given metric and analyze them using a callback function |
| 13 | +b) examine a *single* host for a specific metric and return that metric |
| 14 | +(a) is appropriate if you want to be notified if any host crosses a certain |
| 15 | + threshold. It is more efficient than creating a nagios check for every host on |
| 16 | + that metric. An example would be checking to see if any host has >95% disk |
| 17 | + space used. To set this up, create a 'ganglia' host in the nagios config and |
| 18 | + put the service checks under that host. Note that you won't be able to resolve |
| 19 | + the nagios alert until *all* hosts are back below the threshold. This can lead |
| 20 | + to alerts just hanging out on your nagios page for too long. |
| 21 | +(b) is appropriate to check a specific host for a condition. If you have 1000 |
| 22 | + hosts, of which 20 are web servers, you should use this method to test your web |
| 23 | + servers for Stuff so as to avoid wasting cycles for the other 980 hosts. |
| 24 | +""" |
| 25 | + |
| 26 | +import os |
| 27 | +import sys |
| 28 | +import time |
| 29 | +import stat |
| 30 | +import glob |
| 31 | +import socket |
| 32 | + |
| 33 | +import elementtree.ElementTree as ET |
| 34 | +import xml.parsers.expat as expat |
| 35 | + |
| 36 | +__revision__ = '0' |
| 37 | + |
| 38 | +_cachedir = '/var/lib/ganglia/xmlcache/' |
| 39 | +_stale_time = 300 |
| 40 | +_hostdir = os.path.join(_cachedir, 'hosts') |
| 41 | + |
| 42 | +def parse_ganglia (metrics, thunk): |
| 43 | + """ |
| 44 | + metrics is a list of strings. |
| 45 | + thunk is a callback. |
| 46 | + |
| 47 | + This parses the xml files in /tmp/ganglia-cache/ and calls thunk |
| 48 | + every time a METRIC with NAME in metrics is seen. Use this function for |
| 49 | + method (a) above |
| 50 | + |
| 51 | + thunk should take 3 arguments: (host-name, metric-name, value) |
| 52 | + """ |
| 53 | + status = 0 # ok |
| 54 | + bad = [] |
| 55 | + |
| 56 | + def go_bad (xml_file, bad): |
| 57 | + """ change status to bad, and output the stale nannybot """ |
| 58 | + bad_host = xml_file.replace ('.xml', '') |
| 59 | + if not bad_host in bad: |
| 60 | + bad += [bad_host] |
| 61 | + |
| 62 | + try: |
| 63 | + os.mkdir(_cachedir) |
| 64 | + except: |
| 65 | + pass |
| 66 | + |
| 67 | + for xml_file in os.listdir(_cachedir): |
| 68 | + filename = _cachedir+xml_file |
| 69 | + if xml_file.endswith ('.xml'): |
| 70 | + # make sure the data is fresh |
| 71 | + mod_time = os.stat (filename)[stat.ST_MTIME] |
| 72 | + if time.time () - mod_time > _stale_time: |
| 73 | + go_bad (xml_file, bad) |
| 74 | + status = 2 |
| 75 | + # read the xml file, look for certain metrics |
| 76 | + f_hndl = open (filename) |
| 77 | + try: |
| 78 | + tree = ET.parse (f_hndl) |
| 79 | + ganglia_xml = tree.getroot() |
| 80 | + for cluster in ganglia_xml.getchildren (): |
| 81 | + for host in cluster.getchildren (): |
| 82 | + for metric in host.getchildren (): |
| 83 | + # found a metric we care about. |
| 84 | + if metric.get ('NAME') in metrics: |
| 85 | + thunk (host.get ('NAME'), |
| 86 | + metric.get ('NAME'), |
| 87 | + metric.get ('VAL')) |
| 88 | + except expat.ExpatError: |
| 89 | + go_bad (xml_file, bad) |
| 90 | + status = 2 |
| 91 | + f_hndl.close () |
| 92 | + if len (bad) > 0: |
| 93 | + if status == 0: |
| 94 | + status = 2 # critical |
| 95 | + sys.stdout.write ('<b>STALE</b>:') |
| 96 | + for bad_host in bad: |
| 97 | + sys.stdout.write (bad_host + ' ') |
| 98 | + return status |
| 99 | + |
| 100 | +def get_metric_for_host(hostname, metricname): |
| 101 | + """ |
| 102 | + using the new-style (one file per host), this |
| 103 | + takes a hostname, looks up the metric, and returns its value |
| 104 | + This is method (b) above |
| 105 | + """ |
| 106 | + |
| 107 | + # first, find the canonical name for the host passed in |
| 108 | + # i.e. translate inv5-mysql.agni.lindenlab.com to db1c3.lindenlab.com |
| 109 | + try: |
| 110 | + new_hostname = socket.gethostbyaddr(hostname)[0] |
| 111 | + except socket.gaierror, e: |
| 112 | + # name not found. it will probably fail anyways... but pass it through just in case |
| 113 | + new_hostname = hostname |
| 114 | + hostname = new_hostname |
| 115 | + |
| 116 | + #strip off leading int., eth0., etc. |
| 117 | + split_name = hostname.split('.') |
| 118 | + if(split_name[0] in ('int', 'eth0', 'eth1', 'tunnel0', 'tunnel1')): |
| 119 | + hostname = '.'.join(split_name[1:]) |
| 120 | + |
| 121 | + # find the one file that best matches what came in. checks for names like |
| 122 | + # int.$name and $name and complains if it doesn't find a unique match. |
| 123 | + filelist = glob.glob(os.path.join(_hostdir, "*.%s" % hostname)) |
| 124 | + if len(filelist) == 0: |
| 125 | + filelist = glob.glob(os.path.join(_hostdir, "%s" % hostname)) |
| 126 | +### |
| 127 | +### for the VPNs, it's a valid state that there exist >1 files for each vpn |
| 128 | +### (a tunnel address and a private interface). What's the right action to take |
| 129 | +### here? not sure... -green 2008-05-28 |
| 130 | +### |
| 131 | +# if len(filelist) != 1: |
| 132 | +# sys.stdout.write("not exactly one match for '%s' (found %s)" % (hostname, len(filelist))) |
| 133 | +# done(2) |
| 134 | + |
| 135 | + filename = filelist[0] # there can be only one |
| 136 | + # make sure it's not old data |
| 137 | + mod_time = os.stat(filename)[stat.ST_MTIME] |
| 138 | + if time.time () - mod_time > _stale_time: # seconds |
| 139 | + sys.stdout.write('STALE') |
| 140 | + done(2) |
| 141 | + |
| 142 | + # read the xml file, look for certain metrics |
| 143 | + f_hndl = open(filename) |
| 144 | + try: |
| 145 | + tree = ET.parse (f_hndl) |
| 146 | + host = tree.getroot() |
| 147 | + for metric in host.getchildren (): |
| 148 | + # found a metric we care about. |
| 149 | + if metric.get ('NAME') == metricname: |
| 150 | + return metric.get('VAL') |
| 151 | + except expat.ExpatError: |
| 152 | + sys.stdout.write("XML parse error") |
| 153 | + done(2) |
| 154 | + f_hndl.close() |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | +def done (status): |
| 159 | + """ print newline if needed, exit with status """ |
| 160 | + print '' |
| 161 | + sys.exit (status) |
Property changes on: trunk/debs/ganglios/src/ganglios/ganglios.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 162 | + native |
Index: trunk/debs/ganglios/src/check_ganglios_generic_value |
— | — | @@ -0,0 +1,93 @@ |
| 2 | +#!/usr/bin/python -tt |
| 3 | +# |
| 4 | +# Copyright (C) 2009 Linden Lab |
| 5 | +# |
| 6 | +# Released under the GPL v2 or later. For a full description of the license, |
| 7 | +# please visit http://www.gnu.org/licenses/gpl-2.0.html |
| 8 | +# |
| 9 | + |
| 10 | +""" |
| 11 | +@file check_ganglios_generic_value |
| 12 | +@brief a ganglios module that checks a given metric against a reference value |
| 13 | + |
| 14 | +Copyright (c) 2008, Linden Research, Inc. |
| 15 | +$License$ |
| 16 | +""" |
| 17 | + |
| 18 | +import sys |
| 19 | +import ganglios.ganglios as ganglios |
| 20 | +import operator |
| 21 | +from optparse import OptionParser |
| 22 | + |
| 23 | +__revision__ = 1; |
| 24 | + |
| 25 | +def main(): |
| 26 | + """Check $host in the ganglia cache that $metric $op than $value |
| 27 | +example: |
| 28 | +check_ganglios_generic_value alan.lindenlab.com disk_usage gt 90 |
| 29 | + this will return CRIT if alan's disk_usage goes above 90 |
| 30 | +possible ops: |
| 31 | + gt ge lt le eq ne |
| 32 | +each means "warn if the $cur_value $op $expected_value |
| 33 | +so passing 'gt 10' means CRIT if the host's value greater than 10 |
| 34 | + |
| 35 | + |
| 36 | + """ |
| 37 | + # actually, any valid function supplied by the 'operator' module that return a |
| 38 | + # boolean will probably work, but the above functions are what I expect to use |
| 39 | + |
| 40 | + parser = OptionParser() |
| 41 | + |
| 42 | + parser.add_option('-H', dest='hostname', help='hostname') |
| 43 | + parser.add_option('-m', dest='metric', help='metric to check') |
| 44 | + parser.add_option('-o', dest='op', help='valid operators are: gt ge lt le eq ne') |
| 45 | + parser.add_option('-w', dest='warning', help='warning threshold for the metric') |
| 46 | + parser.add_option('-c', dest='critical', help='critical threshold for the metric') |
| 47 | + parser.add_option('-g', dest='getmetric', action="store_true", help='only return the metric, don\'t do any comparisons') |
| 48 | + |
| 49 | + options, args = parser.parse_args() |
| 50 | + host_name = options.hostname |
| 51 | + metric = options.metric |
| 52 | + getmetric = options.getmetric |
| 53 | + |
| 54 | + cur_val_raw = ganglios.get_metric_for_host(host_name, metric) |
| 55 | + try: |
| 56 | + cur_val = float(cur_val_raw) |
| 57 | + except TypeError, e: |
| 58 | + # ganglios didn't return a number - probably None |
| 59 | + if (cur_val_raw is None): |
| 60 | + output = "UNKNOWN - check failed, metric not found" |
| 61 | + else: |
| 62 | + output = "UNKNOWN - check failed (returned '%s' when casting '%s')" % (e, cur_val_raw) |
| 63 | + status = 3 |
| 64 | + sys.stdout.write(output) |
| 65 | + ganglios.done(status) |
| 66 | + |
| 67 | + if(getmetric): |
| 68 | + # this feature was added for scripts that want the value for their own processing |
| 69 | + output = "%s" % cur_val |
| 70 | + sys.stdout.write(output) |
| 71 | + status = 0 |
| 72 | + ganglios.done(status) |
| 73 | + |
| 74 | + # if called with -g, these metrics are optional. Move them here so you don't trigger an exception on casting an empty option |
| 75 | + op = options.op |
| 76 | + warn_value = float(options.warning) |
| 77 | + crit_value = float(options.critical) |
| 78 | + |
| 79 | + if getattr(operator, op)(cur_val, crit_value): |
| 80 | + status = 2 |
| 81 | + output = "CRITICAL: %s is %s (%s %s)" % (metric, cur_val, op, crit_value) |
| 82 | + elif getattr(operator, op)(cur_val, warn_value): |
| 83 | + status = 1 |
| 84 | + output = "WARN: %s is %s (%s %s)" % (metric, cur_val, op, warn_value) |
| 85 | + else: |
| 86 | + status = 0 |
| 87 | + output = "OK: %s is %s" % (metric, cur_val) |
| 88 | + |
| 89 | + sys.stdout.write(output) |
| 90 | + ganglios.done(status) |
| 91 | + |
| 92 | +if __name__ == "__main__": |
| 93 | + main() |
| 94 | + |
Property changes on: trunk/debs/ganglios/src/check_ganglios_generic_value |
___________________________________________________________________ |
Added: svn:executable |
1 | 95 | + * |
Index: trunk/debs/ganglios/src/check_ganglios_disk |
— | — | @@ -0,0 +1,149 @@ |
| 2 | +#!/usr/bin/python -tt |
| 3 | +# |
| 4 | +# Copyright (C) 2009 Linden Lab |
| 5 | +# |
| 6 | +# Released under the GPL v2 or later. For a full description of the license, |
| 7 | +# please visit http://www.gnu.org/licenses/gpl-2.0.html |
| 8 | +# |
| 9 | + |
| 10 | +""" |
| 11 | +@file check_ganglios_disk |
| 12 | +@brief look for over-full disks |
| 13 | + |
| 14 | +Copyright (c) 2007, Linden Research, Inc. |
| 15 | +$License$ |
| 16 | +$Id: check_ganglios_disk,v 1.12 2008/03/05 21:35:51 benoc Exp $ |
| 17 | +""" |
| 18 | + |
| 19 | +import sys |
| 20 | +import ganglios.ganglios as ganglios |
| 21 | +import socket |
| 22 | + |
| 23 | +__revision__ = '0' |
| 24 | + |
| 25 | + |
| 26 | +def main (): |
| 27 | + """ |
| 28 | + This check examines the 'disk_usage' gangila metric on all hosts. |
| 29 | + |
| 30 | + Usage: |
| 31 | + check_ganglios_disk [warn|critical] threshold [include|exclude host list] |
| 32 | + threshould should be a number, above which the alert will trigger |
| 33 | + include host list - only check the hosts listed (DEPRECATED - do not use) |
| 34 | + exclude host list - check all hosts *except* those listed. Useful if some unimportant hosts are permanently in the triggered state. |
| 35 | + """ |
| 36 | + if sys.argv[ 1 ] == 'warn': |
| 37 | + err_status = 1 |
| 38 | + elif sys.argv[ 1 ] == 'critical': |
| 39 | + err_status = 2 |
| 40 | + else: |
| 41 | + print 'bad arguments' |
| 42 | + sys.exit (2) |
| 43 | + |
| 44 | + cutoff = int (sys.argv[ 2 ]) |
| 45 | + try: |
| 46 | + inexcl_flag = sys.argv[ 3 ] |
| 47 | + except: |
| 48 | + inexcl_flag = '' |
| 49 | + |
| 50 | + # i want to be able to list cnames in the nagios config. turn |
| 51 | + # each alias into the real hostname (ganglios always reports by |
| 52 | + # the real host name) for the purpose of matching, but say |
| 53 | + # the alias in the output of this plugin. |
| 54 | + incl_excl_hosts = {} |
| 55 | + |
| 56 | + for host in sys.argv[ 4: ]: |
| 57 | + if ( not host.endswith('.com') and not host.endswith('.net')): |
| 58 | + host += '.lindenlab.com' |
| 59 | + |
| 60 | + if host.find ('%d') >= 0: |
| 61 | + # a %d in the hostname means to include all hosts with |
| 62 | + # the name and %d expanded to a number. pull the |
| 63 | + # actual names from DNS until we stop seeing them. |
| 64 | + i = 0 |
| 65 | + missed = 0 |
| 66 | + while True: |
| 67 | + try: |
| 68 | + invdb_name = host % i |
| 69 | + invdb_cname = socket.gethostbyname_ex (invdb_name) |
| 70 | + incl_excl_hosts[ invdb_cname[ 0 ] ] = invdb_name |
| 71 | + incl_excl_hosts[ "int." + invdb_cname[ 0 ] ] = "int." + invdb_name |
| 72 | + incl_excl_hosts[ "eth0-0." + invdb_cname[ 0 ] ] = "eth0-0." + invdb_name |
| 73 | + incl_excl_hosts[ "eth1-0." + invdb_cname[ 0 ] ] = "eth1-0." + invdb_name |
| 74 | + incl_excl_hosts[ "tunnel0." + invdb_cname[ 0 ] ] = "tunnel0." + invdb_name |
| 75 | + i += 1 |
| 76 | + except socket.gaierror: |
| 77 | + missed += 1 |
| 78 | + i += 1 |
| 79 | + if missed > 4: |
| 80 | + break |
| 81 | + else: |
| 82 | + # this is just a hostname (no expansion). figure out its |
| 83 | + # real hostname so we can spot its data in the ganglia data |
| 84 | + try: |
| 85 | + (name, aliaslist, addresslist) = socket.gethostbyname_ex (host) |
| 86 | + except socket.gaierror: |
| 87 | + sys.stdout.write ("CRITICAL: " + host + " does not resolve (fix the nagios config)") |
| 88 | + ganglios.done (2) |
| 89 | + incl_excl_hosts[ name ] = host |
| 90 | + |
| 91 | + # incl_excl_hosts now contains a dictionary of {cname => alias} eg. {db1.lindenlab.com => mysql.agni.lindenlab.com} |
| 92 | + |
| 93 | + ##### |
| 94 | + # for name in incl_excl_hosts.keys (): |
| 95 | + # print '%s -> %s' % (name, incl_excl_hosts[ name ]) |
| 96 | + # sys.exit (0) |
| 97 | + ##### |
| 98 | + |
| 99 | + report = {} |
| 100 | + hostmetrics = {} |
| 101 | + |
| 102 | + def build_metrics_include( host, metric, value): |
| 103 | + ''' callback from ganglios.parse_ganglia ''' |
| 104 | + if host in incl_excl_hosts: |
| 105 | + hostmetrics[ incl_excl_hosts[host] ] = int(value) |
| 106 | + |
| 107 | + def build_metrics_exclude( host, metric, value): |
| 108 | + ''' callback from ganglios.parse_ganglia ''' |
| 109 | + if not host in incl_excl_hosts: |
| 110 | + hostmetrics[ host ] = int(value) |
| 111 | + |
| 112 | + status = 0 # OK |
| 113 | + if( inexcl_flag == 'include' ): |
| 114 | + build_metrics = build_metrics_include |
| 115 | + else: |
| 116 | + # either exclude or empty |
| 117 | + build_metrics = build_metrics_exclude |
| 118 | + |
| 119 | + # build hostmetrics using build_metrics callback |
| 120 | + ganglios.parse_ganglia (['disk_usage'], build_metrics) |
| 121 | + |
| 122 | + # pull out hosts that have metric > cutoff |
| 123 | + report = dict([ |
| 124 | + (host.replace('.lindenlab.com',''), hostmetrics[host]) |
| 125 | + for host in hostmetrics |
| 126 | + if hostmetrics[host] >= cutoff |
| 127 | + ]) |
| 128 | + |
| 129 | + hosts = report.keys () |
| 130 | + hosts.sort () |
| 131 | + if len (hosts) > 0: |
| 132 | + sys.stdout.write ('<b>DISK</b>:') |
| 133 | + for host in hosts: |
| 134 | + sys.stdout.write ('%s:%s ' % (host, report[ host ])) |
| 135 | + status = err_status # warn or critical |
| 136 | + |
| 137 | + ganglios.done (status) |
| 138 | + |
| 139 | + |
| 140 | +if __name__ == "__main__": |
| 141 | + try: |
| 142 | + main () |
| 143 | + except SystemExit, exp: |
| 144 | + # this exception is raised when the check exits normally |
| 145 | + raise exp |
| 146 | + except: |
| 147 | + # if anything raises an exception in the test, die critical |
| 148 | + sys.stdout.write ("CRITICAL: check raised an exception!") |
| 149 | + ganglios.done (2) |
| 150 | + |
Property changes on: trunk/debs/ganglios/src/check_ganglios_disk |
___________________________________________________________________ |
Added: svn:executable |
1 | 151 | + * |
Index: trunk/debs/ganglios/src/ganglia_parser.orig |
— | — | @@ -0,0 +1,266 @@ |
| 2 | +#!/usr/bin/python -tt |
| 3 | +# |
| 4 | +# Copyright (C) 2009 Linden Lab |
| 5 | +# |
| 6 | +# Released under the GPL v2 or later. For a full description of the license, |
| 7 | +# please visit http://www.gnu.org/licenses/gpl-2.0.html |
| 8 | +# |
| 9 | + |
| 10 | + |
| 11 | +""" |
| 12 | +Iterates over all the gmond collector nodes and fetch metrics where available. |
| 13 | +Separates metrics into one file per host for easier processing by nagios |
| 14 | +plugins. |
| 15 | +""" |
| 16 | + |
| 17 | +import sys |
| 18 | +import socket |
| 19 | +import os |
| 20 | +import glob |
| 21 | +import tempfile |
| 22 | +import errno |
| 23 | +import re |
| 24 | +import logging |
| 25 | + |
| 26 | +__revision__ = '0' |
| 27 | + |
| 28 | +dataDir = './ganglia/xmlcache' |
| 29 | +logDir = './ganglia' |
| 30 | +logger = logging.getLogger('ganglia_parser') |
| 31 | +hdlr = logging.FileHandler('%s/ganglia_parser.log' % logDir) |
| 32 | +formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') |
| 33 | +hdlr.setFormatter(formatter) |
| 34 | +logger.addHandler(hdlr) |
| 35 | +logger.setLevel(logging.WARNING) |
| 36 | + |
| 37 | +def unionMetrics( curMetrics, hostFilePath ): |
| 38 | + '''When a single host is present in more than one gmond collector node |
| 39 | + (because of DNS errors or something else), write out the union of metrics |
| 40 | + from each collector node, taking the more recent metric when the same |
| 41 | + metric is present in multiple nodes.''' |
| 42 | + metrics = dict() |
| 43 | + metricsAge = dict() |
| 44 | + # parseMetric grabs the metric name and age |
| 45 | + parseMetric = re.compile('^<METRIC NAME="([^"]*).*TN="([^"]*)') |
| 46 | + |
| 47 | + # populate metrics and metricsAge from the existing file |
| 48 | + oldFileHandle = open(hostFilePath, 'r') |
| 49 | + fileFirstLine = oldFileHandle.readline() |
| 50 | + for line in oldFileHandle: |
| 51 | + regMatch = parseMetric.match(line) |
| 52 | + if regMatch: |
| 53 | + met = regMatch.group(1) |
| 54 | + age = regMatch.group(2) |
| 55 | + metrics[met] = line |
| 56 | + metricsAge[met] = age |
| 57 | + else: |
| 58 | + # the match failed, which will happen on </HOST> but nowhere else? |
| 59 | + # theoretically, more logging could go in here |
| 60 | + pass |
| 61 | + |
| 62 | + # selectively overwrite metrics with stuff from the buf if it is newer |
| 63 | + bufFirstLine = curMetrics[0] |
| 64 | + for line in curMetrics[1:-1]: |
| 65 | + regMatch = parseMetric.match(line) |
| 66 | + if regMatch: |
| 67 | + met = regMatch.group(1) |
| 68 | + # age represents seconds since the metric was reported. larger == older |
| 69 | + age = regMatch.group(2) |
| 70 | + if metrics.has_key(met): |
| 71 | + try: |
| 72 | + if float(age) < float(metricsAge[met]): |
| 73 | + # if this metric exists and is older, replace it |
| 74 | + metrics[met] = line |
| 75 | + except Exception, e: |
| 76 | + logger.warning("exception caught when comparing TN values: %s", e) |
| 77 | + else: |
| 78 | + # if the metric doesn't exist, add it |
| 79 | + metrics[met] = line |
| 80 | + else: |
| 81 | + # the match failed, which shouldn't happen (because first and last lines are cut by the slice) |
| 82 | + logger.warning("regMatch failed unexpectedly. current line: %s" % (line)) |
| 83 | + |
| 84 | + |
| 85 | + # compare first lines to see which to use |
| 86 | + parseHost = re.compile('^<HOST.*TN="([^"]*)') |
| 87 | + fileAge = parseHost.match(fileFirstLine).group(1) |
| 88 | + bufAge = parseHost.match(bufFirstLine).group(1) |
| 89 | + if fileAge < bufAge: |
| 90 | + firstLine = fileFirstLine |
| 91 | + else: |
| 92 | + firstLine = bufFirstLine |
| 93 | + |
| 94 | + # construct array to pass back to the calling function |
| 95 | + newBuf = [] |
| 96 | + newBuf.append(firstLine) |
| 97 | + newBuf.extend(metrics.values()) |
| 98 | + newBuf.append('</HOST>\n') |
| 99 | + |
| 100 | + # return the union metrics buffer |
| 101 | + return newBuf |
| 102 | + |
| 103 | + |
| 104 | +def listXMLSources(): |
| 105 | + """ returns a list of hosts to fetch ganglia stats from (the gmond |
| 106 | + collector nodes). In a small network, this will likely be a single host. |
| 107 | + You could simply hardcode the list if there is no coherent naming scheme. |
| 108 | + In our network, the hosts are named 'nannybot1', 'nannybot2', etc. This |
| 109 | + function polls nannybots of increasing number until it finds 3 that don't |
| 110 | + respond and considers itself done.""" |
| 111 | + i = 0 |
| 112 | + missed = 0 |
| 113 | + nannybots = ['spence', ] |
| 114 | + logger.info("Retrieving list of nannybots...") |
| 115 | +# while True: |
| 116 | +# try: |
| 117 | +# nannybot_addr = socket.gethostbyname('nannybot%d.lindenlab.com' % i) |
| 118 | +# s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
| 119 | +# s.settimeout(15.0) |
| 120 | +# s.connect((nannybot_addr, 8649)) |
| 121 | +# s.close() |
| 122 | +# nannybots.append('nannybot%d.lindenlab.com' % i) |
| 123 | +# logger.info('Checking nannybot%d.lindenlab.com: [OK]' % i) |
| 124 | +# missed = 0 |
| 125 | +# except (socket.gaierror): |
| 126 | +# logger.info('Checking nannybot%d.lindenlab.com: [FAILED]' % i) |
| 127 | +# missed += 1 |
| 128 | +# if missed > 3: |
| 129 | +# break |
| 130 | +# except (socket.error, socket.timeout), e: |
| 131 | +# logger.warning('Checking nannybot%d.lindenlab.com: [FAILED] with error %s' % (i, e)) |
| 132 | +# # a connection error (rather than non-existent) should not count towards |
| 133 | +# # the three missing that mark the end of the nannybots. |
| 134 | +# except Exception, e: |
| 135 | +# logger.critical('Caught unexpected exception while checking nannybot%d: %s' % (i, e)) |
| 136 | +# logger.critical('EXITING...') |
| 137 | +# raise e |
| 138 | +# i += 1 |
| 139 | +# |
| 140 | + return nannybots |
| 141 | + |
| 142 | +def storeXMLData(srcHosts, dataDir): |
| 143 | + """ Fetch ganglia xml data from remote hosts and store it locally in $dataDir """ |
| 144 | + |
| 145 | + logger.info("Retrieving ganglia data from selected hosts...") |
| 146 | + for host in srcHosts: |
| 147 | + try: |
| 148 | + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
| 149 | + s.settimeout(15.0) |
| 150 | + s.connect((host, 8649)) |
| 151 | + (fdTF,tmpFile) = tempfile.mkstemp() |
| 152 | + fdTmpFile = os.fdopen(fdTF, "w+b") |
| 153 | + while True: |
| 154 | + data = s.recv(1024) |
| 155 | + if len(data) == 0: |
| 156 | + break |
| 157 | + fdTmpFile.write(data) |
| 158 | + s.close() |
| 159 | + fdTmpFile.close() |
| 160 | + |
| 161 | + # hostname is being used as the filename here |
| 162 | + targetfile = '%s/%s.xml' % (dataDir, host) |
| 163 | + try: |
| 164 | + os.rename(tmpFile, targetfile) |
| 165 | + os.chmod(targetfile, 0644) |
| 166 | + except: |
| 167 | + logger.critical("Failed to rename %s to %s" % (tmpFile, targetfile)) |
| 168 | + |
| 169 | + logger.info('parsing %s: [OK]' % host) |
| 170 | + except Exception, e: |
| 171 | + logger.warning('parsing %s: [FAILED] | reason: %s' % (host, e)) |
| 172 | + |
| 173 | +def splitXML(dataDir): |
| 174 | + """ find all .xml files in $dataDir and split them up on a per-host basis. |
| 175 | + Grid and Cluster are discarded. We dont care about the xml data, so we can |
| 176 | + save CPU cycle treating the file as txt one. """ |
| 177 | + |
| 178 | + # These patterns don't include whitespace, which might be a problem if |
| 179 | + # someday ganglia starts emitting formatted xml |
| 180 | + hostOpen = re.compile('^<HOST NAME="([^"]*)', re.I) #retrieve hostname |
| 181 | + hostClose = re.compile('^</HOST>$', re.I) |
| 182 | + totalHostCount = 0 |
| 183 | + try: |
| 184 | + os.mkdir("%s/hosts" % dataDir,0755) |
| 185 | + except OSError, e: |
| 186 | + if e.errno != errno.EEXIST: |
| 187 | + logger.critical('Problems creating %s. Aborting data splitting' % dataDir) |
| 188 | + sys.exit(2) |
| 189 | + |
| 190 | + logger.info("Splitting xml data in per-host files...") |
| 191 | + # Sean suggested this could backfire. Cant find a practical case myself |
| 192 | + xmlFiles = glob.glob('%s/*.xml' % dataDir) |
| 193 | + |
| 194 | + # flagDupeHost tells you whether this host has been seen before (in this run) |
| 195 | + # if it has, union the current host file and the stuff read in |
| 196 | + flagDupeHost = False |
| 197 | + |
| 198 | + hostsource = dict() |
| 199 | + for xmlFile in xmlFiles: |
| 200 | + fdXML = open(xmlFile) |
| 201 | + thisHostCount = 0 |
| 202 | + for line in fdXML: |
| 203 | + mHO = hostOpen.match(line) |
| 204 | + if mHO: |
| 205 | + hostname = mHO.group(1) |
| 206 | + if hostsource.has_key(hostname): |
| 207 | + hostsource[hostname].append(xmlFile) |
| 208 | + v = hostsource[hostname] |
| 209 | + flagDupeHost = True |
| 210 | + logger.info("host '%s' seen more than once: %s" % (hostname, v)) |
| 211 | + else: |
| 212 | + hostsource[hostname] = [xmlFile] |
| 213 | + |
| 214 | + hostFile = '%s/hosts/%s' % (dataDir, hostname) |
| 215 | + hostBuf = [line] |
| 216 | + # now we want to continue iterating over the same file |
| 217 | + # until we reach the next close line |
| 218 | + while True: |
| 219 | + line = fdXML.next() |
| 220 | + hostBuf.append(line) |
| 221 | + if hostClose.match(line): |
| 222 | + break |
| 223 | + |
| 224 | + if ( flagDupeHost): |
| 225 | + oldHostBufLen = len(hostBuf) |
| 226 | + try: |
| 227 | + hostBuf = unionMetrics(hostBuf, hostFile) |
| 228 | + except Exception, e: |
| 229 | + hostBuf = [] |
| 230 | + logger.warning("duped host: unionMetrics failed with exception %s" % e) |
| 231 | + newHostBufLen = len(hostBuf) |
| 232 | + logger.debug("duped host: old length: %s, new length: %s" % (oldHostBufLen, newHostBufLen)) |
| 233 | + flagDupeHost = False |
| 234 | + |
| 235 | + if ( len(hostBuf) != 0 ): |
| 236 | + # if hostBuf is empty, don't touch the existing file (so it will go stale rather than writing bad data) |
| 237 | + fdHostFile = open(hostFile, 'w+b') |
| 238 | + fdHostFile.write(''.join(hostBuf)) |
| 239 | + fdHostFile.close() |
| 240 | + else: |
| 241 | + logger.warning("didn't write file %s because hostBuf is empty" % hostFile) |
| 242 | + thisHostCount += 1 |
| 243 | + totalHostCount += 1 |
| 244 | + logger.info("Parsed hosts in source '%s': %s" % (xmlFile, thisHostCount)) |
| 245 | + logger.info("Parsed hosts total: %s" % totalHostCount) |
| 246 | + |
| 247 | +def main(): |
| 248 | + """ main docstring """ |
| 249 | + |
| 250 | + # logging at level crit to give verification that the process is running, but not spew too much. |
| 251 | + # this should probably be changed to logger.info |
| 252 | + logger.critical('Starting ganglia_parser') |
| 253 | + try: |
| 254 | + os.makedirs(dataDir,2755) |
| 255 | + except OSError, e: |
| 256 | + if e.errno != errno.EEXIST: |
| 257 | + logger.critical('Problems creating %s. Aborting data splitting' % dataDir) |
| 258 | + sys.exit(2) |
| 259 | + |
| 260 | + |
| 261 | + storeXMLData(listXMLSources(), dataDir) |
| 262 | + splitXML(dataDir) |
| 263 | + logger.critical('Finished ganglia_parser') |
| 264 | + |
| 265 | + |
| 266 | +if __name__ == "__main__": |
| 267 | + main() |
Property changes on: trunk/debs/ganglios/src/ganglia_parser.orig |
___________________________________________________________________ |
Added: svn:executable |
1 | 268 | + * |
Index: trunk/debs/ganglios/src/ganglia_parser |
— | — | @@ -0,0 +1,266 @@ |
| 2 | +#!/usr/bin/python -tt |
| 3 | +# |
| 4 | +# Copyright (C) 2009 Linden Lab |
| 5 | +# |
| 6 | +# Released under the GPL v2 or later. For a full description of the license, |
| 7 | +# please visit http://www.gnu.org/licenses/gpl-2.0.html |
| 8 | +# |
| 9 | + |
| 10 | + |
| 11 | +""" |
| 12 | +Iterates over all the gmond collector nodes and fetch metrics where available. |
| 13 | +Separates metrics into one file per host for easier processing by nagios |
| 14 | +plugins. |
| 15 | +""" |
| 16 | + |
| 17 | +import sys |
| 18 | +import socket |
| 19 | +import os |
| 20 | +import glob |
| 21 | +import tempfile |
| 22 | +import errno |
| 23 | +import re |
| 24 | +import logging |
| 25 | + |
| 26 | +__revision__ = '0' |
| 27 | + |
| 28 | +dataDir = '/var/lib/ganglia/xmlcache' |
| 29 | +logDir = '/var/log/ganglia' |
| 30 | +logger = logging.getLogger('ganglia_parser') |
| 31 | +hdlr = logging.FileHandler('%s/ganglia_parser.log' % logDir) |
| 32 | +formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') |
| 33 | +hdlr.setFormatter(formatter) |
| 34 | +logger.addHandler(hdlr) |
| 35 | +logger.setLevel(logging.WARNING) |
| 36 | + |
| 37 | +def unionMetrics( curMetrics, hostFilePath ): |
| 38 | + '''When a single host is present in more than one gmond collector node |
| 39 | + (because of DNS errors or something else), write out the union of metrics |
| 40 | + from each collector node, taking the more recent metric when the same |
| 41 | + metric is present in multiple nodes.''' |
| 42 | + metrics = dict() |
| 43 | + metricsAge = dict() |
| 44 | + # parseMetric grabs the metric name and age |
| 45 | + parseMetric = re.compile('^<METRIC NAME="([^"]*).*TN="([^"]*)') |
| 46 | + |
| 47 | + # populate metrics and metricsAge from the existing file |
| 48 | + oldFileHandle = open(hostFilePath, 'r') |
| 49 | + fileFirstLine = oldFileHandle.readline() |
| 50 | + for line in oldFileHandle: |
| 51 | + regMatch = parseMetric.match(line) |
| 52 | + if regMatch: |
| 53 | + met = regMatch.group(1) |
| 54 | + age = regMatch.group(2) |
| 55 | + metrics[met] = line |
| 56 | + metricsAge[met] = age |
| 57 | + else: |
| 58 | + # the match failed, which will happen on </HOST> but nowhere else? |
| 59 | + # theoretically, more logging could go in here |
| 60 | + pass |
| 61 | + |
| 62 | + # selectively overwrite metrics with stuff from the buf if it is newer |
| 63 | + bufFirstLine = curMetrics[0] |
| 64 | + for line in curMetrics[1:-1]: |
| 65 | + regMatch = parseMetric.match(line) |
| 66 | + if regMatch: |
| 67 | + met = regMatch.group(1) |
| 68 | + # age represents seconds since the metric was reported. larger == older |
| 69 | + age = regMatch.group(2) |
| 70 | + if metrics.has_key(met): |
| 71 | + try: |
| 72 | + if float(age) < float(metricsAge[met]): |
| 73 | + # if this metric exists and is older, replace it |
| 74 | + metrics[met] = line |
| 75 | + except Exception, e: |
| 76 | + logger.warning("exception caught when comparing TN values: %s", e) |
| 77 | + else: |
| 78 | + # if the metric doesn't exist, add it |
| 79 | + metrics[met] = line |
| 80 | + else: |
| 81 | + # the match failed, which shouldn't happen (because first and last lines are cut by the slice) |
| 82 | + logger.warning("regMatch failed unexpectedly. current line: %s" % (line)) |
| 83 | + |
| 84 | + |
| 85 | + # compare first lines to see which to use |
| 86 | + parseHost = re.compile('^<HOST.*TN="([^"]*)') |
| 87 | + fileAge = parseHost.match(fileFirstLine).group(1) |
| 88 | + bufAge = parseHost.match(bufFirstLine).group(1) |
| 89 | + if fileAge < bufAge: |
| 90 | + firstLine = fileFirstLine |
| 91 | + else: |
| 92 | + firstLine = bufFirstLine |
| 93 | + |
| 94 | + # construct array to pass back to the calling function |
| 95 | + newBuf = [] |
| 96 | + newBuf.append(firstLine) |
| 97 | + newBuf.extend(metrics.values()) |
| 98 | + newBuf.append('</HOST>\n') |
| 99 | + |
| 100 | + # return the union metrics buffer |
| 101 | + return newBuf |
| 102 | + |
| 103 | + |
| 104 | +def listXMLSources(): |
| 105 | + """ returns a list of hosts to fetch ganglia stats from (the gmond |
| 106 | + collector nodes). In a small network, this will likely be a single host. |
| 107 | + You could simply hardcode the list if there is no coherent naming scheme. |
| 108 | + In our network, the hosts are named 'nannybot1', 'nannybot2', etc. This |
| 109 | + function polls nannybots of increasing number until it finds 3 that don't |
| 110 | + respond and considers itself done.""" |
| 111 | + i = 0 |
| 112 | + missed = 0 |
| 113 | + nannybots = [] |
| 114 | + logger.info("Retrieving list of nannybots...") |
| 115 | + while True: |
| 116 | + try: |
| 117 | + nannybot_addr = socket.gethostbyname('nannybot%d.lindenlab.com' % i) |
| 118 | + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
| 119 | + s.settimeout(15.0) |
| 120 | + s.connect((nannybot_addr, 8649)) |
| 121 | + s.close() |
| 122 | + nannybots.append('nannybot%d.lindenlab.com' % i) |
| 123 | + logger.info('Checking nannybot%d.lindenlab.com: [OK]' % i) |
| 124 | + missed = 0 |
| 125 | + except (socket.gaierror): |
| 126 | + logger.info('Checking nannybot%d.lindenlab.com: [FAILED]' % i) |
| 127 | + missed += 1 |
| 128 | + if missed > 3: |
| 129 | + break |
| 130 | + except (socket.error, socket.timeout), e: |
| 131 | + logger.warning('Checking nannybot%d.lindenlab.com: [FAILED] with error %s' % (i, e)) |
| 132 | + # a connection error (rather than non-existent) should not count towards |
| 133 | + # the three missing that mark the end of the nannybots. |
| 134 | + except Exception, e: |
| 135 | + logger.critical('Caught unexpected exception while checking nannybot%d: %s' % (i, e)) |
| 136 | + logger.critical('EXITING...') |
| 137 | + raise e |
| 138 | + i += 1 |
| 139 | + |
| 140 | + return nannybots |
| 141 | + |
| 142 | +def storeXMLData(srcHosts, dataDir): |
| 143 | + """ Fetch ganglia xml data from remote hosts and store it locally in $dataDir """ |
| 144 | + |
| 145 | + logger.info("Retrieving ganglia data from selected hosts...") |
| 146 | + for host in srcHosts: |
| 147 | + try: |
| 148 | + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
| 149 | + s.settimeout(15.0) |
| 150 | + s.connect((host, 8649)) |
| 151 | + (fdTF,tmpFile) = tempfile.mkstemp() |
| 152 | + fdTmpFile = os.fdopen(fdTF, "w+b") |
| 153 | + while True: |
| 154 | + data = s.recv(1024) |
| 155 | + if len(data) == 0: |
| 156 | + break |
| 157 | + fdTmpFile.write(data) |
| 158 | + s.close() |
| 159 | + fdTmpFile.close() |
| 160 | + |
| 161 | + # hostname is being used as the filename here |
| 162 | + targetfile = '%s/%s.xml' % (dataDir, host) |
| 163 | + try: |
| 164 | + os.rename(tmpFile, targetfile) |
| 165 | + os.chmod(targetfile, 0644) |
| 166 | + except: |
| 167 | + logger.critical("Failed to rename %s to %s" % (tmpFile, targetfile)) |
| 168 | + |
| 169 | + logger.info('parsing %s: [OK]' % host) |
| 170 | + except Exception, e: |
| 171 | + logger.warning('parsing %s: [FAILED] | reason: %s' % (host, e)) |
| 172 | + |
| 173 | +def splitXML(dataDir): |
| 174 | + """ find all .xml files in $dataDir and split them up on a per-host basis. |
| 175 | + Grid and Cluster are discarded. We dont care about the xml data, so we can |
| 176 | + save CPU cycle treating the file as txt one. """ |
| 177 | + |
| 178 | + # These patterns don't include whitespace, which might be a problem if |
| 179 | + # someday ganglia starts emitting formatted xml |
| 180 | + hostOpen = re.compile('^<HOST NAME="([^"]*)', re.I) #retrieve hostname |
| 181 | + hostClose = re.compile('^</HOST>$', re.I) |
| 182 | + totalHostCount = 0 |
| 183 | + try: |
| 184 | + os.mkdir("%s/hosts" % dataDir,0755) |
| 185 | + except OSError, e: |
| 186 | + if e.errno != errno.EEXIST: |
| 187 | + logger.critical('Problems creating %s. Aborting data splitting' % dataDir) |
| 188 | + sys.exit(2) |
| 189 | + |
| 190 | + logger.info("Splitting xml data in per-host files...") |
| 191 | + # Sean suggested this could backfire. Cant find a practical case myself |
| 192 | + xmlFiles = glob.glob('%s/*.xml' % dataDir) |
| 193 | + |
| 194 | + # flagDupeHost tells you whether this host has been seen before (in this run) |
| 195 | + # if it has, union the current host file and the stuff read in |
| 196 | + flagDupeHost = False |
| 197 | + |
| 198 | + hostsource = dict() |
| 199 | + for xmlFile in xmlFiles: |
| 200 | + fdXML = open(xmlFile) |
| 201 | + thisHostCount = 0 |
| 202 | + for line in fdXML: |
| 203 | + mHO = hostOpen.match(line) |
| 204 | + if mHO: |
| 205 | + hostname = mHO.group(1) |
| 206 | + if hostsource.has_key(hostname): |
| 207 | + hostsource[hostname].append(xmlFile) |
| 208 | + v = hostsource[hostname] |
| 209 | + flagDupeHost = True |
| 210 | + logger.info("host '%s' seen more than once: %s" % (hostname, v)) |
| 211 | + else: |
| 212 | + hostsource[hostname] = [xmlFile] |
| 213 | + |
| 214 | + hostFile = '%s/hosts/%s' % (dataDir, hostname) |
| 215 | + hostBuf = [line] |
| 216 | + # now we want to continue iterating over the same file |
| 217 | + # until we reach the next close line |
| 218 | + while True: |
| 219 | + line = fdXML.next() |
| 220 | + hostBuf.append(line) |
| 221 | + if hostClose.match(line): |
| 222 | + break |
| 223 | + |
| 224 | + if ( flagDupeHost): |
| 225 | + oldHostBufLen = len(hostBuf) |
| 226 | + try: |
| 227 | + hostBuf = unionMetrics(hostBuf, hostFile) |
| 228 | + except Exception, e: |
| 229 | + hostBuf = [] |
| 230 | + logger.warning("duped host: unionMetrics failed with exception %s" % e) |
| 231 | + newHostBufLen = len(hostBuf) |
| 232 | + logger.debug("duped host: old length: %s, new length: %s" % (oldHostBufLen, newHostBufLen)) |
| 233 | + flagDupeHost = False |
| 234 | + |
| 235 | + if ( len(hostBuf) != 0 ): |
| 236 | + # if hostBuf is empty, don't touch the existing file (so it will go stale rather than writing bad data) |
| 237 | + fdHostFile = open(hostFile, 'w+b') |
| 238 | + fdHostFile.write(''.join(hostBuf)) |
| 239 | + fdHostFile.close() |
| 240 | + else: |
| 241 | + logger.warning("didn't write file %s because hostBuf is empty" % hostFile) |
| 242 | + thisHostCount += 1 |
| 243 | + totalHostCount += 1 |
| 244 | + logger.info("Parsed hosts in source '%s': %s" % (xmlFile, thisHostCount)) |
| 245 | + logger.info("Parsed hosts total: %s" % totalHostCount) |
| 246 | + |
| 247 | +def main(): |
| 248 | + """ main docstring """ |
| 249 | + |
| 250 | + # logging at level crit to give verification that the process is running, but not spew too much. |
| 251 | + # this should probably be changed to logger.info |
| 252 | + logger.critical('Starting ganglia_parser') |
| 253 | + try: |
| 254 | + os.makedirs(dataDir,2755) |
| 255 | + except OSError, e: |
| 256 | + if e.errno != errno.EEXIST: |
| 257 | + logger.critical('Problems creating %s. Aborting data splitting' % dataDir) |
| 258 | + sys.exit(2) |
| 259 | + |
| 260 | + |
| 261 | + storeXMLData(listXMLSources(), dataDir) |
| 262 | + splitXML(dataDir) |
| 263 | + logger.critical('Finished ganglia_parser') |
| 264 | + |
| 265 | + |
| 266 | +if __name__ == "__main__": |
| 267 | + main() |
Property changes on: trunk/debs/ganglios/src/ganglia_parser |
___________________________________________________________________ |
Added: svn:executable |
1 | 268 | + * |
Index: trunk/debs/ganglios/Makefile |
— | — | @@ -0,0 +1,28 @@ |
| 2 | +SRCDIR=src |
| 3 | + |
| 4 | +SCRIPTS=src/ganglia_parser |
| 5 | +PLUGINS=src/check_ganglios_disk src/check_ganglios_generic_value src/check_ganglios_diskio src/check_ganglios_memory_v2 |
| 6 | +MODULES=src/ganglios/__init__.py src/ganglios/ganglios.py |
| 7 | + |
| 8 | +all: |
| 9 | + |
| 10 | +install: |
| 11 | + install -d ${DESTDIR}/usr/sbin |
| 12 | + install -m 0755 ${SCRIPTS} ${DESTDIR}/usr/sbin |
| 13 | + |
| 14 | + install -d ${DESTDIR}/usr/lib/nagios/plugins |
| 15 | + install -m 0755 ${PLUGINS} ${DESTDIR}/usr/lib/nagios/plugins |
| 16 | + |
| 17 | + install -d ${DESTDIR}/usr/share/pyshared/ganglios |
| 18 | + install -m 0644 ${MODULES} ${DESTDIR}/usr/share/pyshared/ganglios |
| 19 | + |
| 20 | +clean: |
| 21 | + |
| 22 | +deb: |
| 23 | + debuild -uc -us -i -b |
| 24 | + |
| 25 | +source-deb: |
| 26 | + debuild -uc -us -i -S |
| 27 | + |
| 28 | +debclean: |
| 29 | + debuild clean |
Property changes on: trunk/debs/ganglios/Makefile |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 30 | + native |