Index: trunk/debs/ganglios/debian/changelog |
— | — | @@ -1,8 +1,14 @@ |
2 | | -ganglios (1.1) stable; urgency=low |
| 2 | +ganglios (1.2) stable; urgency=low |
3 | 3 | |
4 | | - * changed listXMLSources to read the list from /etc/ganglia/gmetad.conf |
| 4 | + * changed ganglia_parser to use gmetad.conf instead of nannybot# |
| 5 | + * updated ganglios.py to use the native xml etree instead of (deprecated) |
| 6 | + ElementTree |
| 7 | + * fixed broken memory and disk checks |
| 8 | + * added additional help text to the generic check |
| 9 | + * configured the ganglios package to install the cronjob (activating the |
| 10 | + parser) |
5 | 11 | |
6 | | - -- Ben Hartshorne <bhartshorne@wikimedia.org> Mon, 3 Oct 2011 16:35:00 -0700 |
| 12 | + -- Ben Hartshorne <bhartshorne@wikimedia.org> Fri, 7 Oct 2011 12:02:34 -0700 |
7 | 13 | |
8 | 14 | ganglios (1.0-1) stable; urgency=low |
9 | 15 | |
Index: trunk/debs/ganglios/debian/rules |
— | — | @@ -72,7 +72,7 @@ |
73 | 73 | # dh_installmime |
74 | 74 | dh_pysupport |
75 | 75 | # dh_installinit |
76 | | -# dh_installcron |
| 76 | + dh_installcron |
77 | 77 | # dh_installinfo |
78 | 78 | dh_installman |
79 | 79 | dh_link |
Index: trunk/debs/ganglios/src/check_ganglios_memory_v2 |
— | — | @@ -64,7 +64,7 @@ |
65 | 65 | sys.stdout.write(output) |
66 | 66 | ganglios.done(2) |
67 | 67 | |
68 | | - if( options.free ): |
| 68 | + if( options.avail ): |
69 | 69 | # testing against free memory |
70 | 70 | mem = mem_shared + mem_free + mem_cached + mem_buffers |
71 | 71 | metric = 'mem_avail' |
— | — | @@ -76,16 +76,15 @@ |
77 | 77 | op = 'gt' |
78 | 78 | |
79 | 79 | |
80 | | - if getattr(operator, op)(cur_val, crit_value): |
81 | | - if mem_used > crit_value: |
| 80 | + if getattr(operator, op)(mem, crit_value): |
82 | 81 | status = 2 |
83 | | - output = "CRITICAL: %s is %s (op %s)" % (metric, cur_val, op, crit_value) |
84 | | - elif mem_used > warn_value: |
| 82 | + output = "CRITICAL: %s is %s (%s %s)" % (metric, mem, op, crit_value) |
| 83 | + elif getattr(operator, op)(mem, warn_value): |
85 | 84 | status = 1 |
86 | | - output = "WARN: %s is %s (op %s)" % (metric, cur_val, op, warn_value) |
| 85 | + output = "WARN: %s is %s (%s %s)" % (metric, mem, op, warn_value) |
87 | 86 | else: |
88 | 87 | status = 0 |
89 | | - output = "OK: %s is %s" % (metric, cur_val) |
| 88 | + output = "OK: %s is %s" % (metric, mem) |
90 | 89 | |
91 | 90 | sys.stdout.write(output) |
92 | 91 | ganglios.done(status) |
Index: trunk/debs/ganglios/src/ganglios/ganglios.py |
— | — | @@ -29,7 +29,7 @@ |
30 | 30 | import glob |
31 | 31 | import socket |
32 | 32 | |
33 | | -import elementtree.ElementTree as ET |
| 33 | +import xml.etree.ElementTree as ET |
34 | 34 | import xml.parsers.expat as expat |
35 | 35 | |
36 | 36 | __revision__ = '0' |
— | — | @@ -52,6 +52,8 @@ |
53 | 53 | status = 0 # ok |
54 | 54 | bad = [] |
55 | 55 | |
| 56 | + # go_bad collects xml cache files that are old, broken or otherwise |
| 57 | + # unparseable and stops us from parsing them again in the future |
56 | 58 | def go_bad (xml_file, bad): |
57 | 59 | """ change status to bad, and output the stale nannybot """ |
58 | 60 | bad_host = xml_file.replace ('.xml', '') |
— | — | @@ -75,15 +77,19 @@ |
76 | 78 | f_hndl = open (filename) |
77 | 79 | try: |
78 | 80 | tree = ET.parse (f_hndl) |
79 | | - ganglia_xml = tree.getroot() |
80 | | - for cluster in ganglia_xml.getchildren (): |
81 | | - for host in cluster.getchildren (): |
82 | | - for metric in host.getchildren (): |
83 | | - # found a metric we care about. |
84 | | - if metric.get ('NAME') in metrics: |
85 | | - thunk (host.get ('NAME'), |
86 | | - metric.get ('NAME'), |
87 | | - metric.get ('VAL')) |
| 81 | + root = tree.getroot() |
| 82 | + clusters = list(root) |
| 83 | + for cluster in clusters: |
| 84 | + for host in cluster.findall('HOST'): |
| 85 | + for metric in host.findall('METRIC'): |
| 86 | + if metric.attrib['NAME'] in metrics: |
| 87 | + try: |
| 88 | + thunk( host.attrib['NAME'], |
| 89 | + metric.attrib['NAME'], |
| 90 | + metric.attrib['VAL']) |
| 91 | + except Exception, e: |
| 92 | + print "thunk threw an exception: %s" % e |
| 93 | + raise |
88 | 94 | except expat.ExpatError: |
89 | 95 | go_bad (xml_file, bad) |
90 | 96 | status = 2 |
— | — | @@ -122,6 +128,9 @@ |
123 | 129 | filelist = glob.glob(os.path.join(_hostdir, "*.%s" % hostname)) |
124 | 130 | if len(filelist) == 0: |
125 | 131 | filelist = glob.glob(os.path.join(_hostdir, "%s" % hostname)) |
| 132 | + # if there's still no match, complain host not found. |
| 133 | + if len(filelist) == 0: |
| 134 | + raise Exception("Host not found: %s." % hostname) |
126 | 135 | ### |
127 | 136 | ### for the VPNs, it's a valid state that there exist >1 files for each vpn |
128 | 137 | ### (a tunnel address and a private interface). What's the right action to take |
— | — | @@ -142,11 +151,10 @@ |
143 | 152 | f_hndl = open(filename) |
144 | 153 | try: |
145 | 154 | tree = ET.parse (f_hndl) |
146 | | - host = tree.getroot() |
147 | | - for metric in host.getchildren (): |
| 155 | + for metric in tree.findall('METRIC'): |
148 | 156 | # found a metric we care about. |
149 | | - if metric.get ('NAME') == metricname: |
150 | | - return metric.get('VAL') |
| 157 | + if metric.attrib['NAME'] == metricname: |
| 158 | + return metric.attrib['VAL'] |
151 | 159 | except expat.ExpatError: |
152 | 160 | sys.stdout.write("XML parse error") |
153 | 161 | done(2) |
Index: trunk/debs/ganglios/src/check_ganglios_generic_value |
— | — | @@ -36,7 +36,14 @@ |
37 | 37 | # actually, any valid function supplied by the 'operator' module that return a |
38 | 38 | # boolean will probably work, but the above functions are what I expect to use |
39 | 39 | |
40 | | - parser = OptionParser() |
| 40 | + description = """check_ganglios_generic_value -H hostname -m metric -w val -c val -o op |
| 41 | + |
| 42 | + Checks the value of a metric against the warning and crit values |
| 43 | + passed in using the operator supplied. An example call that checks that |
| 44 | + the load_one metric for foo.example.com. It will trigger WARN if load_one |
| 45 | + is greater than 2 and CRIT when greater than 4: |
| 46 | + ./check_ganglios_generic_value -H foo.example.com -m load_one -w 2 -c 4 -o gt""" |
| 47 | + parser = OptionParser(usage=description) |
41 | 48 | |
42 | 49 | parser.add_option('-H', dest='hostname', help='hostname') |
43 | 50 | parser.add_option('-m', dest='metric', help='metric to check') |
Index: trunk/debs/ganglios/src/check_ganglios_disk |
— | — | @@ -101,12 +101,12 @@ |
102 | 102 | def build_metrics_include( host, metric, value): |
103 | 103 | ''' callback from ganglios.parse_ganglia ''' |
104 | 104 | if host in incl_excl_hosts: |
105 | | - hostmetrics[ incl_excl_hosts[host] ] = int(value) |
| 105 | + hostmetrics[ incl_excl_hosts[host] ] = float(value) |
106 | 106 | |
107 | 107 | def build_metrics_exclude( host, metric, value): |
108 | 108 | ''' callback from ganglios.parse_ganglia ''' |
109 | 109 | if not host in incl_excl_hosts: |
110 | | - hostmetrics[ host ] = int(value) |
| 110 | + hostmetrics[ host ] = float(value) |
111 | 111 | |
112 | 112 | status = 0 # OK |
113 | 113 | if( inexcl_flag == 'include' ): |
Index: trunk/debs/ganglios/src/ganglia_parser |
— | — | @@ -31,7 +31,7 @@ |
32 | 32 | formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') |
33 | 33 | hdlr.setFormatter(formatter) |
34 | 34 | logger.addHandler(hdlr) |
35 | | -logger.setLevel(logging.WARNING) |
| 35 | +logger.setLevel(logging.DEBUG) |
36 | 36 | |
37 | 37 | def unionMetrics( curMetrics, hostFilePath ): |
38 | 38 | '''When a single host is present in more than one gmond collector node |
— | — | @@ -110,13 +110,16 @@ |
111 | 111 | # so long as ganglios is running on the same host as the ganglia web ui, it |
112 | 112 | # can use ganglia's gmetad.conf to get the list of sources. |
113 | 113 | gmetadconf = open('/etc/ganglia/gmetad.conf') |
114 | | - datasourcere = re.compile('^data_source "(?P<name>[^"]*)" (?P<hostlist>.*)') |
| 114 | + # datasource is the string 'data_source' followed by a quoted string name followed by |
| 115 | + # an optional polling interval followed by a list of hostnames |
| 116 | + datasourcere = re.compile('^data_source "(?P<name>[^"]*)" (?P<pollint>\d+ )?(?P<hostlist>.*)') |
115 | 117 | for line in gmetadconf.readlines(): |
116 | 118 | match = datasourcere.match(line) |
117 | 119 | if match: |
118 | | - for host in match.group(2).split(): |
| 120 | + for host in match.group('hostlist').split(): |
119 | 121 | nannybots.append(host) |
120 | 122 | |
| 123 | + logger.info("nannybot list: %s" % nannybots) |
121 | 124 | return nannybots |
122 | 125 | |
123 | 126 | def storeXMLData(srcHosts, dataDir): |