Index: trunk/phase3/maintenance/storage/storageTypeStatsSum.py |
— | — | @@ -0,0 +1,113 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | + |
| 5 | +""" |
| 6 | + |
| 7 | + For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data |
| 8 | + |
| 9 | + |
| 10 | + reads in a file which should contain the output of |
| 11 | + ben@hume:~$ /home/w/bin/foreachwiki maintenance/storage/storageTypeStats.php > /tmp/storageTypeStats.log |
| 12 | + Parses it and sums up the values for all wikis. |
| 13 | + prints this sum to stdout. |
| 14 | + |
| 15 | + Example content: |
| 16 | + |
| 17 | +ben@fenari:~/storageStats$ cat sample_output.txt |
| 18 | +----------------------------------------------------------------- |
| 19 | +aawiki |
| 20 | +----------------------------------------------------------------- |
| 21 | +aawiki: Using bin size of 100 |
| 22 | +aawiki: 0^M1000^M2000^M3000^M4000^M5000^M6000^M7000^M8000^M9000^M10000^M |
| 23 | +aawiki: |
| 24 | +aawiki: Flags Class Count old_id range |
| 25 | +aawiki: ------------------------------------------------------------------------------------------------------------------------ |
| 26 | +aawiki: gzip [none] 4568 0 - 4700 |
| 27 | +aawiki: [none] [none] 1615 4600 - 6300 |
| 28 | +aawiki: utf-8,gzip [none] 1883 5300 - 8300 |
| 29 | +aawiki: external,utf-8 CGZ pointer 626 6200 - 10300 |
| 30 | +aawiki: external,utf-8 DHB pointer 368 9100 - 10300 |
| 31 | +aawiki: utf-8,gzip,external simple pointer 975 8200 - 10400 |
| 32 | +aawiki: external,utf8 DHB pointer 211 9400 - 10200 |
| 33 | +----------------------------------------------------------------- |
| 34 | +aawikibooks |
| 35 | +----------------------------------------------------------------- |
| 36 | +aawikibooks: Using bin size of 100 |
| 37 | +aawikibooks: 0^M1000^M2000^M3000^M |
| 38 | +aawikibooks: |
| 39 | +aawikibooks: Flags Class Count old_id range |
| 40 | +aawikibooks: ------------------------------------------------------------------------------------------------------------------------ |
| 41 | +aawikibooks: [none] [none] 881 0 - 1000 |
| 42 | +aawikibooks: external,utf-8 CGZ pointer 187 0 - 3400 |
| 43 | +aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 |
| 44 | +aawikibooks: object historyblobcurstub 898 900 - 1900 |
| 45 | +aawikibooks: utf-8,gzip [none] 900 1800 - 2900 |
| 46 | +aawikibooks: utf-8,gzip,external simple pointer 431 2800 - 3400 |
| 47 | +aawikibooks: external,utf8 DHB pointer 25 3300 - 3400 |
| 48 | + |
| 49 | +""" |
| 50 | + |
| 51 | + |
| 52 | +import re |
| 53 | +import optparse |
| 54 | + |
| 55 | +## |
| 56 | +## set up argument parsing. Require --input (or -i) and a filename. |
| 57 | +usage = "usage: %prog <input>" |
| 58 | +desc = """Sum the storage types across all wikis. The input file should |
| 59 | +contain the output of: |
| 60 | + foreachwiki maintenance/storage/storageTypeStats.php |
| 61 | +""" |
| 62 | + |
| 63 | +parser = optparse.OptionParser(usage=usage, description=desc) |
| 64 | +(opts, args) = parser.parse_args() |
| 65 | +if len(args) != 1: |
| 66 | + print "I can't do anything without a file to parse. Sorry!" |
| 67 | + parser.print_help() |
| 68 | + exit(1) |
| 69 | + |
| 70 | +input = args[0] |
| 71 | + |
| 72 | +try: |
| 73 | + file=open(input, 'r') |
| 74 | + |
| 75 | + # create a bunch of regexes to match various sections of the file |
| 76 | + # a section starts with nothing on the line but the name of the wiki db |
| 77 | + #aawikibooks |
| 78 | + start_section = re.compile("^(?P<dbname>[a-z0-9_]+)$") |
| 79 | + #aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 |
| 80 | + counter = re.compile("^[a-z0-9_]*: *(?P<flags>[^ ]+) +(?P<class>[^ ]+ [^ ]*) +(?P<count>\d+) +.*") |
| 81 | + |
| 82 | + # create a bunch of counters |
| 83 | + wiki_count=0 |
| 84 | + content_counters = dict() |
| 85 | + |
| 86 | + # ok, parse the file and collect stats! |
| 87 | + for line in file: |
| 88 | + match = start_section.match(line) |
| 89 | + if match: |
| 90 | + # this isn't actually used yet, but is in here for when we |
| 91 | + # want more interesting stats and collect per-db |
| 92 | + wiki_count += 1 |
| 93 | + db_name=match.group('dbname') |
| 94 | + match = counter.match(line) |
| 95 | + if match: |
| 96 | + # sum all unique class,flags combinations |
| 97 | + key = "%s/%s" % (match.group('flags'), match.group('class')) |
| 98 | + try: |
| 99 | + content_counters[key] += int(match.group('count')) |
| 100 | + except KeyError: |
| 101 | + content_counters[key] = int(match.group('count')) |
| 102 | + |
| 103 | + |
| 104 | +except IOError, e: |
| 105 | + print "omg io error %s!" % e |
| 106 | + raise e |
| 107 | + |
| 108 | +print "Results:" |
| 109 | +print " Count Type" |
| 110 | +print "------------------------------------------" |
| 111 | +for key in sorted(content_counters.keys()): |
| 112 | + print "%12d %s" % (content_counters[key], key) |
| 113 | +print "all done!" |
| 114 | + |
Property changes on: trunk/phase3/maintenance/storage/storageTypeStatsSum.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 115 | + * |
Index: trunk/phase3/maintenance/storage/storageTypeStatsDiff.py |
— | — | @@ -0,0 +1,113 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | + |
| 5 | +""" |
| 6 | + |
| 7 | + For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data |
| 8 | + |
| 9 | + reads in two files which should contain the output of storageTypeStatsSum.py |
| 10 | + Parses them both and calculates the difference for each storage type |
| 11 | + prints this to stdout. |
| 12 | + |
| 13 | + For best results, give the old and new files their dates for names, eg: |
| 14 | + ben@fenari:~/storageStats$ ./storageTypeStatsDiff.py 2010-02-18 2011-08-31 |
| 15 | + |
| 16 | + Example content: |
| 17 | + |
| 18 | +ben@fenari:~/storageStats$ cat 2010-02-18 |
| 19 | +Results: |
| 20 | + Count Type |
| 21 | +------------------------------------------ |
| 22 | + 9 0,external/simple pointer |
| 23 | + 435 0/[none] |
| 24 | + 1482941 [none]/[none] |
| 25 | + 968957 gzip/[none] |
| 26 | + 178234 object,external/simple pointer |
| 27 | + 1800 object,utf-8/[none] |
| 28 | + 17076928 utf-8,gzip/[none] |
| 29 | + 1269 utf-8/[none] |
| 30 | +all done! |
| 31 | + |
| 32 | +ben@fenari:~/storageStats$ cat 2011-08-31 |
| 33 | +Results: |
| 34 | + Count Type |
| 35 | +------------------------------------------ |
| 36 | + 9 0,external/simple pointer |
| 37 | + 1435 0/[none] |
| 38 | + 1002341 [none]/[none] |
| 39 | + 1234212 object,external/simple pointer |
| 40 | + 213 object,external/blob |
| 41 | + 20 object,utf-8/[none] |
| 42 | + 123428 utf-8,gzip/[none] |
| 43 | + 123 utf-8/[none] |
| 44 | +all done! |
| 45 | + |
| 46 | +""" |
| 47 | + |
| 48 | + |
| 49 | +import re |
| 50 | +import optparse |
| 51 | + |
| 52 | +## |
| 53 | +## set up argument parsing. |
| 54 | +usage = "usage: %prog <old-stats-file> <new-stats-file>" |
| 55 | +desc = "Calculate the difference between two files containing storageTypeStatsSum.py output" |
| 56 | +parser = optparse.OptionParser(usage=usage, description=desc) |
| 57 | +(opts, args) = parser.parse_args() |
| 58 | +# Require exactly two arguments |
| 59 | +if len(args) != 2: |
| 60 | + print "Two files needed." |
| 61 | + parser.print_help() |
| 62 | + exit() |
| 63 | + |
| 64 | +try: |
| 65 | + oldfile=open(args[0], 'r') |
| 66 | + newfile=open(args[1], 'r') |
| 67 | +except IOError, e: |
| 68 | + print "IOError trying to open %s or %s: %s\n" % (args[0], args[1], e) |
| 69 | + exit(1) |
| 70 | + |
| 71 | +# match only the actual value / key lines; ignore everything else |
| 72 | +valueline = re.compile("^ *(?P<val>\d+) *(?P<desc>.*)$") |
| 73 | + |
| 74 | +files={} |
| 75 | +# ok, parse the files and collect stats! |
| 76 | +for file in (oldfile, newfile): |
| 77 | + stats = {} |
| 78 | + for line in file: |
| 79 | + match = valueline.match(line) |
| 80 | + if match: |
| 81 | + stats[match.group('desc')] = int(match.group('val')) |
| 82 | + #stats collected for one file, save it to the files dict |
| 83 | + files[file.name] = stats |
| 84 | + |
| 85 | +# calculate the difference |
| 86 | +diff = {} # contains numbers keyed on storage types |
| 87 | +allkeys = [] |
| 88 | +# collect keys from both sets in case they don't match |
| 89 | +for stats in files.keys(): |
| 90 | + # get the union of allkeys and this file's stats keys |
| 91 | + allkeys = list( set(allkeys) | set(files[stats].keys()) ) |
| 92 | +for key in allkeys: |
| 93 | + try: |
| 94 | + diff[key] = files[newfile.name][key] - files[oldfile.name][key] |
| 95 | + except KeyError: |
| 96 | + # this happens when a key only exists in one set |
| 97 | + diff[key] = 'n/a' |
| 98 | + |
| 99 | +# print out results |
| 100 | +print "%12s %12s %12s %s" % (oldfile.name, newfile.name, 'Diff', 'Type') |
| 101 | +print "---------------------------------------------------------------------" |
| 102 | +for key in sorted(allkeys): |
| 103 | + try: |
| 104 | + oldval = files[oldfile.name][key] |
| 105 | + except KeyError: |
| 106 | + oldval = 'n/a' |
| 107 | + try: |
| 108 | + newval = files[newfile.name][key] |
| 109 | + except KeyError: |
| 110 | + newval = 'n/a' |
| 111 | + diffnum = diff[key] |
| 112 | + name = key |
| 113 | + print "%12s %12s %12s %s" % (oldval, newval, diffnum, name) |
| 114 | + |
Property changes on: trunk/phase3/maintenance/storage/storageTypeStatsDiff.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 115 | + native |
Added: svn:executable |
2 | 116 | + * |