Index: trunk/phase3/maintenance/storage/storageTypeStatsSum.py |
— | — | @@ -1,113 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | - |
4 | | - |
5 | | -""" |
6 | | - |
7 | | - For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data |
8 | | - |
9 | | - |
10 | | - reads in a file which should contain the output of |
11 | | - ben@hume:~$ /home/w/bin/foreachwiki maintenance/storage/storageTypeStats.php > /tmp/storageTypeStats.log |
12 | | - Parses it and sums up the values for all wikis. |
13 | | - prints this sum to stdout. |
14 | | - |
15 | | - Example content: |
16 | | - |
17 | | -ben@fenari:~/storageStats$ cat sample_output.txt |
18 | | -aawiki |
19 | | -aawiki: Using bin size of 100 |
20 | | -aawiki: 0^M1000^M2000^M3000^M4000^M5000^M6000^M7000^M8000^M9000^M10000^M |
21 | | -aawiki: |
22 | | -aawiki: Flags Class Count old_id range |
23 | | -aawiki: ------------------------------------------------------------------------------------------------------------------------ |
24 | | -aawiki: gzip [none] 4568 0 - 4700 |
25 | | -aawiki: [none] [none] 1615 4600 - 6300 |
26 | | -aawiki: utf-8,gzip [none] 1883 5300 - 8300 |
27 | | -aawiki: external,utf-8 CGZ pointer 626 6200 - 10300 |
28 | | -aawiki: external,utf-8 DHB pointer 368 9100 - 10300 |
29 | | -aawiki: utf-8,gzip,external simple pointer 975 8200 - 10400 |
30 | | -aawiki: external,utf8 DHB pointer 211 9400 - 10200 |
31 | | -aawikibooks |
32 | | -aawikibooks: Using bin size of 100 |
33 | | -aawikibooks: 0^M1000^M2000^M3000^M |
34 | | -aawikibooks: |
35 | | -aawikibooks: Flags Class Count old_id range |
36 | | -aawikibooks: ------------------------------------------------------------------------------------------------------------------------ |
37 | | -aawikibooks: [none] [none] 881 0 - 1000 |
38 | | -aawikibooks: external,utf-8 CGZ pointer 187 0 - 3400 |
39 | | -aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 |
40 | | -aawikibooks: object historyblobcurstub 898 900 - 1900 |
41 | | -aawikibooks: utf-8,gzip [none] 900 1800 - 2900 |
42 | | -aawikibooks: utf-8,gzip,external simple pointer 431 2800 - 3400 |
43 | | -aawikibooks: external,utf8 DHB pointer 25 3300 - 3400 |
44 | | - |
45 | | -""" |
46 | | - |
47 | | - |
48 | | -import re |
49 | | -import optparse |
50 | | - |
51 | | -## |
52 | | -## set up argument parsing. Require --input (or -i) and a filename. |
53 | | -usage = "usage: %prog <input>" |
54 | | -desc = """Sum the storage types across all wikis. The input file should |
55 | | -contain the output of: |
56 | | - foreachwiki maintenance/storage/storageTypeStats.php |
57 | | -""" |
58 | | - |
59 | | -parser = optparse.OptionParser(usage=usage, description=desc) |
60 | | -(opts, args) = parser.parse_args() |
61 | | -if len(args) != 1: |
62 | | - print "I can't do anything without a file to parse. Sorry!" |
63 | | - parser.print_help() |
64 | | - exit(1) |
65 | | - |
66 | | -input = args[0] |
67 | | - |
68 | | -try: |
69 | | - file=open(input, 'r') |
70 | | - |
71 | | - # create a bunch of regexes to match various sections of the file |
72 | | - # a section starts with nothing on the line but the name of the wiki db |
73 | | - #aawikibooks |
74 | | - start_section = re.compile("^(?P<dbname>[a-z0-9_]+)$") |
75 | | - #aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 |
76 | | - counter = re.compile("^[a-z0-9_]*: *(?P<flags>[^ ]+) +(?P<class>[^ ]+ [^ ]*) +(?P<count>\d+) +.*") |
77 | | - |
78 | | - # create a bunch of counters |
79 | | - wiki_count=0 |
80 | | - content_counters = dict() |
81 | | - |
82 | | - # ok, parse the file and collect stats! |
83 | | - for line in file: |
84 | | - match = start_section.match(line) |
85 | | - if match: |
86 | | - # this isn't actually used yet, but is in here for when we |
87 | | - # want more interesting stats and collect per-db |
88 | | - wiki_count += 1 |
89 | | - db_name=match.group('dbname') |
90 | | - match = counter.match(line) |
91 | | - if match: |
92 | | - # sum all unique class,flags combinations |
93 | | - key = "%s/%s" % (match.group('flags'), match.group('class')) |
94 | | - try: |
95 | | - content_counters[key] += int(match.group('count')) |
96 | | - except KeyError: |
97 | | - content_counters[key] = int(match.group('count')) |
98 | | - |
99 | | - |
100 | | -except IOError, e: |
101 | | - print "omg io error %s!" % e |
102 | | - raise e |
103 | | - |
104 | | -print "Results:" |
105 | | -print " Count Type" |
106 | | -print "------------------------------------------" |
107 | | -for key in sorted(content_counters.keys()): |
108 | | - print "%12d %s" % (content_counters[key], key) |
109 | | -print "all done!" |
110 | | - |
Index: trunk/phase3/maintenance/storage/storageTypeStatsDiff.py |
— | — | @@ -1,113 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | - |
4 | | - |
5 | | -""" |
6 | | - |
7 | | - For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data |
8 | | - |
9 | | - reads in two files which should contain the output of storageTypeStatsSum.py |
10 | | - Parses them both and calculates the difference for each storage type |
11 | | - prints this to stdout. |
12 | | - |
13 | | - For best results, give the old and new files their dates for names, eg: |
14 | | - ben@fenari:~/storageStats$ ./storageTypeStatsDiff.py 2010-02-18 2011-08-31 |
15 | | - |
16 | | - Example content: |
17 | | - |
18 | | -ben@fenari:~/storageStats$ cat 2010-02-18 |
19 | | -Results: |
20 | | - Count Type |
21 | | - 9 0,external/simple pointer |
22 | | - 435 0/[none] |
23 | | - 1482941 [none]/[none] |
24 | | - 968957 gzip/[none] |
25 | | - 178234 object,external/simple pointer |
26 | | - 1800 object,utf-8/[none] |
27 | | - 17076928 utf-8,gzip/[none] |
28 | | - 1269 utf-8/[none] |
29 | | -all done! |
30 | | - |
31 | | -ben@fenari:~/storageStats$ cat 2011-08-31 |
32 | | -Results: |
33 | | - Count Type |
34 | | - 9 0,external/simple pointer |
35 | | - 1435 0/[none] |
36 | | - 1002341 [none]/[none] |
37 | | - 1234212 object,external/simple pointer |
38 | | - 213 object,external/blob |
39 | | - 20 object,utf-8/[none] |
40 | | - 123428 utf-8,gzip/[none] |
41 | | - 123 utf-8/[none] |
42 | | -all done! |
43 | | - |
44 | | -""" |
45 | | - |
46 | | - |
47 | | -import re |
48 | | -import optparse |
49 | | - |
50 | | -## |
51 | | -## set up argument parsing. |
52 | | -usage = "usage: %prog <old-stats-file> <new-stats-file>" |
53 | | -desc = "Calculate the difference between two files containing storageTypeStatsSum.py output" |
54 | | -parser = optparse.OptionParser(usage=usage, description=desc) |
55 | | -(opts, args) = parser.parse_args() |
56 | | -# Require exactly two arguments |
57 | | -if len(args) != 2: |
58 | | - print "Two files needed." |
59 | | - parser.print_help() |
60 | | - exit() |
61 | | - |
62 | | -try: |
63 | | - oldfile=open(args[0], 'r') |
64 | | - newfile=open(args[1], 'r') |
65 | | -except IOError, e: |
66 | | - print "IOError trying to open %s or %s: %s\n" % (args[0], args[1], e) |
67 | | - exit(1) |
68 | | - |
69 | | -# match only the actual value / key lines; ignore everything else |
70 | | -valueline = re.compile("^ *(?P<val>\d+) *(?P<desc>.*)$") |
71 | | - |
72 | | -files={} |
73 | | -# ok, parse the files and collect stats! |
74 | | -for file in (oldfile, newfile): |
75 | | - stats = {} |
76 | | - for line in file: |
77 | | - match = valueline.match(line) |
78 | | - if match: |
79 | | - stats[match.group('desc')] = int(match.group('val')) |
80 | | - #stats collected for one file, save it to the files dict |
81 | | - files[file.name] = stats |
82 | | - |
83 | | -# calculate the difference |
84 | | -diff = {} # contains numbers keyed on storage types |
85 | | -allkeys = [] |
86 | | -# collect keys from both sets in case they don't match |
87 | | -for stats in files.keys(): |
88 | | - # get the union of allkeys and this file's stats keys |
89 | | - allkeys = list( set(allkeys) | set(files[stats].keys()) ) |
90 | | -for key in allkeys: |
91 | | - try: |
92 | | - diff[key] = files[newfile.name][key] - files[oldfile.name][key] |
93 | | - except KeyError: |
94 | | - # this happens when a key only exists in one set |
95 | | - diff[key] = 'n/a' |
96 | | - |
97 | | -# print out results |
98 | | -print "%12s %12s %12s %s" % (oldfile.name, newfile.name, 'Diff', 'Type') |
99 | | -print "---------------------------------------------------------------------" |
100 | | -for key in sorted(allkeys): |
101 | | - try: |
102 | | - oldval = files[oldfile.name][key] |
103 | | - except KeyError: |
104 | | - oldval = 'n/a' |
105 | | - try: |
106 | | - newval = files[newfile.name][key] |
107 | | - except KeyError: |
108 | | - newval = 'n/a' |
109 | | - diffnum = diff[key] |
110 | | - name = key |
111 | | - print "%12s %12s %12s %s" % (oldval, newval, diffnum, name) |
112 | | - |
Index: trunk/extensions/WikimediaMaintenance/storage/storageTypeStatsSum.py |
— | — | @@ -0,0 +1,113 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | + |
| 5 | +""" |
| 6 | + |
| 7 | + For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data |
| 8 | + |
| 9 | + |
| 10 | + reads in a file which should contain the output of |
| 11 | + ben@hume:~$ /home/w/bin/foreachwiki maintenance/storage/storageTypeStats.php > /tmp/storageTypeStats.log |
| 12 | + Parses it and sums up the values for all wikis. |
| 13 | + prints this sum to stdout. |
| 14 | + |
| 15 | + Example content: |
| 16 | + |
| 17 | +ben@fenari:~/storageStats$ cat sample_output.txt |
| 18 | +----------------------------------------------------------------- |
| 19 | +aawiki |
| 20 | +----------------------------------------------------------------- |
| 21 | +aawiki: Using bin size of 100 |
| 22 | +aawiki: 0^M1000^M2000^M3000^M4000^M5000^M6000^M7000^M8000^M9000^M10000^M |
| 23 | +aawiki: |
| 24 | +aawiki: Flags Class Count old_id range |
| 25 | +aawiki: ------------------------------------------------------------------------------------------------------------------------ |
| 26 | +aawiki: gzip [none] 4568 0 - 4700 |
| 27 | +aawiki: [none] [none] 1615 4600 - 6300 |
| 28 | +aawiki: utf-8,gzip [none] 1883 5300 - 8300 |
| 29 | +aawiki: external,utf-8 CGZ pointer 626 6200 - 10300 |
| 30 | +aawiki: external,utf-8 DHB pointer 368 9100 - 10300 |
| 31 | +aawiki: utf-8,gzip,external simple pointer 975 8200 - 10400 |
| 32 | +aawiki: external,utf8 DHB pointer 211 9400 - 10200 |
| 33 | +----------------------------------------------------------------- |
| 34 | +aawikibooks |
| 35 | +----------------------------------------------------------------- |
| 36 | +aawikibooks: Using bin size of 100 |
| 37 | +aawikibooks: 0^M1000^M2000^M3000^M |
| 38 | +aawikibooks: |
| 39 | +aawikibooks: Flags Class Count old_id range |
| 40 | +aawikibooks: ------------------------------------------------------------------------------------------------------------------------ |
| 41 | +aawikibooks: [none] [none] 881 0 - 1000 |
| 42 | +aawikibooks: external,utf-8 CGZ pointer 187 0 - 3400 |
| 43 | +aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 |
| 44 | +aawikibooks: object historyblobcurstub 898 900 - 1900 |
| 45 | +aawikibooks: utf-8,gzip [none] 900 1800 - 2900 |
| 46 | +aawikibooks: utf-8,gzip,external simple pointer 431 2800 - 3400 |
| 47 | +aawikibooks: external,utf8 DHB pointer 25 3300 - 3400 |
| 48 | + |
| 49 | +""" |
| 50 | + |
| 51 | + |
| 52 | +import re |
| 53 | +import optparse |
| 54 | + |
| 55 | +## |
| 56 | +## set up argument parsing. Require --input (or -i) and a filename. |
| 57 | +usage = "usage: %prog <input>" |
| 58 | +desc = """Sum the storage types across all wikis. The input file should |
| 59 | +contain the output of: |
| 60 | + foreachwiki maintenance/storage/storageTypeStats.php |
| 61 | +""" |
| 62 | + |
| 63 | +parser = optparse.OptionParser(usage=usage, description=desc) |
| 64 | +(opts, args) = parser.parse_args() |
| 65 | +if len(args) != 1: |
| 66 | + print "I can't do anything without a file to parse. Sorry!" |
| 67 | + parser.print_help() |
| 68 | + exit(1) |
| 69 | + |
| 70 | +input = args[0] |
| 71 | + |
| 72 | +try: |
| 73 | + file=open(input, 'r') |
| 74 | + |
| 75 | + # create a bunch of regexes to match various sections of the file |
| 76 | + # a section starts with nothing on the line but the name of the wiki db |
| 77 | + #aawikibooks |
| 78 | + start_section = re.compile("^(?P<dbname>[a-z0-9_]+)$") |
| 79 | + #aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 |
| 80 | + counter = re.compile("^[a-z0-9_]*: *(?P<flags>[^ ]+) +(?P<class>[^ ]+ [^ ]*) +(?P<count>\d+) +.*") |
| 81 | + |
| 82 | + # create a bunch of counters |
| 83 | + wiki_count=0 |
| 84 | + content_counters = dict() |
| 85 | + |
| 86 | + # ok, parse the file and collect stats! |
| 87 | + for line in file: |
| 88 | + match = start_section.match(line) |
| 89 | + if match: |
| 90 | + # this isn't actually used yet, but is in here for when we |
| 91 | + # want more interesting stats and collect per-db |
| 92 | + wiki_count += 1 |
| 93 | + db_name=match.group('dbname') |
| 94 | + match = counter.match(line) |
| 95 | + if match: |
| 96 | + # sum all unique class,flags combinations |
| 97 | + key = "%s/%s" % (match.group('flags'), match.group('class')) |
| 98 | + try: |
| 99 | + content_counters[key] += int(match.group('count')) |
| 100 | + except KeyError: |
| 101 | + content_counters[key] = int(match.group('count')) |
| 102 | + |
| 103 | + |
| 104 | +except IOError, e: |
| 105 | + print "omg io error %s!" % e |
| 106 | + raise e |
| 107 | + |
| 108 | +print "Results:" |
| 109 | +print " Count Type" |
| 110 | +print "------------------------------------------" |
| 111 | +for key in sorted(content_counters.keys()): |
| 112 | + print "%12d %s" % (content_counters[key], key) |
| 113 | +print "all done!" |
| 114 | + |
Property changes on: trunk/extensions/WikimediaMaintenance/storage/storageTypeStatsSum.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 115 | + native |
Added: svn:executable |
2 | 116 | + * |
Index: trunk/extensions/WikimediaMaintenance/storage/storageTypeStatsDiff.py |
— | — | @@ -0,0 +1,113 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | + |
| 5 | +""" |
| 6 | + |
| 7 | + For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data |
| 8 | + |
| 9 | + reads in two files which should contain the output of storageTypeStatsSum.py |
| 10 | + Parses them both and calculates the difference for each storage type |
| 11 | + prints this to stdout. |
| 12 | + |
| 13 | + For best results, give the old and new files their dates for names, eg: |
| 14 | + ben@fenari:~/storageStats$ ./storageTypeStatsDiff.py 2010-02-18 2011-08-31 |
| 15 | + |
| 16 | + Example content: |
| 17 | + |
| 18 | +ben@fenari:~/storageStats$ cat 2010-02-18 |
| 19 | +Results: |
| 20 | + Count Type |
| 21 | +------------------------------------------ |
| 22 | + 9 0,external/simple pointer |
| 23 | + 435 0/[none] |
| 24 | + 1482941 [none]/[none] |
| 25 | + 968957 gzip/[none] |
| 26 | + 178234 object,external/simple pointer |
| 27 | + 1800 object,utf-8/[none] |
| 28 | + 17076928 utf-8,gzip/[none] |
| 29 | + 1269 utf-8/[none] |
| 30 | +all done! |
| 31 | + |
| 32 | +ben@fenari:~/storageStats$ cat 2011-08-31 |
| 33 | +Results: |
| 34 | + Count Type |
| 35 | +------------------------------------------ |
| 36 | + 9 0,external/simple pointer |
| 37 | + 1435 0/[none] |
| 38 | + 1002341 [none]/[none] |
| 39 | + 1234212 object,external/simple pointer |
| 40 | + 213 object,external/blob |
| 41 | + 20 object,utf-8/[none] |
| 42 | + 123428 utf-8,gzip/[none] |
| 43 | + 123 utf-8/[none] |
| 44 | +all done! |
| 45 | + |
| 46 | +""" |
| 47 | + |
| 48 | + |
| 49 | +import re |
| 50 | +import optparse |
| 51 | + |
| 52 | +## |
| 53 | +## set up argument parsing. |
| 54 | +usage = "usage: %prog <old-stats-file> <new-stats-file>" |
| 55 | +desc = "Calculate the difference between two files containing storageTypeStatsSum.py output" |
| 56 | +parser = optparse.OptionParser(usage=usage, description=desc) |
| 57 | +(opts, args) = parser.parse_args() |
| 58 | +# Require exactly two arguments |
| 59 | +if len(args) != 2: |
| 60 | + print "Two files needed." |
| 61 | + parser.print_help() |
| 62 | + exit() |
| 63 | + |
| 64 | +try: |
| 65 | + oldfile=open(args[0], 'r') |
| 66 | + newfile=open(args[1], 'r') |
| 67 | +except IOError, e: |
| 68 | + print "IOError trying to open %s or %s: %s\n" % (args[0], args[1], e) |
| 69 | + exit(1) |
| 70 | + |
| 71 | +# match only the actual value / key lines; ignore everything else |
| 72 | +valueline = re.compile("^ *(?P<val>\d+) *(?P<desc>.*)$") |
| 73 | + |
| 74 | +files={} |
| 75 | +# ok, parse the files and collect stats! |
| 76 | +for file in (oldfile, newfile): |
| 77 | + stats = {} |
| 78 | + for line in file: |
| 79 | + match = valueline.match(line) |
| 80 | + if match: |
| 81 | + stats[match.group('desc')] = int(match.group('val')) |
| 82 | + #stats collected for one file, save it to the files dict |
| 83 | + files[file.name] = stats |
| 84 | + |
| 85 | +# calculate the difference |
| 86 | +diff = {} # contains numbers keyed on storage types |
| 87 | +allkeys = [] |
| 88 | +# collect keys from both sets in case they don't match |
| 89 | +for stats in files.keys(): |
| 90 | + # get the union of allkeys and this file's stats keys |
| 91 | + allkeys = list( set(allkeys) | set(files[stats].keys()) ) |
| 92 | +for key in allkeys: |
| 93 | + try: |
| 94 | + diff[key] = files[newfile.name][key] - files[oldfile.name][key] |
| 95 | + except KeyError: |
| 96 | + # this happens when a key only exists in one set |
| 97 | + diff[key] = 'n/a' |
| 98 | + |
| 99 | +# print out results |
| 100 | +print "%12s %12s %12s %s" % (oldfile.name, newfile.name, 'Diff', 'Type') |
| 101 | +print "---------------------------------------------------------------------" |
| 102 | +for key in sorted(allkeys): |
| 103 | + try: |
| 104 | + oldval = files[oldfile.name][key] |
| 105 | + except KeyError: |
| 106 | + oldval = 'n/a' |
| 107 | + try: |
| 108 | + newval = files[newfile.name][key] |
| 109 | + except KeyError: |
| 110 | + newval = 'n/a' |
| 111 | + diffnum = diff[key] |
| 112 | + name = key |
| 113 | + print "%12s %12s %12s %s" % (oldval, newval, diffnum, name) |
| 114 | + |
Property changes on: trunk/extensions/WikimediaMaintenance/storage/storageTypeStatsDiff.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 115 | + native |
Added: svn:executable |
2 | 116 | + * |