r76845 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76844‎ \| r76845 \| r76846 >
Date:	23:07, 16 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	This directory contains the final scripts to generate charts.
Modified paths:	/trunk/tools/editor_trends/analyses (added) (history) /trunk/tools/editor_trends/analyses/__init__.py (added) (history) /trunk/tools/editor_trends/analyses/cohort_charts.py (added) (history) /trunk/tools/editor_trends/analyses/file_size_reduction.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/__init__.py
Property changes on: trunk/tools/editor_trends/analyses/__init__.py
___________________________________________________________________
Added: svn:eol-style
1	1	+ native
Index: trunk/tools/editor_trends/analyses/cohort_charts.py
—	—	@@ -0,0 +1,30 @@
	2	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	3	+__author__email = 'dvanliere at gmail dot com'
	4	+__date__ = '2010-11-10'
	5	+__version__ = '0.1'
	6	+
	7	+import configuration
	8	+settings = configuration.Settings()
	9	+from utils import utils
	10	+
	11	+def prepare_cohort_dataset():
	12	+ dataset = utils.load_object(settings.binary_location, 'cohort_data.bin')
	13	+ fh = utils.create_txt_filehandle(settings.dataset_location, 'cohort_data.txt', 'w', settings.encoding)
	14	+
	15	+ years = dataset.keys()
	16	+ years.sort()
	17	+ periods = dataset[2001].keys()
	18	+ periods.sort()
	19	+ periods.remove('n')
	20	+ headers = ['months_%s' % i for i in periods]
	21	+ headers.insert(0, 'year')
	22	+ utils.write_list_to_csv(headers, fh)
	23	+ for year in years:
	24	+ n = float(dataset[year].pop('n'))
	25	+ obs = [100 * float(dataset[year][p]) / n for p in periods]
	26	+ raw = [dataset[year][p] for p in periods]
	27	+ print sum(obs)
	28	+ obs.insert(0, year)
	29	+ utils.write_list_to_csv(obs, fh, newline=False)
	30	+ utils.write_list_to_csv(raw, fh)
	31	+ fh.close()
Property changes on: trunk/tools/editor_trends/analyses/cohort_charts.py
___________________________________________________________________
Added: svn:eol-style
1	32	+ native
Index: trunk/tools/editor_trends/analyses/file_size_reduction.py
—	—	@@ -0,0 +1,99 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-15'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+sys.path.append('..')
	23	+
	24	+import os
	25	+import xml.etree.cElementTree as cElementTree
	26	+
	27	+import configuration
	28	+from utils import utils
	29	+settings = configuration.Settings()
	30	+
	31	+
	32	+class DumpStatistics(object):
	33	+ ''' Simple class to keep track of XML tags, how often they occur,
	34	+ and the length of strings they contain. This is used to calculate the
	35	+ overhead.
	36	+ '''
	37	+ def __init__(self):
	38	+ self.tags = {}
	39	+
	40	+ def add_tag(self, kwargs):
	41	+ for kw in kwargs:
	42	+ if kw not in self.tags:
	43	+ self.tags[kw] = {}
	44	+ self.tags[kw]['n'] = 0
	45	+ self.tags[kw]['size'] = 0
	46	+ self.tags[kw]['n'] += 1
	47	+ self.tags[kw]['size'] += self.determine_length(kwargs[kw])
	48	+
	49	+ def average_size_text(self):
	50	+ avg = {}
	51	+ for kw in self.tags:
	52	+ avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n']
	53	+ return avg
	54	+
	55	+ def total_size_text(self):
	56	+ return sum([self.tags[kw]['size'] for kw in self.tags])
	57	+
	58	+ def total_size_xml(self):
	59	+ # the x2 is for the opening and closing tag
	60	+ # the +5 is for 2x <, 2x > and 1x /
	61	+ return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags])
	62	+
	63	+ def determine_length(self, text):
	64	+ if text == None:
	65	+ return 0
	66	+ else:
	67	+ return len(text)
	68	+
	69	+
	70	+def calculate_filesize_overhead(location, filename):
	71	+ counter = None
	72	+ ds = DumpStatistics()
	73	+ context = cElementTree.iterparse(filename, events=('start', 'end'))
	74	+ context = iter(context)
	75	+ event, root = context.next() #get the root element of the XML doc
	76	+
	77	+ try:
	78	+ for event, elem in context:
	79	+ if event == 'end':
	80	+ ds.add_tag({elem.tag:elem.text})
	81	+ root.clear() # when done parsing a section clear the tree to release memory
	82	+ except SyntaxError:
	83	+ pass
	84	+ utils.store_object(ds, settings.binary_location, 'ds')
	85	+ xml_size = ds.total_size_xml()
	86	+ text_size = ds.total_size_text()
	87	+ print text_size, xml_size
	88	+ print ds.tags
	89	+
	90	+
	91	+def output_dumpstatistics():
	92	+ ds = utils.load_object(settings.binary_location, 'ds.bin')
	93	+
	94	+ for key in ds.tags:
	95	+ print '%s\t%s' % (key, ds.tags[key])
	96	+
	97	+if __name__ == '__main__':
	98	+ output_dumpstatistics()
	99	+ #calculate_filesize_overhead(settings.input_location, settings.input_filename)
	100	+
Property changes on: trunk/tools/editor_trends/analyses/file_size_reduction.py
___________________________________________________________________
Added: svn:eol-style
1	101	+ native

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76845 [removed: new added: deferred]