r76845 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76844‎ | r76845 | r76846 >
Date:23:07, 16 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
This directory contains the final scripts to generate charts.
Modified paths:
  • /trunk/tools/editor_trends/analyses (added) (history)
  • /trunk/tools/editor_trends/analyses/__init__.py (added) (history)
  • /trunk/tools/editor_trends/analyses/cohort_charts.py (added) (history)
  • /trunk/tools/editor_trends/analyses/file_size_reduction.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/__init__.py
Property changes on: trunk/tools/editor_trends/analyses/__init__.py
___________________________________________________________________
Added: svn:eol-style
11 + native
Index: trunk/tools/editor_trends/analyses/cohort_charts.py
@@ -0,0 +1,30 @@
 2+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 3+__author__email = 'dvanliere at gmail dot com'
 4+__date__ = '2010-11-10'
 5+__version__ = '0.1'
 6+
 7+import configuration
 8+settings = configuration.Settings()
 9+from utils import utils
 10+
 11+def prepare_cohort_dataset():
 12+ dataset = utils.load_object(settings.binary_location, 'cohort_data.bin')
 13+ fh = utils.create_txt_filehandle(settings.dataset_location, 'cohort_data.txt', 'w', settings.encoding)
 14+
 15+ years = dataset.keys()
 16+ years.sort()
 17+ periods = dataset[2001].keys()
 18+ periods.sort()
 19+ periods.remove('n')
 20+ headers = ['months_%s' % i for i in periods]
 21+ headers.insert(0, 'year')
 22+ utils.write_list_to_csv(headers, fh)
 23+ for year in years:
 24+ n = float(dataset[year].pop('n'))
 25+ obs = [100 * float(dataset[year][p]) / n for p in periods]
 26+ raw = [dataset[year][p] for p in periods]
 27+ print sum(obs)
 28+ obs.insert(0, year)
 29+ utils.write_list_to_csv(obs, fh, newline=False)
 30+ utils.write_list_to_csv(raw, fh)
 31+ fh.close()
Property changes on: trunk/tools/editor_trends/analyses/cohort_charts.py
___________________________________________________________________
Added: svn:eol-style
132 + native
Index: trunk/tools/editor_trends/analyses/file_size_reduction.py
@@ -0,0 +1,99 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-15'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+sys.path.append('..')
 23+
 24+import os
 25+import xml.etree.cElementTree as cElementTree
 26+
 27+import configuration
 28+from utils import utils
 29+settings = configuration.Settings()
 30+
 31+
 32+class DumpStatistics(object):
 33+ ''' Simple class to keep track of XML tags, how often they occur,
 34+ and the length of strings they contain. This is used to calculate the
 35+ overhead.
 36+ '''
 37+ def __init__(self):
 38+ self.tags = {}
 39+
 40+ def add_tag(self, kwargs):
 41+ for kw in kwargs:
 42+ if kw not in self.tags:
 43+ self.tags[kw] = {}
 44+ self.tags[kw]['n'] = 0
 45+ self.tags[kw]['size'] = 0
 46+ self.tags[kw]['n'] += 1
 47+ self.tags[kw]['size'] += self.determine_length(kwargs[kw])
 48+
 49+ def average_size_text(self):
 50+ avg = {}
 51+ for kw in self.tags:
 52+ avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n']
 53+ return avg
 54+
 55+ def total_size_text(self):
 56+ return sum([self.tags[kw]['size'] for kw in self.tags])
 57+
 58+ def total_size_xml(self):
 59+ # the x2 is for the opening and closing tag
 60+ # the +5 is for 2x <, 2x > and 1x /
 61+ return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags])
 62+
 63+ def determine_length(self, text):
 64+ if text == None:
 65+ return 0
 66+ else:
 67+ return len(text)
 68+
 69+
 70+def calculate_filesize_overhead(location, filename):
 71+ counter = None
 72+ ds = DumpStatistics()
 73+ context = cElementTree.iterparse(filename, events=('start', 'end'))
 74+ context = iter(context)
 75+ event, root = context.next() #get the root element of the XML doc
 76+
 77+ try:
 78+ for event, elem in context:
 79+ if event == 'end':
 80+ ds.add_tag({elem.tag:elem.text})
 81+ root.clear() # when done parsing a section clear the tree to release memory
 82+ except SyntaxError:
 83+ pass
 84+ utils.store_object(ds, settings.binary_location, 'ds')
 85+ xml_size = ds.total_size_xml()
 86+ text_size = ds.total_size_text()
 87+ print text_size, xml_size
 88+ print ds.tags
 89+
 90+
 91+def output_dumpstatistics():
 92+ ds = utils.load_object(settings.binary_location, 'ds.bin')
 93+
 94+ for key in ds.tags:
 95+ print '%s\t%s' % (key, ds.tags[key])
 96+
 97+if __name__ == '__main__':
 98+ output_dumpstatistics()
 99+ #calculate_filesize_overhead(settings.input_location, settings.input_filename)
 100+
Property changes on: trunk/tools/editor_trends/analyses/file_size_reduction.py
___________________________________________________________________
Added: svn:eol-style
1101 + native

Status & tagging log