r85459 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85458‎ | r85459 | r85460 >
Date:19:22, 5 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Cleanup, removing old code. Reorganization.
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py (deleted) (history)
  • /trunk/tools/editor_trends/classes/analytics.py (modified) (history)
  • /trunk/tools/editor_trends/classes/bots.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/code-snippets/match_talkpage_article.py (added) (history)
  • /trunk/tools/editor_trends/code-snippets/shaper.py (added) (history)
  • /trunk/tools/editor_trends/code-snippets/wikitree (added) (history)
  • /trunk/tools/editor_trends/code-snippets/wikitree/parser.py (replaced) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/shaper.py (deleted) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/utils/data_converter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
@@ -1,72 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-07'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -import os
23 -if '..' not in sys.path:
24 - sys.path.append('..')
25 -
26 -from classes import settings
27 -settings = settings.Settings()
28 -from etl import extracter
29 -from utils import file_utils
30 -import wikitree
31 -
32 -try:
33 - import psyco
34 - psyco.full()
35 -except ImportError:
36 - pass
37 -
38 -class Article:
39 - def __init__(self, title, id, talk_id=None):
40 - self.title = title
41 - self.id = id
42 - self.talk_id = talk_id
43 -
44 -
45 -def parse_dumpfile(project, language_code, namespaces=['0', '1']):
46 - articles = {}
47 - ns = extracter.load_namespace(language_code)
48 - non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)
49 -
50 -
51 - location = os.path.join(settings.input_location, language_code, project)
52 - fh = file_utils.create_txt_filehandle(location,
53 - '%s%s-latest-stub-meta-history.xml' % (language_code, project),
54 - 'r', 'utf-8')
55 -
56 - for page, article_size in wikitree.parser.read_input(fh):
57 - title = page.find('title')
58 - if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):
59 - article_id = page.find('id').text
60 - title = title.text
61 - if title.startswith(ns['1'].get('canonical')):
62 - namespace = 'Talk'
63 - article = articles.get(article_id, Article(None, None, article_id))
64 - article.talk_id = article_id
65 - else:
66 - namespace = 'Main'
67 - article = articles.get(article_id, Article(title, article_id))
68 - articles[article_id] = article
69 -
70 - file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')
71 -
72 -if __name__ == '__main__':
73 - parse_dumpfile('wiki', 'en')
Index: trunk/tools/editor_trends/etl/shaper.py
@@ -1,72 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -
17 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
18 -__email__ = 'dvanliere at gmail dot com'
19 -__date__ = '2010-11-24'
20 -__version__ = '0.1'
21 -
22 -import datetime
23 -import math
24 -
25 -
26 -def add_datatype(datatype=0.0):
27 - if datatype == 'dict':
28 - d = dict()
29 - elif datatype == 'list':
30 - d = list()
31 - elif datatype == 'set':
32 - d = set()
33 - else:
34 - d = 0.0
35 - return d
36 -
37 -
38 -def create_datacontainer(first_year, final_year, datatype='dict'):
39 - '''
40 - This function initializes an empty dictionary with as key the year (starting
41 - 2001 and running through) and as value @datatype, in most cases this will
42 - be zero so the dictionary will act as a running tally for a variable but
43 - @datatype can also a list, [], or a dictionary, {}, or a set, set().
44 - '''
45 - data = {}
46 - for x in xrange(first_year, final_year):
47 - data[str(x)] = add_datatype(datatype)
48 - return data
49 -
50 -
51 -def add_windows_to_datacontainer(datacontainer, windows):
52 - for dc in datacontainer:
53 - for w in windows:
54 - datacontainer[dc][w] = add_datatype()
55 -
56 - return datacontainer
57 -
58 -
59 -def add_months_to_datacontainer(datacontainer, datatype):
60 - for dc in datacontainer:
61 - datacontainer[dc] = {}
62 - for x in xrange(1, 13):
63 - datacontainer[dc][x] = add_datatype(datatype)
64 -
65 - return datacontainer
66 -
67 -
68 -def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):
69 - for dc in datacontainer:
70 - datacontainer[dc] = {}
71 - for x in range(first_year, final_year):
72 - datacontainer[dc][x] = datatype
73 - return datacontainer
Index: trunk/tools/editor_trends/etl/enricher.py
@@ -86,13 +86,14 @@
8787 }
8888
8989 class Statistics:
90 - def __init__(self):
 90+ def __init__(self, process_id):
 91+ self.process_id = process_id
9192 self.count_articles = 0
9293 self.count_revisions = 0
9394
9495 def summary(self):
95 - print 'Number of articles: %s' % self.count_articles
96 - print 'Number of revisions: %s' % self.count_revisions
 96+ print 'Worker %s: Number of articles: %s' % (self.process_id, self.count_articles)
 97+ print 'Worker %s: Number of revisions: %s' % (self.process_id, self.count_revisions)
9798
9899 class Dummy:
99100 pass
@@ -108,20 +109,20 @@
109110
110111
111112 class Buffer:
112 - def __init__(self, storage, processs_id, rts=None, filehandles=None, locks=None):
 113+ def __init__(self, storage, process_id, rts=None, filehandles=None, locks=None):
113114 assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \
114115 'Valid storage options are cassandra and mongo.'
115116 self.storage = storage
116117 self.revisions = {}
117118 self.comments = {}
118119 self.titles = {}
119 - self.processs_id = processs_id
 120+ self.process_id = process_id
120121 self.keyspace_name = 'enwiki'
121122 self.keys = ['revision_id', 'article_id', 'id', 'username', 'namespace',
122123 'title', 'timestamp', 'hash', 'revert', 'bot', 'cur_size',
123124 'delta']
124125 self.setup_storage()
125 - self.stats = Statistics()
 126+ self.stats = Statistics(self.process_id)
126127 if storage == 'csv' and locks != None:
127128 self.rts = rts
128129 self.lock1 = locks[0] #lock for generic data
@@ -579,8 +580,8 @@
580581 5: 'Wikipedia Talk',
581582 1: 'Talk',
582583 2: 'User',
583 - 4: 'Wikipedia'
584 - }
 584+ 4: 'Wikipedia'}
 585+
585586 title = parse_title(article['title'])
586587 namespaces = article['namespaces']
587588 namespace = determine_namespace(title, namespaces, include_ns, EXCLUDE_NAMESPACE)
@@ -641,30 +642,36 @@
642643 article = {}
643644 article['revisions'] = []
644645 id = False
645 - for event, elem in context:
646 - if event == 'end' and elem.tag.endswith('siteinfo'):
647 - xml_namespace = determine_xml_namespace(elem)
648 - namespaces = create_namespace_dict(elem, xml_namespace)
649 - article['namespaces'] = namespaces
650 - elif event == 'end' and elem.tag.endswith('title'):
651 - article['title'] = elem
652 - elif event == 'end' and elem.tag.endswith('revision'):
653 - article['revisions'].append(elem)
654 - elif event == 'end' and elem.tag.endswith('id') and id == False:
655 - article['id'] = elem
656 - id = True
657 - elif event == 'end' and elem.tag.endswith('page'):
658 - yield article, xml_namespace
659 - elem.clear()
660 - article = {}
661 - article['revisions'] = []
662 - article['namespaces'] = namespaces
663 - id = False
664 - elif rts.kaggle == True and event == 'end':
665 - print 'I am cleaning up'
666 - elem.clear()
667646
 647+ try:
 648+ for event, elem in context:
 649+ if event == 'end' and elem.tag.endswith('siteinfo'):
 650+ xml_namespace = determine_xml_namespace(elem)
 651+ namespaces = create_namespace_dict(elem, xml_namespace)
 652+ article['namespaces'] = namespaces
 653+ elif event == 'end' and elem.tag.endswith('title'):
 654+ article['title'] = elem
 655+ elif event == 'end' and elem.tag.endswith('revision'):
 656+ article['revisions'].append(elem)
 657+ elif event == 'end' and elem.tag.endswith('id') and id == False:
 658+ article['id'] = elem
 659+ id = True
 660+ elif event == 'end' and elem.tag.endswith('page'):
 661+ yield article, xml_namespace
 662+ elem.clear()
 663+ article = {}
 664+ article['revisions'] = []
 665+ article['namespaces'] = namespaces
 666+ id = False
 667+ elif rts.kaggle == True and event == 'end':
 668+ print 'I am cleaning up'
 669+ elem.clear()
 670+ except SyntaxError, error:
 671+ print 'Encountered invalid XML tag. Error message: %s' % error
 672+ dump(elem)
 673+ sys.exit(-1)
668674
 675+
669676 def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts):
670677 bots = detector.retrieve_bots('en')
671678 path = os.path.join(rts.location, 'txt')
@@ -708,7 +715,8 @@
709716 fh.close()
710717
711718 t1 = datetime.datetime.now()
712 - print 'Processing of %s took %s' % (filename, (t1 - t0))
 719+ print 'Worker %s: Processing of %s took %s' % (process_id, filename, (t1 - t0))
 720+ print 'There are %s files left in the queue' % (input_queue.qsize())
713721 t0 = t1
714722
715723 if dataset == 'training':
@@ -725,7 +733,7 @@
726734 filename = 'counts_%s.bin' % filename
727735 file_utils.store_object(counts, location, filename)
728736
729 - print 'Finished parsing bz2 archives'
 737+ print 'Finished parsing Wikipedia dump files.'
730738
731739
732740 def setup(storage, rts=None):
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -28,9 +28,10 @@
2929 from database import db
3030 from utils import file_utils
3131 from utils import messages
 32+from utils import data_converter
3233 from classes import consumers
33 -import shaper
3434
 35+
3536 try:
3637 import psyco
3738 psyco.full()
@@ -84,7 +85,7 @@
8586 edit_count = determine_number_edits(edits, first_year, final_year)
8687
8788 totals = {}
88 - counts = shaper.create_datacontainer(first_year, final_year)
 89+ counts = data_converter.create_datacontainer(first_year, final_year)
8990 totals = calculate_totals(totals, counts, character_count, 'character_count')
9091 totals = calculate_totals(totals, counts, revert_count, 'revert_count')
9192 totals = calculate_totals(totals, counts, article_count, 'article_count')
@@ -95,12 +96,14 @@
9697 new_wikipedian = edits[cutoff]['date']
9798 else:
9899 new_wikipedian = False
 100+ cum_edit_count = len(edits)
99101 first_edit = edits[0]['date']
100102 final_edit = edits[-1]['date']
101103
102104 self.output_db.insert({'editor': self.id,
103105 'username': username,
104106 'new_wikipedian': new_wikipedian,
 107+ 'cum_edit_count': cum_edit_count,
105108 'final_edit': final_edit,
106109 'first_edit': first_edit,
107110 'last_edit_by_year': last_edit_by_year,
@@ -148,8 +151,8 @@
149152
150153
151154 def determine_number_edits(edits, first_year, final_year):
152 - dc = shaper.create_datacontainer(first_year, final_year)
153 - dc = shaper.add_months_to_datacontainer(dc, 'dict')
 155+ dc = data_converter.create_datacontainer(first_year, final_year)
 156+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
154157 for edit in edits:
155158 ns = edit['ns']
156159 year, month = str(edit['date'].year), edit['date'].month
@@ -161,8 +164,8 @@
162165
163166
164167 def determine_articles_workedon(edits, first_year, final_year):
165 - dc = shaper.create_datacontainer(first_year, final_year)
166 - dc = shaper.add_months_to_datacontainer(dc, 'dict')
 168+ dc = data_converter.create_datacontainer(first_year, final_year)
 169+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
167170 for year in edits:
168171 for edit in edits[year]:
169172 month = edit['date'].month
@@ -179,8 +182,8 @@
180183
181184
182185 def determine_namespaces_workedon(edits, first_year, final_year):
183 - dc = shaper.create_datacontainer(first_year, final_year)
184 - dc = shaper.add_months_to_datacontainer(dc, 'set')
 186+ dc = data_converter.create_datacontainer(first_year, final_year)
 187+ dc = data_converter.add_months_to_datacontainer(dc, 'set')
185188 for year in edits:
186189 for edit in edits[year]:
187190 month = edit['date'].month
@@ -194,8 +197,8 @@
195198
196199
197200 def determine_number_reverts(edits, first_year, final_year):
198 - dc = shaper.create_datacontainer(first_year, final_year)
199 - dc = shaper.add_months_to_datacontainer(dc, 'dict')
 201+ dc = data_converter.create_datacontainer(first_year, final_year)
 202+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
200203 for year in edits:
201204 for edit in edits[year]:
202205 month = edit['date'].month
@@ -213,8 +216,8 @@
214217 This function counts the number of characters added and remove by year
215218 by month by namespace for a particular editor.
216219 '''
217 - dc = shaper.create_datacontainer(first_year, final_year)
218 - dc = shaper.add_months_to_datacontainer(dc, 'dict')
 220+ dc = data_converter.create_datacontainer(first_year, final_year)
 221+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
219222 for year in edits:
220223 for edit in edits[year]:
221224 month = edit['date'].month
@@ -240,7 +243,7 @@
241244
242245
243246 def determine_last_edit_by_year(edits, first_year, final_year):
244 - dc = shaper.create_datacontainer(first_year, final_year, 0)
 247+ dc = data_converter.create_datacontainer(first_year, final_year, 0)
245248 for year in edits:
246249 for edit in edits[year]:
247250 date = str(edit['date'].year)
@@ -257,8 +260,8 @@
258261 This function counts the number of unique articles by year edited by a
259262 particular editor.
260263 '''
261 - dc = shaper.create_datacontainer(first_year, final_year)
262 - dc = shaper.add_months_to_datacontainer(dc, 'dict')
 264+ dc = data_converter.create_datacontainer(first_year, final_year)
 265+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
263266 for year in articles_edited:
264267 for month in articles_edited[year]:
265268 for ns in articles_edited[year][month]:
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -115,10 +115,13 @@
116116 if [True for kw in keywords if kw.find('=') > -1] != []:
117117 for kw in keywords:
118118 key, value = kw.split('=')
119 - try:
120 - value = int(value)
121 - except ValueError:
122 - pass
 119+ if value.find(';') > -1:
 120+ value = value.split(';')
 121+ else:
 122+ try:
 123+ value = int(value)
 124+ except ValueError:
 125+ pass
123126 d[key] = value
124127 return d
125128
Index: trunk/tools/editor_trends/classes/analytics.py
@@ -28,7 +28,7 @@
2929
3030 class Replicator:
3131 def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs):
32 - #this is an ugly hack to prevent a circular import problem
 32+ #TODO this is an ugly hack to prevent a circular import problem
3333 #this needs a better fix.
3434 import manage
3535
Index: trunk/tools/editor_trends/classes/bots.py
@@ -27,7 +27,7 @@
2828 from classes import settings
2929 settings = settings.Settings()
3030
31 -from etl import shaper
 31+from utils import data_converter
3232 from utils import file_utils
3333
3434
@@ -36,7 +36,7 @@
3737 def __init__(self, name, **kwargs):
3838 self.name = name
3939 self.projects = []
40 - self.time = shaper.create_datacontainer(datatype='list')
 40+ self.time = data_converter.create_datacontainer(datatype='list')
4141 self.verified = True
4242 for kw in kwargs:
4343 setattr(self, kw, kwargs[kw])
@@ -45,7 +45,7 @@
4646 return self.name
4747
4848 def hours_active(self):
49 - self.clock = shaper.create_clock()
 49+ self.clock = data_converter.create_clock()
5050 years = self.time.keys()
5151 for year in years:
5252 for obs in self.time[year]:
Index: trunk/tools/editor_trends/utils/data_converter.py
@@ -19,22 +19,77 @@
2020 __version__ = '0.1'
2121
2222 import datetime
 23+import datetime
 24+import math
2325
 26+
 27+def add_datatype(datatype=0.0):
 28+ if datatype == 'dict':
 29+ d = dict()
 30+ elif datatype == 'list':
 31+ d = list()
 32+ elif datatype == 'set':
 33+ d = set()
 34+ else:
 35+ d = 0.0
 36+ return d
 37+
 38+
 39+def create_datacontainer(first_year, final_year, datatype='dict'):
 40+ '''
 41+ This function initializes an empty dictionary with as key the year (starting
 42+ 2001 and running through) and as value @datatype, in most cases this will
 43+ be zero so the dictionary will act as a running tally for a variable but
 44+ @datatype can also a list, [], or a dictionary, {}, or a set, set().
 45+ '''
 46+ data = {}
 47+ for x in xrange(first_year, final_year):
 48+ data[str(x)] = add_datatype(datatype)
 49+ return data
 50+
 51+
 52+def add_windows_to_datacontainer(datacontainer, windows):
 53+ for dc in datacontainer:
 54+ for w in windows:
 55+ datacontainer[dc][w] = add_datatype()
 56+
 57+ return datacontainer
 58+
 59+
 60+def add_months_to_datacontainer(datacontainer, datatype):
 61+ for dc in datacontainer:
 62+ datacontainer[dc] = {}
 63+ for x in xrange(1, 13):
 64+ datacontainer[dc][x] = add_datatype(datatype)
 65+
 66+ return datacontainer
 67+
 68+
 69+def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):
 70+ for dc in datacontainer:
 71+ datacontainer[dc] = {}
 72+ for x in range(first_year, final_year):
 73+ datacontainer[dc][x] = datatype
 74+ return datacontainer
 75+
 76+
2477 def create_windows(var, break_down_first_year=True):
2578 '''
2679 This function creates a list of months. If break_down_first_year = True then
2780 the first year will be split in 3, 6, 9 months as well.
2881 '''
29 - years = (var.max_year - var.min_year) +1
 82+ years = (var.max_year - var.min_year) + 1
3083 windows = [y * 12 for y in xrange(1, years)]
3184 if break_down_first_year:
3285 windows = [3, 6, 9] + windows
3386 return windows
3487
 88+
3589 def convert_seconds_to_date(secs):
3690 #return time.gmtime(secs)
3791 return datetime.datetime.fromtimestamp(secs)
3892
 93+
3994 def convert_dataset_to_lists(ds, caller):
4095 assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.'
4196 data = []
@@ -88,15 +143,6 @@
89144 return headers
90145
91146
92 -#def make_data_rectangular(data, all_keys):
93 -# for i, d in enumerate(data):
94 -# for key in all_keys:
95 -# if key not in d:
96 -# d[key] = 0
97 -# data[i] = d
98 -# return data
99 -
100 -
101147 def get_all_props(var):
102148 all_keys = []
103149 for obs in var.obs.values():
Index: trunk/tools/editor_trends/database/cache.py
@@ -31,7 +31,7 @@
3232
3333 import db
3434 from utils import file_utils
35 -from etl import shaper
 35+from utils import data_converter
3636
3737 class EditorCache(object):
3838 def __init__(self, collection):
@@ -65,7 +65,7 @@
6666 if key not in self.editors:
6767 self.editors[key] = {}
6868 self.editors[key]['obs'] = 0
69 - self.editors[key]['edits'] = shaper.create_datacontainer(2001, self.final_year, 'list')
 69+ self.editors[key]['edits'] = data_converter.create_datacontainer(2001, self.final_year, 'list')
7070 self.editors[key]['username'] = value.pop('username')
7171 else:
7272 value.pop('username')
Index: trunk/tools/editor_trends/code-snippets/wikitree/parser.py
@@ -0,0 +1,173 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+import re
 22+import cStringIO
 23+import codecs
 24+import xml.etree.cElementTree as cElementTree
 25+from lxml import etree
 26+import sys
 27+import gzip
 28+
 29+if '..' not in sys.path:
 30+ sys.path.append('..')
 31+
 32+from classes import settings
 33+settings = settings.Settings()
 34+from utils import file_utils
 35+
 36+def convert_html_entities(text):
 37+ return file_utils.unescape(text)
 38+
 39+
 40+def extract_text(elem, **kwargs):
 41+ if elem != None and elem.text != None:
 42+ return u'%s' % elem.text
 43+ else:
 44+ return None
 45+
 46+
 47+def remove_xml_namespace(element, xml_namespace):
 48+ '''Remove namespace from the XML document.'''
 49+ ns = u'{%s}' % xml_namespace
 50+ nsl = len(ns)
 51+ for elem in element.getiterator():
 52+ if elem.tag.startswith(ns):
 53+ elem.tag = elem.tag[nsl:]
 54+ return element
 55+
 56+
 57+def determine_element(line):
 58+ pos = line.find(' ')
 59+ elem = line[:pos] + '>'
 60+
 61+
 62+def create_namespace_dict(namespaces):
 63+ d = {}
 64+ print 'Constructing namespace dictionary'
 65+ for ns in namespaces:
 66+ key = ns.get('key')
 67+ d[key] = extract_text(ns)
 68+ text = ns.text if ns.text != None else ''
 69+ try:
 70+ print key, text.encode('utf-8')
 71+ except UnicodeEncodeError:
 72+ print key
 73+ return d
 74+
 75+
 76+def extract_meta_information(fh):
 77+ '''
 78+ The purpose of this function is:
 79+ 1) Determine the version of the mediawiki dump file. Default is 0.4.
 80+ 2) Create a dictionary with the namespaces
 81+ '''
 82+ buffer = cStringIO.StringIO()
 83+ wrapper = codecs.getwriter('utf-8')(buffer)
 84+ wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
 85+ re_version = re.compile('\"\d\.\d\"')
 86+ for x, raw_data in enumerate(fh):
 87+ raw_data = ''.join(raw_data.strip())
 88+ if x == 0:
 89+ version = re.findall(re_version, raw_data)[0]
 90+ version = version.replace('"', '')
 91+ wrapper.write(raw_data)
 92+ if raw_data.find('</siteinfo>') > -1:
 93+ wrapper.write('</mediawiki>')
 94+ article = wrapper.getvalue()
 95+ elem = cElementTree.XML(article)
 96+ break
 97+ xml_namespace = settings.xml_namespace.replace('0.4', version)
 98+ elem = remove_xml_namespace(elem, xml_namespace)
 99+ siteinfo = elem.find('siteinfo')
 100+ namespaces = siteinfo.find('namespaces')
 101+ namespaces = create_namespace_dict(namespaces)
 102+ return namespaces, xml_namespace
 103+
 104+
 105+def read_input(fh):
 106+ context = cElementTree.iterparse(fh, events=('end',))
 107+ context = iter(context)
 108+
 109+ article = {}
 110+ article['revisions'] = []
 111+ id = False
 112+ namespace = '{http://www.mediawiki.org/xml/export-0.4/}'
 113+
 114+ for event, elem in context:
 115+ if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'):
 116+ article['title'] = elem
 117+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'):
 118+ article['revisions'].append(elem)
 119+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False:
 120+ article['id'] = elem
 121+ id = True
 122+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
 123+ yield article, 0
 124+ elem.clear()
 125+ article = {}
 126+ article['revisions'] = []
 127+ id = False
 128+ elif event == 'end':
 129+ elem.clear()
 130+
 131+#def read_input(fh):
 132+# buffer = cStringIO.StringIO()
 133+# wrapper = codecs.getwriter('utf-8')(buffer)
 134+# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
 135+# start_parsing = False
 136+#
 137+# for raw_data in fh:
 138+# if raw_data == '\n':
 139+# continue
 140+# if start_parsing == False and raw_data.find('<page>') > -1:
 141+# start_parsing = True
 142+# if start_parsing:
 143+# raw_data = ''.join(raw_data.strip())
 144+# wrapper.write(raw_data)
 145+# if raw_data.find('</page>') > -1:
 146+# article = wrapper.getvalue()
 147+# size = len(article)
 148+# #article.encode('utf-8')
 149+# article = cElementTree.XML(article)
 150+# yield article, size
 151+# '''
 152+# #This looks counter intuitive but Python continues with this
 153+# call after it has finished the yield statement
 154+# '''
 155+# buffer = cStringIO.StringIO()
 156+# wrapper = codecs.getwriter('utf-8')(buffer)
 157+# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
 158+# fh.close()
 159+
 160+
 161+def debug():
 162+ #fh = codecs.open('c:\\wikimedia\\en\\wiki\dewiki-latest-stub-meta-history.xml', 'r', 'utf-8')
 163+ filename = 'c:\\wikimedia\\en\\wiki\\enwiki-latest-stub-meta-history10.xml.gz'
 164+ fh = gzip.GzipFile(filename, 'rb')
 165+
 166+ for raw_data in fh:
 167+ print raw_data
 168+
 169+
 170+ fh.close()
 171+
 172+
 173+if __name__ == '__main__':
 174+ debug()
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree/parser.py
___________________________________________________________________
Added: svn:eol-style
1175 + native
Index: trunk/tools/editor_trends/code-snippets/wikitree/__init__.py
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree/__init__.py
___________________________________________________________________
Added: svn:eol-style
2176 + native
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree
___________________________________________________________________
Added: svn:ignore
3177 + wikistats
zips
notes.txt
*.pyc
*.xml
*.db
*.bin
*.zip
*.csv
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
fabric.py
fabfile.py
deployment
data
Index: trunk/tools/editor_trends/code-snippets/match_talkpage_article.py
@@ -0,0 +1,72 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-07'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+import os
 23+if '..' not in sys.path:
 24+ sys.path.append('..')
 25+
 26+from classes import settings
 27+settings = settings.Settings()
 28+from etl import extracter
 29+from utils import file_utils
 30+import wikitree
 31+
 32+try:
 33+ import psyco
 34+ psyco.full()
 35+except ImportError:
 36+ pass
 37+
 38+class Article:
 39+ def __init__(self, title, id, talk_id=None):
 40+ self.title = title
 41+ self.id = id
 42+ self.talk_id = talk_id
 43+
 44+
 45+def parse_dumpfile(project, language_code, namespaces=['0', '1']):
 46+ articles = {}
 47+ ns = extracter.load_namespace(language_code)
 48+ non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)
 49+
 50+
 51+ location = os.path.join(settings.input_location, language_code, project)
 52+ fh = file_utils.create_txt_filehandle(location,
 53+ '%s%s-latest-stub-meta-history.xml' % (language_code, project),
 54+ 'r', 'utf-8')
 55+
 56+ for page, article_size in wikitree.parser.read_input(fh):
 57+ title = page.find('title')
 58+ if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):
 59+ article_id = page.find('id').text
 60+ title = title.text
 61+ if title.startswith(ns['1'].get('canonical')):
 62+ namespace = 'Talk'
 63+ article = articles.get(article_id, Article(None, None, article_id))
 64+ article.talk_id = article_id
 65+ else:
 66+ namespace = 'Main'
 67+ article = articles.get(article_id, Article(title, article_id))
 68+ articles[article_id] = article
 69+
 70+ file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')
 71+
 72+if __name__ == '__main__':
 73+ parse_dumpfile('wiki', 'en')
Property changes on: trunk/tools/editor_trends/code-snippets/match_talkpage_article.py
___________________________________________________________________
Added: svn:eol-style
174 + native
Index: trunk/tools/editor_trends/code-snippets/shaper.py
@@ -0,0 +1,72 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+
 17+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 18+__email__ = 'dvanliere at gmail dot com'
 19+__date__ = '2010-11-24'
 20+__version__ = '0.1'
 21+
 22+import datetime
 23+import math
 24+
 25+
 26+def add_datatype(datatype=0.0):
 27+ if datatype == 'dict':
 28+ d = dict()
 29+ elif datatype == 'list':
 30+ d = list()
 31+ elif datatype == 'set':
 32+ d = set()
 33+ else:
 34+ d = 0.0
 35+ return d
 36+
 37+
 38+def create_datacontainer(first_year, final_year, datatype='dict'):
 39+ '''
 40+ This function initializes an empty dictionary with as key the year (starting
 41+ 2001 and running through) and as value @datatype, in most cases this will
 42+ be zero so the dictionary will act as a running tally for a variable but
 43+ @datatype can also a list, [], or a dictionary, {}, or a set, set().
 44+ '''
 45+ data = {}
 46+ for x in xrange(first_year, final_year):
 47+ data[str(x)] = add_datatype(datatype)
 48+ return data
 49+
 50+
 51+def add_windows_to_datacontainer(datacontainer, windows):
 52+ for dc in datacontainer:
 53+ for w in windows:
 54+ datacontainer[dc][w] = add_datatype()
 55+
 56+ return datacontainer
 57+
 58+
 59+def add_months_to_datacontainer(datacontainer, datatype):
 60+ for dc in datacontainer:
 61+ datacontainer[dc] = {}
 62+ for x in xrange(1, 13):
 63+ datacontainer[dc][x] = add_datatype(datatype)
 64+
 65+ return datacontainer
 66+
 67+
 68+def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):
 69+ for dc in datacontainer:
 70+ datacontainer[dc] = {}
 71+ for x in range(first_year, final_year):
 72+ datacontainer[dc][x] = datatype
 73+ return datacontainer
Property changes on: trunk/tools/editor_trends/code-snippets/shaper.py
___________________________________________________________________
Added: svn:eol-style
174 + native