r85459 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r85458‎ \| r85459 \| r85460 >
Date:	19:22, 5 April 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Cleanup, removing old code. Reorganization.
Modified paths:	/trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py (deleted) (history) /trunk/tools/editor_trends/classes/analytics.py (modified) (history) /trunk/tools/editor_trends/classes/bots.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/code-snippets/match_talkpage_article.py (added) (history) /trunk/tools/editor_trends/code-snippets/shaper.py (added) (history) /trunk/tools/editor_trends/code-snippets/wikitree (added) (history) /trunk/tools/editor_trends/code-snippets/wikitree/parser.py (replaced) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/etl/enricher.py (modified) (history) /trunk/tools/editor_trends/etl/shaper.py (deleted) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/utils/data_converter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
—	—	@@ -1,72 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-07'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		~~-import os~~
23		~~-if '..' not in sys.path:~~
24		~~- sys.path.append('..')~~
25		-
26		~~-from classes import settings~~
27		~~-settings = settings.Settings()~~
28		~~-from etl import extracter~~
29		~~-from utils import file_utils~~
30		~~-import wikitree~~
31		-
32		~~-try:~~
33		~~- import psyco~~
34		~~- psyco.full()~~
35		~~-except ImportError:~~
36		~~- pass~~
37		-
38		~~-class Article:~~
39		~~- def __init__(self, title, id, talk_id=None):~~
40		~~- self.title = title~~
41		~~- self.id = id~~
42		~~- self.talk_id = talk_id~~
43		-
44		-
45		~~-def parse_dumpfile(project, language_code, namespaces=['0', '1']):~~
46		~~- articles = {}~~
47		~~- ns = extracter.load_namespace(language_code)~~
48		~~- non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)~~
49		-
50		-
51		~~- location = os.path.join(settings.input_location, language_code, project)~~
52		~~- fh = file_utils.create_txt_filehandle(location,~~
53		~~- '%s%s-latest-stub-meta-history.xml' % (language_code, project),~~
54		~~- 'r', 'utf-8')~~
55		-
56		~~- for page, article_size in wikitree.parser.read_input(fh):~~
57		~~- title = page.find('title')~~
58		~~- if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):~~
59		~~- article_id = page.find('id').text~~
60		~~- title = title.text~~
61		~~- if title.startswith(ns['1'].get('canonical')):~~
62		~~- namespace = 'Talk'~~
63		~~- article = articles.get(article_id, Article(None, None, article_id))~~
64		~~- article.talk_id = article_id~~
65		~~- else:~~
66		~~- namespace = 'Main'~~
67		~~- article = articles.get(article_id, Article(title, article_id))~~
68		~~- articles[article_id] = article~~
69		-
70		~~- file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')~~
71		-
72		~~-if __name__ == '__main__':~~
73		~~- parse_dumpfile('wiki', 'en')~~
Index: trunk/tools/editor_trends/etl/shaper.py
—	—	@@ -1,72 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		-
17		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
18		~~-__email__ = 'dvanliere at gmail dot com'~~
19		~~-__date__ = '2010-11-24'~~
20		~~-__version__ = '0.1'~~
21		-
22		~~-import datetime~~
23		~~-import math~~
24		-
25		-
26		~~-def add_datatype(datatype=0.0):~~
27		~~- if datatype == 'dict':~~
28		~~- d = dict()~~
29		~~- elif datatype == 'list':~~
30		~~- d = list()~~
31		~~- elif datatype == 'set':~~
32		~~- d = set()~~
33		~~- else:~~
34		~~- d = 0.0~~
35		~~- return d~~
36		-
37		-
38		~~-def create_datacontainer(first_year, final_year, datatype='dict'):~~
39		~~- '''~~
40		~~- This function initializes an empty dictionary with as key the year (starting~~
41		~~- 2001 and running through) and as value @datatype, in most cases this will~~
42		~~- be zero so the dictionary will act as a running tally for a variable but~~
43		~~- @datatype can also a list, [], or a dictionary, {}, or a set, set().~~
44		~~- '''~~
45		~~- data = {}~~
46		~~- for x in xrange(first_year, final_year):~~
47		~~- data[str(x)] = add_datatype(datatype)~~
48		~~- return data~~
49		-
50		-
51		~~-def add_windows_to_datacontainer(datacontainer, windows):~~
52		~~- for dc in datacontainer:~~
53		~~- for w in windows:~~
54		~~- datacontainer[dc][w] = add_datatype()~~
55		-
56		~~- return datacontainer~~
57		-
58		-
59		~~-def add_months_to_datacontainer(datacontainer, datatype):~~
60		~~- for dc in datacontainer:~~
61		~~- datacontainer[dc] = {}~~
62		~~- for x in xrange(1, 13):~~
63		~~- datacontainer[dc][x] = add_datatype(datatype)~~
64		-
65		~~- return datacontainer~~
66		-
67		-
68		~~-def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):~~
69		~~- for dc in datacontainer:~~
70		~~- datacontainer[dc] = {}~~
71		~~- for x in range(first_year, final_year):~~
72		~~- datacontainer[dc][x] = datatype~~
73		~~- return datacontainer~~
Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -86,13 +86,14 @@
87	87	}
88	88
89	89	class Statistics:
90		~~- def __init__(self):~~
	90	+ def __init__(self, process_id):
	91	+ self.process_id = process_id
91	92	self.count_articles = 0
92	93	self.count_revisions = 0
93	94
94	95	def summary(self):
95		~~- print 'Number of articles: %s' % self.count_articles~~
96		~~- print 'Number of revisions: %s' % self.count_revisions~~
	96	+ print 'Worker %s: Number of articles: %s' % (self.process_id, self.count_articles)
	97	+ print 'Worker %s: Number of revisions: %s' % (self.process_id, self.count_revisions)
97	98
98	99	class Dummy:
99	100	pass
—	—	@@ -108,20 +109,20 @@
109	110
110	111
111	112	class Buffer:
112		~~- def __init__(self, storage, processs_id, rts=None, filehandles=None, locks=None):~~
	113	+ def __init__(self, storage, process_id, rts=None, filehandles=None, locks=None):
113	114	assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \
114	115	'Valid storage options are cassandra and mongo.'
115	116	self.storage = storage
116	117	self.revisions = {}
117	118	self.comments = {}
118	119	self.titles = {}
119		~~- self.processs_id = processs_id~~
	120	+ self.process_id = process_id
120	121	self.keyspace_name = 'enwiki'
121	122	self.keys = ['revision_id', 'article_id', 'id', 'username', 'namespace',
122	123	'title', 'timestamp', 'hash', 'revert', 'bot', 'cur_size',
123	124	'delta']
124	125	self.setup_storage()
125		~~- self.stats = Statistics()~~
	126	+ self.stats = Statistics(self.process_id)
126	127	if storage == 'csv' and locks != None:
127	128	self.rts = rts
128	129	self.lock1 = locks[0] #lock for generic data
—	—	@@ -579,8 +580,8 @@
580	581	5: 'Wikipedia Talk',
581	582	1: 'Talk',
582	583	2: 'User',
583		~~- 4: 'Wikipedia'~~
584		~~- }~~
	584	+ 4: 'Wikipedia'}
	585	+
585	586	title = parse_title(article['title'])
586	587	namespaces = article['namespaces']
587	588	namespace = determine_namespace(title, namespaces, include_ns, EXCLUDE_NAMESPACE)
—	—	@@ -641,30 +642,36 @@
642	643	article = {}
643	644	article['revisions'] = []
644	645	id = False
645		~~- for event, elem in context:~~
646		~~- if event == 'end' and elem.tag.endswith('siteinfo'):~~
647		~~- xml_namespace = determine_xml_namespace(elem)~~
648		~~- namespaces = create_namespace_dict(elem, xml_namespace)~~
649		~~- article['namespaces'] = namespaces~~
650		~~- elif event == 'end' and elem.tag.endswith('title'):~~
651		~~- article['title'] = elem~~
652		~~- elif event == 'end' and elem.tag.endswith('revision'):~~
653		~~- article['revisions'].append(elem)~~
654		~~- elif event == 'end' and elem.tag.endswith('id') and id == False:~~
655		~~- article['id'] = elem~~
656		~~- id = True~~
657		~~- elif event == 'end' and elem.tag.endswith('page'):~~
658		~~- yield article, xml_namespace~~
659		~~- elem.clear()~~
660		~~- article = {}~~
661		~~- article['revisions'] = []~~
662		~~- article['namespaces'] = namespaces~~
663		~~- id = False~~
664		~~- elif rts.kaggle == True and event == 'end':~~
665		~~- print 'I am cleaning up'~~
666		~~- elem.clear()~~
667	646
	647	+ try:
	648	+ for event, elem in context:
	649	+ if event == 'end' and elem.tag.endswith('siteinfo'):
	650	+ xml_namespace = determine_xml_namespace(elem)
	651	+ namespaces = create_namespace_dict(elem, xml_namespace)
	652	+ article['namespaces'] = namespaces
	653	+ elif event == 'end' and elem.tag.endswith('title'):
	654	+ article['title'] = elem
	655	+ elif event == 'end' and elem.tag.endswith('revision'):
	656	+ article['revisions'].append(elem)
	657	+ elif event == 'end' and elem.tag.endswith('id') and id == False:
	658	+ article['id'] = elem
	659	+ id = True
	660	+ elif event == 'end' and elem.tag.endswith('page'):
	661	+ yield article, xml_namespace
	662	+ elem.clear()
	663	+ article = {}
	664	+ article['revisions'] = []
	665	+ article['namespaces'] = namespaces
	666	+ id = False
	667	+ elif rts.kaggle == True and event == 'end':
	668	+ print 'I am cleaning up'
	669	+ elem.clear()
	670	+ except SyntaxError, error:
	671	+ print 'Encountered invalid XML tag. Error message: %s' % error
	672	+ dump(elem)
	673	+ sys.exit(-1)
668	674
	675	+
669	676	def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts):
670	677	bots = detector.retrieve_bots('en')
671	678	path = os.path.join(rts.location, 'txt')
—	—	@@ -708,7 +715,8 @@
709	716	fh.close()
710	717
711	718	t1 = datetime.datetime.now()
712		~~- print 'Processing of %s took %s' % (filename, (t1 - t0))~~
	719	+ print 'Worker %s: Processing of %s took %s' % (process_id, filename, (t1 - t0))
	720	+ print 'There are %s files left in the queue' % (input_queue.qsize())
713	721	t0 = t1
714	722
715	723	if dataset == 'training':
—	—	@@ -725,7 +733,7 @@
726	734	filename = 'counts_%s.bin' % filename
727	735	file_utils.store_object(counts, location, filename)
728	736
729		~~- print 'Finished parsing bz2 archives'~~
	737	+ print 'Finished parsing Wikipedia dump files.'
730	738
731	739
732	740	def setup(storage, rts=None):
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -28,9 +28,10 @@
29	29	from database import db
30	30	from utils import file_utils
31	31	from utils import messages
	32	+from utils import data_converter
32	33	from classes import consumers
33		~~-import shaper~~
34	34
	35	+
35	36	try:
36	37	import psyco
37	38	psyco.full()
—	—	@@ -84,7 +85,7 @@
85	86	edit_count = determine_number_edits(edits, first_year, final_year)
86	87
87	88	totals = {}
88		~~- counts = shaper.create_datacontainer(first_year, final_year)~~
	89	+ counts = data_converter.create_datacontainer(first_year, final_year)
89	90	totals = calculate_totals(totals, counts, character_count, 'character_count')
90	91	totals = calculate_totals(totals, counts, revert_count, 'revert_count')
91	92	totals = calculate_totals(totals, counts, article_count, 'article_count')
—	—	@@ -95,12 +96,14 @@
96	97	new_wikipedian = edits[cutoff]['date']
97	98	else:
98	99	new_wikipedian = False
	100	+ cum_edit_count = len(edits)
99	101	first_edit = edits[0]['date']
100	102	final_edit = edits[-1]['date']
101	103
102	104	self.output_db.insert({'editor': self.id,
103	105	'username': username,
104	106	'new_wikipedian': new_wikipedian,
	107	+ 'cum_edit_count': cum_edit_count,
105	108	'final_edit': final_edit,
106	109	'first_edit': first_edit,
107	110	'last_edit_by_year': last_edit_by_year,
—	—	@@ -148,8 +151,8 @@
149	152
150	153
151	154	def determine_number_edits(edits, first_year, final_year):
152		~~- dc = shaper.create_datacontainer(first_year, final_year)~~
153		~~- dc = shaper.add_months_to_datacontainer(dc, 'dict')~~
	155	+ dc = data_converter.create_datacontainer(first_year, final_year)
	156	+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
154	157	for edit in edits:
155	158	ns = edit['ns']
156	159	year, month = str(edit['date'].year), edit['date'].month
—	—	@@ -161,8 +164,8 @@
162	165
163	166
164	167	def determine_articles_workedon(edits, first_year, final_year):
165		~~- dc = shaper.create_datacontainer(first_year, final_year)~~
166		~~- dc = shaper.add_months_to_datacontainer(dc, 'dict')~~
	168	+ dc = data_converter.create_datacontainer(first_year, final_year)
	169	+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
167	170	for year in edits:
168	171	for edit in edits[year]:
169	172	month = edit['date'].month
—	—	@@ -179,8 +182,8 @@
180	183
181	184
182	185	def determine_namespaces_workedon(edits, first_year, final_year):
183		~~- dc = shaper.create_datacontainer(first_year, final_year)~~
184		~~- dc = shaper.add_months_to_datacontainer(dc, 'set')~~
	186	+ dc = data_converter.create_datacontainer(first_year, final_year)
	187	+ dc = data_converter.add_months_to_datacontainer(dc, 'set')
185	188	for year in edits:
186	189	for edit in edits[year]:
187	190	month = edit['date'].month
—	—	@@ -194,8 +197,8 @@
195	198
196	199
197	200	def determine_number_reverts(edits, first_year, final_year):
198		~~- dc = shaper.create_datacontainer(first_year, final_year)~~
199		~~- dc = shaper.add_months_to_datacontainer(dc, 'dict')~~
	201	+ dc = data_converter.create_datacontainer(first_year, final_year)
	202	+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
200	203	for year in edits:
201	204	for edit in edits[year]:
202	205	month = edit['date'].month
—	—	@@ -213,8 +216,8 @@
214	217	This function counts the number of characters added and remove by year
215	218	by month by namespace for a particular editor.
216	219	'''
217		~~- dc = shaper.create_datacontainer(first_year, final_year)~~
218		~~- dc = shaper.add_months_to_datacontainer(dc, 'dict')~~
	220	+ dc = data_converter.create_datacontainer(first_year, final_year)
	221	+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
219	222	for year in edits:
220	223	for edit in edits[year]:
221	224	month = edit['date'].month
—	—	@@ -240,7 +243,7 @@
241	244
242	245
243	246	def determine_last_edit_by_year(edits, first_year, final_year):
244		~~- dc = shaper.create_datacontainer(first_year, final_year, 0)~~
	247	+ dc = data_converter.create_datacontainer(first_year, final_year, 0)
245	248	for year in edits:
246	249	for edit in edits[year]:
247	250	date = str(edit['date'].year)
—	—	@@ -257,8 +260,8 @@
258	261	This function counts the number of unique articles by year edited by a
259	262	particular editor.
260	263	'''
261		~~- dc = shaper.create_datacontainer(first_year, final_year)~~
262		~~- dc = shaper.add_months_to_datacontainer(dc, 'dict')~~
	264	+ dc = data_converter.create_datacontainer(first_year, final_year)
	265	+ dc = data_converter.add_months_to_datacontainer(dc, 'dict')
263	266	for year in articles_edited:
264	267	for month in articles_edited[year]:
265	268	for ns in articles_edited[year][month]:
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -115,10 +115,13 @@
116	116	if [True for kw in keywords if kw.find('=') > -1] != []:
117	117	for kw in keywords:
118	118	key, value = kw.split('=')
119		~~- try:~~
120		~~- value = int(value)~~
121		~~- except ValueError:~~
122		~~- pass~~
	119	+ if value.find(';') > -1:
	120	+ value = value.split(';')
	121	+ else:
	122	+ try:
	123	+ value = int(value)
	124	+ except ValueError:
	125	+ pass
123	126	d[key] = value
124	127	return d
125	128
Index: trunk/tools/editor_trends/classes/analytics.py
—	—	@@ -28,7 +28,7 @@
29	29
30	30	class Replicator:
31	31	def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs):
32		~~- #this is an ugly hack to prevent a circular import problem~~
	32	+ #TODO this is an ugly hack to prevent a circular import problem
33	33	#this needs a better fix.
34	34	import manage
35	35
Index: trunk/tools/editor_trends/classes/bots.py
—	—	@@ -27,7 +27,7 @@
28	28	from classes import settings
29	29	settings = settings.Settings()
30	30
31		~~-from etl import shaper~~
	31	+from utils import data_converter
32	32	from utils import file_utils
33	33
34	34
—	—	@@ -36,7 +36,7 @@
37	37	def __init__(self, name, **kwargs):
38	38	self.name = name
39	39	self.projects = []
40		~~- self.time = shaper.create_datacontainer(datatype='list')~~
	40	+ self.time = data_converter.create_datacontainer(datatype='list')
41	41	self.verified = True
42	42	for kw in kwargs:
43	43	setattr(self, kw, kwargs[kw])
—	—	@@ -45,7 +45,7 @@
46	46	return self.name
47	47
48	48	def hours_active(self):
49		~~- self.clock = shaper.create_clock()~~
	49	+ self.clock = data_converter.create_clock()
50	50	years = self.time.keys()
51	51	for year in years:
52	52	for obs in self.time[year]:
Index: trunk/tools/editor_trends/utils/data_converter.py
—	—	@@ -19,22 +19,77 @@
20	20	__version__ = '0.1'
21	21
22	22	import datetime
	23	+import datetime
	24	+import math
23	25
	26	+
	27	+def add_datatype(datatype=0.0):
	28	+ if datatype == 'dict':
	29	+ d = dict()
	30	+ elif datatype == 'list':
	31	+ d = list()
	32	+ elif datatype == 'set':
	33	+ d = set()
	34	+ else:
	35	+ d = 0.0
	36	+ return d
	37	+
	38	+
	39	+def create_datacontainer(first_year, final_year, datatype='dict'):
	40	+ '''
	41	+ This function initializes an empty dictionary with as key the year (starting
	42	+ 2001 and running through) and as value @datatype, in most cases this will
	43	+ be zero so the dictionary will act as a running tally for a variable but
	44	+ @datatype can also a list, [], or a dictionary, {}, or a set, set().
	45	+ '''
	46	+ data = {}
	47	+ for x in xrange(first_year, final_year):
	48	+ data[str(x)] = add_datatype(datatype)
	49	+ return data
	50	+
	51	+
	52	+def add_windows_to_datacontainer(datacontainer, windows):
	53	+ for dc in datacontainer:
	54	+ for w in windows:
	55	+ datacontainer[dc][w] = add_datatype()
	56	+
	57	+ return datacontainer
	58	+
	59	+
	60	+def add_months_to_datacontainer(datacontainer, datatype):
	61	+ for dc in datacontainer:
	62	+ datacontainer[dc] = {}
	63	+ for x in xrange(1, 13):
	64	+ datacontainer[dc][x] = add_datatype(datatype)
	65	+
	66	+ return datacontainer
	67	+
	68	+
	69	+def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):
	70	+ for dc in datacontainer:
	71	+ datacontainer[dc] = {}
	72	+ for x in range(first_year, final_year):
	73	+ datacontainer[dc][x] = datatype
	74	+ return datacontainer
	75	+
	76	+
24	77	def create_windows(var, break_down_first_year=True):
25	78	'''
26	79	This function creates a list of months. If break_down_first_year = True then
27	80	the first year will be split in 3, 6, 9 months as well.
28	81	'''
29		~~- years = (var.max_year - var.min_year) +1~~
	82	+ years = (var.max_year - var.min_year) + 1
30	83	windows = [y * 12 for y in xrange(1, years)]
31	84	if break_down_first_year:
32	85	windows = [3, 6, 9] + windows
33	86	return windows
34	87
	88	+
35	89	def convert_seconds_to_date(secs):
36	90	#return time.gmtime(secs)
37	91	return datetime.datetime.fromtimestamp(secs)
38	92
	93	+
39	94	def convert_dataset_to_lists(ds, caller):
40	95	assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.'
41	96	data = []
—	—	@@ -88,15 +143,6 @@
89	144	return headers
90	145
91	146
92		~~-#def make_data_rectangular(data, all_keys):~~
93		~~-# for i, d in enumerate(data):~~
94		~~-# for key in all_keys:~~
95		~~-# if key not in d:~~
96		~~-# d[key] = 0~~
97		~~-# data[i] = d~~
98		~~-# return data~~
99		-
100		-
101	147	def get_all_props(var):
102	148	all_keys = []
103	149	for obs in var.obs.values():
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -31,7 +31,7 @@
32	32
33	33	import db
34	34	from utils import file_utils
35		~~-from etl import shaper~~
	35	+from utils import data_converter
36	36
37	37	class EditorCache(object):
38	38	def __init__(self, collection):
—	—	@@ -65,7 +65,7 @@
66	66	if key not in self.editors:
67	67	self.editors[key] = {}
68	68	self.editors[key]['obs'] = 0
69		~~- self.editors[key]['edits'] = shaper.create_datacontainer(2001, self.final_year, 'list')~~
	69	+ self.editors[key]['edits'] = data_converter.create_datacontainer(2001, self.final_year, 'list')
70	70	self.editors[key]['username'] = value.pop('username')
71	71	else:
72	72	value.pop('username')
Index: trunk/tools/editor_trends/code-snippets/wikitree/parser.py
—	—	@@ -0,0 +1,173 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+import re
	22	+import cStringIO
	23	+import codecs
	24	+import xml.etree.cElementTree as cElementTree
	25	+from lxml import etree
	26	+import sys
	27	+import gzip
	28	+
	29	+if '..' not in sys.path:
	30	+ sys.path.append('..')
	31	+
	32	+from classes import settings
	33	+settings = settings.Settings()
	34	+from utils import file_utils
	35	+
	36	+def convert_html_entities(text):
	37	+ return file_utils.unescape(text)
	38	+
	39	+
	40	+def extract_text(elem, **kwargs):
	41	+ if elem != None and elem.text != None:
	42	+ return u'%s' % elem.text
	43	+ else:
	44	+ return None
	45	+
	46	+
	47	+def remove_xml_namespace(element, xml_namespace):
	48	+ '''Remove namespace from the XML document.'''
	49	+ ns = u'{%s}' % xml_namespace
	50	+ nsl = len(ns)
	51	+ for elem in element.getiterator():
	52	+ if elem.tag.startswith(ns):
	53	+ elem.tag = elem.tag[nsl:]
	54	+ return element
	55	+
	56	+
	57	+def determine_element(line):
	58	+ pos = line.find(' ')
	59	+ elem = line[:pos] + '>'
	60	+
	61	+
	62	+def create_namespace_dict(namespaces):
	63	+ d = {}
	64	+ print 'Constructing namespace dictionary'
	65	+ for ns in namespaces:
	66	+ key = ns.get('key')
	67	+ d[key] = extract_text(ns)
	68	+ text = ns.text if ns.text != None else ''
	69	+ try:
	70	+ print key, text.encode('utf-8')
	71	+ except UnicodeEncodeError:
	72	+ print key
	73	+ return d
	74	+
	75	+
	76	+def extract_meta_information(fh):
	77	+ '''
	78	+ The purpose of this function is:
	79	+ 1) Determine the version of the mediawiki dump file. Default is 0.4.
	80	+ 2) Create a dictionary with the namespaces
	81	+ '''
	82	+ buffer = cStringIO.StringIO()
	83	+ wrapper = codecs.getwriter('utf-8')(buffer)
	84	+ wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
	85	+ re_version = re.compile('\"\d\.\d\"')
	86	+ for x, raw_data in enumerate(fh):
	87	+ raw_data = ''.join(raw_data.strip())
	88	+ if x == 0:
	89	+ version = re.findall(re_version, raw_data)[0]
	90	+ version = version.replace('"', '')
	91	+ wrapper.write(raw_data)
	92	+ if raw_data.find('</siteinfo>') > -1:
	93	+ wrapper.write('</mediawiki>')
	94	+ article = wrapper.getvalue()
	95	+ elem = cElementTree.XML(article)
	96	+ break
	97	+ xml_namespace = settings.xml_namespace.replace('0.4', version)
	98	+ elem = remove_xml_namespace(elem, xml_namespace)
	99	+ siteinfo = elem.find('siteinfo')
	100	+ namespaces = siteinfo.find('namespaces')
	101	+ namespaces = create_namespace_dict(namespaces)
	102	+ return namespaces, xml_namespace
	103	+
	104	+
	105	+def read_input(fh):
	106	+ context = cElementTree.iterparse(fh, events=('end',))
	107	+ context = iter(context)
	108	+
	109	+ article = {}
	110	+ article['revisions'] = []
	111	+ id = False
	112	+ namespace = '{http://www.mediawiki.org/xml/export-0.4/}'
	113	+
	114	+ for event, elem in context:
	115	+ if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'):
	116	+ article['title'] = elem
	117	+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'):
	118	+ article['revisions'].append(elem)
	119	+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False:
	120	+ article['id'] = elem
	121	+ id = True
	122	+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
	123	+ yield article, 0
	124	+ elem.clear()
	125	+ article = {}
	126	+ article['revisions'] = []
	127	+ id = False
	128	+ elif event == 'end':
	129	+ elem.clear()
	130	+
	131	+#def read_input(fh):
	132	+# buffer = cStringIO.StringIO()
	133	+# wrapper = codecs.getwriter('utf-8')(buffer)
	134	+# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
	135	+# start_parsing = False
	136	+#
	137	+# for raw_data in fh:
	138	+# if raw_data == '\n':
	139	+# continue
	140	+# if start_parsing == False and raw_data.find('<page>') > -1:
	141	+# start_parsing = True
	142	+# if start_parsing:
	143	+# raw_data = ''.join(raw_data.strip())
	144	+# wrapper.write(raw_data)
	145	+# if raw_data.find('</page>') > -1:
	146	+# article = wrapper.getvalue()
	147	+# size = len(article)
	148	+# #article.encode('utf-8')
	149	+# article = cElementTree.XML(article)
	150	+# yield article, size
	151	+# '''
	152	+# #This looks counter intuitive but Python continues with this
	153	+# call after it has finished the yield statement
	154	+# '''
	155	+# buffer = cStringIO.StringIO()
	156	+# wrapper = codecs.getwriter('utf-8')(buffer)
	157	+# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
	158	+# fh.close()
	159	+
	160	+
	161	+def debug():
	162	+ #fh = codecs.open('c:\\wikimedia\\en\\wiki\dewiki-latest-stub-meta-history.xml', 'r', 'utf-8')
	163	+ filename = 'c:\\wikimedia\\en\\wiki\\enwiki-latest-stub-meta-history10.xml.gz'
	164	+ fh = gzip.GzipFile(filename, 'rb')
	165	+
	166	+ for raw_data in fh:
	167	+ print raw_data
	168	+
	169	+
	170	+ fh.close()
	171	+
	172	+
	173	+if __name__ == '__main__':
	174	+ debug()
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree/parser.py
___________________________________________________________________
Added: svn:eol-style
1	175	+ native
Index: trunk/tools/editor_trends/code-snippets/wikitree/__init__.py
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree/__init__.py
___________________________________________________________________
Added: svn:eol-style
2	176	+ native
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree
___________________________________________________________________
Added: svn:ignore
3	177	+ wikistats
zips
notes.txt
*.pyc
*.xml
*.db
*.bin
*.zip
*.csv
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
fabric.py
fabfile.py
deployment
data
Index: trunk/tools/editor_trends/code-snippets/match_talkpage_article.py
—	—	@@ -0,0 +1,72 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-07'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+import os
	23	+if '..' not in sys.path:
	24	+ sys.path.append('..')
	25	+
	26	+from classes import settings
	27	+settings = settings.Settings()
	28	+from etl import extracter
	29	+from utils import file_utils
	30	+import wikitree
	31	+
	32	+try:
	33	+ import psyco
	34	+ psyco.full()
	35	+except ImportError:
	36	+ pass
	37	+
	38	+class Article:
	39	+ def __init__(self, title, id, talk_id=None):
	40	+ self.title = title
	41	+ self.id = id
	42	+ self.talk_id = talk_id
	43	+
	44	+
	45	+def parse_dumpfile(project, language_code, namespaces=['0', '1']):
	46	+ articles = {}
	47	+ ns = extracter.load_namespace(language_code)
	48	+ non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)
	49	+
	50	+
	51	+ location = os.path.join(settings.input_location, language_code, project)
	52	+ fh = file_utils.create_txt_filehandle(location,
	53	+ '%s%s-latest-stub-meta-history.xml' % (language_code, project),
	54	+ 'r', 'utf-8')
	55	+
	56	+ for page, article_size in wikitree.parser.read_input(fh):
	57	+ title = page.find('title')
	58	+ if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):
	59	+ article_id = page.find('id').text
	60	+ title = title.text
	61	+ if title.startswith(ns['1'].get('canonical')):
	62	+ namespace = 'Talk'
	63	+ article = articles.get(article_id, Article(None, None, article_id))
	64	+ article.talk_id = article_id
	65	+ else:
	66	+ namespace = 'Main'
	67	+ article = articles.get(article_id, Article(title, article_id))
	68	+ articles[article_id] = article
	69	+
	70	+ file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')
	71	+
	72	+if __name__ == '__main__':
	73	+ parse_dumpfile('wiki', 'en')
Property changes on: trunk/tools/editor_trends/code-snippets/match_talkpage_article.py
___________________________________________________________________
Added: svn:eol-style
1	74	+ native
Index: trunk/tools/editor_trends/code-snippets/shaper.py
—	—	@@ -0,0 +1,72 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+
	17	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	18	+__email__ = 'dvanliere at gmail dot com'
	19	+__date__ = '2010-11-24'
	20	+__version__ = '0.1'
	21	+
	22	+import datetime
	23	+import math
	24	+
	25	+
	26	+def add_datatype(datatype=0.0):
	27	+ if datatype == 'dict':
	28	+ d = dict()
	29	+ elif datatype == 'list':
	30	+ d = list()
	31	+ elif datatype == 'set':
	32	+ d = set()
	33	+ else:
	34	+ d = 0.0
	35	+ return d
	36	+
	37	+
	38	+def create_datacontainer(first_year, final_year, datatype='dict'):
	39	+ '''
	40	+ This function initializes an empty dictionary with as key the year (starting
	41	+ 2001 and running through) and as value @datatype, in most cases this will
	42	+ be zero so the dictionary will act as a running tally for a variable but
	43	+ @datatype can also a list, [], or a dictionary, {}, or a set, set().
	44	+ '''
	45	+ data = {}
	46	+ for x in xrange(first_year, final_year):
	47	+ data[str(x)] = add_datatype(datatype)
	48	+ return data
	49	+
	50	+
	51	+def add_windows_to_datacontainer(datacontainer, windows):
	52	+ for dc in datacontainer:
	53	+ for w in windows:
	54	+ datacontainer[dc][w] = add_datatype()
	55	+
	56	+ return datacontainer
	57	+
	58	+
	59	+def add_months_to_datacontainer(datacontainer, datatype):
	60	+ for dc in datacontainer:
	61	+ datacontainer[dc] = {}
	62	+ for x in xrange(1, 13):
	63	+ datacontainer[dc][x] = add_datatype(datatype)
	64	+
	65	+ return datacontainer
	66	+
	67	+
	68	+def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):
	69	+ for dc in datacontainer:
	70	+ datacontainer[dc] = {}
	71	+ for x in range(first_year, final_year):
	72	+ datacontainer[dc][x] = datatype
	73	+ return datacontainer
Property changes on: trunk/tools/editor_trends/code-snippets/shaper.py
___________________________________________________________________
Added: svn:eol-style
1	74	+ native