r78582 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r78581‎ \| r78582 \| r78583 >
Date:	22:10, 18 December 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Thanks to Nimish for giving me a number of suggestions to reduce the processing time. This commit is primarily focused on improving the efficiency, for example by merging the split and extract phase into a single group of functionality.
Modified paths:	/trunk/tools/editor_trends/analyses/file_size_reduction.py (modified) (history) /trunk/tools/editor_trends/bots/bots.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/etl/chunker.py (modified) (history) /trunk/tools/editor_trends/etl/exporter.py (modified) (history) /trunk/tools/editor_trends/etl/extract.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (added) (history) /trunk/tools/editor_trends/etl/models.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/wikitree/parser.py (added) (history) /trunk/tools/editor_trends/wikitree/xml.py (deleted) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/file_size_reduction.py
—	—	@@ -69,6 +69,7 @@
70	70	def calculate_filesize_overhead(location, filename):
71	71	counter = None
72	72	ds = DumpStatistics()
	73	+ filename = os.path.join(location, filename)
73	74	context = cElementTree.iterparse(filename, events=('start', 'end'))
74	75	context = iter(context)
75	76	event, root = context.next() #get the root element of the XML doc
—	—	@@ -80,20 +81,20 @@
81	82	root.clear() # when done parsing a section clear the tree to release memory
82	83	except SyntaxError:
83	84	pass
84		~~- utils.store_object(ds, settings.binary_location, 'ds')~~
	85	+ utils.store_object(ds, settings.binary_location, 'ds')
85	86	xml_size = ds.total_size_xml()
86	87	text_size = ds.total_size_text()
87	88	print text_size, xml_size
88	89	print ds.tags
89		-
90	90
	91	+
91	92	def output_dumpstatistics():
92	93	ds = utils.load_object(settings.binary_location, 'ds.bin')
93		-
	94	+
94	95	for key in ds.tags:
95	96	print '%s\t%s' % (key, ds.tags[key])
96		-
	97	+
97	98	if __name__ == '__main__':
	99	+ input = os.path.join(settings.input_location, 'en', 'wiki')
	100	+ calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml')
98	101	output_dumpstatistics()
99		~~- #calculate_filesize_overhead(settings.input_location, settings.input_filename)~~
100		-
Index: trunk/tools/editor_trends/manage.py
—	—	@@ -182,9 +182,10 @@
183	183	filename = kwargs.get('filename')
184	184	extension = kwargs.get('extension')
185	185	location = kwargs.get('location')
	186	+ full_project = kwargs.get('full_project')
186	187	pbar = get_value(args, 'progress')
187	188	domain = settings.wp_dump_location
188		~~- path = '/%s/latest/' % project~~
	189	+ path = '/%s/latest/' % full_project
189	190	extension = utils.determine_file_extension(filename)
190	191	filemode = utils.determine_file_mode(extension)
191	192	dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar)
—	—	@@ -322,7 +323,7 @@
323	324	ignore = ignore + ',extract'
324	325
325	326	functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'),
326		~~- (chunker_launcher, 'split'),~~
	327	+ #(chunker_launcher, 'split'),
327	328	(extract_launcher, 'extract'),
328	329	(sort_launcher, 'sort'),
329	330	(store_launcher, 'store'),
—	—	@@ -407,10 +408,9 @@
408	409	parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.')
409	410	parser_download.set_defaults(func=dump_downloader_launcher)
410	411
411		~~- parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')~~
	412	+ #parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
	413	+ #parser_split.set_defaults(func=chunker_launcher)
412	414
413		~~- parser_split.set_defaults(func=chunker_launcher)~~
414		-
415	415	parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
416	416	parser_create.set_defaults(func=extract_launcher)
417	417
Index: trunk/tools/editor_trends/wikitree/xml.py
—	—	@@ -1,55 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-10-21'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-from utils import utils~~
22		~~-import configuration~~
23		~~-settings = configuration.Settings()~~
24		-
25		-
26		~~-def convert_html_entities(text):~~
27		~~- return utils.unescape(text)~~
28		-
29		-
30		~~-def extract_text(elem, **kwargs):~~
31		~~- if elem != None and elem.text != None:~~
32		~~- #try:~~
33		~~- return elem.text #.decode(settings.encoding)~~
34		~~- #except UnicodeDecodeError:~~
35		~~- #return None~~
36		-
37		-
38		~~-def retrieve_xml_node(xml_nodes, name):~~
39		~~- for xml_node in xml_nodes:~~
40		~~- if xml_node.tag == name:~~
41		~~- return xml_node~~
42		~~- return None #maybe this should be replaced with an NotFoundError~~
43		-
44		-
45		~~-def read_input(file):~~
46		~~- lines = []~~
47		~~- for line in file:~~
48		~~- lines.append(line)~~
49		~~- if line.find('</page>') > -1:~~
50		~~- yield lines~~
51		~~- '''~~
52		~~- #This looks counter intuitive but Python continues with this call~~
53		~~- after it has finished the yield statement~~
54		~~- '''~~
55		~~- lines = []~~
56		~~- file.close()~~
Index: trunk/tools/editor_trends/wikitree/parser.py
—	—	@@ -0,0 +1,70 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+import xml.etree.cElementTree as cElementTree
	22	+
	23	+import configuration
	24	+settings = configuration.Settings()
	25	+from utils import utils
	26	+
	27	+def convert_html_entities(text):
	28	+ return utils.unescape(text)
	29	+
	30	+
	31	+def extract_text(elem, **kwargs):
	32	+ if elem != None and elem.text != None:
	33	+ #try:
	34	+ return elem.text #.decode(settings.encoding)
	35	+ #except UnicodeDecodeError:
	36	+ #return None
	37	+
	38	+
	39	+def retrieve_xml_node(xml_nodes, name):
	40	+ for xml_node in xml_nodes:
	41	+ if xml_node.tag == name:
	42	+ return xml_node
	43	+ return None #maybe this should be replaced with an NotFoundError
	44	+
	45	+def determine_element(line):
	46	+ pos = line.find(' ')
	47	+ elem = line[:pos] + '>'
	48	+
	49	+
	50	+def read_input(file):
	51	+ lines = []
	52	+ start_parsing = False
	53	+ for line in file:
	54	+ if line == '\n':
	55	+ continue
	56	+ if start_parsing == False and line.find('<page>') > -1:
	57	+ start_parsing = True
	58	+ if start_parsing:
	59	+ lines.append(line.strip())
	60	+ if line.find('</page>') > -1:
	61	+ #print lines
	62	+ lines = '\n'.join(lines)
	63	+ lines = lines.encode(settings.encoding)
	64	+ xml_string = cElementTree.XML(lines)
	65	+ yield xml_string
	66	+ '''
	67	+ #This looks counter intuitive but Python continues with this call
	68	+ after it has finished the yield statement
	69	+ '''
	70	+ lines = []
	71	+ file.close()
Property changes on: trunk/tools/editor_trends/wikitree/parser.py
___________________________________________________________________
Added: svn:eol-style
1	72	+ native
Added: svn:mime-type
2	73	+ text/plain
Index: trunk/tools/editor_trends/etl/exporter.py
—	—	@@ -213,16 +213,12 @@
214	214	break
215	215	obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
216	216
217		~~- #for editor in tasks:~~
218		~~- # obs = tasks[editor]~~
219	217	first_edit = obs['first_edit']
220	218	last_edit = obs['final_edit']
221	219	editor_dt = relativedelta(last_edit, first_edit)
222	220	editor_dt = (editor_dt.years * 12) + editor_dt.months
223	221	edits = []
224	222	for year in xrange(2001, datetime.datetime.now().year + 1):
225		~~- #if year == 2009 and editor == '2':~~
226		~~- # print 'debug'~~
227	223	if first_edit.year > year or last_edit.year < year:
228	224	continue
229	225	window_end = datetime.datetime(year, 12, 31)
—	—	@@ -251,11 +247,67 @@
252	248
253	249
254	250
255		~~-def generate_cohort_dataset(tasks, dbname, collection, **kwargs):~~
	251	+def generate_cohort_dataset_forward(tasks, dbname, collection, **kwargs):
256	252	mongo = db.init_mongo_db(dbname)
257	253	editors = mongo[collection + '_dataset']
258	254	windows = create_windows()
259	255	data = shaper.create_datacontainer('dict')
	256	+ final_year = datetime.datetime.now().year + 1
	257	+ m1 = [1, 2, 3, 4, 5, 6]
	258	+ m2 = [7, 8, 9, 10, 11, 12]
	259	+ frames = [m1, m2]
	260	+ while True:
	261	+ id = tasks.get(block=False)
	262	+ if id == None:
	263	+ break
	264	+ obs = editors.find_one({'editor': id}, {'new_wikipedian': 1, 'monthly_edits': 1, 'final_edit':1})
	265	+ new_wikipedian = obs['new_wikipedian']
	266	+ last_edit = obs['final_edit']
	267	+ start_year = new_wikipedian.year
	268	+ last_year = last_edit.year + 1
	269	+ if new_wikipedian.month != 1:
	270	+ continue
	271	+ for year in xrange(start_year, last_year):
	272	+ if year not in data[start_year]:
	273	+ data[start_year][year] = {}
	274	+ for x, frame in enumerate(frames):
	275	+ if x not in data[start_year][year]:
	276	+ data[start_year][year][x] = 0
	277	+ if 'n' not in data[start_year][year]:
	278	+ data[start_year][year]['n'] = 0
	279	+
	280	+ active = sum([obs['monthly_edits'][str(year)][str(m)] for m in frame])
	281	+ data[start_year][year]['n'] += 1
	282	+ if active > 0:
	283	+ data[start_year][year][x] += 1
	284	+ filename = '%s_cohort_forward.csv' % dbname
	285	+ fh = utils.create_txt_filehandle(settings.dataset_location, filename, 'w', settings.encoding)
	286	+ frames.append('n')
	287	+ headers = ["%s_%s" % (year, frame[0]) for year in xrange(2001, final_year) for frame in enumerate(frames)]
	288	+ headers.insert(0, '\t')
	289	+ utils.write_list_to_csv(headers, fh)
	290	+
	291	+ for obs_year in data:
	292	+ obs = '%s\t' % obs_year
	293	+ for year in xrange(2001, final_year):
	294	+ values = data[obs_year].get(year, None)
	295	+ if values != None:
	296	+ for value in values:
	297	+ obs = '%s\t%s\t' % (obs, values[value])
	298	+ else:
	299	+ obs = '%s\t.\t.\t.\t' % obs
	300	+
	301	+ obs = '%s\n' % obs
	302	+ fh.write(obs)
	303	+ fh.close()
	304	+
	305	+
	306	+
	307	+def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs):
	308	+ mongo = db.init_mongo_db(dbname)
	309	+ editors = mongo[collection + '_dataset']
	310	+ windows = create_windows()
	311	+ data = shaper.create_datacontainer('dict')
260	312	data = shaper.add_windows_to_datacontainer(data, windows)
261	313
262	314	while True:
—	—	@@ -360,10 +412,12 @@
361	413	'13':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2009, 4, 30)},
362	414	}
363	415	generate_cohort_dataset(editors, dbname, collection)
	416	+
	417	+
364	418	if __name__ == '__main__':
365	419	dbname = 'enwiki'
366	420	collection = 'editors'
367	421	#debug(dbname, collection)
368		~~- dataset_launcher(dbname, collection, generate_cohort_dataset_howie)~~
	422	+ dataset_launcher(dbname, collection, generate_cohort_dataset_forward)
369	423	#dataset_launcher(dbname, collection, generate_long_editor_dataset)
370	424	#dataset_launcher(dbname, collection, generate_wide_editor_dataset)
Index: trunk/tools/editor_trends/etl/extract.py
—	—	@@ -39,7 +39,7 @@
40	40	from database import db_settings
41	41	from database import db
42	42	from database import cache
43		~~-from wikitree import xml~~
	43	+import wikitree.parser
44	44	from bots import bots
45	45	from etl import models
46	46	#from utils import process_constructor as pc
Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -34,7 +34,7 @@
35	35
36	36	from utils import utils
37	37	import extract
38		~~-from wikitree import xml~~
	38	+import wikitree.parser
39	39	from bots import bots
40	40
41	41
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -0,0 +1,283 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-12-13'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+import re
	23	+import json
	24	+import os
	25	+import xml.etree.cElementTree as cElementTree
	26	+
	27	+sys.path.append('..')
	28	+import configuration
	29	+settings = configuration.Settings()
	30	+
	31	+import wikitree.parser
	32	+from bots import bots
	33	+from utils import utils
	34	+
	35	+try:
	36	+ import psyco
	37	+ psyco.full()
	38	+except ImportError:
	39	+ pass
	40	+
	41	+
	42	+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
	43	+
	44	+
	45	+def remove_numeric_character_references(text):
	46	+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
	47	+
	48	+
	49	+def lenient_deccharref(m):
	50	+ try:
	51	+ return unichr(int(m.group(1)))
	52	+ except ValueError:
	53	+ '''
	54	+ There are a few articles that raise a Value Error here, the reason is
	55	+ that I am using a narrow Python build (UCS2) instead of a wide build
	56	+ (UCS4). The quick fix is to return an empty string...
	57	+ Real solution is to rebuild Python with UCS4 support.....
	58	+ '''
	59	+ return ''
	60	+
	61	+
	62	+def remove_namespace(element, namespace):
	63	+ '''Remove namespace from the XML document.'''
	64	+ ns = u'{%s}' % namespace
	65	+ nsl = len(ns)
	66	+ for elem in element.getiterator():
	67	+ if elem.tag.startswith(ns):
	68	+ elem.tag = elem.tag[nsl:]
	69	+ return element
	70	+
	71	+
	72	+def load_namespace(language):
	73	+ file = '%s_ns.json' % language
	74	+ fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
	75	+ ns = json.load(fh)
	76	+ fh.close()
	77	+ ns = ns['query']['namespaces']
	78	+ return ns
	79	+
	80	+
	81	+def build_namespaces_locale(namespaces, include=['0']):
	82	+ '''
	83	+ @include is a list of namespace keys that should not be ignored, the default
	84	+ setting is to ignore all namespaces except the main namespace.
	85	+ '''
	86	+ ns = []
	87	+ for namespace in namespaces:
	88	+ if namespace not in include:
	89	+ value = namespaces[namespace].get(u'*', None)
	90	+ ns.append(value)
	91	+ return ns
	92	+
	93	+
	94	+def parse_comments(revisions, function):
	95	+ for revision in revisions:
	96	+ comment = revision.find('{%s}comment' % settings.xml_namespace)
	97	+ #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text
	98	+ if comment != None and comment.text != None:
	99	+ comment.text = function(comment.text)
	100	+ return revisions
	101	+
	102	+
	103	+def is_article_main_namespace(elem, namespace):
	104	+ '''
	105	+ checks whether the article belongs to the main namespace
	106	+ '''
	107	+ title = elem.text
	108	+ for ns in namespace:
	109	+ if title.startswith(ns):
	110	+ return False
	111	+ return True
	112	+
	113	+def validate_hostname(address):
	114	+ '''
	115	+ This is not a foolproof solution at all. The problem is that it's really hard
	116	+ to determine whether a string is a hostname or not reliably. This is a
	117	+ very fast rule of thumb. Will lead to false positives, but that's life :)
	118	+ '''
	119	+ parts = address.split(".")
	120	+ if len(parts) > 2:
	121	+ return True
	122	+ else:
	123	+ return False
	124	+
	125	+
	126	+def validate_ip(address):
	127	+ parts = address.split(".")
	128	+ if len(parts) != 4:
	129	+ return False
	130	+ parts = parts[:3]
	131	+ for item in parts:
	132	+ try:
	133	+ if not 0 <= int(item) <= 255:
	134	+ return False
	135	+ except ValueError:
	136	+ return False
	137	+ return True
	138	+
	139	+
	140	+def determine_username_is_bot(contributor, **kwargs):
	141	+ '''
	142	+ #contributor is an xml element containing the id of the contributor
	143	+ @bots should have a dict with all the bot ids and bot names
	144	+ @Return False if username id is not in bot dict id or True if username id
	145	+ is a bot id.
	146	+ '''
	147	+ bots = kwargs.get('bots')
	148	+ username = contributor.find('username')
	149	+ if username == None:
	150	+ return 0
	151	+ else:
	152	+ if username in bots:
	153	+ return 1
	154	+ else:
	155	+ return 0
	156	+
	157	+
	158	+def extract_username(contributor, **kwargs):
	159	+ contributor = contributor.find('username')
	160	+ if contributor != None:
	161	+ return contributor.text
	162	+ else:
	163	+ return None
	164	+
	165	+
	166	+def extract_contributor_id(contributor, **kwargs):
	167	+ '''
	168	+ @contributor is the xml contributor node containing a number of attributes
	169	+ Currently, we are only interested in registered contributors, hence we
	170	+ ignore anonymous editors.
	171	+ '''
	172	+ if contributor.get('deleted'):
	173	+ return None # ASK: Not sure if this is the best way to code deleted contributors.
	174	+ elem = contributor.find('id')
	175	+ if elem != None:
	176	+ return {'id':elem.text}
	177	+ else:
	178	+ elem = contributor.find('ip')
	179	+ if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
	180	+ return {'username':elem.text, 'id': elem.text}
	181	+ else:
	182	+ return None
	183	+
	184	+
	185	+def output_editor_information(revisions, page, bots):
	186	+ '''
	187	+ @elem is an XML element containing 1 revision from a page
	188	+ @output is where to store the data, a filehandle
	189	+ @**kwargs contains extra information
	190	+
	191	+ the variable tags determines which attributes are being parsed, the values in
	192	+ this dictionary are the functions used to extract the data.
	193	+ '''
	194	+ headers = ['id', 'date', 'article', 'username']
	195	+ tags = {'contributor': {'id': extract_contributor_id,
	196	+ 'bot': determine_username_is_bot,
	197	+ 'username': extract_username,
	198	+ },
	199	+ 'timestamp': {'date': wikitree.parser.extract_text},
	200	+ }
	201	+ vars = {}
	202	+ flat = []
	203	+
	204	+ for x, revision in enumerate(revisions):
	205	+ #print len(revision.getchildren())
	206	+ vars[x] = {}
	207	+ vars[x]['article'] = page
	208	+ for tag in tags:
	209	+ el = revision.find('%s' % tag)
	210	+ if el == None:
	211	+ #print cElementTree.tostring(revision, settings.encoding)
	212	+ del vars[x]
	213	+ break
	214	+ for function in tags[tag].keys():
	215	+ f = tags[tag][function]
	216	+ value = f(el, bots=bots)
	217	+ if type(value) == type({}):
	218	+ for kw in value:
	219	+ vars[x][kw] = value[kw]
	220	+ else:
	221	+ vars[x][function] = value
	222	+
	223	+ '''
	224	+ This loop determines for each observation whether it should be stored or not.
	225	+ '''
	226	+ for x in vars:
	227	+ if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
	228	+ continue
	229	+ else:
	230	+ f = []
	231	+ for head in headers:
	232	+ f.append(vars[x][head])
	233	+ flat.append(f)
	234	+
	235	+ return flat
	236	+
	237	+
	238	+def parse_dumpfile(project, language_code, namespaces=['0']):
	239	+ bot_ids = bots.retrieve_bots(language_code)
	240	+ ns = load_namespace(language_code)
	241	+ ns = build_namespaces_locale(ns, namespaces)
	242	+
	243	+ location = os.path.join(settings.input_location, language_code, project)
	244	+ fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)
	245	+ for page in wikitree.parser.read_input(fh):
	246	+ title = page.find('title')
	247	+ if is_article_main_namespace(title, ns):
	248	+ #cElementTree.dump(page)
	249	+ article_id = page.find('id').text
	250	+ revisions = page.findall('revision')
	251	+ revisions = parse_comments(revisions, remove_numeric_character_references)
	252	+ output = output_editor_information(revisions, article_id, bot_ids)
	253	+ write_output(output, project, language_code)
	254	+ page.clear()
	255	+ fh.close()
	256	+
	257	+
	258	+def write_output(output, project, language_code):
	259	+ location = os.path.join(settings.input_location, language_code, project, 'txt')
	260	+ for o in output:
	261	+ file = '%s.csv' % hash(o[0])
	262	+ try:
	263	+ fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)
	264	+ utils.write_list_to_csv(o, fh)
	265	+ fh.close()
	266	+ except Exception, error:
	267	+ print error
	268	+
	269	+
	270	+def hash(id):
	271	+ '''
	272	+ A very simple hash function based on modulo. The except clause has been
	273	+ addde because there are instances where the username is stored in userid
	274	+ tag and hence that's a string and not an integer.
	275	+ '''
	276	+ try:
	277	+ return int(id) % 500
	278	+ except:
	279	+ return sum([ord(i) for i in id]) % 500
	280	+
	281	+if __name__ == '__main__':
	282	+ project = 'wiki'
	283	+ language_code = 'en'
	284	+ parse_dumpfile(project, language_code)
Index: trunk/tools/editor_trends/etl/models.py
—	—	@@ -27,7 +27,7 @@
28	28
29	29	from utils import models
30	30	from utils import utils
31		~~-from wikitree import xml~~
	31	+import wikitree
32	32
33	33	class TXTFile(object):
34	34
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -57,7 +57,7 @@
58	58	#Change this to match your computers configuration (RAM / CPU)
59	59	self.minimum_python_version = (2, 6)
60	60	self.wp_dump_location = 'http://download.wikimedia.org'
61		~~- self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/'~~
	61	+ self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
62	62	self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
63	63	self.windows_register = {'7z.exe': 'Software\\7-Zip', }
64	64	#Extensions of ascii files, this is used to determine the filemode to use
Index: trunk/tools/editor_trends/bots/bots.py
—	—	@@ -28,7 +28,7 @@
29	29
30	30	import configuration
31	31	settings = configuration.Settings()
32		~~-from wikitree import xml~~
	32	+import wikitree
33	33	from database import db
34	34	from utils import utils
35	35	#from etl import extract

Follow-up revisions

Revision	Commit summary	Author	Date
r78583	Followup r78582, svn:eol-style native	reedy	22:19, 18 December 2010

Status & tagging log

00:03, 14 January 2011 Reedy (talk | contribs) changed the status of r78582 [removed: new added: deferred]