r75884 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75883‎ | r75884 | r75885 >
Date:22:08, 2 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Significant code refactoring, debugging and performance improvements. In particular, the cache object is starting to pay off.
Modified paths:
  • /trunk/tools/editor_trends (modified) (history)
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/optimize_editors.py (added) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/split_xml_file.py (modified) (history)
  • /trunk/tools/editor_trends/utils/namespace_downloader.py (added) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
@@ -0,0 +1,120 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-02'
 19+__version__ = '0.1'
 20+
 21+
 22+
 23+import settings
 24+from database import db
 25+from utils import process_constructor as pc
 26+
 27+
 28+def create_datacontainer(init_value=0):
 29+ '''
 30+ This function initializes an empty dictionary with as key the year (starting
 31+ 2001 and running through) and as value @init_value, in most cases this will
 32+ be zero so the dictionary will act as a running tally for a variable but
 33+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
 34+ '''
 35+ data = {}
 36+ year = datetime.datetime.now().year + 1
 37+ for x in xrange(2001, year):
 38+ data[str(x)] = init_value
 39+ return data
 40+
 41+
 42+def determine_edits_by_year(dates):
 43+ '''
 44+ This function counts the number of edits by year made by a particular editor.
 45+ '''
 46+ edits = create_datacontainer()
 47+ for date in dates:
 48+ year = str(date['date'].year)
 49+ edits[year] += 1
 50+ return edits
 51+
 52+
 53+def determine_articles_by_year(dates):
 54+ '''
 55+ This function counts the number of unique articles by year edited by a
 56+ particular editor.
 57+ '''
 58+ articles = create_datacontainer(set())
 59+ for date in dates:
 60+ year = str(date['date'].year)
 61+ articles[year].add(date['article'])
 62+ for article in articles:
 63+ articles[article] = len(article)
 64+ return articles
 65+
 66+
 67+def optimize_editors(input_queue, result_queue, pbar, kwargs):
 68+ dbname = kwargs.pop('dbname')
 69+ mongo = db.init_mongo_db(dbname)
 70+ input = mongo['editors']
 71+ output = mongo['dataset']
 72+ mongo.output.ensure_index('editor')
 73+ mongo.output.ensure_index('year_joined')
 74+ definition = kwargs.pop('definition')
 75+ while True:
 76+ try:
 77+ id = input_queue.get(block=False)
 78+ editor = input.find_one({'editor': id})
 79+ edits = editor['edits']
 80+ edits = sorted(edits, key=itemgetter('date'))
 81+ edit_count = len(edits)
 82+ new_wikipedian = edits[9]['date'].year
 83+ first_edit = edits[0]['date']
 84+ final_edit = edits[-1]['date']
 85+ edits_by_year = determine_edits_by_year(edits)
 86+ articles_by_year = determine_articles_by_year(edits)
 87+ edits = edits[:10]
 88+
 89+ output.insert({'editor': id, 'edits': edits,
 90+ 'edits_by_year': edits_by_year,
 91+ 'year_joined': year,
 92+ 'edit_count': edit_count,
 93+ 'final_edit': final_edit,
 94+ 'first_edit': first_edit,
 95+ 'articles_by_year': articles_by_year})
 96+ print 'Items left: %s' % input_queue.qsize()
 97+ except Empty:
 98+ break
 99+
 100+def run_optimize_editors(dbname):
 101+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 102+ kwargs = {'definition': 'traditional',
 103+ 'pbar': True,
 104+ 'dbname': 'enwiki',
 105+ 'nr_input_processors': 2,
 106+ 'nr_output_processors': 0,
 107+ }
 108+ pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
 109+
 110+
 111+def debug_optimize_editors(dbname):
 112+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 113+ q = pc.load_queue(ids)
 114+ kwargs = {'definition': 'traditional',
 115+ 'dbname': 'enwiki'
 116+ }
 117+ optimize_editors(q, False, True, kwargs)
 118+
 119+
 120+if __name__ == '__main__':
 121+ run_optimize_editors('enwiki')
\ No newline at end of file
Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -21,11 +21,14 @@
2222 import sys
2323 import os
2424 import time
 25+import datetime
2526 import codecs
 27+import math
2628 import cStringIO
2729 import re
 30+from operator import itemgetter
2831 import xml.etree.cElementTree as cElementTree
29 -from multiprocessing import Queue
 32+from multiprocessing import Queue, JoinableQueue
3033 from Queue import Empty
3134 import pymongo
3235
@@ -34,6 +37,7 @@
3538 from utils import utils, models
3639 from database import db_settings
3740 from database import db
 41+from database import cache
3842 from wikitree import xml
3943 from statistics import dataset
4044 from utils import process_constructor as pc
@@ -45,13 +49,15 @@
4650 except ImportError:
4751 pass
4852
49 -#contributors = {}
5053
51 -RE_BOT = re.compile('bot', re.IGNORECASE)
52 -RE_SCRIPT = re.compile('script', re.IGNORECASE)
 54+def determine_username_is_bot(username, kwargs):
 55+ '''
 56+ @username is the xml element containing the id of the user
 57+ @kwargs should have a list with all the bot ids
5358
54 -
55 -def determine_username_is_bot(username, kwargs):
 59+ @Return False if username id is not in bot list id or True if username id
 60+ is a bot id.
 61+ '''
5662 ids = kwargs.get('bots', [])
5763 if ids == None:
5864 ids = []
@@ -66,14 +72,14 @@
6773 def extract_contributor_id(contributor, kwargs):
6874 '''
6975 @contributor is the xml contributor node containing a number of attributes
70 -
 76+
7177 Currently, we are only interested in registered contributors, hence we
7278 ignore anonymous editors. If you are interested in collecting data on
7379 anonymous editors then add the string 'ip' to the tags variable.
7480 '''
7581 tags = ['id']
7682 if contributor.get('deleted'):
77 - return - 1 #Not sure if this is the best way to code deleted contributors.
 83+ return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
7884 for elem in contributor:
7985 if elem.tag in tags:
8086 if elem.text != None:
@@ -83,6 +89,14 @@
8490
8591
8692 def output_editor_information(elem, data_queue, **kwargs):
 93+ '''
 94+ @elem is an XML element containing 1 revision from a page
 95+ @data_queue is where to store the data
 96+ @**kwargs contains extra information
 97+
 98+ the variable tags determines which attributes are being parsed, the values in
 99+ this dictionary are the functions used to extract the data.
 100+ '''
87101 tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot},
88102 'timestamp': {'date': xml.extract_text},
89103 }
@@ -104,10 +118,24 @@
105119 data_queue.put(vars)
106120 vars = {}
107121
108 -def parse_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
 122+
 123+def parse_editors(xml_queue, data_queue, pbar, bots, **kwargs):
 124+ '''
 125+ @xml_queue contains the filenames of the files to be parsed
 126+ @data_queue is an instance of Queue where the extracted data is stored for
 127+ further processing
 128+ @pbar is an instance of progressbar to display the progress
 129+ @bots is a list of id's of known Wikipedia bots
 130+ @debug is a flag to indicate whether the function is called for debugging.
 131+
 132+ Output is the data_queue that will be used by store_editors()
 133+ '''
 134+ file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'))
 135+ debug = kwargs.get('debug', None)
109136 if settings.DEBUG:
110137 messages = {}
111138 vars = {}
 139+
112140 while True:
113141 try:
114142 if debug:
@@ -117,12 +145,13 @@
118146 if file == None:
119147 print 'Swallowed a poison pill'
120148 break
121 - data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION,
 149+ data = xml.read_input(utils.create_txt_filehandle(file_location,
122150 file, 'r',
123151 encoding=settings.ENCODING))
124152 for raw_data in data:
125153 xml_buffer = cStringIO.StringIO()
126154 raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 155+
127156 try:
128157 raw_data = ''.join(raw_data)
129158 xml_buffer.write(raw_data)
@@ -144,142 +173,122 @@
145174 if settings.DEBUG:
146175 utils.track_errors(xml_buffer, error, file, messages)
147176 except MemoryError, error:
148 - '''
149 - There is one xml file causing an out of memory file, not
150 - sure which one yet. This happens when raw_data =
151 - ''.join(raw_data) is called. 18-22
152 - '''
153177 print file, error
154178 print raw_data[:12]
155179 print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
156 - if settings.DEBUG:
157 - utils.track_errors(xml_buffer, error, file, messages)
158180
 181+ data_queue.put('NEXT')
159182 if pbar:
160 - #print xml_queue.qsize()
161 - utils.update_progressbar(pbar, xml_queue)
 183+ print file, xml_queue.qsize(), data_queue.qsize()
 184+ #utils.update_progressbar(pbar, xml_queue)
162185 if debug:
163186 break
164187
 188+ while True:
 189+ if data_queue.qsize() < 100000:
 190+ break
 191+ else:
 192+ time.sleep(10)
 193+ print 'Still sleeping, queue is %s items long' % data_queue.qsize()
 194+
165195 except Empty:
166196 break
167197
 198+ #for x in xrange(4):
 199+ data_queue.put(None)
 200+
168201 if settings.DEBUG:
169 - utils.report_error_messages(messages, lookup_new_editors)
 202+ utils.report_error_messages(messages, parse_editors)
170203
171204
172205 def store_editors(data_queue, pids, dbname):
 206+ '''
 207+ @data_queue is an instance of Queue containing information extracted by
 208+ parse_editors()
 209+ @pids is a list of PIDs used to check if other processes are finished
 210+ running
 211+ @dbname is the name of the MongoDB collection where to store the information.
 212+ '''
173213 mongo = db.init_mongo_db(dbname)
174214 collection = mongo['editors']
175215 mongo.collection.ensure_index('editor')
 216+ editor_cache = cache.EditorCache(collection)
176217 while True:
177218 try:
178219 edit = data_queue.get(block=False)
179 - contributor = edit['editor']
180 - value = {'date':edit['date'], 'article': edit['article']}
181 - collection.update({'editor': contributor}, {'$inc': {'edit_count': 1},
182 - '$push': {'edits': value}}, True)
 220+ data_queue.task_done()
 221+ if edit == None:
 222+ print 'Swallowing poison pill'
 223+ break
 224+ elif edit == 'NEXT':
 225+ editor_cache.add('NEXT', '')
 226+ else:
 227+ contributor = edit['editor']
 228+ value = {'date': edit['date'], 'article': edit['article']}
 229+ editor_cache.add(contributor, value)
 230+ #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)
 231+ #'$inc': {'edit_count': 1},
 232+
183233 except Empty:
184234 '''
185235 This checks whether the Queue is empty because the preprocessors are
186236 finished or because this function is faster in emptying the Queue
187 - then the preprocessors are able to fill it. If this preprocessors
 237+ then the preprocessors are able to fill it. If the preprocessors
188238 are finished and this Queue is empty than break, else wait for the
189239 Queue to fill.
190240 '''
191 - if all([utils.check_if_process_is_running(pid) for pid in pids]):
192 - pass
193 - #print 'Empty queue or not %s?' % data_queue.qsize()
194 - else:
195 - break
 241+ pass
196242
 243+ print 'Emptying entire cache.'
 244+ editor_cache.store()
 245+ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
197246
198 -def optimize_editors(dbname, input_queue, **kwargs):
199 - mongo = db.init_mongo_db(dbname)
200 - collection = mongo['editors']
201 - definition = kwargs.pop('definition')
202 - while True:
203 - try:
204 - id = input_queue.get(block=False)
205 - #id = '94033'
206 - editor = collection.find_one({'editor': id})
207 - edits = editor['edits']
208 - edits.sort()
209 - year = edits[0]['date'].year
210 - new_wikipedian = dataset.determine_editor_is_new_wikipedian(edits, defintion)
211 - collection.update({'editor': id}, {'$set': {'edits': edits, 'year_joined': year, 'new_wikipedian': new_wikipedian}})
212 -
213 - except Empty:
214 - break
215247
 248+def load_bot_ids():
 249+ '''
 250+ Loader function to retrieve list of id's of known Wikipedia bots.
 251+ '''
 252+ ids = {}
 253+ mongo = db.init_mongo_db('bots')
 254+ bots = mongo['ids']
 255+ cursor = bots.find()
 256+ for bot in cursor:
 257+ ids[bot['id']] = bot['name']
 258+ return ids
216259
217 -def store_data_db(data_queue, pids):
218 - connection = db.init_database()
219 - cursor = connection.cursor()
220 - db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE)
221260
222 - empty = 0
223 -
224 - values = []
225 - while True:
226 - try:
227 - chunk = data_queue.get(block=False)
228 - contributor = chunk['contributor'].encode(settings.ENCODING)
229 - article = chunk['article']
230 - timestamp = chunk['timestamp'].encode(settings.ENCODING)
231 - bot = chunk['bot']
232 - values.append((contributor, article, timestamp, bot))
233 -
234 - if len(values) == 50000:
235 - cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values)
236 - connection.commit()
237 - #print 'Size of queue: %s' % data_queue.qsize()
238 - values = []
239 -
240 - except Empty:
241 - if all([utils.check_if_process_is_running(pid) for pid in pids]):
242 - pass
243 - else:
244 - break
245 - connection.close()
246 -
247 -
248 -def run_stand_alone(dbname):
249 - files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
250 - #files = files[:2]
 261+def run_parse_editors(dbname, language):
 262+ ids = load_bot_ids()
251263 kwargs = {'bots': ids,
252264 'dbname': dbname,
253265 'pbar': True,
254 - 'definition': 'traditional'}
 266+ 'nr_input_processors': 1,
 267+ 'nr_output_processors': 1,
 268+ 'language': language,
 269+ }
 270+ chunks = {}
 271+ file_location = os.path.join(settings.XML_FILE_LOCATION, language)
 272+ files = utils.retrieve_file_list(file_location, 'xml')
 273+ parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
 274+ a = 0
 275+ for x in xrange(settings.NUMBER_OF_PROCESSES):
 276+ b = a + parts
 277+ chunks[x] = files[a:b]
 278+ a = (x + 1) * parts
255279
256 - mongo = db.init_mongo_db('bots')
257 - bots = mongo['ids']
258 - ids = {}
259 - cursor = bots.find()
260 - for bot in cursor:
261 - ids[bot['id']] = bot['name']
262 -
263 - pc.build_scaffolding(pc.load_queue, parse_editors, files, store_editors, True, **kwargs)
264 - ids = retrieve_ids_mongo_new(dbname, 'editors')
265 - pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
266280
267 -def debug_lookup_new_editors():
268 - q = Queue()
269 - import progressbar
270 - pbar = progressbar.ProgressBar().start()
 281+ for x in xrange(settings.NUMBER_OF_PROCESSES):
 282+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks[x], store_editors, True, **kwargs)
 283+
 284+
 285+def debug_parse_editors(dbname):
 286+ q = JoinableQueue()
271287 #edits = db.init_mongo_db('editors')
272 - parse_editors('464.xml', q, None, None, True)
273 - store_data_mongo(q, [], 'test')
274 - #keys = ['editor']
275 - #for key in keys:
276 - # db.add_index_to_collection('editors', 'editors', key)
 288+ parse_editors('en\\522.xml', q, None, None, True)
 289+ store_editors(q, [], dbname)
277290
 291+
278292 if __name__ == "__main__":
279 - #optimize_editors('enwiki')
280 - #debug_lookup_new_editors()
281 -
282 - if settings.RUN_MODE == 'stand_alone':
283 - run_stand_alone()
284 - print 'Finished processing XML files.'
285 - else:
286 - run_hadoop()
 293+ #debug_parse_editors('test')
 294+ run_parse_editors('test', 'en')
 295+ pass
Index: trunk/tools/editor_trends/settings.py
@@ -30,21 +30,29 @@
3131
3232 #Setting up the environment
3333 ops = {platform.win32_ver: 'Windows',
34 - platform.linux_distribution: 'Linux',
35 - platform.mac_ver: 'OSX'}
 34+ platform.linux_distribution: 'Linux',
 35+ platform.mac_ver: 'OSX'}
 36+
3637 for op in ops:
3738 if op() != ('', '', '') and op() != ('', ('', '', ''), ''):
3839 OS = ops[op]
3940
40 -WORKING_DIRECTORY = os.getcwd()#[:-9]
 41+WORKING_DIRECTORY = os.getcwd()
4142 IGNORE_DIRS = ['wikistats', 'zips']
 43+ROOT = '/' if OS != 'Windows' else 'c:\\'
4244
43 -dirs = [name for name in os.listdir(WORKING_DIRECTORY) if os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
 45+
 46+dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
 47+ os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
4448 for subdirname in dirs:
4549 if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
4650 sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
4751
 52+WINDOWS_ZIP = ['7z.exe']
4853
 54+OSX_ZIP = []
 55+
 56+LINUX_ZIP = []
4957 #General settings
5058
5159 # Valid values are 'stand-alone' and 'hadoop'
@@ -65,22 +73,23 @@
6674 #This section contains configuration variables for the different file locations.
6775
6876 # Location where to write xml chunks
69 -XML_FILE_LOCATION = 'C:/wikimedia/'
 77+XML_FILE_LOCATION = os.path.join(ROOT, 'wikimedia')
7078
7179 # Input file
72 -XML_FILE = 'C:/Source_Files/enwiki-20100916-stub-meta-history.xml'
 80+XML_FILE = os.path.join(ROOT, 'Source_Files', 'enwiki-20100916-stub-meta-history.xml')
7381
7482 # This is the place where error messages are stored for debugging purposes
75 -ERROR_MESSAGE_FILE_LOCATION = WORKING_DIRECTORY + '/errors/'
 83+ERROR_MESSAGE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'errors')
7684
77 -DATABASE_FILE_LOCATION = WORKING_DIRECTORY + '/data/database/'
 85+DATABASE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'database')
7886
79 -BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/'
 87+BINARY_OBJECT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'objects')
8088
81 -DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/'
 89+DATASETS_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'datasets')
8290
83 -TXT_FILE_LOCATION = WORKING_DIRECTORY + '/csv/'
 91+TXT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'csv')
8492
 93+NAMESPACE_LOCATION = os.path.join(WORKING_DIRECTORY, 'namespaces')
8594 #This section contains configuration variables for parsing / encoding and
8695 #working with the XML files.
8796
@@ -92,12 +101,32 @@
93102 # Name space, do not change as this works for Mediawiki wikis
94103 NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
95104
 105+
 106+WIKIMEDIA_PROJECTS = {'commons': 'commonswiki',
 107+ 'wikibooks': 'wikibooks',
 108+ 'wikinews': 'wikinews',
 109+ 'wikiquote': 'wikiquote',
 110+ 'wikisource': 'wikisource',
 111+ 'wikiversity': 'wikiversity',
 112+ 'wiktionary': 'wiktionary',
 113+ 'metawiki': 'metawiki',
 114+ 'wikispecies': 'specieswiki',
 115+ 'incubator': 'incubatorwiki',
 116+ 'foundation': 'foundationwiki',
 117+ 'mediawiki': 'mediawikiwiki',
 118+ 'outreach': 'outreachwiki',
 119+ 'strategic planning': 'strategywiki',
 120+ 'usability initiative': 'usabilitywiki',
 121+ 'multilingual wikisource': None
 122+ }
 123+
96124 #Multiprocess settings used to parallelize workload
97125 #Change this to match your computers configuration (RAM / CPU)
98126 NUMBER_OF_PROCESSES = cpu_count() * 1
99127
100 -#Extensions of ascii files, this is used to determine the filemode to use
 128+#Extensions of ascii files, this is used to determine the filemode to use
101129 ASCII = ['txt', 'csv', 'xml', 'sql']
102130
103131 WP_DUMP_LOCATION = 'http://download.wikimedia.org'
104132
 133+MAX_CACHE_SIZE = 1024 * 1024
Index: trunk/tools/editor_trends/utils/namespace_downloader.py
@@ -0,0 +1,43 @@
 2+
 3+
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = 'Oct 27, 2010'
 19+__version__ = '0.1'
 20+
 21+import languages
 22+import dump_downloader as dd
 23+import settings
 24+
 25+PATH = '/w/api.php?action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json'
 26+LOCATION = settings.NAMESPACE_LOCATION
 27+
 28+def retrieve_json_namespace():
 29+ visited = set()
 30+ for language in languages.MAPPING:
 31+ language = languages.MAPPING[language]
 32+ filename = '%s_ns.json' % language
 33+ if language not in visited:
 34+ domain = 'http://%s.wikipedia.org' % language
 35+ dd.download_wiki_file(domain, PATH, filename, LOCATION, 'w', True)
 36+ visited.add(language)
 37+
 38+
 39+def launch_downloader():
 40+ retrieve_json_namespace()
 41+
 42+
 43+if __name__ == '__main__':
 44+ launch_downloader()
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/utils.py
@@ -65,7 +65,7 @@
6666 return False
6767 else:
6868 os.kill(pid, 0)
69 - return Tru
 69+ return True
7070 except Exception, error:
7171 print error
7272 return False
@@ -132,7 +132,7 @@
133133 # read / write data related functions
134134 def read_data_from_csv(filename, encoding):
135135 if hasattr(filename, '__call__'):
136 - filename = construct_filename_from_function(filename)
 136+ filename = construct_filename(filename)
137137
138138 fh = open_txt_file(filename, 'r', encoding=encoding)
139139 for line in fh:
@@ -140,13 +140,15 @@
141141
142142 fh.close()
143143
144 -def create_directory(language):
 144+
 145+def create_directory(path):
145146 try:
146 - os.mkdir(settings.WORKING_DIRECTORY + '/' + language)
 147+ os.mkdir(path)
147148 return True
148 - except IOERROR:
 149+ except IOError:
149150 return False
150151
 152+
151153 def determine_file_extension(filename):
152154 pos = filename.rfind('.') + 1
153155 return filename[pos:]
@@ -158,10 +160,18 @@
159161 else:
160162 return 'wb'
161163
162 -
163 -def write_data_to_csv(data, location, function, encoding):
164 - filename = construct_filename_from_function(function, '.csv')
165 - fh = open_txt_file(location, filename, 'a', encoding=encoding)
 164+def write_list_to_csv(data, fh, recursive=False):
 165+ if recursive:
 166+ recursive = False
 167+ for d in data:
 168+ if type(d) == type([]):
 169+ recursive = write_list_to_csv(d, fh, True)
 170+ else:
 171+ fh.write('%s\t' % d)
 172+ if recursive:
 173+ return True
 174+
 175+def write_dict_to_csv(data, fh):
166176 keys = data.keys()
167177 for key in keys:
168178 fh.write('%s' % key)
@@ -172,45 +182,68 @@
173183 else:
174184 fh.write('\t%s' % (obs))
175185 fh.write('\n')
176 - fh.close()
177186
178187
179 -def open_txt_file(location, filename, mode, encoding):
180 - return codecs.open(location + filename, mode, encoding=encoding)
 188+def create_txt_filehandle(location, name, mode, encoding):
 189+ filename = construct_filename(name, '.csv')
 190+ path = os.path.join(location, filename)
 191+ return codecs.open(path, mode, encoding=encoding)
181192
182193
183 -def open_binary_file(location, filename, mode):
184 - return open(location + filename, mode)
 194+def create_binary_filehandle(location, filename, mode):
 195+ path = os.path.join(location, filename)
 196+ return open(path, mode)
185197
186 -def construct_filename_from_function(function, extension):
187 - return function.func_name + extension
188198
 199+def construct_filename(name, extension):
 200+ if hasattr(name, '__call__'):
 201+ return name.func_name + extension
 202+ else:
 203+ return name
189204
 205+
190206 def check_file_exists(location, filename):
191207 if hasattr(filename, '__call__'):
192 - filename = construct_filename_from_function(filename, '.bin')
193 - if os.path.exists(location + filename):
 208+ filename = construct_filename(filename, '.bin')
 209+ if os.path.exists(os.path.join(location, filename)):
194210 return True
195211 else:
196212 return False
197213
198214
 215+def which(program):
 216+ def is_exe(fpath):
 217+ return os.path.exists(fpath) and os.access(fpath, os.X_OK)
 218+
 219+ fpath, fname = os.path.split(program)
 220+ if fpath:
 221+ if is_exe(program):
 222+ return program
 223+ else:
 224+ for path in os.environ["PATH"].split(os.pathsep):
 225+ exe_file = os.path.join(path, program)
 226+ if is_exe(exe_file):
 227+ return exe_file
 228+
 229+ return None
 230+
 231+
199232 def store_object(object, location, filename):
200233 if hasattr(filename, '__call__'):
201 - filename = construct_filename_from_function(filename, '.bin')
 234+ filename = construct_filename(filename, '.bin')
202235 if not filename.endswith('.bin'):
203236 filename = filename + '.bin'
204 - fh = open(location + filename, 'wb')
 237+ fh = create_binary_filehandle(location, filename, 'wb')
205238 cPickle.dump(object, fh)
206239 fh.close()
207240
208241
209242 def load_object(location, filename):
210243 if hasattr(filename, '__call__'):
211 - filename = construct_filename_from_function(filename, '.bin')
 244+ filename = construct_filename(filename, '.bin')
212245 if not filename.endswith('.bin'):
213246 filename = filename + '.bin'
214 - fh = open(location + filename, 'rb')
 247+ fh = create_binary_filehandle(location, filename, 'rb')
215248 obj = cPickle.load(fh)
216249 fh.close()
217250 return obj
@@ -293,8 +326,8 @@
294327
295328
296329 def debug():
297 - dt = humanize_time_difference(64)
298 - print dt
299 -
 330+ #dt = humanize_time_difference(64)
 331+ #print dt
 332+ check_if_process_is_running(3012)
300333 if __name__ == '__main__':
301334 debug()
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -17,7 +17,7 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
21 -from multiprocessing import Process, Queue
 21+from multiprocessing import Process, Queue, JoinableQueue
2222 from Queue import Empty
2323
2424 import settings
@@ -53,29 +53,32 @@
5454 @kwargs is a dictionary with optional variables. Used to supply to main
5555 '''
5656
57 - input_queue = Queue()
 57+ nr_input_processors = kwargs.pop('nr_input_processors')
 58+ nr_output_processors = kwargs.pop('nr_output_processors')
 59+
5860 if result_queue:
59 - result_queue = Queue()
 61+ result_queue = JoinableQueue()
6062
61 - load_input_queue(input_queue, obj, poison_pill=True)
 63+ input_queue = load_input_queue(obj, poison_pill=True)
6264
6365 if settings.PROGRESS_BAR:
6466 pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start()
 67+ kwargs['pbar'] = pbar
6568 else:
6669 pbar = False
6770
6871
6972 input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
70 - **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES -1)]
 73+ **kwargs) for i in xrange(nr_input_processors)]
7174
7275 for input_process in input_processes:
7376 input_process.start()
7477 pids = [p.pid for p in input_processes]
7578 kwargs['pids'] = pids
76 -
 79+
7780 if result_queue:
7881 result_processes = [models.ProcessResultQueue(result_processor,
79 - result_queue, **kwargs) for i in xrange(24)]
 82+ result_queue, **kwargs) for i in xrange(nr_output_processors)]
8083 for result_process in result_processes:
8184 result_process.start()
8285
@@ -95,7 +98,7 @@
9699 print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
97100
98101
99 -def load_queue(input_queue, obj, poison_pill=False):
 102+def load_queue(obj, poison_pill=False):
100103 '''
101104 @input_queue should be an instance of multiprocessing.Queue
102105
@@ -103,7 +106,7 @@
104107
105108 @returns: queue with tasks
106109 '''
107 -
 110+ input_queue = Queue()
108111 if isinstance(obj, type(list)):
109112 data = utils.load_object(obj)
110113 else:
Index: trunk/tools/editor_trends/split_xml_file.py
@@ -21,6 +21,12 @@
2222 import codecs
2323 import utils
2424 import re
 25+import json
 26+import os
 27+
 28+import progressbar
 29+
 30+from utils import utils
2531 import settings
2632
2733 try:
@@ -30,6 +36,7 @@
3137 pass
3238
3339
 40+
3441 RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
3542
3643
@@ -38,7 +45,16 @@
3946
4047
4148 def lenient_deccharref(m):
42 - return unichr(int(m.group(1)))
 49+ try:
 50+ return unichr(int(m.group(1)))
 51+ except ValueError:
 52+ '''
 53+ There are a few articles that raise a Value Error here, the reason is
 54+ that I am using a narrow Python build (UCS2) instead of a wide build
 55+ (UCS4). The quick fix is to return an empty string...
 56+ Real solution is to rebuild Python with UCS4 support.....
 57+ '''
 58+ return ''
4359
4460
4561 def remove_namespace(element, namespace):
@@ -50,42 +66,70 @@
5167 elem.tag = elem.tag[nsl:]
5268 return element
5369
 70+def load_namespace(language):
 71+ file = '%s_ns.json' % language
 72+ fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING)
 73+ ns = json.load(fh)
 74+ fh.close()
 75+ ns = ns['query']['namespaces']
 76+ return ns
5477
 78+
 79+def build_namespaces_locale(namespaces):
 80+ ns = []
 81+ for namespace in namespaces:
 82+ value = namespaces[namespace].get(u'canonical', None)
 83+ if value != None and not value.endswith('talk'):
 84+ ns.append(value)
 85+ return ns
 86+
 87+
5588 def parse_comments(xml, function):
5689 revisions = xml.findall('revision')
5790 for revision in revisions:
5891 comment = revision.find('comment')
5992 timestamp = revision.find('timestamp').text
60 -
6193 # text1 = remove_ascii_control_characters(text)
6294 # text2 = remove_numeric_character_references(text)
6395 # text3 = convert_html_entities(text)
64 -
6596 if comment != None and comment.text != None:
6697 comment.text = function(comment.text)
6798 return xml
6899
69100
 101+def is_article_main_namespace(elem, namespace):
 102+ title = elem.find('title').text
 103+ for ns in namespace:
 104+ if title.startswith(ns):
 105+ return False
 106+ return True
 107+
 108+
 109+
70110 def write_xml_file(element, fh, counter, language):
71111 '''Get file handle and write xml element to file'''
72112 size = len(cElementTree.tostring(element))
73 - fh, counter = create_xml_file_handle(fh, counter, size)
74 - fh.write(cElementTree.tostring(element))
 113+ fh, counter = create_xml_file_handle(fh, counter, size, language)
 114+ try:
 115+ fh.write(cElementTree.tostring(element))
 116+ except MemoryError:
 117+ print 'Add error capturing logic'
75118 fh.write('\n')
76119 return fh, counter
77120
78121
79 -def create_xml_file_handle(fh, counter, size):
 122+def create_xml_file_handle(fh, counter, size, language):
80123 '''Create file handle if none is supplied or if file size > max file size.'''
 124+ path = os.path.join(settings.XML_FILE_LOCATION , language, '%s.xml' % counter)
81125 if not fh:
82126 counter = 0
83 - fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
 127+ fh = codecs.open(path, 'w', encoding=settings.ENCODING)
84128 return fh, counter
85129 elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
86130 print 'Created chunk %s' % counter
87131 fh.close
88132 counter += 1
89 - fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
 133+ fh = codecs.open(path, 'w', encoding=settings.ENCODING)
90134 return fh, counter
91135 else:
92136 return fh, counter
@@ -93,14 +137,21 @@
94138
95139 def split_xml(language):
96140 '''Reads xml file and splits it in N chunks'''
97 - result = utils.create_directory(language)
 141+ location = os.path.join(settings.XML_FILE_LOCATION, language)
 142+ result = utils.check_file_exists(location, '')
 143+ if result == False:
 144+ result = utils.create_directory(location)
98145 if not result:
99146 return
100147
 148+ ns = load_namespace(language)
 149+ ns = build_namespaces_locale(ns)
 150+
 151+
101152 fh = None
102153 counter = None
103154 tag = '{%s}page' % settings.NAME_SPACE
104 -
 155+
105156 context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
106157 context = iter(context)
107158 event, root = context.next() # get the root element of the XML doc
@@ -110,12 +161,16 @@
111162 if elem.tag == tag:
112163 elem = remove_namespace(elem, settings.NAME_SPACE)
113164 elem = parse_comments(elem, remove_numeric_character_references)
 165+
 166+ if is_article_main_namespace(elem, ns):
 167+ fh, counter = write_xml_file(elem, fh, counter, language)
 168+
 169+ root.clear() # when done parsing a section clear the tree to safe memory
 170+
114171 #elem = parse_comments(elem, convert_html_entities)
115172 #elem = parse_comments(elem, remove_ascii_control_characters)
116 - fh, counter = write_xml_file(elem, fh, counter, language)
117173 #print cElementTree.tostring(elem)
118 - root.clear() # when done parsing a section clear the tree to safe memory
119174
120175
121176 if __name__ == "__main__":
122 - split_xml('enwiki')
 177+ split_xml('en')
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -19,7 +19,6 @@
2020
2121 from multiprocessing import Queue
2222 from Queue import Empty
23 -import sqlite3
2423
2524 import progressbar
2625
@@ -35,46 +34,63 @@
3635 pass
3736
3837
39 -def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True):
40 - raise DeprecatedError
41 -# if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
42 -# retrieve_editor_ids_mongo):
43 -# contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
44 -# retrieve_editor_ids_mongo)
45 -# else:
46 -# mongo = db.init_mongo_db('editors')
47 -# editors = mongo['editors']
48 -# contributors = set()
49 -# #ids = editors.find().distinct('editor')
50 -# ids = editors.find()
51 -# for x, id in enumerate(ids):
52 -# contributors.add(id['editor'])
53 -# if len(contributors) == 100000:
54 -# if RANDOM_SAMPLE:
55 -# break
56 -# if contributors != set():
57 -# utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
58 -# return contributors
59 -
60 -def retrieve_ids_mongo_new(dbname, collection):
61 - if utils.check_file_exists(settings.TXT_FILE_LOCATION,
 38+def retrieve_editor_ids_mongo(dbname, collection):
 39+ if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
6240 retrieve_editor_ids_mongo):
63 - ids = utils.load_object(settings.TXT_FILE_LOCATION,
 41+ ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
6442 retrieve_editor_ids_mongo)
6543 else:
6644 mongo = db.init_mongo_db(dbname)
6745 editors = mongo[collection]
68 - ids = editors.distinct()
69 - utils.store_object(contributors, settings.TXT_FILE_LOCATION, retrieve_editor_ids_mongo)
 46+ ids = editors.distinct('editor')
 47+ utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
7048 return ids
7149
 50+
 51+def expand_edits(edits):
 52+ data = []
 53+ for edit in edits:
 54+ data.append(edit['date'])
 55+ return data
 56+
 57+
 58+def expand_observations(obs, vars_to_expand):
 59+ for var in vars_to_expand:
 60+ if var == 'edits':
 61+ obs[var] = expand_edits(obs[var])
 62+ elif var == 'edits_by_year':
 63+ keys = obs[var].keys()
 64+ keys.sort()
 65+ edits = []
 66+ for key in keys:
 67+ edits.append(str(obs[var][key]))
 68+ obs[var] = edits
 69+ return obs
 70+
 71+
 72+def expand_headers(headers, vars_to_expand, obs):
 73+ for var in vars_to_expand:
 74+ l = len(obs[var])
 75+ pos = headers.index(var)
 76+ for i in xrange(l):
 77+ if var.endswith('year'):
 78+ suffix = 2001 + i
 79+ elif var.endswith('edits'):
 80+ suffix = 1 + i
 81+ headers.insert(pos+i, '%s_%s' % (var, suffix))
 82+ headers.remove(var)
 83+ return headers
 84+
 85+
7286 def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
73 - definition = kwargs.pop('definition')
74 - limit = kwargs.pop('limit')
7587 debug = kwargs.pop('debug')
76 - mongo = db.init_mongo_db('editors')
77 - editors = mongo['editors']
78 - data = {}
 88+ dbname = kwargs.pop('dbname')
 89+ mongo = db.init_mongo_db(dbname)
 90+ editors = mongo['dataset']
 91+ name = dbname + '_editors.csv'
 92+ fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)
 93+ x = 0
 94+ vars_to_expand = ['edits', 'edits_by_year']
7995 while True:
8096 try:
8197 if debug:
@@ -83,115 +99,68 @@
84100 id = input_queue.get(block=False)
85101
86102 print input_queue.qsize()
87 - if definition == 'Traditional':
88103
89 - obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit)
90 - contributors = []
91 - for ob in obs:
92 - contributors.append(ob['date'])
93 - obs = ''
94 - else:
95 - obs = editors.find({'editor': id}, {'date':1}).sort('date')
96 - contributors = set()
97 - for ob in obs:
98 - if len(contributors) == limit:
99 - break
100 - else:
101 - contributors.add(ob['date'])
102 - obs.close()
103 - if len(contributors) < limit:
104 - new_wikipedian = False
105 - else:
106 - new_wikipedian = True
107 - data[id] = [contributors, new_wikipedian]
 104+ obs = editors.find_one({'editor': id})
 105+ obs = expand_observations(obs, vars_to_expand)
 106+ if x == 0:
 107+ headers = obs.keys()
 108+ headers.sort()
 109+ headers = expand_headers(headers, vars_to_expand, obs)
 110+ utils.write_list_to_csv(headers, fh)
 111+ fh.write('\n')
 112+ data = []
 113+ keys = obs.keys()
 114+ keys.sort()
 115+ for key in keys:
 116+ data.append(obs[key])
 117+ utils.write_list_to_csv(data, fh)
 118+ fh.write('\n')
108119
109 -
 120+ x += 1
110121 except Empty:
111 - utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)
112122 break
 123+ fh.close()
113124
114125
115 -def retrieve_editor_ids_db():
116 - contributors = set()
117 - connection = db.init_database()
118 - cursor = connection.cursor()
119 - if settings.PROGRESS_BAR:
120 - cursor.execute('SELECT MAX(ROWID) FROM contributors')
121 - for id in cursor:
122 - pass
123 - pbar = progressbar.ProgressBar(maxval=id[0]).start()
124 -
125 - cursor.execute('SELECT contributor FROM contributors WHERE bot=0')
126 -
127 - print 'Retrieving contributors...'
128 - for x, contributor in enumerate(cursor):
129 - contributors.add(contributor[0])
130 - if x % 100000 == 0:
131 - pbar.update(x)
132 - print 'Serializing contributors...'
133 - utils.store_object(contributors, 'contributors')
134 - print 'Finished serializing contributors...'
135 -
136 - if pbar:
137 - pbar.finish()
138 - print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
139 -
140 - connection.close()
141 -
142 -
143 -def retrieve_edits_by_contributor(input_queue, result_queue, pbar):
144 - connection = db.init_database()
145 - cursor = connection.cursor()
146 -
147 - while True:
148 - try:
149 - contributor = input_queue.get(block=False)
150 - if contributor == None:
151 - break
152 -
153 - cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,))
154 - edits = {}
155 - edits[contributor] = set()
156 - for edit, timestamp, bot in cursor:
157 - date = utils.convert_timestamp_to_date(timestamp)
158 - edits[contributor].add(date)
159 - #print edit, timestamp, bot
160 -
161 - utils.write_data_to_csv(edits, retrieve_edits_by_contributor)
162 - if pbar:
163 - utils.update_progressbar(pbar, input_queue)
164 -
165 - except Empty:
166 - pass
167 -
168 - connection.close()
169 -
170 -
171126 def retrieve_edits_by_contributor_launcher():
172127 pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
173128
174129
175130 def debug_retrieve_edits_by_contributor_launcher():
176 - q = Queue()
177 - kwargs = {'definition':'Traditional',
178 - 'limit': 10,
179 - 'debug': False
 131+ kwargs = {'debug': False,
 132+ 'dbname': 'enwiki',
180133 }
181 - ids = retrieve_editor_ids_mongo()
182 - input_queue = pc.load_queue(q, ids)
183 - generate_editor_dataset(input_queue, False, False, kwargs)
 134+ ids = retrieve_editor_ids_mongo('enwiki', 'editors')
 135+ input_queue = pc.load_queue(ids)
 136+ q = Queue()
 137+ generate_editor_dataset(input_queue, q, False, kwargs)
184138 #generate_editor_dataset_launcher()
185139 #retrieve_list_contributors()
186140 #retrieve_edits_by_contributor()
187141
188142 def generate_editor_dataset_launcher():
189 - kwargs = {'definition':'Traditional',
190 - 'limit': 10,
191 - 'debug': False
 143+ kwargs = {'nr_input_processors': 1,
 144+ 'nr_output_processors': 1,
 145+ 'debug': False,
 146+ 'dbname': 'enwiki',
192147 }
193 - pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, kwargs)
 148+ ids = retrieve_editor_ids_mongo('enwiki', 'editors')
 149+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, **kwargs)
194150
195151
 152+def generate_editor_dataset_debug():
 153+ ids = retrieve_editor_ids_mongo('enwiki', 'editors')
 154+ input_queue = pc.load_queue(ids)
 155+ #write_dataset(input_queue, [], 'enwiki')
 156+ kwargs = {'nr_input_processors': 1,
 157+ 'nr_output_processors': 1,
 158+ 'debug': True,
 159+ 'dbname': 'enwiki',
 160+ }
 161+ generate_editor_dataset(input_queue, False, False, kwargs)
 162+
 163+
196164 if __name__ == '__main__':
197 - #generate_editor_dataset_launcher()
198 - debug_retrieve_edits_by_contributor_launcher()
 165+ #generate_editor_dataset_debug()
 166+ generate_editor_dataset_launcher()
 167+ #debug_retrieve_edits_by_contributor_launcher()
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
199168 - wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
200169 + wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
wiki.cfg

Status & tagging log