r75053 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75052‎ \| r75053 \| r75054 >
Date:	22:29, 19 October 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Initial commit Editor Trends Analytics Package Current functionality: 1) Splitting XML file in chunks 2) Multiprocessor layer to distribute tasks among different cores 3) Extract information from XML 4) Store data in MongoDB / Sqlite database More information about this project can be found at: http://strategy.wikimedia.org/wiki/Editor_Trends_Study For more information: see README.1ST This version is not finished, might not always work, etc. etc. Use at your own discretion.
Modified paths:	/trunk/tools/editor_trends (added) (history) /trunk/tools/editor_trends/.svn_ignore (added) (history) /trunk/tools/editor_trends/README.1ST (added) (history) /trunk/tools/editor_trends/__init__.py (added) (history) /trunk/tools/editor_trends/algorithms (added) (history) /trunk/tools/editor_trends/algorithms/__init__.py (added) (history) /trunk/tools/editor_trends/algorithms/red_wiki_editors.py (added) (history) /trunk/tools/editor_trends/bots (added) (history) /trunk/tools/editor_trends/bots/__init__.py (added) (history) /trunk/tools/editor_trends/construct_datasets.py (added) (history) /trunk/tools/editor_trends/data (added) (history) /trunk/tools/editor_trends/data/csv (added) (history) /trunk/tools/editor_trends/data/database (added) (history) /trunk/tools/editor_trends/data/objects (added) (history) /trunk/tools/editor_trends/database (added) (history) /trunk/tools/editor_trends/database/__init__.py (added) (history) /trunk/tools/editor_trends/database/db.py (added) (history) /trunk/tools/editor_trends/database/db_settings.py (added) (history) /trunk/tools/editor_trends/datasets (added) (history) /trunk/tools/editor_trends/errors (added) (history) /trunk/tools/editor_trends/init_bot_db.py (added) (history) /trunk/tools/editor_trends/map_wiki_editors.py (added) (history) /trunk/tools/editor_trends/requirements.txt (added) (history) /trunk/tools/editor_trends/run.bat (added) (history) /trunk/tools/editor_trends/settings.py (added) (history) /trunk/tools/editor_trends/split_xml_file.py (added) (history) /trunk/tools/editor_trends/utils (added) (history) /trunk/tools/editor_trends/utils/__init__.py (added) (history) /trunk/tools/editor_trends/utils/models.py (added) (history) /trunk/tools/editor_trends/utils/process_constructor.py (added) (history) /trunk/tools/editor_trends/utils/utils.py (added) (history) /trunk/tools/editor_trends/wikitree (added) (history) /trunk/tools/editor_trends/wikitree/__init__.py (added) (history) /trunk/tools/editor_trends/wikitree/xml.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -0,0 +1,266 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+#Default Python libraries (Python => 2.6)
	19	+import sys
	20	+import os
	21	+import time
	22	+import codecs
	23	+import cStringIO
	24	+import re
	25	+import xml.etree.cElementTree as cElementTree
	26	+from multiprocessing import Queue
	27	+from Queue import Empty
	28	+
	29	+# Custom written files
	30	+import settings
	31	+from utils import utils, models
	32	+from database import db_settings
	33	+from database import db
	34	+from wikitree import xml
	35	+from utils import process_constructor as pc
	36	+
	37	+
	38	+try:
	39	+ import psyco
	40	+ psyco.full()
	41	+except ImportError:
	42	+ pass
	43	+
	44	+contributors = {}
	45	+
	46	+RE_BOT = re.compile('bot', re.IGNORECASE)
	47	+RE_SCRIPT = re.compile('script', re.IGNORECASE)
	48	+#RE_NUMERIC_CHARACTER = re.compile('&#[\d{1,5}]+;')
	49	+#
	50	+#def remove_numeric_character_references(text):
	51	+# return re.sub(RE_NUMERIC_CHARACTER, '', text)
	52	+#
	53	+
	54	+
	55	+def determine_username_is_bot(username, kwargs):
	56	+ ids = kwargs.get('bots', [])
	57	+ if ids == None:
	58	+ ids = []
	59	+ if username != None and username.text != None:
	60	+ id = username.text
	61	+ if id in ids:
	62	+ return 1
	63	+ else:
	64	+ return 0
	65	+
	66	+
	67	+def extract_contributor_id(contributor, kwargs):
	68	+ '''
	69	+ @contributor is the xml contributor node containing a number of attributes
	70	+
	71	+ Currently, we are only interested in registered contributors, hence we
	72	+ ignore anonymous editors. If you are interested in collecting data on
	73	+ anonymous editors then add the string 'ip' to the tags variable.
	74	+ '''
	75	+ tags = ['id']
	76	+ if contributor.get('deleted'):
	77	+ return -1 #Not sure if this is the best way to code deleted contributors.
	78	+ for elem in contributor:
	79	+ if elem.tag in tags:
	80	+ if elem.text != None:
	81	+ return elem.text.decode('utf-8')
	82	+ else:
	83	+ return -1
	84	+
	85	+
	86	+def output_editor_information(elem, data_queue, **kwargs):
	87	+ tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot},
	88	+ 'timestamp': {'date': xml.extract_text},
	89	+ }
	90	+ vars = {}
	91	+ vars['article'] = elem.find('id').text.decode(settings.ENCODING)
	92	+ revisions = elem.findall('revision')
	93	+ for revision in revisions:
	94	+ #print vars
	95	+ elements = revision.getchildren()
	96	+ for tag, functions in tags.iteritems():
	97	+ xml_node = xml.retrieve_xml_node(elements, tag)
	98	+ for var, function in functions.iteritems():
	99	+ vars[var] = function(xml_node, kwargs)
	100	+
	101	+ #if vars['editor'] == '11887479' or vars['editor'] == '518794':
	102	+ # print vars
	103	+ #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
	104	+ if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
	105	+ vars.pop('bot')
	106	+ vars['date'] = utils.convert_timestamp_to_date(vars['date'])
	107	+ data_queue.put(vars)
	108	+ vars={}
	109	+
	110	+def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
	111	+ if settings.DEBUG:
	112	+ messages = {}
	113	+ vars = {}
	114	+ while True:
	115	+ try:
	116	+ if debug:
	117	+ file = xml_queue
	118	+ else:
	119	+ file = xml_queue.get(block=False)
	120	+ #print 'parsing %s' % file
	121	+ if file == None:
	122	+ break
	123	+
	124	+ data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION
	125	+ + file, 'r', encoding=settings.ENCODING))
	126	+ #data = read_input(sys.stdin)
	127	+ #print xml_queue.qsize()
	128	+ for raw_data in data:
	129	+ xml_buffer = cStringIO.StringIO()
	130	+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	131	+ raw_data = ''.join(raw_data)
	132	+ xml_buffer.write(raw_data)
	133	+
	134	+ try:
	135	+ elem = cElementTree.XML(xml_buffer.getvalue())
	136	+ output_editor_information(elem, data_queue, bots=bots)
	137	+ except SyntaxError, error:
	138	+ print error
	139	+ #There are few cases with invalid tokens, they are fixed
	140	+ #here and then reinserted into the XML DOM
	141	+ #data = convert_html_entities(xml_buffer.getvalue())
	142	+ #elem = cElementTree.XML(data)
	143	+ #output_editor_information(elem)
	144	+ if settings.DEBUG:
	145	+ utils.track_errors(xml_buffer, error, file, messages)
	146	+ except UnicodeEncodeError, error:
	147	+ print error
	148	+ if settings.DEBUG:
	149	+ utils.track_errors(xml_buffer, error, file, messages)
	150	+ #finally:
	151	+
	152	+
	153	+ if pbar:
	154	+ print xml_queue.qsize()
	155	+ #utils.update_progressbar(pbar, xml_queue)
	156	+ if debug:
	157	+ break
	158	+
	159	+ except Empty:
	160	+ break
	161	+
	162	+ if settings.DEBUG:
	163	+ utils.report_error_messages(messages, lookup_new_editors)
	164	+
	165	+
	166	+def store_data_mongo(data_queue, pids):
	167	+ mongo = db.init_mongo_db('editors')
	168	+ collection = mongo['editors']
	169	+ values = []
	170	+ while True:
	171	+ try:
	172	+ chunk = data_queue.get(block=False)
	173	+ values.append(chunk)
	174	+ #print chunk
	175	+ if len(values) == 100000:
	176	+ collection.insert(values)
	177	+ values = []
	178	+ #print data_queue.qsize()
	179	+ data_queue.task_done()
	180	+ except Empty:
	181	+ # The queue is empty but store the remaining values if present
	182	+ if values != []:
	183	+ collection.insert(values)
	184	+ values = []
	185	+
	186	+ #print [utils.check_if_process_is_running(pid) for pid in pids]
	187	+ '''
	188	+ This checks whether the Queue is empty because the preprocessors are
	189	+ finished or because this function is faster in emptying the Queue
	190	+ then the preprocessors are able to fill it. If this preprocessors
	191	+ are finished and this Queue is empty than break, else wait for the
	192	+ Queue to fill.
	193	+ '''
	194	+ if all([utils.check_if_process_is_running(pid) for pid in pids]):
	195	+ pass
	196	+ else:
	197	+ break
	198	+
	199	+
	200	+def store_data_db(data_queue, pids):
	201	+ connection = db.init_database()
	202	+ cursor = connection.cursor()
	203	+ db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE)
	204	+
	205	+ empty = 0
	206	+
	207	+ values = []
	208	+ while True:
	209	+ try:
	210	+ chunk = data_queue.get(block=False)
	211	+ contributor = chunk['contributor'].encode(settings.ENCODING)
	212	+ article = chunk['article']
	213	+ timestamp = chunk['timestamp'].encode(settings.ENCODING)
	214	+ bot = chunk['bot']
	215	+ values.append((contributor, article, timestamp, bot))
	216	+
	217	+ if len(values) == 50000:
	218	+ cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values)
	219	+ connection.commit()
	220	+ #print 'Size of queue: %s' % data_queue.qsize()
	221	+ values = []
	222	+
	223	+ except Empty:
	224	+ if all([utils.check_if_process_is_running(pid) for pid in pids]):
	225	+ pass
	226	+ else:
	227	+ break
	228	+ connection.close()
	229	+
	230	+
	231	+def run_stand_alone():
	232	+ files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
	233	+ #files = files[:2]
	234	+ mongo = db.init_mongo_db('bots')
	235	+ bots = mongo['ids']
	236	+ ids = {}
	237	+ cursor = bots.find()
	238	+
	239	+ for bot in cursor:
	240	+ ids[bot['id']] = bot['name']
	241	+ pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids)
	242	+ db.add_index_to_collection('editors', 'date')
	243	+ db.add_index_to_collection('editors', 'name')
	244	+
	245	+def debug_lookup_new_editors():
	246	+ q = Queue()
	247	+ import progressbar
	248	+ pbar = progressbar.ProgressBar().start()
	249	+ edits = db.init_mongo_db('editors')
	250	+ lookup_new_editors('1.xml', q, None, None, True)
	251	+ db.add_index_to_collection('editors', 'date')
	252	+ db.add_index_to_collection('editors', 'name')
	253	+
	254	+
	255	+
	256	+def run_hadoop():
	257	+ pass
	258	+
	259	+
	260	+if __name__ == "__main__":
	261	+ #debug_lookup_new_editors()
	262	+
	263	+ if settings.RUN_MODE == 'stand_alone':
	264	+ run_stand_alone()
	265	+ print 'Finished processing XML files.'
	266	+ else:
	267	+ run_hadoop()
Property changes on: trunk/tools/editor_trends/map_wiki_editors.py
___________________________________________________________________
Added: svn:mime-type
1	268	+ text/plain
Added: svn:eol-style
2	269	+ native
Index: trunk/tools/editor_trends/wikitree/__init__.py
Property changes on: trunk/tools/editor_trends/wikitree/__init__.py
___________________________________________________________________
Added: svn:mime-type
3	270	+ text/plain
Added: svn:eol-style
4	271	+ native
Index: trunk/tools/editor_trends/wikitree/xml.py
—	—	@@ -0,0 +1,49 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+from utils import utils
	19	+import settings
	20	+
	21	+
	22	+def convert_html_entities(text):
	23	+ return utils.unescape(text)
	24	+
	25	+
	26	+def extract_text(elem, kwargs):
	27	+ if elem != None and elem.text != None:
	28	+ return elem.text.decode(settings.ENCODING)
	29	+ return None
	30	+
	31	+
	32	+def retrieve_xml_node(xml_nodes, name):
	33	+ for xml_node in xml_nodes:
	34	+ if xml_node.tag == name:
	35	+ return xml_node
	36	+ return None #maybe this should be replaced with an NotFoundError
	37	+
	38	+
	39	+def read_input(file):
	40	+ lines = []
	41	+ for line in file:
	42	+ lines.append(line)
	43	+ if line.find('</page>') > -1:
	44	+ yield lines
	45	+ '''
	46	+ #This looks counter intuitive but Python continues with this call
	47	+ after it has finished the yield statement
	48	+ '''
	49	+ lines = []
	50	+ file.close()
Property changes on: trunk/tools/editor_trends/wikitree/xml.py
___________________________________________________________________
Added: svn:mime-type
1	51	+ text/plain
Added: svn:eol-style
2	52	+ native
Index: trunk/tools/editor_trends/__init__.py
—	—	@@ -0,0 +1,14 @@
	2	+import os
	3	+import sys
	4	+
	5	+WORKING_DIRECTORY = os.getcwd()#[:-9]
	6	+IGNORE_DIRS = ['wikistats', 'zips']
	7	+
	8	+dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
	9	+ os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
	10	+
	11	+
	12	+for subdirname in dirs:
	13	+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
	14	+ sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
	15	+ #print os.path.join(WORKING_DIRECTORY, subdirname)
Property changes on: trunk/tools/editor_trends/__init__.py
___________________________________________________________________
Added: svn:mime-type
1	16	+ text/plain
Added: svn:eol-style
2	17	+ native
Index: trunk/tools/editor_trends/.svn_ignore
—	—	@@ -0,0 +1,12 @@
	2	+*.pyc
	3	+*.xml
	4	+*.db
	5	+*.bin
	6	+*.zip
	7	+*.csv
	8	+.*
	9	+zips/
	10	+wikistats/
	11	+datasets/
	12	+data/
	13	+notes.txt
\ No newline at end of file
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -0,0 +1,91 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+
	19	+'''
	20	+This file contains settings that are used for constructing and analyzing
	21	+the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
	22	+'''
	23	+
	24	+
	25	+from multiprocessing import cpu_count
	26	+import os
	27	+import sys
	28	+import platform
	29	+
	30	+#Setting up the environment
	31	+ops = {platform.win32_ver: 'Windows',
	32	+ platform.linux_distribution: 'Linux',
	33	+ platform.mac_ver: 'OSX'}
	34	+for op in ops:
	35	+ if op() != ('', '', '') and op() != ('', ('', '', ''), ''):
	36	+ OS = ops[op]
	37	+
	38	+WORKING_DIRECTORY = os.getcwd()#[:-9]
	39	+IGNORE_DIRS = ['wikistats', 'zips']
	40	+
	41	+dirs = [name for name in os.listdir(WORKING_DIRECTORY) if os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
	42	+for subdirname in dirs:
	43	+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
	44	+ sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
	45	+
	46	+
	47	+#General settings
	48	+
	49	+# Valid values are 'stand-alone' and 'hadoop'
	50	+RUN_MODE = 'stand_alone'
	51	+
	52	+# If true then some more detailed debug information is collected
	53	+DEBUG = True
	54	+
	55	+#If True then it will display a progress bar on the console.
	56	+PROGRESS_BAR = True
	57	+
	58	+#Date format as used by Erik Zachte
	59	+DATE_FORMAT = '%Y-%m-%d'
	60	+
	61	+# Timestamp format as generated by the MediaWiki dumps
	62	+DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
	63	+
	64	+#This section contains configuration variables for the different file locations.
	65	+
	66	+# Location where to write xml chunks
	67	+XML_FILE_LOCATION = 'C:/wikimedia/'
	68	+
	69	+# Input file
	70	+XML_FILE = 'C:/Source_Files/enwiki-20100916-stub-meta-history.xml'
	71	+
	72	+# This is the place where error messages are stored for debugging purposes
	73	+ERROR_MESSAGE_FILE_LOCATION = WORKING_DIRECTORY + '/errors/'
	74	+
	75	+DATABASE_FILE_LOCATION = WORKING_DIRECTORY + '/data/database/'
	76	+
	77	+BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/'
	78	+
	79	+#This section contains configuration variables for parsing / encoding and
	80	+#working with the XML files.
	81	+
	82	+# ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Sets for reason
	83	+MAX_XML_FILE_SIZE = 67108864
	84	+
	85	+ENCODING = 'utf-8'
	86	+
	87	+# Name space, do not change as this works for Mediawiki wikis
	88	+NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
	89	+
	90	+#Multiprocess settings used to parallelize workload
	91	+#Change this to match your computers configuration (RAM / CPU)
	92	+NUMBER_OF_PROCESSES = cpu_count() * 1
Property changes on: trunk/tools/editor_trends/settings.py
___________________________________________________________________
Added: svn:mime-type
1	93	+ text/plain
Added: svn:eol-style
2	94	+ native
Index: trunk/tools/editor_trends/utils/__init__.py
Property changes on: trunk/tools/editor_trends/utils/__init__.py
___________________________________________________________________
Added: svn:mime-type
3	95	+ text/plain
Added: svn:eol-style
4	96	+ native
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -0,0 +1,267 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+'''
	19	+The utils module contains helper functions that will be needed throughout.
	20	+It provides functions to read / write data to text and binary files, fix markup
	21	+and track error messages.
	22	+'''
	23	+
	24	+import re
	25	+import htmlentitydefs
	26	+import cPickle
	27	+import datetime
	28	+import codecs
	29	+import os
	30	+import ctypes
	31	+
	32	+import settings
	33	+
	34	+
	35	+try:
	36	+ import psyco
	37	+ psyco.full()
	38	+except ImportError:
	39	+ pass
	40	+
	41	+
	42	+RE_ERROR_LOCATION = re.compile('\d+')
	43	+RE_NUMERIC_CHARACTER = re.compile('&#?\w+;')
	44	+
	45	+
	46	+def convert_timestamp_to_date(timestamp):
	47	+ return datetime.datetime.strptime(timestamp[:10], settings.DATE_FORMAT)
	48	+
	49	+
	50	+def convert_timestamp_to_datetime(timestamp):
	51	+ return datetime.datetime.strptime(timestamp, settings.DATETIME_FORMAT)
	52	+
	53	+
	54	+def check_if_process_is_running(pid):
	55	+ try:
	56	+ if settings.OS == 'Windows':
	57	+ PROCESS_TERMINATE = 1
	58	+ handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
	59	+ if handle != 0:
	60	+ return True
	61	+ else:
	62	+ return False
	63	+ else:
	64	+ os.kill(pid, 0)
	65	+ return Tru
	66	+ except Exception, error:
	67	+ print error
	68	+ return False
	69	+
	70	+
	71	+# error tracking related functions
	72	+def track_errors(xml_buffer, error, file, messages):
	73	+ text = extract_offending_string(xml_buffer.getvalue(), error)
	74	+
	75	+ vars = {}
	76	+ vars['file'] = file
	77	+ vars['error'] = error
	78	+ vars['text'] = text
	79	+ #print file, error, text
	80	+ key = remove_error_specific_information(error)
	81	+ if key not in messages:
	82	+ messages[key] = {}
	83	+ if messages[key] == {}:
	84	+ c = 0
	85	+ else:
	86	+ counters = messages[key].keys()
	87	+ counters.sort()
	88	+ counters.reverse()
	89	+ c = counters[-1]
	90	+
	91	+ messages[key][c] = {}
	92	+ for var in vars:
	93	+ messages[key][c][var] = vars[var]
	94	+
	95	+ return messages
	96	+
	97	+
	98	+def report_error_messages(messages, function):
	99	+ store_object(messages, settings.ERROR_MESSAGE_FILE_LOCATION, function.func_name)
	100	+ errors = messages.keys()
	101	+ for error in errors:
	102	+ for key, value in messages[error].iteritems():
	103	+ print error, key, value
	104	+
	105	+
	106	+def remove_error_specific_information(e):
	107	+ pos = e.args[0].find('line')
	108	+ if pos > -1:
	109	+ return e.args[0][:pos]
	110	+ else:
	111	+ return e.args[0]
	112	+
	113	+
	114	+def extract_offending_string(text, error):
	115	+ '''
	116	+ This function determines the string that causes an error when feeding it to
	117	+ the XML parser. This is only useful for debugging purposes.
	118	+ '''
	119	+ location = re.findall(RE_ERROR_LOCATION, error.args[0])
	120	+ if location != []:
	121	+ location = int(location[0]) - 1
	122	+ text = text.split('\n')[location]
	123	+ text = text.decode('utf-8')
	124	+ return text
	125	+ else:
	126	+ return ''
	127	+
	128	+
	129	+# read / write data related functions
	130	+def read_data_from_csv(filename, encoding):
	131	+ if hasattr(filename, '__call__'):
	132	+ filename = construct_filename_from_function(filename)
	133	+
	134	+ fh = open_txt_file(filename, 'r', encoding=encoding)
	135	+ for line in fh:
	136	+ yield line
	137	+
	138	+ fh.close()
	139	+
	140	+
	141	+def write_data_to_csv(data, function, encoding):
	142	+ filename = construct_filename_from_function(function, '.csv')
	143	+ fh = open_txt_file(filename, 'a', encoding=encoding)
	144	+ keys = data.keys()
	145	+ for key in keys:
	146	+ for value in data[key]:
	147	+ fh.write('%s\t%s\n' % (key, value))
	148	+ fh.close()
	149	+
	150	+
	151	+def open_txt_file(filename, mode, encoding):
	152	+ return codecs.open(filename, mode, encoding=encoding)
	153	+
	154	+def construct_filename_from_function(function, extension):
	155	+ return function.func_name + extension
	156	+
	157	+def check_file_exists(location, filename):
	158	+ if hasattr(filename, '__call__'):
	159	+ filename = construct_filename_from_function(filename, '.bin')
	160	+ if os.path.exists(location + filename):
	161	+ return True
	162	+ else:
	163	+ return False
	164	+
	165	+
	166	+def store_object(object, location, filename):
	167	+ if hasattr(filename, '__call__'):
	168	+ filename = construct_filename_from_function(filename, '.bin')
	169	+ if not filename.endswith('.bin'):
	170	+ filename = filename + '.bin'
	171	+ fh = open(location + filename, 'wb')
	172	+ cPickle.dump(object, fh)
	173	+ fh.close()
	174	+
	175	+
	176	+def load_object(location, filename):
	177	+ if hasattr(filename, '__call__'):
	178	+ filename = construct_filename_from_function(filename, '.bin')
	179	+ if not filename.endswith('.bin'):
	180	+ filename = filename + '.bin'
	181	+ fh = open(location + filename, 'rb')
	182	+ obj = cPickle.load(fh)
	183	+ fh.close()
	184	+ return obj
	185	+
	186	+
	187	+def clean_string(string):
	188	+ string = string.replace('\n', '')
	189	+ return string
	190	+
	191	+
	192	+def create_dict_from_csv_file(filename, encoding):
	193	+ d = {}
	194	+ for line in read_data_from_csv(filename, encoding):
	195	+ line = clean_string(line)
	196	+ value, key = line.split('\t')
	197	+ d[key] = value
	198	+
	199	+ return d
	200	+
	201	+
	202	+def retrieve_file_list(location, extension):
	203	+ all_files = os.listdir(location)
	204	+ if not extension.startswith('.'):
	205	+ extension = '.' + extension
	206	+ files = []
	207	+ for file in all_files:
	208	+ if file.endswith(extension):
	209	+ files.append(file)
	210	+
	211	+ return files
	212	+
	213	+
	214	+# Progress bar related functions
	215	+def update_progressbar(pbar, queue):
	216	+ '''
	217	+ Updates the progressbar by determining how much work is left in a queue
	218	+ '''
	219	+ x = pbar.maxval - queue.qsize()
	220	+ '''
	221	+ Currently, calling the pbar.update function gives the following error:
	222	+ File "build\bdist.win32\egg\progressbar.py", line 352, in update
	223	+ self.fd.write(self._format_line() + '\r')
	224	+ ValueError: I/O operation on closed file
	225	+ Not sure how to fix this, that's why the line is commented.
	226	+ '''
	227	+ #pbar.update(x)
	228	+
	229	+
	230	+def humanize_time_difference(seconds_elapsed):
	231	+ """
	232	+ Returns a humanized string representing time difference.
	233	+ It will only output the first two time units, so days and
	234	+ hours, or hours and minutes, except when there are only
	235	+ seconds.
	236	+ """
	237	+ seconds_elapsed = int(seconds_elapsed)
	238	+ humanized_time = {}
	239	+ time_units = [('days', 86400), ('hours', 3600), ('minutes', 60), ('seconds', 1)]
	240	+ for time, unit in time_units:
	241	+ dt = seconds_elapsed / unit
	242	+ if dt > 0:
	243	+ humanized_time[time] = dt
	244	+ seconds_elapsed = seconds_elapsed - (unit * humanized_time[time])
	245	+ #humanized_time['seconds'] = seconds_elapsed
	246	+
	247	+ x = 0
	248	+ if len(humanized_time) == 1:
	249	+ return '%s %s' % (humanized_time['seconds'], 'seconds')
	250	+ else:
	251	+ obs = []
	252	+ for time, unit in time_units:
	253	+ if time in humanized_time:
	254	+ unit = humanized_time.get(time, None)
	255	+ if humanized_time[time] == 1:
	256	+ time = time[:-1]
	257	+ obs.append((time, unit))
	258	+ x += 1
	259	+ if x == 2:
	260	+ return '%s %s and %s %s' % (obs[0][1], obs[0][0], obs[1][1], obs[1][0])
	261	+
	262	+
	263	+def debug():
	264	+ dt = humanize_time_difference(64)
	265	+ print dt
	266	+
	267	+if __name__ == '__main__':
	268	+ debug()
Property changes on: trunk/tools/editor_trends/utils/utils.py
___________________________________________________________________
Added: svn:mime-type
1	269	+ text/plain
Added: svn:eol-style
2	270	+ native
Index: trunk/tools/editor_trends/utils/models.py
—	—	@@ -0,0 +1,54 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+import multiprocessing
	19	+
	20	+
	21	+class ProcessInputQueue(multiprocessing.Process):
	22	+
	23	+ def __init__(self, target, input_queue, result_queue, pbar, **kwargs):
	24	+ multiprocessing.Process.__init__(self)
	25	+ self.input_queue = input_queue
	26	+ self.result_queue = result_queue
	27	+ self.target = target
	28	+ self.progressbar = pbar
	29	+ for kw in kwargs:
	30	+ setattr(self, kw, kwargs[kw])
	31	+
	32	+ def run(self):
	33	+ proc_name = self.name
	34	+ kwargs = {}
	35	+ IGNORE = [self.input_queue, self.result_queue, self.target,
	36	+ self.progressbar]
	37	+ for kw in self.__dict__:
	38	+ if kw not in IGNORE and not kw.startswith('_'):
	39	+ kwargs[kw] = getattr(self, kw)
	40	+
	41	+ self.target(self.input_queue, self.result_queue, self.progressbar, kwargs)
	42	+
	43	+
	44	+class ProcessResultQueue(multiprocessing.Process):
	45	+
	46	+ def __init__(self, target, result_queue, pids, pbar):
	47	+ multiprocessing.Process.__init__(self)
	48	+ self.result_queue = result_queue
	49	+ self.target = target
	50	+ self.progressbar = pbar
	51	+ self.pids = pids
	52	+
	53	+ def run(self):
	54	+ proc_name = self.name
	55	+ self.target(self.result_queue, self.pids)
Property changes on: trunk/tools/editor_trends/utils/models.py
___________________________________________________________________
Added: svn:mime-type
1	56	+ text/plain
Added: svn:eol-style
2	57	+ native
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -0,0 +1,113 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+from multiprocessing import Process, Queue, JoinableQueue
	19	+from Queue import Empty
	20	+
	21	+import settings
	22	+import utils
	23	+import models
	24	+
	25	+#3rd party dependency
	26	+import progressbar
	27	+
	28	+
	29	+def build_scaffolding(load_input_queue, main, obj, result_processor=False, result_queue=False, **kwargs):
	30	+ '''
	31	+ This a generic producer/consumer process launcher. It can launch two types
	32	+ of processes:
	33	+ a) Processes that take a task from a queue and do their thing
	34	+ b) Processes that take a task from a queue and put the result in the
	35	+ result_queue.
	36	+ If result_queue is False then a) is assumed.
	37	+
	38	+ @load_input_queue is a function that is used to insert jobs into queue
	39	+
	40	+ @main is the function that will process the input_queue
	41	+
	42	+ @obj can be a pickled object or an enumerable variable that will be loaded
	43	+ into the input_queue
	44	+
	45	+ @result_queue, if set to True will become a true queue and will be provided
	46	+ to main whose job it is to fill with new tasks. If False then this variable
	47	+ is ignored.
	48	+
	49	+ @result_processor, name of the function to process the @result_queue
	50	+
	51	+ @kwargs is a dictionary with optional variables. Used to supply to main
	52	+ '''
	53	+
	54	+ input_queue = Queue()
	55	+ if result_queue:
	56	+ result_queue = JoinableQueue()
	57	+
	58	+ load_input_queue(input_queue, obj, poison_pill=True)
	59	+
	60	+ if settings.PROGRESS_BAR:
	61	+ pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start()
	62	+ else:
	63	+ pbar = False
	64	+
	65	+
	66	+ input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
	67	+ pbar, **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES)]
	68	+
	69	+ for input_process in input_processes:
	70	+ input_process.start()
	71	+ pids = [p.pid for p in input_processes]
	72	+
	73	+ if result_queue:
	74	+ result_processes = [models.ProcessResultQueue(result_processor,
	75	+ result_queue, pids, pbar) for i in xrange(1)]
	76	+ for result_process in result_processes:
	77	+ result_process.start()
	78	+
	79	+ for input_process in input_processes:
	80	+ print 'Waiting for input process to finish'
	81	+ input_process.join()
	82	+ print 'Input process finished'
	83	+
	84	+ if result_queue:
	85	+ for result_process in result_processes:
	86	+ print 'Waiting for result process to finish.'
	87	+ result_process.join()
	88	+ print 'Result process finished'
	89	+
	90	+ if pbar:
	91	+ pbar.finish()
	92	+ print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
	93	+
	94	+
	95	+def load_queue(input_queue, obj, poison_pill=False):
	96	+ '''
	97	+ @input_queue should be an instance of multiprocessing.Queue
	98	+
	99	+ @obj either pickled or enumerable variable that contains the tasks
	100	+
	101	+ @returns: queue with tasks
	102	+ '''
	103	+
	104	+ if isinstance(obj, type(list)):
	105	+ data = utils.load_object(obj)
	106	+ else:
	107	+ data = obj
	108	+ for d in data:
	109	+ input_queue.put(d)
	110	+
	111	+ if poison_pill:
	112	+ for p in xrange(settings.NUMBER_OF_PROCESSES):
	113	+ input_queue.put(None)
	114	+ return input_queue
Property changes on: trunk/tools/editor_trends/utils/process_constructor.py
___________________________________________________________________
Added: svn:mime-type
1	115	+ text/plain
Added: svn:eol-style
2	116	+ native
Index: trunk/tools/editor_trends/requirements.txt
—	—	@@ -0,0 +1,2 @@
	2	+progressbar==2.3-dev
	3	+psyco==1.6
Property changes on: trunk/tools/editor_trends/requirements.txt
___________________________________________________________________
Added: svn:mime-type
1	4	+ text/plain
Added: svn:eol-style
2	5	+ native
Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -0,0 +1,127 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+
	19	+import xml.etree.cElementTree as cElementTree
	20	+import codecs
	21	+import utils
	22	+import re
	23	+import settings
	24	+
	25	+try:
	26	+ import psyco
	27	+ psyco.full()
	28	+except ImportError:
	29	+ pass
	30	+
	31	+
	32	+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
	33	+
	34	+#def convert_html_entities(text):
	35	+# return utils.unescape(text)
	36	+
	37	+
	38	+def remove_numeric_character_references(text):
	39	+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
	40	+
	41	+
	42	+def lenient_deccharref(m):
	43	+ return unichr(int(m.group(1)))
	44	+
	45	+
	46	+def remove_namespace(element, namespace):
	47	+ '''Remove namespace from the document.'''
	48	+ ns = u'{%s}' % namespace
	49	+ nsl = len(ns)
	50	+ for elem in element.getiterator():
	51	+ if elem.tag.startswith(ns):
	52	+ elem.tag = elem.tag[nsl:]
	53	+ return element
	54	+
	55	+
	56	+def parse_comments(xml, function):
	57	+ revisions = xml.findall('revision')
	58	+ for revision in revisions:
	59	+ comment = revision.find('comment')
	60	+ timestamp = revision.find('timestamp').text
	61	+ #if timestamp == '2007-11-25T09:21:11Z':
	62	+ # print 'debug'
	63	+ # text = comment.text
	64	+ #test2 = text.encode('utf-8')
	65	+ #test = text.decode('utf-8')
	66	+
	67	+# text1 = remove_ascii_control_characters(text)
	68	+# text2 = remove_numeric_character_references(text)
	69	+# text3 = convert_html_entities(text)
	70	+
	71	+ if comment != None and comment.text != None:
	72	+ #print comment.text.encode('utf-8')
	73	+
	74	+ comment.text = function(comment.text)
	75	+ #text = comment.text
	76	+ #print text
	77	+ return xml
	78	+
	79	+
	80	+def write_xml_file(element, fh, counter):
	81	+ '''Get file handle and write xml element to file'''
	82	+ size = len(cElementTree.tostring(element))
	83	+ fh, counter = create_xml_file_handle(fh, counter, size)
	84	+ fh.write(cElementTree.tostring(element))
	85	+ fh.write('\n')
	86	+ return fh, counter
	87	+
	88	+
	89	+def create_xml_file_handle(fh, counter, size):
	90	+ '''Create file handle if none is supplied or if file size > max file size.'''
	91	+ if not fh:
	92	+ counter = 0
	93	+ fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
	94	+ return fh, counter
	95	+ elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
	96	+ print 'Created chunk %s' % counter
	97	+ fh.close
	98	+ counter += 1
	99	+ fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
	100	+ return fh, counter
	101	+ else:
	102	+ return fh, counter
	103	+
	104	+
	105	+def split_xml():
	106	+ '''Reads xml file and splits it in N chunks'''
	107	+ fh = None
	108	+ counter = None
	109	+ tag = '{%s}page' % settings.NAME_SPACE
	110	+
	111	+ context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
	112	+ context = iter(context)
	113	+ event, root = context.next() # get the root element of the XML doc
	114	+
	115	+ for event, elem in context:
	116	+ if event == 'end':
	117	+ if elem.tag == tag:
	118	+ elem = remove_namespace(elem, settings.NAME_SPACE)
	119	+ elem = parse_comments(elem, remove_numeric_character_references)
	120	+ #elem = parse_comments(elem, convert_html_entities)
	121	+ #elem = parse_comments(elem, remove_ascii_control_characters)
	122	+ fh, counter = write_xml_file(elem, fh, counter)
	123	+ #print cElementTree.tostring(elem)
	124	+ root.clear() # when done parsing a section clear the tree to safe memory
	125	+
	126	+
	127	+if __name__ == "__main__":
	128	+ split_xml()
Property changes on: trunk/tools/editor_trends/split_xml_file.py
___________________________________________________________________
Added: svn:mime-type
1	129	+ text/plain
Added: svn:eol-style
2	130	+ native
Index: trunk/tools/editor_trends/README.1ST
—	—	@@ -0,0 +1,65 @@
	2	+===============================================================================
	3	+
	4	+ Wikipedia Editor Trends Analytics
	5	+
	6	+===============================================================================
	7	+
	8	+BACKGROUND:
	9	+This package offers a set of tools used to create datasets to analyze Editor
	10	+Trends. By Editor Trends we refer to the overall pattern of entering and leaving
	11	+a Wikipedia site. The main information source for this package is:
	12	+ http://strategy.wikimedia.org/wiki/Editor_Trends_Study
	13	+
	14	+REQUIREMENTS:
	15	+
	16	+* Python 2.6 or higher (this code has not been tested with Python 3.x)
	17	+
	18	+OPTIONAL
	19	+* MongoDB
	20	+
	21	+If you don't want to install / use MongDB then the package will use the built-in
	22	+Sqlite library. However, this not optimized for speed and may take a serious
	23	+amount of time. If possible, install MongoDB.
	24	+
	25	+INSTALLING USING VIRTUALENV
	26	+It's recommended to use Python virtualenv. If you are not familiar with
	27	+virtualenv then have a look over here:
	28	+ http://groups.google.com/group/python-virtualenv/browse_thread/thread/f2f19d2cc93a844e
	29	+
	30	+To install Editor Trends Analytics:
	31	+
	32	+ virtualenv --no-site-packages --distribute editor_trends
	33	+ pip install -E editor_trends -r /editor_trends/requirements.txt
	34	+
	35	+
	36	+The first command creates a new virtualenv called editor_trends and the second
	37	+command installs the dependencies. Currently the dependencies are:
	38	+* PyMongo
	39	+* Progressbar
	40	+
	41	+INSTALLING WITHOUT VIRTUALENV
	42	+If you don't like virtualenv then do the following:
	43	+
	44	+ easy_install pymongo
	45	+ easy_install progressbar
	46	+
	47	+IMPORTANT MONGODB NOTES
	48	+If you decide to use MongDB to store the results then you have to install the
	49	+64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
	50	+databases created by this package will definitely be larger than that. For more
	51	+background information on this limitation, please read:
	52	+ http://blog.mongodb.org/post/137788967/32-bit-limitations
	53	+
	54	+
	55	+CONFIGURATION:
	56	+If you would like to create a dataset for your own analyses then you should
	57	+first make the appropriate changes to settings.py. Settings.py contains
	58	+configuration variables such as the location of input and output files. Most
	59	+settings are self-explanatory but in cases of any questions please drop me a
	60	+line.
	61	+
	62	+CODE:
	63	+The Python code adheres to PEP8. Function names are deliberately expressive to
	64	+ease understanding what's going. If you find a bug please email me at dvanliere
	65	+at gmail dot com or leave a message on my Talk page.
	66	+
Property changes on: trunk/tools/editor_trends/README.1ST
___________________________________________________________________
Added: native
1	67	+ svn:eol-style=native
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -0,0 +1,159 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+from multiprocessing import Queue
	19	+from Queue import Empty
	20	+import sqlite3
	21	+
	22	+import progressbar
	23	+
	24	+import settings
	25	+from utils import models, utils
	26	+from database import db
	27	+from utils import process_constructor as pc
	28	+
	29	+try:
	30	+ import psyco
	31	+ psyco.full()
	32	+except ImportError:
	33	+ pass
	34	+
	35	+
	36	+def retrieve_editor_ids_mongo():
	37	+ if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
	38	+ retrieve_editor_ids_mongo):
	39	+ ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
	40	+ retrieve_editor_ids_mongo)
	41	+ else:
	42	+ mongo = db.init_mongo_db('editors')
	43	+ editors = mongo['editors']
	44	+ ids = editors.find().distinct('editor')
	45	+ print ids
	46	+ if ids != []:
	47	+ utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
	48	+ return ids
	49	+
	50	+
	51	+def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
	52	+ definition = kwargs.pop('definition')
	53	+ limit = kwargs.pop('limit')
	54	+ debug = kwargs.pop('debug')
	55	+ mongo = db.init_mongo_db('editors')
	56	+ editors = mongo['editors']
	57	+ while True:
	58	+ try:
	59	+ if debug:
	60	+ id = u'99797'
	61	+ else:
	62	+ id = input_queue.get(block=False)
	63	+
	64	+ contributors = set()
	65	+ if definition == 'Traditional':
	66	+ obs = editors.find({'editor': id}).limit(limit) #.sort({'date': 1}).limit(limit)
	67	+ for ob in obs:
	68	+ contributors.add(ob)
	69	+ else:
	70	+ obs = editors.find({'editor': id}).sort({'date': 1})
	71	+ for ob in obs:
	72	+ if len(dates) > limit:
	73	+ break
	74	+ else:
	75	+ if edit.date not in dates:
	76	+ set.add(edit)
	77	+ utils.write_data_to_csv(contributors, generate_editor_dataset, settings.ENCODING)
	78	+
	79	+ except Empty:
	80	+ break
	81	+
	82	+
	83	+def retrieve_editor_ids_db():
	84	+ contributors = set()
	85	+ connection = db.init_database()
	86	+ cursor = connection.cursor()
	87	+ if settings.PROGRESS_BAR:
	88	+ cursor.execute('SELECT MAX(ROWID) FROM contributors')
	89	+ for id in cursor:
	90	+ pass
	91	+ pbar = progressbar.ProgressBar(maxval=id[0]).start()
	92	+
	93	+ cursor.execute('SELECT contributor FROM contributors WHERE bot=0')
	94	+
	95	+ print 'Retrieving contributors...'
	96	+ for x, contributor in enumerate(cursor):
	97	+ contributors.add(contributor[0])
	98	+ if x % 100000 == 0:
	99	+ pbar.update(x)
	100	+ print 'Serializing contributors...'
	101	+ utils.store_object(contributors, 'contributors')
	102	+ print 'Finished serializing contributors...'
	103	+
	104	+ if pbar:
	105	+ pbar.finish()
	106	+ print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
	107	+
	108	+ connection.close()
	109	+
	110	+
	111	+def retrieve_edits_by_contributor(input_queue, result_queue, pbar):
	112	+ connection = db.init_database()
	113	+ cursor = connection.cursor()
	114	+
	115	+ while True:
	116	+ try:
	117	+ contributor = input_queue.get(block=False)
	118	+ if contributor == None:
	119	+ break
	120	+
	121	+ cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,))
	122	+ edits = {}
	123	+ edits[contributor] = set()
	124	+ for edit, timestamp, bot in cursor:
	125	+ date = utils.convert_timestamp_to_date(timestamp)
	126	+ edits[contributor].add(date)
	127	+ #print edit, timestamp, bot
	128	+
	129	+ utils.write_data_to_csv(edits, retrieve_edits_by_contributor)
	130	+ if pbar:
	131	+ utils.update_progressbar(pbar, input_queue)
	132	+
	133	+ except Empty:
	134	+ pass
	135	+
	136	+ connection.close()
	137	+
	138	+
	139	+def retrieve_edits_by_contributor_launcher():
	140	+ pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
	141	+
	142	+
	143	+def debug_retrieve_edits_by_contributor_launcher():
	144	+ input_queue = Queue()
	145	+ kwargs = {'definition':'Traditional',
	146	+ 'limit': 10,
	147	+ 'debug': True
	148	+ }
	149	+ generate_editor_dataset(input_queue, False, False, kwargs)
	150	+ generate_editor_dataset_launcher()
	151	+ #retrieve_list_contributors()
	152	+ #retrieve_edits_by_contributor()
	153	+
	154	+def generate_editor_dataset_launcher():
	155	+ ids = retrieve_editor_ids_mongo()
	156	+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, definition='Traditional', limit=10)
	157	+
	158	+
	159	+if __name__ == '__main__':
	160	+ debug_retrieve_edits_by_contributor_launcher()
Property changes on: trunk/tools/editor_trends/construct_datasets.py
___________________________________________________________________
Added: svn:mime-type
1	161	+ text/plain
Added: svn:eol-style
2	162	+ native
Index: trunk/tools/editor_trends/init_bot_db.py
—	—	@@ -0,0 +1,196 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+import os
	19	+import cStringIO
	20	+import xml.etree.cElementTree as cElementTree
	21	+
	22	+
	23	+import settings
	24	+from wikitree import xml
	25	+from database import db
	26	+from database import db_settings
	27	+from utils import utils
	28	+from utils import process_constructor as pc
	29	+
	30	+try:
	31	+ import psyco
	32	+ psyco.full()
	33	+except ImportError:
	34	+ pass
	35	+
	36	+
	37	+def create_bot_ids_db_mongo():
	38	+ ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.ENCODING)
	39	+ mongo = db.init_mongo_db('bots')
	40	+ collection = mongo['ids']
	41	+
	42	+ db.remove_documents_from_mongo_db(collection, None)
	43	+
	44	+ for id, name in ids.iteritems():
	45	+ collection.insert({'id': id, 'name': name})
	46	+
	47	+ print collection.count()
	48	+
	49	+
	50	+def create_bots_db(db_name):
	51	+ '''
	52	+ This function reads the csv file provided by Erik Zachte and constructs a
	53	+ sqlite memory database. The reason for this is that I suspect I will need
	54	+ some simple querying capabilities in the future, else a dictionary would
	55	+ suffice.
	56	+ '''
	57	+ connection = db.init_database('db_name')
	58	+ #connection = db.init_database('data/database/bots.db')
	59	+ cursor = connection.cursor()
	60	+ db.create_tables(cursor, db_settings.BOT_TABLE)
	61	+ values = []
	62	+ fields = [field[0] for field in db_settings.BOT_TABLE['bots']]
	63	+ for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.ENCODING):
	64	+ line = line.split(',')
	65	+ row = []
	66	+ for x, (field, value) in enumerate(zip(fields, line)):
	67	+ if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER':
	68	+ value = int(value)
	69	+ elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT':
	70	+ value = value.replace('/', '-')
	71	+ #print field, value
	72	+ row.append(value)
	73	+ values.append(row)
	74	+
	75	+ cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values)
	76	+ connection.commit()
	77	+ if db_name == ':memory':
	78	+ return cursor
	79	+ else:
	80	+ connection.close()
	81	+
	82	+
	83	+def retrieve_botnames_without_id(cursor, language):
	84	+ return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall()
	85	+
	86	+
	87	+def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):
	88	+ '''
	89	+ This function is used to find the id's belonging to the different bots that
	90	+ are patrolling the Wikipedia sites.
	91	+ @input_queue contains a list of xml files to parse
	92	+
	93	+ @result_queue should be set to false as the results are directly written to
	94	+ a csv file.
	95	+
	96	+ @progressbar depends on settings
	97	+
	98	+ @bots is a dictionary containing the names of the bots to lookup
	99	+ '''
	100	+
	101	+ #if len(bots.keys()) == 1:
	102	+ bots = bots['bots']
	103	+ #print bots.keys()
	104	+
	105	+ if settings.DEBUG:
	106	+ messages = {}
	107	+
	108	+ while True:
	109	+ if debug:
	110	+ file = input_queue
	111	+ else:
	112	+ file = input_queue.get(block=False)
	113	+
	114	+ if file == None:
	115	+ break
	116	+
	117	+ data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION +
	118	+ file, 'r', encoding=settings.ENCODING))
	119	+
	120	+ for raw_data in data:
	121	+ xml_buffer = cStringIO.StringIO()
	122	+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	123	+ raw_data = ''.join(raw_data)
	124	+ raw_data = raw_data.encode('utf-8')
	125	+ xml_buffer.write(raw_data)
	126	+
	127	+ try:
	128	+ xml_nodes = cElementTree.XML(xml_buffer.getvalue())
	129	+ revisions = xml_nodes.findall('revision')
	130	+ for revision in revisions:
	131	+ contributor = xml.retrieve_xml_node(revision, 'contributor')
	132	+ username = contributor.find('username')
	133	+ if username == None:
	134	+ continue
	135	+ username = xml.extract_text(username)
	136	+ #print username.encode('utf-8')
	137	+
	138	+ if username in bots:
	139	+ id = contributor.find('id')
	140	+ id = xml.extract_text(id)
	141	+ #print username.encode('utf-8'), id
	142	+ utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.ENCODING)
	143	+ bots.pop(username)
	144	+ if bots == {}:
	145	+ print 'Mission accomplished'
	146	+ return
	147	+ except Exception, error:
	148	+ print error
	149	+ if settings.DEBUG:
	150	+ messages = utils.track_errors(xml_buffer, error, file,
	151	+ messages)
	152	+
	153	+ if settings.DEBUG:
	154	+ utils.report_error_messages(messages, lookup_username)
	155	+
	156	+
	157	+def add_id_to_botnames():
	158	+ '''
	159	+ This is the worker function for the multi-process version of
	160	+ lookup_username.First, the names of the bots are retrieved, then the
	161	+ multiprocess is launched by makinga call to pc.build_scaffolding. This is a
	162	+ generic launcher that takes as input the function to load the input_queue,
	163	+ the function that will do the main work and the objects to be put in the
	164	+ input_queue. The launcher also accepts optional keyword arguments.
	165	+ '''
	166	+ cursor = create_bots_db(':memory')
	167	+ files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
	168	+
	169	+ botnames = retrieve_botnames_without_id(cursor, 'en')
	170	+ bots = {}
	171	+ for botname in botnames:
	172	+ bots[botname[0]] = 1
	173	+ pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots)
	174	+ cursor.close()
	175	+
	176	+
	177	+def debug_lookup_username():
	178	+ '''
	179	+ This function launches the lookup_username function but then single
	180	+ threaded, this eases debugging. That's also the reason why the queue
	181	+ parameters are set to None. When launching this function make sure that
	182	+ debug=False when calling lookup_username
	183	+ '''
	184	+ cursor = create_bots_db(':memory')
	185	+ botnames = retrieve_botnames_without_id(cursor, 'en')
	186	+ bots = {}
	187	+ for botname in botnames:
	188	+ bots[botname[0]] = 1
	189	+
	190	+ lookup_username('12.xml', None, None, bots, debug=True)
	191	+ cursor.close()
	192	+
	193	+
	194	+if __name__ == '__main__':
	195	+ #debug()
	196	+ #add_id_to_botnames()
	197	+ create_bot_ids_db_mongo()
Property changes on: trunk/tools/editor_trends/init_bot_db.py
___________________________________________________________________
Added: svn:mime-type
1	198	+ text/plain
Added: svn:eol-style
2	199	+ native
Index: trunk/tools/editor_trends/database/__init__.py
Property changes on: trunk/tools/editor_trends/database/__init__.py
___________________________________________________________________
Added: svn:mime-type
3	200	+ text/plain
Added: svn:eol-style
4	201	+ native
Index: trunk/tools/editor_trends/database/db.py
—	—	@@ -0,0 +1,83 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+import sqlite3 as sqlite
	19	+from pymongo import Connection
	20	+
	21	+
	22	+import settings
	23	+from database import db_settings
	24	+
	25	+
	26	+def init_mongo_db(db):
	27	+ connection = Connection()
	28	+ db = connection[db]
	29	+ return db
	30	+
	31	+
	32	+def remove_documents_from_mongo_db(collection, ids):
	33	+ collection.remove(ids)
	34	+
	35	+
	36	+def add_index_to_collection(db, collection, keys):
	37	+ '''
	38	+ @db is the name of the mongodb
	39	+ @collection is the name of the 'table' in mongodb
	40	+ @keys should be a list of keys used to create the index
	41	+ '''
	42	+
	43	+ mongo = init_mongo_db(db)
	44	+ collection = mongo[collection]
	45	+ mongo.collection.create_index(keys)
	46	+ mongo.collection.ensure_index(keys)
	47	+
	48	+
	49	+def init_database(db=None):
	50	+ '''
	51	+ This function initializes the connection with a sqlite db.
	52	+ If the database already exists then it returns False to indicate
	53	+ that the db already exists, else it returns True to indicate
	54	+ that it's an empty database without tables.
	55	+ '''
	56	+ if db == None:
	57	+ db = settings.DATABASE_NAME
	58	+
	59	+ return sqlite.connect(db, check_same_thread=False)
	60	+
	61	+
	62	+def create_tables(cursor, tables):
	63	+ '''
	64	+ Tables is expected to be a dictionary, with key
	65	+ table name and value another dictionary. This second
	66	+ dictionary contains variable names and datatypes.
	67	+ '''
	68	+ for table in tables:
	69	+ vars = '('
	70	+ for var, datatype in tables[table]:
	71	+ vars = vars + '%s %s,' % (var, datatype)
	72	+ vars = vars[:-1]
	73	+ vars = vars + ')'
	74	+ cursor.execute('CREATE TABLE IF NOT EXISTS ? ?' % (table, vars))
	75	+
	76	+
	77	+def debug():
	78	+ connection = init_database()
	79	+ cursor = connection.cursor()
	80	+ create_tables(cursor, settings.TABLES)
	81	+
	82	+
	83	+if __name__ == '__main__':
	84	+ debug()
Property changes on: trunk/tools/editor_trends/database/db.py
___________________________________________________________________
Added: svn:mime-type
1	85	+ text/plain
Added: svn:eol-style
2	86	+ native
Index: trunk/tools/editor_trends/database/db_settings.py
—	—	@@ -0,0 +1,38 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+'''
	19	+This is a settings file that contains the layout of different tables. The main
	20	+key will be used as the tablename while it,s values contain tuples containing
	21	+fieldname and datatype This is only be used for sqlite.
	22	+'''
	23	+CONTRIBUTOR_TABLE = {'contributors': []}
	24	+CONTRIBUTOR_TABLE['contributors'].append(('contributor', 'VARCHAR(64)'))
	25	+CONTRIBUTOR_TABLE['contributors'].append(('article', 'INTEGER'))
	26	+CONTRIBUTOR_TABLE['contributors'].append(('timestamp', 'TEXT'))
	27	+CONTRIBUTOR_TABLE['contributors'].append(('bot', 'INTEGER'))
	28	+
	29	+BOT_TABLE = {'bots': []}
	30	+BOT_TABLE['bots'].append(('language', 'VARCHAR(12)'))
	31	+BOT_TABLE['bots'].append(('name', 'VARCHAR(64)'))
	32	+BOT_TABLE['bots'].append(('edits_namespace_a', 'INTEGER'))
	33	+BOT_TABLE['bots'].append(('edits_namespace_x', 'INTEGER'))
	34	+BOT_TABLE['bots'].append(('rank_now', 'INTEGER'))
	35	+BOT_TABLE['bots'].append(('rank_prev', 'INTEGER'))
	36	+BOT_TABLE['bots'].append(('first_date', 'TEXT'))
	37	+BOT_TABLE['bots'].append(('days_first', 'INTEGER'))
	38	+BOT_TABLE['bots'].append(('last_date', 'TEXT'))
	39	+BOT_TABLE['bots'].append(('days_last', 'INTEGER'))
Property changes on: trunk/tools/editor_trends/database/db_settings.py
___________________________________________________________________
Added: svn:mime-type
1	40	+ text/plain
Added: svn:eol-style
2	41	+ native
Index: trunk/tools/editor_trends/bots/__init__.py
Property changes on: trunk/tools/editor_trends/bots/__init__.py
___________________________________________________________________
Added: svn:mime-type
3	42	+ text/plain
Added: svn:eol-style
4	43	+ native
Property changes on: trunk/tools/editor_trends/data/database
___________________________________________________________________
Added: svn:ignore
5	44	+ *.db
Property changes on: trunk/tools/editor_trends/data/objects
___________________________________________________________________
Added: svn:ignore
6	45	+ *.bin
Property changes on: trunk/tools/editor_trends/data/csv
___________________________________________________________________
Added: svn:ignore
7	46	+ *.csv
Index: trunk/tools/editor_trends/run.bat
—	—	@@ -0,0 +1,3 @@
	2	+@echo off
	3	+python split_xml_file.py
	4	+python map_wiki_editors.py
Index: trunk/tools/editor_trends/algorithms/__init__.py
Property changes on: trunk/tools/editor_trends/algorithms/__init__.py
___________________________________________________________________
Added: svn:mime-type
1	5	+ text/plain
Added: svn:eol-style
2	6	+ native
Index: trunk/tools/editor_trends/algorithms/red_wiki_editors.py
—	—	@@ -0,0 +1,40 @@
	2	+import re
	3	+import settings
	4	+
	5	+try:
	6	+ import psyco
	7	+ psyco.full()
	8	+except ImportError:
	9	+ pass
	10	+
	11	+
	12	+RE_ID = re.compile('\d*')
	13	+RE_IP = re.compile('(?:\d{1,3}\.){2,3}\d{1,3}') #Some of the addresses have the last 3 digits blocked as xxx
	14	+
	15	+
	16	+
	17	+def determine_contributor_type(id):
	18	+ if len(re.findall(RE_ID, id)) == 1:
	19	+ return 'id'
	20	+ elif len(re.findall(RE_IP, id)) == 1:
	21	+ return 'ip'
	22	+ else:
	23	+ return 'name'
	24	+
	25	+def open_file_handles():
	26	+ fh1, fh2,fh3 = None, None, None
	27	+ handles = {'id.txt': fh1,
	28	+ 'ip.txt': fh2,
	29	+ 'name.txt': fh3
	30	+ }
	31	+ for handle, var in handles.iteritems():
	32	+ var = codecs.open(handle, 'w', encoding=settings.ENCODING)
	33	+
	34	+ return handles
	35	+
	36	+def close_file_handles(handles):
	37	+ for handle, var in handles.iteritems():
	38	+ var.close()
	39	+
	40	+def write_data(vars):
	41	+ pass
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/algorithms/red_wiki_editors.py
___________________________________________________________________
Added: svn:mime-type
1	42	+ text/plain
Added: svn:eol-style
2	43	+ native
Property changes on: trunk/tools/editor_trends/errors
___________________________________________________________________
Added: svn:ignore
3	44	+ *.bin
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Added: svn:ignore
4	45	+ wikistats
zips
notes.txt
*.pyc
datasets
errors
Added: native
5	46	+ svn:eol-style=native

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r75053 [removed: new added: deferred]