r75053 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75052‎ | r75053 | r75054 >
Date:22:29, 19 October 2010
Author:diederik
Status:deferred
Tags:
Comment:
Initial commit Editor Trends Analytics Package

Current functionality:
1) Splitting XML file in chunks
2) Multiprocessor layer to distribute tasks among different cores
3) Extract information from XML
4) Store data in MongoDB / Sqlite database

More information about this project can be found at: http://strategy.wikimedia.org/wiki/Editor_Trends_Study

For more information: see README.1ST

This version is not finished, might not always work, etc. etc. Use at your own discretion.
Modified paths:
  • /trunk/tools/editor_trends (added) (history)
  • /trunk/tools/editor_trends/.svn_ignore (added) (history)
  • /trunk/tools/editor_trends/README.1ST (added) (history)
  • /trunk/tools/editor_trends/__init__.py (added) (history)
  • /trunk/tools/editor_trends/algorithms (added) (history)
  • /trunk/tools/editor_trends/algorithms/__init__.py (added) (history)
  • /trunk/tools/editor_trends/algorithms/red_wiki_editors.py (added) (history)
  • /trunk/tools/editor_trends/bots (added) (history)
  • /trunk/tools/editor_trends/bots/__init__.py (added) (history)
  • /trunk/tools/editor_trends/construct_datasets.py (added) (history)
  • /trunk/tools/editor_trends/data (added) (history)
  • /trunk/tools/editor_trends/data/csv (added) (history)
  • /trunk/tools/editor_trends/data/database (added) (history)
  • /trunk/tools/editor_trends/data/objects (added) (history)
  • /trunk/tools/editor_trends/database (added) (history)
  • /trunk/tools/editor_trends/database/__init__.py (added) (history)
  • /trunk/tools/editor_trends/database/db.py (added) (history)
  • /trunk/tools/editor_trends/database/db_settings.py (added) (history)
  • /trunk/tools/editor_trends/datasets (added) (history)
  • /trunk/tools/editor_trends/errors (added) (history)
  • /trunk/tools/editor_trends/init_bot_db.py (added) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (added) (history)
  • /trunk/tools/editor_trends/requirements.txt (added) (history)
  • /trunk/tools/editor_trends/run.bat (added) (history)
  • /trunk/tools/editor_trends/settings.py (added) (history)
  • /trunk/tools/editor_trends/split_xml_file.py (added) (history)
  • /trunk/tools/editor_trends/utils (added) (history)
  • /trunk/tools/editor_trends/utils/__init__.py (added) (history)
  • /trunk/tools/editor_trends/utils/models.py (added) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (added) (history)
  • /trunk/tools/editor_trends/utils/utils.py (added) (history)
  • /trunk/tools/editor_trends/wikitree (added) (history)
  • /trunk/tools/editor_trends/wikitree/__init__.py (added) (history)
  • /trunk/tools/editor_trends/wikitree/xml.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -0,0 +1,266 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+#Default Python libraries (Python => 2.6)
 19+import sys
 20+import os
 21+import time
 22+import codecs
 23+import cStringIO
 24+import re
 25+import xml.etree.cElementTree as cElementTree
 26+from multiprocessing import Queue
 27+from Queue import Empty
 28+
 29+# Custom written files
 30+import settings
 31+from utils import utils, models
 32+from database import db_settings
 33+from database import db
 34+from wikitree import xml
 35+from utils import process_constructor as pc
 36+
 37+
 38+try:
 39+ import psyco
 40+ psyco.full()
 41+except ImportError:
 42+ pass
 43+
 44+contributors = {}
 45+
 46+RE_BOT = re.compile('bot', re.IGNORECASE)
 47+RE_SCRIPT = re.compile('script', re.IGNORECASE)
 48+#RE_NUMERIC_CHARACTER = re.compile('&#[\d{1,5}]+;')
 49+#
 50+#def remove_numeric_character_references(text):
 51+# return re.sub(RE_NUMERIC_CHARACTER, '', text)
 52+#
 53+
 54+
 55+def determine_username_is_bot(username, kwargs):
 56+ ids = kwargs.get('bots', [])
 57+ if ids == None:
 58+ ids = []
 59+ if username != None and username.text != None:
 60+ id = username.text
 61+ if id in ids:
 62+ return 1
 63+ else:
 64+ return 0
 65+
 66+
 67+def extract_contributor_id(contributor, kwargs):
 68+ '''
 69+ @contributor is the xml contributor node containing a number of attributes
 70+
 71+ Currently, we are only interested in registered contributors, hence we
 72+ ignore anonymous editors. If you are interested in collecting data on
 73+ anonymous editors then add the string 'ip' to the tags variable.
 74+ '''
 75+ tags = ['id']
 76+ if contributor.get('deleted'):
 77+ return -1 #Not sure if this is the best way to code deleted contributors.
 78+ for elem in contributor:
 79+ if elem.tag in tags:
 80+ if elem.text != None:
 81+ return elem.text.decode('utf-8')
 82+ else:
 83+ return -1
 84+
 85+
 86+def output_editor_information(elem, data_queue, **kwargs):
 87+ tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot},
 88+ 'timestamp': {'date': xml.extract_text},
 89+ }
 90+ vars = {}
 91+ vars['article'] = elem.find('id').text.decode(settings.ENCODING)
 92+ revisions = elem.findall('revision')
 93+ for revision in revisions:
 94+ #print vars
 95+ elements = revision.getchildren()
 96+ for tag, functions in tags.iteritems():
 97+ xml_node = xml.retrieve_xml_node(elements, tag)
 98+ for var, function in functions.iteritems():
 99+ vars[var] = function(xml_node, kwargs)
 100+
 101+ #if vars['editor'] == '11887479' or vars['editor'] == '518794':
 102+ # print vars
 103+ #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
 104+ if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
 105+ vars.pop('bot')
 106+ vars['date'] = utils.convert_timestamp_to_date(vars['date'])
 107+ data_queue.put(vars)
 108+ vars={}
 109+
 110+def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
 111+ if settings.DEBUG:
 112+ messages = {}
 113+ vars = {}
 114+ while True:
 115+ try:
 116+ if debug:
 117+ file = xml_queue
 118+ else:
 119+ file = xml_queue.get(block=False)
 120+ #print 'parsing %s' % file
 121+ if file == None:
 122+ break
 123+
 124+ data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION
 125+ + file, 'r', encoding=settings.ENCODING))
 126+ #data = read_input(sys.stdin)
 127+ #print xml_queue.qsize()
 128+ for raw_data in data:
 129+ xml_buffer = cStringIO.StringIO()
 130+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 131+ raw_data = ''.join(raw_data)
 132+ xml_buffer.write(raw_data)
 133+
 134+ try:
 135+ elem = cElementTree.XML(xml_buffer.getvalue())
 136+ output_editor_information(elem, data_queue, bots=bots)
 137+ except SyntaxError, error:
 138+ print error
 139+ #There are few cases with invalid tokens, they are fixed
 140+ #here and then reinserted into the XML DOM
 141+ #data = convert_html_entities(xml_buffer.getvalue())
 142+ #elem = cElementTree.XML(data)
 143+ #output_editor_information(elem)
 144+ if settings.DEBUG:
 145+ utils.track_errors(xml_buffer, error, file, messages)
 146+ except UnicodeEncodeError, error:
 147+ print error
 148+ if settings.DEBUG:
 149+ utils.track_errors(xml_buffer, error, file, messages)
 150+ #finally:
 151+
 152+
 153+ if pbar:
 154+ print xml_queue.qsize()
 155+ #utils.update_progressbar(pbar, xml_queue)
 156+ if debug:
 157+ break
 158+
 159+ except Empty:
 160+ break
 161+
 162+ if settings.DEBUG:
 163+ utils.report_error_messages(messages, lookup_new_editors)
 164+
 165+
 166+def store_data_mongo(data_queue, pids):
 167+ mongo = db.init_mongo_db('editors')
 168+ collection = mongo['editors']
 169+ values = []
 170+ while True:
 171+ try:
 172+ chunk = data_queue.get(block=False)
 173+ values.append(chunk)
 174+ #print chunk
 175+ if len(values) == 100000:
 176+ collection.insert(values)
 177+ values = []
 178+ #print data_queue.qsize()
 179+ data_queue.task_done()
 180+ except Empty:
 181+ # The queue is empty but store the remaining values if present
 182+ if values != []:
 183+ collection.insert(values)
 184+ values = []
 185+
 186+ #print [utils.check_if_process_is_running(pid) for pid in pids]
 187+ '''
 188+ This checks whether the Queue is empty because the preprocessors are
 189+ finished or because this function is faster in emptying the Queue
 190+ then the preprocessors are able to fill it. If this preprocessors
 191+ are finished and this Queue is empty than break, else wait for the
 192+ Queue to fill.
 193+ '''
 194+ if all([utils.check_if_process_is_running(pid) for pid in pids]):
 195+ pass
 196+ else:
 197+ break
 198+
 199+
 200+def store_data_db(data_queue, pids):
 201+ connection = db.init_database()
 202+ cursor = connection.cursor()
 203+ db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE)
 204+
 205+ empty = 0
 206+
 207+ values = []
 208+ while True:
 209+ try:
 210+ chunk = data_queue.get(block=False)
 211+ contributor = chunk['contributor'].encode(settings.ENCODING)
 212+ article = chunk['article']
 213+ timestamp = chunk['timestamp'].encode(settings.ENCODING)
 214+ bot = chunk['bot']
 215+ values.append((contributor, article, timestamp, bot))
 216+
 217+ if len(values) == 50000:
 218+ cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values)
 219+ connection.commit()
 220+ #print 'Size of queue: %s' % data_queue.qsize()
 221+ values = []
 222+
 223+ except Empty:
 224+ if all([utils.check_if_process_is_running(pid) for pid in pids]):
 225+ pass
 226+ else:
 227+ break
 228+ connection.close()
 229+
 230+
 231+def run_stand_alone():
 232+ files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
 233+ #files = files[:2]
 234+ mongo = db.init_mongo_db('bots')
 235+ bots = mongo['ids']
 236+ ids = {}
 237+ cursor = bots.find()
 238+
 239+ for bot in cursor:
 240+ ids[bot['id']] = bot['name']
 241+ pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids)
 242+ db.add_index_to_collection('editors', 'date')
 243+ db.add_index_to_collection('editors', 'name')
 244+
 245+def debug_lookup_new_editors():
 246+ q = Queue()
 247+ import progressbar
 248+ pbar = progressbar.ProgressBar().start()
 249+ edits = db.init_mongo_db('editors')
 250+ lookup_new_editors('1.xml', q, None, None, True)
 251+ db.add_index_to_collection('editors', 'date')
 252+ db.add_index_to_collection('editors', 'name')
 253+
 254+
 255+
 256+def run_hadoop():
 257+ pass
 258+
 259+
 260+if __name__ == "__main__":
 261+ #debug_lookup_new_editors()
 262+
 263+ if settings.RUN_MODE == 'stand_alone':
 264+ run_stand_alone()
 265+ print 'Finished processing XML files.'
 266+ else:
 267+ run_hadoop()
Property changes on: trunk/tools/editor_trends/map_wiki_editors.py
___________________________________________________________________
Added: svn:mime-type
1268 + text/plain
Added: svn:eol-style
2269 + native
Index: trunk/tools/editor_trends/wikitree/__init__.py
Property changes on: trunk/tools/editor_trends/wikitree/__init__.py
___________________________________________________________________
Added: svn:mime-type
3270 + text/plain
Added: svn:eol-style
4271 + native
Index: trunk/tools/editor_trends/wikitree/xml.py
@@ -0,0 +1,49 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+from utils import utils
 19+import settings
 20+
 21+
 22+def convert_html_entities(text):
 23+ return utils.unescape(text)
 24+
 25+
 26+def extract_text(elem, kwargs):
 27+ if elem != None and elem.text != None:
 28+ return elem.text.decode(settings.ENCODING)
 29+ return None
 30+
 31+
 32+def retrieve_xml_node(xml_nodes, name):
 33+ for xml_node in xml_nodes:
 34+ if xml_node.tag == name:
 35+ return xml_node
 36+ return None #maybe this should be replaced with an NotFoundError
 37+
 38+
 39+def read_input(file):
 40+ lines = []
 41+ for line in file:
 42+ lines.append(line)
 43+ if line.find('</page>') > -1:
 44+ yield lines
 45+ '''
 46+ #This looks counter intuitive but Python continues with this call
 47+ after it has finished the yield statement
 48+ '''
 49+ lines = []
 50+ file.close()
Property changes on: trunk/tools/editor_trends/wikitree/xml.py
___________________________________________________________________
Added: svn:mime-type
151 + text/plain
Added: svn:eol-style
252 + native
Index: trunk/tools/editor_trends/__init__.py
@@ -0,0 +1,14 @@
 2+import os
 3+import sys
 4+
 5+WORKING_DIRECTORY = os.getcwd()#[:-9]
 6+IGNORE_DIRS = ['wikistats', 'zips']
 7+
 8+dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
 9+ os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
 10+
 11+
 12+for subdirname in dirs:
 13+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
 14+ sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
 15+ #print os.path.join(WORKING_DIRECTORY, subdirname)
Property changes on: trunk/tools/editor_trends/__init__.py
___________________________________________________________________
Added: svn:mime-type
116 + text/plain
Added: svn:eol-style
217 + native
Index: trunk/tools/editor_trends/.svn_ignore
@@ -0,0 +1,12 @@
 2+*.pyc
 3+*.xml
 4+*.db
 5+*.bin
 6+*.zip
 7+*.csv
 8+.*
 9+zips/
 10+wikistats/
 11+datasets/
 12+data/
 13+notes.txt
\ No newline at end of file
Index: trunk/tools/editor_trends/settings.py
@@ -0,0 +1,91 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+
 19+'''
 20+This file contains settings that are used for constructing and analyzing
 21+the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
 22+'''
 23+
 24+
 25+from multiprocessing import cpu_count
 26+import os
 27+import sys
 28+import platform
 29+
 30+#Setting up the environment
 31+ops = {platform.win32_ver: 'Windows',
 32+ platform.linux_distribution: 'Linux',
 33+ platform.mac_ver: 'OSX'}
 34+for op in ops:
 35+ if op() != ('', '', '') and op() != ('', ('', '', ''), ''):
 36+ OS = ops[op]
 37+
 38+WORKING_DIRECTORY = os.getcwd()#[:-9]
 39+IGNORE_DIRS = ['wikistats', 'zips']
 40+
 41+dirs = [name for name in os.listdir(WORKING_DIRECTORY) if os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
 42+for subdirname in dirs:
 43+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
 44+ sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
 45+
 46+
 47+#General settings
 48+
 49+# Valid values are 'stand-alone' and 'hadoop'
 50+RUN_MODE = 'stand_alone'
 51+
 52+# If true then some more detailed debug information is collected
 53+DEBUG = True
 54+
 55+#If True then it will display a progress bar on the console.
 56+PROGRESS_BAR = True
 57+
 58+#Date format as used by Erik Zachte
 59+DATE_FORMAT = '%Y-%m-%d'
 60+
 61+# Timestamp format as generated by the MediaWiki dumps
 62+DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
 63+
 64+#This section contains configuration variables for the different file locations.
 65+
 66+# Location where to write xml chunks
 67+XML_FILE_LOCATION = 'C:/wikimedia/'
 68+
 69+# Input file
 70+XML_FILE = 'C:/Source_Files/enwiki-20100916-stub-meta-history.xml'
 71+
 72+# This is the place where error messages are stored for debugging purposes
 73+ERROR_MESSAGE_FILE_LOCATION = WORKING_DIRECTORY + '/errors/'
 74+
 75+DATABASE_FILE_LOCATION = WORKING_DIRECTORY + '/data/database/'
 76+
 77+BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/'
 78+
 79+#This section contains configuration variables for parsing / encoding and
 80+#working with the XML files.
 81+
 82+# ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Sets for reason
 83+MAX_XML_FILE_SIZE = 67108864
 84+
 85+ENCODING = 'utf-8'
 86+
 87+# Name space, do not change as this works for Mediawiki wikis
 88+NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
 89+
 90+#Multiprocess settings used to parallelize workload
 91+#Change this to match your computers configuration (RAM / CPU)
 92+NUMBER_OF_PROCESSES = cpu_count() * 1
Property changes on: trunk/tools/editor_trends/settings.py
___________________________________________________________________
Added: svn:mime-type
193 + text/plain
Added: svn:eol-style
294 + native
Index: trunk/tools/editor_trends/utils/__init__.py
Property changes on: trunk/tools/editor_trends/utils/__init__.py
___________________________________________________________________
Added: svn:mime-type
395 + text/plain
Added: svn:eol-style
496 + native
Index: trunk/tools/editor_trends/utils/utils.py
@@ -0,0 +1,267 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+'''
 19+The utils module contains helper functions that will be needed throughout.
 20+It provides functions to read / write data to text and binary files, fix markup
 21+and track error messages.
 22+'''
 23+
 24+import re
 25+import htmlentitydefs
 26+import cPickle
 27+import datetime
 28+import codecs
 29+import os
 30+import ctypes
 31+
 32+import settings
 33+
 34+
 35+try:
 36+ import psyco
 37+ psyco.full()
 38+except ImportError:
 39+ pass
 40+
 41+
 42+RE_ERROR_LOCATION = re.compile('\d+')
 43+RE_NUMERIC_CHARACTER = re.compile('&#?\w+;')
 44+
 45+
 46+def convert_timestamp_to_date(timestamp):
 47+ return datetime.datetime.strptime(timestamp[:10], settings.DATE_FORMAT)
 48+
 49+
 50+def convert_timestamp_to_datetime(timestamp):
 51+ return datetime.datetime.strptime(timestamp, settings.DATETIME_FORMAT)
 52+
 53+
 54+def check_if_process_is_running(pid):
 55+ try:
 56+ if settings.OS == 'Windows':
 57+ PROCESS_TERMINATE = 1
 58+ handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
 59+ if handle != 0:
 60+ return True
 61+ else:
 62+ return False
 63+ else:
 64+ os.kill(pid, 0)
 65+ return Tru
 66+ except Exception, error:
 67+ print error
 68+ return False
 69+
 70+
 71+# error tracking related functions
 72+def track_errors(xml_buffer, error, file, messages):
 73+ text = extract_offending_string(xml_buffer.getvalue(), error)
 74+
 75+ vars = {}
 76+ vars['file'] = file
 77+ vars['error'] = error
 78+ vars['text'] = text
 79+ #print file, error, text
 80+ key = remove_error_specific_information(error)
 81+ if key not in messages:
 82+ messages[key] = {}
 83+ if messages[key] == {}:
 84+ c = 0
 85+ else:
 86+ counters = messages[key].keys()
 87+ counters.sort()
 88+ counters.reverse()
 89+ c = counters[-1]
 90+
 91+ messages[key][c] = {}
 92+ for var in vars:
 93+ messages[key][c][var] = vars[var]
 94+
 95+ return messages
 96+
 97+
 98+def report_error_messages(messages, function):
 99+ store_object(messages, settings.ERROR_MESSAGE_FILE_LOCATION, function.func_name)
 100+ errors = messages.keys()
 101+ for error in errors:
 102+ for key, value in messages[error].iteritems():
 103+ print error, key, value
 104+
 105+
 106+def remove_error_specific_information(e):
 107+ pos = e.args[0].find('line')
 108+ if pos > -1:
 109+ return e.args[0][:pos]
 110+ else:
 111+ return e.args[0]
 112+
 113+
 114+def extract_offending_string(text, error):
 115+ '''
 116+ This function determines the string that causes an error when feeding it to
 117+ the XML parser. This is only useful for debugging purposes.
 118+ '''
 119+ location = re.findall(RE_ERROR_LOCATION, error.args[0])
 120+ if location != []:
 121+ location = int(location[0]) - 1
 122+ text = text.split('\n')[location]
 123+ text = text.decode('utf-8')
 124+ return text
 125+ else:
 126+ return ''
 127+
 128+
 129+# read / write data related functions
 130+def read_data_from_csv(filename, encoding):
 131+ if hasattr(filename, '__call__'):
 132+ filename = construct_filename_from_function(filename)
 133+
 134+ fh = open_txt_file(filename, 'r', encoding=encoding)
 135+ for line in fh:
 136+ yield line
 137+
 138+ fh.close()
 139+
 140+
 141+def write_data_to_csv(data, function, encoding):
 142+ filename = construct_filename_from_function(function, '.csv')
 143+ fh = open_txt_file(filename, 'a', encoding=encoding)
 144+ keys = data.keys()
 145+ for key in keys:
 146+ for value in data[key]:
 147+ fh.write('%s\t%s\n' % (key, value))
 148+ fh.close()
 149+
 150+
 151+def open_txt_file(filename, mode, encoding):
 152+ return codecs.open(filename, mode, encoding=encoding)
 153+
 154+def construct_filename_from_function(function, extension):
 155+ return function.func_name + extension
 156+
 157+def check_file_exists(location, filename):
 158+ if hasattr(filename, '__call__'):
 159+ filename = construct_filename_from_function(filename, '.bin')
 160+ if os.path.exists(location + filename):
 161+ return True
 162+ else:
 163+ return False
 164+
 165+
 166+def store_object(object, location, filename):
 167+ if hasattr(filename, '__call__'):
 168+ filename = construct_filename_from_function(filename, '.bin')
 169+ if not filename.endswith('.bin'):
 170+ filename = filename + '.bin'
 171+ fh = open(location + filename, 'wb')
 172+ cPickle.dump(object, fh)
 173+ fh.close()
 174+
 175+
 176+def load_object(location, filename):
 177+ if hasattr(filename, '__call__'):
 178+ filename = construct_filename_from_function(filename, '.bin')
 179+ if not filename.endswith('.bin'):
 180+ filename = filename + '.bin'
 181+ fh = open(location + filename, 'rb')
 182+ obj = cPickle.load(fh)
 183+ fh.close()
 184+ return obj
 185+
 186+
 187+def clean_string(string):
 188+ string = string.replace('\n', '')
 189+ return string
 190+
 191+
 192+def create_dict_from_csv_file(filename, encoding):
 193+ d = {}
 194+ for line in read_data_from_csv(filename, encoding):
 195+ line = clean_string(line)
 196+ value, key = line.split('\t')
 197+ d[key] = value
 198+
 199+ return d
 200+
 201+
 202+def retrieve_file_list(location, extension):
 203+ all_files = os.listdir(location)
 204+ if not extension.startswith('.'):
 205+ extension = '.' + extension
 206+ files = []
 207+ for file in all_files:
 208+ if file.endswith(extension):
 209+ files.append(file)
 210+
 211+ return files
 212+
 213+
 214+# Progress bar related functions
 215+def update_progressbar(pbar, queue):
 216+ '''
 217+ Updates the progressbar by determining how much work is left in a queue
 218+ '''
 219+ x = pbar.maxval - queue.qsize()
 220+ '''
 221+ Currently, calling the pbar.update function gives the following error:
 222+ File "build\bdist.win32\egg\progressbar.py", line 352, in update
 223+ self.fd.write(self._format_line() + '\r')
 224+ ValueError: I/O operation on closed file
 225+ Not sure how to fix this, that's why the line is commented.
 226+ '''
 227+ #pbar.update(x)
 228+
 229+
 230+def humanize_time_difference(seconds_elapsed):
 231+ """
 232+ Returns a humanized string representing time difference.
 233+ It will only output the first two time units, so days and
 234+ hours, or hours and minutes, except when there are only
 235+ seconds.
 236+ """
 237+ seconds_elapsed = int(seconds_elapsed)
 238+ humanized_time = {}
 239+ time_units = [('days', 86400), ('hours', 3600), ('minutes', 60), ('seconds', 1)]
 240+ for time, unit in time_units:
 241+ dt = seconds_elapsed / unit
 242+ if dt > 0:
 243+ humanized_time[time] = dt
 244+ seconds_elapsed = seconds_elapsed - (unit * humanized_time[time])
 245+ #humanized_time['seconds'] = seconds_elapsed
 246+
 247+ x = 0
 248+ if len(humanized_time) == 1:
 249+ return '%s %s' % (humanized_time['seconds'], 'seconds')
 250+ else:
 251+ obs = []
 252+ for time, unit in time_units:
 253+ if time in humanized_time:
 254+ unit = humanized_time.get(time, None)
 255+ if humanized_time[time] == 1:
 256+ time = time[:-1]
 257+ obs.append((time, unit))
 258+ x += 1
 259+ if x == 2:
 260+ return '%s %s and %s %s' % (obs[0][1], obs[0][0], obs[1][1], obs[1][0])
 261+
 262+
 263+def debug():
 264+ dt = humanize_time_difference(64)
 265+ print dt
 266+
 267+if __name__ == '__main__':
 268+ debug()
Property changes on: trunk/tools/editor_trends/utils/utils.py
___________________________________________________________________
Added: svn:mime-type
1269 + text/plain
Added: svn:eol-style
2270 + native
Index: trunk/tools/editor_trends/utils/models.py
@@ -0,0 +1,54 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+import multiprocessing
 19+
 20+
 21+class ProcessInputQueue(multiprocessing.Process):
 22+
 23+ def __init__(self, target, input_queue, result_queue, pbar, **kwargs):
 24+ multiprocessing.Process.__init__(self)
 25+ self.input_queue = input_queue
 26+ self.result_queue = result_queue
 27+ self.target = target
 28+ self.progressbar = pbar
 29+ for kw in kwargs:
 30+ setattr(self, kw, kwargs[kw])
 31+
 32+ def run(self):
 33+ proc_name = self.name
 34+ kwargs = {}
 35+ IGNORE = [self.input_queue, self.result_queue, self.target,
 36+ self.progressbar]
 37+ for kw in self.__dict__:
 38+ if kw not in IGNORE and not kw.startswith('_'):
 39+ kwargs[kw] = getattr(self, kw)
 40+
 41+ self.target(self.input_queue, self.result_queue, self.progressbar, kwargs)
 42+
 43+
 44+class ProcessResultQueue(multiprocessing.Process):
 45+
 46+ def __init__(self, target, result_queue, pids, pbar):
 47+ multiprocessing.Process.__init__(self)
 48+ self.result_queue = result_queue
 49+ self.target = target
 50+ self.progressbar = pbar
 51+ self.pids = pids
 52+
 53+ def run(self):
 54+ proc_name = self.name
 55+ self.target(self.result_queue, self.pids)
Property changes on: trunk/tools/editor_trends/utils/models.py
___________________________________________________________________
Added: svn:mime-type
156 + text/plain
Added: svn:eol-style
257 + native
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -0,0 +1,113 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+from multiprocessing import Process, Queue, JoinableQueue
 19+from Queue import Empty
 20+
 21+import settings
 22+import utils
 23+import models
 24+
 25+#3rd party dependency
 26+import progressbar
 27+
 28+
 29+def build_scaffolding(load_input_queue, main, obj, result_processor=False, result_queue=False, **kwargs):
 30+ '''
 31+ This a generic producer/consumer process launcher. It can launch two types
 32+ of processes:
 33+ a) Processes that take a task from a queue and do their thing
 34+ b) Processes that take a task from a queue and put the result in the
 35+ result_queue.
 36+ If result_queue is False then a) is assumed.
 37+
 38+ @load_input_queue is a function that is used to insert jobs into queue
 39+
 40+ @main is the function that will process the input_queue
 41+
 42+ @obj can be a pickled object or an enumerable variable that will be loaded
 43+ into the input_queue
 44+
 45+ @result_queue, if set to True will become a true queue and will be provided
 46+ to main whose job it is to fill with new tasks. If False then this variable
 47+ is ignored.
 48+
 49+ @result_processor, name of the function to process the @result_queue
 50+
 51+ @kwargs is a dictionary with optional variables. Used to supply to main
 52+ '''
 53+
 54+ input_queue = Queue()
 55+ if result_queue:
 56+ result_queue = JoinableQueue()
 57+
 58+ load_input_queue(input_queue, obj, poison_pill=True)
 59+
 60+ if settings.PROGRESS_BAR:
 61+ pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start()
 62+ else:
 63+ pbar = False
 64+
 65+
 66+ input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
 67+ pbar, **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES)]
 68+
 69+ for input_process in input_processes:
 70+ input_process.start()
 71+ pids = [p.pid for p in input_processes]
 72+
 73+ if result_queue:
 74+ result_processes = [models.ProcessResultQueue(result_processor,
 75+ result_queue, pids, pbar) for i in xrange(1)]
 76+ for result_process in result_processes:
 77+ result_process.start()
 78+
 79+ for input_process in input_processes:
 80+ print 'Waiting for input process to finish'
 81+ input_process.join()
 82+ print 'Input process finished'
 83+
 84+ if result_queue:
 85+ for result_process in result_processes:
 86+ print 'Waiting for result process to finish.'
 87+ result_process.join()
 88+ print 'Result process finished'
 89+
 90+ if pbar:
 91+ pbar.finish()
 92+ print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
 93+
 94+
 95+def load_queue(input_queue, obj, poison_pill=False):
 96+ '''
 97+ @input_queue should be an instance of multiprocessing.Queue
 98+
 99+ @obj either pickled or enumerable variable that contains the tasks
 100+
 101+ @returns: queue with tasks
 102+ '''
 103+
 104+ if isinstance(obj, type(list)):
 105+ data = utils.load_object(obj)
 106+ else:
 107+ data = obj
 108+ for d in data:
 109+ input_queue.put(d)
 110+
 111+ if poison_pill:
 112+ for p in xrange(settings.NUMBER_OF_PROCESSES):
 113+ input_queue.put(None)
 114+ return input_queue
Property changes on: trunk/tools/editor_trends/utils/process_constructor.py
___________________________________________________________________
Added: svn:mime-type
1115 + text/plain
Added: svn:eol-style
2116 + native
Index: trunk/tools/editor_trends/requirements.txt
@@ -0,0 +1,2 @@
 2+progressbar==2.3-dev
 3+psyco==1.6
Property changes on: trunk/tools/editor_trends/requirements.txt
___________________________________________________________________
Added: svn:mime-type
14 + text/plain
Added: svn:eol-style
25 + native
Index: trunk/tools/editor_trends/split_xml_file.py
@@ -0,0 +1,127 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+
 19+import xml.etree.cElementTree as cElementTree
 20+import codecs
 21+import utils
 22+import re
 23+import settings
 24+
 25+try:
 26+ import psyco
 27+ psyco.full()
 28+except ImportError:
 29+ pass
 30+
 31+
 32+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
 33+
 34+#def convert_html_entities(text):
 35+# return utils.unescape(text)
 36+
 37+
 38+def remove_numeric_character_references(text):
 39+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
 40+
 41+
 42+def lenient_deccharref(m):
 43+ return unichr(int(m.group(1)))
 44+
 45+
 46+def remove_namespace(element, namespace):
 47+ '''Remove namespace from the document.'''
 48+ ns = u'{%s}' % namespace
 49+ nsl = len(ns)
 50+ for elem in element.getiterator():
 51+ if elem.tag.startswith(ns):
 52+ elem.tag = elem.tag[nsl:]
 53+ return element
 54+
 55+
 56+def parse_comments(xml, function):
 57+ revisions = xml.findall('revision')
 58+ for revision in revisions:
 59+ comment = revision.find('comment')
 60+ timestamp = revision.find('timestamp').text
 61+ #if timestamp == '2007-11-25T09:21:11Z':
 62+ # print 'debug'
 63+ # text = comment.text
 64+ #test2 = text.encode('utf-8')
 65+ #test = text.decode('utf-8')
 66+
 67+# text1 = remove_ascii_control_characters(text)
 68+# text2 = remove_numeric_character_references(text)
 69+# text3 = convert_html_entities(text)
 70+
 71+ if comment != None and comment.text != None:
 72+ #print comment.text.encode('utf-8')
 73+
 74+ comment.text = function(comment.text)
 75+ #text = comment.text
 76+ #print text
 77+ return xml
 78+
 79+
 80+def write_xml_file(element, fh, counter):
 81+ '''Get file handle and write xml element to file'''
 82+ size = len(cElementTree.tostring(element))
 83+ fh, counter = create_xml_file_handle(fh, counter, size)
 84+ fh.write(cElementTree.tostring(element))
 85+ fh.write('\n')
 86+ return fh, counter
 87+
 88+
 89+def create_xml_file_handle(fh, counter, size):
 90+ '''Create file handle if none is supplied or if file size > max file size.'''
 91+ if not fh:
 92+ counter = 0
 93+ fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
 94+ return fh, counter
 95+ elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
 96+ print 'Created chunk %s' % counter
 97+ fh.close
 98+ counter += 1
 99+ fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
 100+ return fh, counter
 101+ else:
 102+ return fh, counter
 103+
 104+
 105+def split_xml():
 106+ '''Reads xml file and splits it in N chunks'''
 107+ fh = None
 108+ counter = None
 109+ tag = '{%s}page' % settings.NAME_SPACE
 110+
 111+ context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
 112+ context = iter(context)
 113+ event, root = context.next() # get the root element of the XML doc
 114+
 115+ for event, elem in context:
 116+ if event == 'end':
 117+ if elem.tag == tag:
 118+ elem = remove_namespace(elem, settings.NAME_SPACE)
 119+ elem = parse_comments(elem, remove_numeric_character_references)
 120+ #elem = parse_comments(elem, convert_html_entities)
 121+ #elem = parse_comments(elem, remove_ascii_control_characters)
 122+ fh, counter = write_xml_file(elem, fh, counter)
 123+ #print cElementTree.tostring(elem)
 124+ root.clear() # when done parsing a section clear the tree to safe memory
 125+
 126+
 127+if __name__ == "__main__":
 128+ split_xml()
Property changes on: trunk/tools/editor_trends/split_xml_file.py
___________________________________________________________________
Added: svn:mime-type
1129 + text/plain
Added: svn:eol-style
2130 + native
Index: trunk/tools/editor_trends/README.1ST
@@ -0,0 +1,65 @@
 2+===============================================================================
 3+
 4+ Wikipedia Editor Trends Analytics
 5+
 6+===============================================================================
 7+
 8+BACKGROUND:
 9+This package offers a set of tools used to create datasets to analyze Editor
 10+Trends. By Editor Trends we refer to the overall pattern of entering and leaving
 11+a Wikipedia site. The main information source for this package is:
 12+ http://strategy.wikimedia.org/wiki/Editor_Trends_Study
 13+
 14+REQUIREMENTS:
 15+
 16+* Python 2.6 or higher (this code has not been tested with Python 3.x)
 17+
 18+OPTIONAL
 19+* MongoDB
 20+
 21+If you don't want to install / use MongDB then the package will use the built-in
 22+Sqlite library. However, this not optimized for speed and may take a serious
 23+amount of time. If possible, install MongoDB.
 24+
 25+INSTALLING USING VIRTUALENV
 26+It's recommended to use Python virtualenv. If you are not familiar with
 27+virtualenv then have a look over here:
 28+ http://groups.google.com/group/python-virtualenv/browse_thread/thread/f2f19d2cc93a844e
 29+
 30+To install Editor Trends Analytics:
 31+
 32+ virtualenv --no-site-packages --distribute editor_trends
 33+ pip install -E editor_trends -r /editor_trends/requirements.txt
 34+
 35+
 36+The first command creates a new virtualenv called editor_trends and the second
 37+command installs the dependencies. Currently the dependencies are:
 38+* PyMongo
 39+* Progressbar
 40+
 41+INSTALLING WITHOUT VIRTUALENV
 42+If you don't like virtualenv then do the following:
 43+
 44+ easy_install pymongo
 45+ easy_install progressbar
 46+
 47+IMPORTANT MONGODB NOTES
 48+If you decide to use MongDB to store the results then you have to install the
 49+64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
 50+databases created by this package will definitely be larger than that. For more
 51+background information on this limitation, please read:
 52+ http://blog.mongodb.org/post/137788967/32-bit-limitations
 53+
 54+
 55+CONFIGURATION:
 56+If you would like to create a dataset for your own analyses then you should
 57+first make the appropriate changes to settings.py. Settings.py contains
 58+configuration variables such as the location of input and output files. Most
 59+settings are self-explanatory but in cases of any questions please drop me a
 60+line.
 61+
 62+CODE:
 63+The Python code adheres to PEP8. Function names are deliberately expressive to
 64+ease understanding what's going. If you find a bug please email me at dvanliere
 65+at gmail dot com or leave a message on my Talk page.
 66+
Property changes on: trunk/tools/editor_trends/README.1ST
___________________________________________________________________
Added: native
167 + svn:eol-style=native
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -0,0 +1,159 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+from multiprocessing import Queue
 19+from Queue import Empty
 20+import sqlite3
 21+
 22+import progressbar
 23+
 24+import settings
 25+from utils import models, utils
 26+from database import db
 27+from utils import process_constructor as pc
 28+
 29+try:
 30+ import psyco
 31+ psyco.full()
 32+except ImportError:
 33+ pass
 34+
 35+
 36+def retrieve_editor_ids_mongo():
 37+ if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
 38+ retrieve_editor_ids_mongo):
 39+ ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
 40+ retrieve_editor_ids_mongo)
 41+ else:
 42+ mongo = db.init_mongo_db('editors')
 43+ editors = mongo['editors']
 44+ ids = editors.find().distinct('editor')
 45+ print ids
 46+ if ids != []:
 47+ utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
 48+ return ids
 49+
 50+
 51+def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
 52+ definition = kwargs.pop('definition')
 53+ limit = kwargs.pop('limit')
 54+ debug = kwargs.pop('debug')
 55+ mongo = db.init_mongo_db('editors')
 56+ editors = mongo['editors']
 57+ while True:
 58+ try:
 59+ if debug:
 60+ id = u'99797'
 61+ else:
 62+ id = input_queue.get(block=False)
 63+
 64+ contributors = set()
 65+ if definition == 'Traditional':
 66+ obs = editors.find({'editor': id}).limit(limit) #.sort({'date': 1}).limit(limit)
 67+ for ob in obs:
 68+ contributors.add(ob)
 69+ else:
 70+ obs = editors.find({'editor': id}).sort({'date': 1})
 71+ for ob in obs:
 72+ if len(dates) > limit:
 73+ break
 74+ else:
 75+ if edit.date not in dates:
 76+ set.add(edit)
 77+ utils.write_data_to_csv(contributors, generate_editor_dataset, settings.ENCODING)
 78+
 79+ except Empty:
 80+ break
 81+
 82+
 83+def retrieve_editor_ids_db():
 84+ contributors = set()
 85+ connection = db.init_database()
 86+ cursor = connection.cursor()
 87+ if settings.PROGRESS_BAR:
 88+ cursor.execute('SELECT MAX(ROWID) FROM contributors')
 89+ for id in cursor:
 90+ pass
 91+ pbar = progressbar.ProgressBar(maxval=id[0]).start()
 92+
 93+ cursor.execute('SELECT contributor FROM contributors WHERE bot=0')
 94+
 95+ print 'Retrieving contributors...'
 96+ for x, contributor in enumerate(cursor):
 97+ contributors.add(contributor[0])
 98+ if x % 100000 == 0:
 99+ pbar.update(x)
 100+ print 'Serializing contributors...'
 101+ utils.store_object(contributors, 'contributors')
 102+ print 'Finished serializing contributors...'
 103+
 104+ if pbar:
 105+ pbar.finish()
 106+ print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
 107+
 108+ connection.close()
 109+
 110+
 111+def retrieve_edits_by_contributor(input_queue, result_queue, pbar):
 112+ connection = db.init_database()
 113+ cursor = connection.cursor()
 114+
 115+ while True:
 116+ try:
 117+ contributor = input_queue.get(block=False)
 118+ if contributor == None:
 119+ break
 120+
 121+ cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,))
 122+ edits = {}
 123+ edits[contributor] = set()
 124+ for edit, timestamp, bot in cursor:
 125+ date = utils.convert_timestamp_to_date(timestamp)
 126+ edits[contributor].add(date)
 127+ #print edit, timestamp, bot
 128+
 129+ utils.write_data_to_csv(edits, retrieve_edits_by_contributor)
 130+ if pbar:
 131+ utils.update_progressbar(pbar, input_queue)
 132+
 133+ except Empty:
 134+ pass
 135+
 136+ connection.close()
 137+
 138+
 139+def retrieve_edits_by_contributor_launcher():
 140+ pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
 141+
 142+
 143+def debug_retrieve_edits_by_contributor_launcher():
 144+ input_queue = Queue()
 145+ kwargs = {'definition':'Traditional',
 146+ 'limit': 10,
 147+ 'debug': True
 148+ }
 149+ generate_editor_dataset(input_queue, False, False, kwargs)
 150+ generate_editor_dataset_launcher()
 151+ #retrieve_list_contributors()
 152+ #retrieve_edits_by_contributor()
 153+
 154+def generate_editor_dataset_launcher():
 155+ ids = retrieve_editor_ids_mongo()
 156+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, definition='Traditional', limit=10)
 157+
 158+
 159+if __name__ == '__main__':
 160+ debug_retrieve_edits_by_contributor_launcher()
Property changes on: trunk/tools/editor_trends/construct_datasets.py
___________________________________________________________________
Added: svn:mime-type
1161 + text/plain
Added: svn:eol-style
2162 + native
Index: trunk/tools/editor_trends/init_bot_db.py
@@ -0,0 +1,196 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+import os
 19+import cStringIO
 20+import xml.etree.cElementTree as cElementTree
 21+
 22+
 23+import settings
 24+from wikitree import xml
 25+from database import db
 26+from database import db_settings
 27+from utils import utils
 28+from utils import process_constructor as pc
 29+
 30+try:
 31+ import psyco
 32+ psyco.full()
 33+except ImportError:
 34+ pass
 35+
 36+
 37+def create_bot_ids_db_mongo():
 38+ ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.ENCODING)
 39+ mongo = db.init_mongo_db('bots')
 40+ collection = mongo['ids']
 41+
 42+ db.remove_documents_from_mongo_db(collection, None)
 43+
 44+ for id, name in ids.iteritems():
 45+ collection.insert({'id': id, 'name': name})
 46+
 47+ print collection.count()
 48+
 49+
 50+def create_bots_db(db_name):
 51+ '''
 52+ This function reads the csv file provided by Erik Zachte and constructs a
 53+ sqlite memory database. The reason for this is that I suspect I will need
 54+ some simple querying capabilities in the future, else a dictionary would
 55+ suffice.
 56+ '''
 57+ connection = db.init_database('db_name')
 58+ #connection = db.init_database('data/database/bots.db')
 59+ cursor = connection.cursor()
 60+ db.create_tables(cursor, db_settings.BOT_TABLE)
 61+ values = []
 62+ fields = [field[0] for field in db_settings.BOT_TABLE['bots']]
 63+ for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.ENCODING):
 64+ line = line.split(',')
 65+ row = []
 66+ for x, (field, value) in enumerate(zip(fields, line)):
 67+ if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER':
 68+ value = int(value)
 69+ elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT':
 70+ value = value.replace('/', '-')
 71+ #print field, value
 72+ row.append(value)
 73+ values.append(row)
 74+
 75+ cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values)
 76+ connection.commit()
 77+ if db_name == ':memory':
 78+ return cursor
 79+ else:
 80+ connection.close()
 81+
 82+
 83+def retrieve_botnames_without_id(cursor, language):
 84+ return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall()
 85+
 86+
 87+def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):
 88+ '''
 89+ This function is used to find the id's belonging to the different bots that
 90+ are patrolling the Wikipedia sites.
 91+ @input_queue contains a list of xml files to parse
 92+
 93+ @result_queue should be set to false as the results are directly written to
 94+ a csv file.
 95+
 96+ @progressbar depends on settings
 97+
 98+ @bots is a dictionary containing the names of the bots to lookup
 99+ '''
 100+
 101+ #if len(bots.keys()) == 1:
 102+ bots = bots['bots']
 103+ #print bots.keys()
 104+
 105+ if settings.DEBUG:
 106+ messages = {}
 107+
 108+ while True:
 109+ if debug:
 110+ file = input_queue
 111+ else:
 112+ file = input_queue.get(block=False)
 113+
 114+ if file == None:
 115+ break
 116+
 117+ data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION +
 118+ file, 'r', encoding=settings.ENCODING))
 119+
 120+ for raw_data in data:
 121+ xml_buffer = cStringIO.StringIO()
 122+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 123+ raw_data = ''.join(raw_data)
 124+ raw_data = raw_data.encode('utf-8')
 125+ xml_buffer.write(raw_data)
 126+
 127+ try:
 128+ xml_nodes = cElementTree.XML(xml_buffer.getvalue())
 129+ revisions = xml_nodes.findall('revision')
 130+ for revision in revisions:
 131+ contributor = xml.retrieve_xml_node(revision, 'contributor')
 132+ username = contributor.find('username')
 133+ if username == None:
 134+ continue
 135+ username = xml.extract_text(username)
 136+ #print username.encode('utf-8')
 137+
 138+ if username in bots:
 139+ id = contributor.find('id')
 140+ id = xml.extract_text(id)
 141+ #print username.encode('utf-8'), id
 142+ utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.ENCODING)
 143+ bots.pop(username)
 144+ if bots == {}:
 145+ print 'Mission accomplished'
 146+ return
 147+ except Exception, error:
 148+ print error
 149+ if settings.DEBUG:
 150+ messages = utils.track_errors(xml_buffer, error, file,
 151+ messages)
 152+
 153+ if settings.DEBUG:
 154+ utils.report_error_messages(messages, lookup_username)
 155+
 156+
 157+def add_id_to_botnames():
 158+ '''
 159+ This is the worker function for the multi-process version of
 160+ lookup_username.First, the names of the bots are retrieved, then the
 161+ multiprocess is launched by makinga call to pc.build_scaffolding. This is a
 162+ generic launcher that takes as input the function to load the input_queue,
 163+ the function that will do the main work and the objects to be put in the
 164+ input_queue. The launcher also accepts optional keyword arguments.
 165+ '''
 166+ cursor = create_bots_db(':memory')
 167+ files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
 168+
 169+ botnames = retrieve_botnames_without_id(cursor, 'en')
 170+ bots = {}
 171+ for botname in botnames:
 172+ bots[botname[0]] = 1
 173+ pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots)
 174+ cursor.close()
 175+
 176+
 177+def debug_lookup_username():
 178+ '''
 179+ This function launches the lookup_username function but then single
 180+ threaded, this eases debugging. That's also the reason why the queue
 181+ parameters are set to None. When launching this function make sure that
 182+ debug=False when calling lookup_username
 183+ '''
 184+ cursor = create_bots_db(':memory')
 185+ botnames = retrieve_botnames_without_id(cursor, 'en')
 186+ bots = {}
 187+ for botname in botnames:
 188+ bots[botname[0]] = 1
 189+
 190+ lookup_username('12.xml', None, None, bots, debug=True)
 191+ cursor.close()
 192+
 193+
 194+if __name__ == '__main__':
 195+ #debug()
 196+ #add_id_to_botnames()
 197+ create_bot_ids_db_mongo()
Property changes on: trunk/tools/editor_trends/init_bot_db.py
___________________________________________________________________
Added: svn:mime-type
1198 + text/plain
Added: svn:eol-style
2199 + native
Index: trunk/tools/editor_trends/database/__init__.py
Property changes on: trunk/tools/editor_trends/database/__init__.py
___________________________________________________________________
Added: svn:mime-type
3200 + text/plain
Added: svn:eol-style
4201 + native
Index: trunk/tools/editor_trends/database/db.py
@@ -0,0 +1,83 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+import sqlite3 as sqlite
 19+from pymongo import Connection
 20+
 21+
 22+import settings
 23+from database import db_settings
 24+
 25+
 26+def init_mongo_db(db):
 27+ connection = Connection()
 28+ db = connection[db]
 29+ return db
 30+
 31+
 32+def remove_documents_from_mongo_db(collection, ids):
 33+ collection.remove(ids)
 34+
 35+
 36+def add_index_to_collection(db, collection, keys):
 37+ '''
 38+ @db is the name of the mongodb
 39+ @collection is the name of the 'table' in mongodb
 40+ @keys should be a list of keys used to create the index
 41+ '''
 42+
 43+ mongo = init_mongo_db(db)
 44+ collection = mongo[collection]
 45+ mongo.collection.create_index(keys)
 46+ mongo.collection.ensure_index(keys)
 47+
 48+
 49+def init_database(db=None):
 50+ '''
 51+ This function initializes the connection with a sqlite db.
 52+ If the database already exists then it returns False to indicate
 53+ that the db already exists, else it returns True to indicate
 54+ that it's an empty database without tables.
 55+ '''
 56+ if db == None:
 57+ db = settings.DATABASE_NAME
 58+
 59+ return sqlite.connect(db, check_same_thread=False)
 60+
 61+
 62+def create_tables(cursor, tables):
 63+ '''
 64+ Tables is expected to be a dictionary, with key
 65+ table name and value another dictionary. This second
 66+ dictionary contains variable names and datatypes.
 67+ '''
 68+ for table in tables:
 69+ vars = '('
 70+ for var, datatype in tables[table]:
 71+ vars = vars + '%s %s,' % (var, datatype)
 72+ vars = vars[:-1]
 73+ vars = vars + ')'
 74+ cursor.execute('CREATE TABLE IF NOT EXISTS ? ?' % (table, vars))
 75+
 76+
 77+def debug():
 78+ connection = init_database()
 79+ cursor = connection.cursor()
 80+ create_tables(cursor, settings.TABLES)
 81+
 82+
 83+if __name__ == '__main__':
 84+ debug()
Property changes on: trunk/tools/editor_trends/database/db.py
___________________________________________________________________
Added: svn:mime-type
185 + text/plain
Added: svn:eol-style
286 + native
Index: trunk/tools/editor_trends/database/db_settings.py
@@ -0,0 +1,38 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+'''
 19+This is a settings file that contains the layout of different tables. The main
 20+key will be used as the tablename while it,s values contain tuples containing
 21+fieldname and datatype This is only be used for sqlite.
 22+'''
 23+CONTRIBUTOR_TABLE = {'contributors': []}
 24+CONTRIBUTOR_TABLE['contributors'].append(('contributor', 'VARCHAR(64)'))
 25+CONTRIBUTOR_TABLE['contributors'].append(('article', 'INTEGER'))
 26+CONTRIBUTOR_TABLE['contributors'].append(('timestamp', 'TEXT'))
 27+CONTRIBUTOR_TABLE['contributors'].append(('bot', 'INTEGER'))
 28+
 29+BOT_TABLE = {'bots': []}
 30+BOT_TABLE['bots'].append(('language', 'VARCHAR(12)'))
 31+BOT_TABLE['bots'].append(('name', 'VARCHAR(64)'))
 32+BOT_TABLE['bots'].append(('edits_namespace_a', 'INTEGER'))
 33+BOT_TABLE['bots'].append(('edits_namespace_x', 'INTEGER'))
 34+BOT_TABLE['bots'].append(('rank_now', 'INTEGER'))
 35+BOT_TABLE['bots'].append(('rank_prev', 'INTEGER'))
 36+BOT_TABLE['bots'].append(('first_date', 'TEXT'))
 37+BOT_TABLE['bots'].append(('days_first', 'INTEGER'))
 38+BOT_TABLE['bots'].append(('last_date', 'TEXT'))
 39+BOT_TABLE['bots'].append(('days_last', 'INTEGER'))
Property changes on: trunk/tools/editor_trends/database/db_settings.py
___________________________________________________________________
Added: svn:mime-type
140 + text/plain
Added: svn:eol-style
241 + native
Index: trunk/tools/editor_trends/bots/__init__.py
Property changes on: trunk/tools/editor_trends/bots/__init__.py
___________________________________________________________________
Added: svn:mime-type
342 + text/plain
Added: svn:eol-style
443 + native
Property changes on: trunk/tools/editor_trends/data/database
___________________________________________________________________
Added: svn:ignore
544 + *.db
Property changes on: trunk/tools/editor_trends/data/objects
___________________________________________________________________
Added: svn:ignore
645 + *.bin
Property changes on: trunk/tools/editor_trends/data/csv
___________________________________________________________________
Added: svn:ignore
746 + *.csv
Index: trunk/tools/editor_trends/run.bat
@@ -0,0 +1,3 @@
 2+@echo off
 3+python split_xml_file.py
 4+python map_wiki_editors.py
Index: trunk/tools/editor_trends/algorithms/__init__.py
Property changes on: trunk/tools/editor_trends/algorithms/__init__.py
___________________________________________________________________
Added: svn:mime-type
15 + text/plain
Added: svn:eol-style
26 + native
Index: trunk/tools/editor_trends/algorithms/red_wiki_editors.py
@@ -0,0 +1,40 @@
 2+import re
 3+import settings
 4+
 5+try:
 6+ import psyco
 7+ psyco.full()
 8+except ImportError:
 9+ pass
 10+
 11+
 12+RE_ID = re.compile('\d*')
 13+RE_IP = re.compile('(?:\d{1,3}\.){2,3}\d{1,3}') #Some of the addresses have the last 3 digits blocked as xxx
 14+
 15+
 16+
 17+def determine_contributor_type(id):
 18+ if len(re.findall(RE_ID, id)) == 1:
 19+ return 'id'
 20+ elif len(re.findall(RE_IP, id)) == 1:
 21+ return 'ip'
 22+ else:
 23+ return 'name'
 24+
 25+def open_file_handles():
 26+ fh1, fh2,fh3 = None, None, None
 27+ handles = {'id.txt': fh1,
 28+ 'ip.txt': fh2,
 29+ 'name.txt': fh3
 30+ }
 31+ for handle, var in handles.iteritems():
 32+ var = codecs.open(handle, 'w', encoding=settings.ENCODING)
 33+
 34+ return handles
 35+
 36+def close_file_handles(handles):
 37+ for handle, var in handles.iteritems():
 38+ var.close()
 39+
 40+def write_data(vars):
 41+ pass
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/algorithms/red_wiki_editors.py
___________________________________________________________________
Added: svn:mime-type
142 + text/plain
Added: svn:eol-style
243 + native
Property changes on: trunk/tools/editor_trends/errors
___________________________________________________________________
Added: svn:ignore
344 + *.bin
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Added: svn:ignore
445 + wikistats
zips
notes.txt
*.pyc
datasets
errors
Added: native
546 + svn:eol-style=native

Status & tagging log