Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -0,0 +1,266 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +#Default Python libraries (Python => 2.6) |
| 19 | +import sys |
| 20 | +import os |
| 21 | +import time |
| 22 | +import codecs |
| 23 | +import cStringIO |
| 24 | +import re |
| 25 | +import xml.etree.cElementTree as cElementTree |
| 26 | +from multiprocessing import Queue |
| 27 | +from Queue import Empty |
| 28 | + |
| 29 | +# Custom written files |
| 30 | +import settings |
| 31 | +from utils import utils, models |
| 32 | +from database import db_settings |
| 33 | +from database import db |
| 34 | +from wikitree import xml |
| 35 | +from utils import process_constructor as pc |
| 36 | + |
| 37 | + |
| 38 | +try: |
| 39 | + import psyco |
| 40 | + psyco.full() |
| 41 | +except ImportError: |
| 42 | + pass |
| 43 | + |
| 44 | +contributors = {} |
| 45 | + |
| 46 | +RE_BOT = re.compile('bot', re.IGNORECASE) |
| 47 | +RE_SCRIPT = re.compile('script', re.IGNORECASE) |
| 48 | +#RE_NUMERIC_CHARACTER = re.compile('&#[\d{1,5}]+;') |
| 49 | +# |
| 50 | +#def remove_numeric_character_references(text): |
| 51 | +# return re.sub(RE_NUMERIC_CHARACTER, '', text) |
| 52 | +# |
| 53 | + |
| 54 | + |
| 55 | +def determine_username_is_bot(username, kwargs): |
| 56 | + ids = kwargs.get('bots', []) |
| 57 | + if ids == None: |
| 58 | + ids = [] |
| 59 | + if username != None and username.text != None: |
| 60 | + id = username.text |
| 61 | + if id in ids: |
| 62 | + return 1 |
| 63 | + else: |
| 64 | + return 0 |
| 65 | + |
| 66 | + |
| 67 | +def extract_contributor_id(contributor, kwargs): |
| 68 | + ''' |
| 69 | + @contributor is the xml contributor node containing a number of attributes |
| 70 | + |
| 71 | + Currently, we are only interested in registered contributors, hence we |
| 72 | + ignore anonymous editors. If you are interested in collecting data on |
| 73 | + anonymous editors then add the string 'ip' to the tags variable. |
| 74 | + ''' |
| 75 | + tags = ['id'] |
| 76 | + if contributor.get('deleted'): |
| 77 | + return -1 #Not sure if this is the best way to code deleted contributors. |
| 78 | + for elem in contributor: |
| 79 | + if elem.tag in tags: |
| 80 | + if elem.text != None: |
| 81 | + return elem.text.decode('utf-8') |
| 82 | + else: |
| 83 | + return -1 |
| 84 | + |
| 85 | + |
| 86 | +def output_editor_information(elem, data_queue, **kwargs): |
| 87 | + tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot}, |
| 88 | + 'timestamp': {'date': xml.extract_text}, |
| 89 | + } |
| 90 | + vars = {} |
| 91 | + vars['article'] = elem.find('id').text.decode(settings.ENCODING) |
| 92 | + revisions = elem.findall('revision') |
| 93 | + for revision in revisions: |
| 94 | + #print vars |
| 95 | + elements = revision.getchildren() |
| 96 | + for tag, functions in tags.iteritems(): |
| 97 | + xml_node = xml.retrieve_xml_node(elements, tag) |
| 98 | + for var, function in functions.iteritems(): |
| 99 | + vars[var] = function(xml_node, kwargs) |
| 100 | + |
| 101 | + #if vars['editor'] == '11887479' or vars['editor'] == '518794': |
| 102 | + # print vars |
| 103 | + #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot']) |
| 104 | + if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None: |
| 105 | + vars.pop('bot') |
| 106 | + vars['date'] = utils.convert_timestamp_to_date(vars['date']) |
| 107 | + data_queue.put(vars) |
| 108 | + vars={} |
| 109 | + |
| 110 | +def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'): |
| 111 | + if settings.DEBUG: |
| 112 | + messages = {} |
| 113 | + vars = {} |
| 114 | + while True: |
| 115 | + try: |
| 116 | + if debug: |
| 117 | + file = xml_queue |
| 118 | + else: |
| 119 | + file = xml_queue.get(block=False) |
| 120 | + #print 'parsing %s' % file |
| 121 | + if file == None: |
| 122 | + break |
| 123 | + |
| 124 | + data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION |
| 125 | + + file, 'r', encoding=settings.ENCODING)) |
| 126 | + #data = read_input(sys.stdin) |
| 127 | + #print xml_queue.qsize() |
| 128 | + for raw_data in data: |
| 129 | + xml_buffer = cStringIO.StringIO() |
| 130 | + raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 131 | + raw_data = ''.join(raw_data) |
| 132 | + xml_buffer.write(raw_data) |
| 133 | + |
| 134 | + try: |
| 135 | + elem = cElementTree.XML(xml_buffer.getvalue()) |
| 136 | + output_editor_information(elem, data_queue, bots=bots) |
| 137 | + except SyntaxError, error: |
| 138 | + print error |
| 139 | + #There are few cases with invalid tokens, they are fixed |
| 140 | + #here and then reinserted into the XML DOM |
| 141 | + #data = convert_html_entities(xml_buffer.getvalue()) |
| 142 | + #elem = cElementTree.XML(data) |
| 143 | + #output_editor_information(elem) |
| 144 | + if settings.DEBUG: |
| 145 | + utils.track_errors(xml_buffer, error, file, messages) |
| 146 | + except UnicodeEncodeError, error: |
| 147 | + print error |
| 148 | + if settings.DEBUG: |
| 149 | + utils.track_errors(xml_buffer, error, file, messages) |
| 150 | + #finally: |
| 151 | + |
| 152 | + |
| 153 | + if pbar: |
| 154 | + print xml_queue.qsize() |
| 155 | + #utils.update_progressbar(pbar, xml_queue) |
| 156 | + if debug: |
| 157 | + break |
| 158 | + |
| 159 | + except Empty: |
| 160 | + break |
| 161 | + |
| 162 | + if settings.DEBUG: |
| 163 | + utils.report_error_messages(messages, lookup_new_editors) |
| 164 | + |
| 165 | + |
| 166 | +def store_data_mongo(data_queue, pids): |
| 167 | + mongo = db.init_mongo_db('editors') |
| 168 | + collection = mongo['editors'] |
| 169 | + values = [] |
| 170 | + while True: |
| 171 | + try: |
| 172 | + chunk = data_queue.get(block=False) |
| 173 | + values.append(chunk) |
| 174 | + #print chunk |
| 175 | + if len(values) == 100000: |
| 176 | + collection.insert(values) |
| 177 | + values = [] |
| 178 | + #print data_queue.qsize() |
| 179 | + data_queue.task_done() |
| 180 | + except Empty: |
| 181 | + # The queue is empty but store the remaining values if present |
| 182 | + if values != []: |
| 183 | + collection.insert(values) |
| 184 | + values = [] |
| 185 | + |
| 186 | + #print [utils.check_if_process_is_running(pid) for pid in pids] |
| 187 | + ''' |
| 188 | + This checks whether the Queue is empty because the preprocessors are |
| 189 | + finished or because this function is faster in emptying the Queue |
| 190 | + then the preprocessors are able to fill it. If this preprocessors |
| 191 | + are finished and this Queue is empty than break, else wait for the |
| 192 | + Queue to fill. |
| 193 | + ''' |
| 194 | + if all([utils.check_if_process_is_running(pid) for pid in pids]): |
| 195 | + pass |
| 196 | + else: |
| 197 | + break |
| 198 | + |
| 199 | + |
| 200 | +def store_data_db(data_queue, pids): |
| 201 | + connection = db.init_database() |
| 202 | + cursor = connection.cursor() |
| 203 | + db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE) |
| 204 | + |
| 205 | + empty = 0 |
| 206 | + |
| 207 | + values = [] |
| 208 | + while True: |
| 209 | + try: |
| 210 | + chunk = data_queue.get(block=False) |
| 211 | + contributor = chunk['contributor'].encode(settings.ENCODING) |
| 212 | + article = chunk['article'] |
| 213 | + timestamp = chunk['timestamp'].encode(settings.ENCODING) |
| 214 | + bot = chunk['bot'] |
| 215 | + values.append((contributor, article, timestamp, bot)) |
| 216 | + |
| 217 | + if len(values) == 50000: |
| 218 | + cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values) |
| 219 | + connection.commit() |
| 220 | + #print 'Size of queue: %s' % data_queue.qsize() |
| 221 | + values = [] |
| 222 | + |
| 223 | + except Empty: |
| 224 | + if all([utils.check_if_process_is_running(pid) for pid in pids]): |
| 225 | + pass |
| 226 | + else: |
| 227 | + break |
| 228 | + connection.close() |
| 229 | + |
| 230 | + |
| 231 | +def run_stand_alone(): |
| 232 | + files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml') |
| 233 | + #files = files[:2] |
| 234 | + mongo = db.init_mongo_db('bots') |
| 235 | + bots = mongo['ids'] |
| 236 | + ids = {} |
| 237 | + cursor = bots.find() |
| 238 | + |
| 239 | + for bot in cursor: |
| 240 | + ids[bot['id']] = bot['name'] |
| 241 | + pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids) |
| 242 | + db.add_index_to_collection('editors', 'date') |
| 243 | + db.add_index_to_collection('editors', 'name') |
| 244 | + |
| 245 | +def debug_lookup_new_editors(): |
| 246 | + q = Queue() |
| 247 | + import progressbar |
| 248 | + pbar = progressbar.ProgressBar().start() |
| 249 | + edits = db.init_mongo_db('editors') |
| 250 | + lookup_new_editors('1.xml', q, None, None, True) |
| 251 | + db.add_index_to_collection('editors', 'date') |
| 252 | + db.add_index_to_collection('editors', 'name') |
| 253 | + |
| 254 | + |
| 255 | + |
| 256 | +def run_hadoop(): |
| 257 | + pass |
| 258 | + |
| 259 | + |
| 260 | +if __name__ == "__main__": |
| 261 | + #debug_lookup_new_editors() |
| 262 | + |
| 263 | + if settings.RUN_MODE == 'stand_alone': |
| 264 | + run_stand_alone() |
| 265 | + print 'Finished processing XML files.' |
| 266 | + else: |
| 267 | + run_hadoop() |
Property changes on: trunk/tools/editor_trends/map_wiki_editors.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 268 | + text/plain |
Added: svn:eol-style |
2 | 269 | + native |
Index: trunk/tools/editor_trends/wikitree/__init__.py |
Property changes on: trunk/tools/editor_trends/wikitree/__init__.py |
___________________________________________________________________ |
Added: svn:mime-type |
3 | 270 | + text/plain |
Added: svn:eol-style |
4 | 271 | + native |
Index: trunk/tools/editor_trends/wikitree/xml.py |
— | — | @@ -0,0 +1,49 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +from utils import utils |
| 19 | +import settings |
| 20 | + |
| 21 | + |
| 22 | +def convert_html_entities(text): |
| 23 | + return utils.unescape(text) |
| 24 | + |
| 25 | + |
| 26 | +def extract_text(elem, kwargs): |
| 27 | + if elem != None and elem.text != None: |
| 28 | + return elem.text.decode(settings.ENCODING) |
| 29 | + return None |
| 30 | + |
| 31 | + |
| 32 | +def retrieve_xml_node(xml_nodes, name): |
| 33 | + for xml_node in xml_nodes: |
| 34 | + if xml_node.tag == name: |
| 35 | + return xml_node |
| 36 | + return None #maybe this should be replaced with an NotFoundError |
| 37 | + |
| 38 | + |
| 39 | +def read_input(file): |
| 40 | + lines = [] |
| 41 | + for line in file: |
| 42 | + lines.append(line) |
| 43 | + if line.find('</page>') > -1: |
| 44 | + yield lines |
| 45 | + ''' |
| 46 | + #This looks counter intuitive but Python continues with this call |
| 47 | + after it has finished the yield statement |
| 48 | + ''' |
| 49 | + lines = [] |
| 50 | + file.close() |
Property changes on: trunk/tools/editor_trends/wikitree/xml.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 51 | + text/plain |
Added: svn:eol-style |
2 | 52 | + native |
Index: trunk/tools/editor_trends/__init__.py |
— | — | @@ -0,0 +1,14 @@ |
| 2 | +import os |
| 3 | +import sys |
| 4 | + |
| 5 | +WORKING_DIRECTORY = os.getcwd()#[:-9] |
| 6 | +IGNORE_DIRS = ['wikistats', 'zips'] |
| 7 | + |
| 8 | +dirs = [name for name in os.listdir(WORKING_DIRECTORY) if |
| 9 | + os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
| 10 | + |
| 11 | + |
| 12 | +for subdirname in dirs: |
| 13 | + if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
| 14 | + sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname)) |
| 15 | + #print os.path.join(WORKING_DIRECTORY, subdirname) |
Property changes on: trunk/tools/editor_trends/__init__.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 16 | + text/plain |
Added: svn:eol-style |
2 | 17 | + native |
Index: trunk/tools/editor_trends/.svn_ignore |
— | — | @@ -0,0 +1,12 @@ |
| 2 | +*.pyc
|
| 3 | +*.xml
|
| 4 | +*.db
|
| 5 | +*.bin
|
| 6 | +*.zip
|
| 7 | +*.csv
|
| 8 | +.*
|
| 9 | +zips/
|
| 10 | +wikistats/
|
| 11 | +datasets/
|
| 12 | +data/
|
| 13 | +notes.txt |
\ No newline at end of file |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -0,0 +1,91 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | + |
| 19 | +''' |
| 20 | +This file contains settings that are used for constructing and analyzing |
| 21 | +the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
| 22 | +''' |
| 23 | + |
| 24 | + |
| 25 | +from multiprocessing import cpu_count |
| 26 | +import os |
| 27 | +import sys |
| 28 | +import platform |
| 29 | + |
| 30 | +#Setting up the environment |
| 31 | +ops = {platform.win32_ver: 'Windows', |
| 32 | + platform.linux_distribution: 'Linux', |
| 33 | + platform.mac_ver: 'OSX'} |
| 34 | +for op in ops: |
| 35 | + if op() != ('', '', '') and op() != ('', ('', '', ''), ''): |
| 36 | + OS = ops[op] |
| 37 | + |
| 38 | +WORKING_DIRECTORY = os.getcwd()#[:-9] |
| 39 | +IGNORE_DIRS = ['wikistats', 'zips'] |
| 40 | + |
| 41 | +dirs = [name for name in os.listdir(WORKING_DIRECTORY) if os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
| 42 | +for subdirname in dirs: |
| 43 | + if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
| 44 | + sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname)) |
| 45 | + |
| 46 | + |
| 47 | +#General settings |
| 48 | + |
| 49 | +# Valid values are 'stand-alone' and 'hadoop' |
| 50 | +RUN_MODE = 'stand_alone' |
| 51 | + |
| 52 | +# If true then some more detailed debug information is collected |
| 53 | +DEBUG = True |
| 54 | + |
| 55 | +#If True then it will display a progress bar on the console. |
| 56 | +PROGRESS_BAR = True |
| 57 | + |
| 58 | +#Date format as used by Erik Zachte |
| 59 | +DATE_FORMAT = '%Y-%m-%d' |
| 60 | + |
| 61 | +# Timestamp format as generated by the MediaWiki dumps |
| 62 | +DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' |
| 63 | + |
| 64 | +#This section contains configuration variables for the different file locations. |
| 65 | + |
| 66 | +# Location where to write xml chunks |
| 67 | +XML_FILE_LOCATION = 'C:/wikimedia/' |
| 68 | + |
| 69 | +# Input file |
| 70 | +XML_FILE = 'C:/Source_Files/enwiki-20100916-stub-meta-history.xml' |
| 71 | + |
| 72 | +# This is the place where error messages are stored for debugging purposes |
| 73 | +ERROR_MESSAGE_FILE_LOCATION = WORKING_DIRECTORY + '/errors/' |
| 74 | + |
| 75 | +DATABASE_FILE_LOCATION = WORKING_DIRECTORY + '/data/database/' |
| 76 | + |
| 77 | +BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/' |
| 78 | + |
| 79 | +#This section contains configuration variables for parsing / encoding and |
| 80 | +#working with the XML files. |
| 81 | + |
| 82 | +# ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Sets for reason |
| 83 | +MAX_XML_FILE_SIZE = 67108864 |
| 84 | + |
| 85 | +ENCODING = 'utf-8' |
| 86 | + |
| 87 | +# Name space, do not change as this works for Mediawiki wikis |
| 88 | +NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/' |
| 89 | + |
| 90 | +#Multiprocess settings used to parallelize workload |
| 91 | +#Change this to match your computers configuration (RAM / CPU) |
| 92 | +NUMBER_OF_PROCESSES = cpu_count() * 1 |
Property changes on: trunk/tools/editor_trends/settings.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 93 | + text/plain |
Added: svn:eol-style |
2 | 94 | + native |
Index: trunk/tools/editor_trends/utils/__init__.py |
Property changes on: trunk/tools/editor_trends/utils/__init__.py |
___________________________________________________________________ |
Added: svn:mime-type |
3 | 95 | + text/plain |
Added: svn:eol-style |
4 | 96 | + native |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -0,0 +1,267 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +''' |
| 19 | +The utils module contains helper functions that will be needed throughout. |
| 20 | +It provides functions to read / write data to text and binary files, fix markup |
| 21 | +and track error messages. |
| 22 | +''' |
| 23 | + |
| 24 | +import re |
| 25 | +import htmlentitydefs |
| 26 | +import cPickle |
| 27 | +import datetime |
| 28 | +import codecs |
| 29 | +import os |
| 30 | +import ctypes |
| 31 | + |
| 32 | +import settings |
| 33 | + |
| 34 | + |
| 35 | +try: |
| 36 | + import psyco |
| 37 | + psyco.full() |
| 38 | +except ImportError: |
| 39 | + pass |
| 40 | + |
| 41 | + |
| 42 | +RE_ERROR_LOCATION = re.compile('\d+') |
| 43 | +RE_NUMERIC_CHARACTER = re.compile('&#?\w+;') |
| 44 | + |
| 45 | + |
| 46 | +def convert_timestamp_to_date(timestamp): |
| 47 | + return datetime.datetime.strptime(timestamp[:10], settings.DATE_FORMAT) |
| 48 | + |
| 49 | + |
| 50 | +def convert_timestamp_to_datetime(timestamp): |
| 51 | + return datetime.datetime.strptime(timestamp, settings.DATETIME_FORMAT) |
| 52 | + |
| 53 | + |
| 54 | +def check_if_process_is_running(pid): |
| 55 | + try: |
| 56 | + if settings.OS == 'Windows': |
| 57 | + PROCESS_TERMINATE = 1 |
| 58 | + handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid) |
| 59 | + if handle != 0: |
| 60 | + return True |
| 61 | + else: |
| 62 | + return False |
| 63 | + else: |
| 64 | + os.kill(pid, 0) |
| 65 | + return Tru |
| 66 | + except Exception, error: |
| 67 | + print error |
| 68 | + return False |
| 69 | + |
| 70 | + |
| 71 | +# error tracking related functions |
| 72 | +def track_errors(xml_buffer, error, file, messages): |
| 73 | + text = extract_offending_string(xml_buffer.getvalue(), error) |
| 74 | + |
| 75 | + vars = {} |
| 76 | + vars['file'] = file |
| 77 | + vars['error'] = error |
| 78 | + vars['text'] = text |
| 79 | + #print file, error, text |
| 80 | + key = remove_error_specific_information(error) |
| 81 | + if key not in messages: |
| 82 | + messages[key] = {} |
| 83 | + if messages[key] == {}: |
| 84 | + c = 0 |
| 85 | + else: |
| 86 | + counters = messages[key].keys() |
| 87 | + counters.sort() |
| 88 | + counters.reverse() |
| 89 | + c = counters[-1] |
| 90 | + |
| 91 | + messages[key][c] = {} |
| 92 | + for var in vars: |
| 93 | + messages[key][c][var] = vars[var] |
| 94 | + |
| 95 | + return messages |
| 96 | + |
| 97 | + |
| 98 | +def report_error_messages(messages, function): |
| 99 | + store_object(messages, settings.ERROR_MESSAGE_FILE_LOCATION, function.func_name) |
| 100 | + errors = messages.keys() |
| 101 | + for error in errors: |
| 102 | + for key, value in messages[error].iteritems(): |
| 103 | + print error, key, value |
| 104 | + |
| 105 | + |
| 106 | +def remove_error_specific_information(e): |
| 107 | + pos = e.args[0].find('line') |
| 108 | + if pos > -1: |
| 109 | + return e.args[0][:pos] |
| 110 | + else: |
| 111 | + return e.args[0] |
| 112 | + |
| 113 | + |
| 114 | +def extract_offending_string(text, error): |
| 115 | + ''' |
| 116 | + This function determines the string that causes an error when feeding it to |
| 117 | + the XML parser. This is only useful for debugging purposes. |
| 118 | + ''' |
| 119 | + location = re.findall(RE_ERROR_LOCATION, error.args[0]) |
| 120 | + if location != []: |
| 121 | + location = int(location[0]) - 1 |
| 122 | + text = text.split('\n')[location] |
| 123 | + text = text.decode('utf-8') |
| 124 | + return text |
| 125 | + else: |
| 126 | + return '' |
| 127 | + |
| 128 | + |
| 129 | +# read / write data related functions |
| 130 | +def read_data_from_csv(filename, encoding): |
| 131 | + if hasattr(filename, '__call__'): |
| 132 | + filename = construct_filename_from_function(filename) |
| 133 | + |
| 134 | + fh = open_txt_file(filename, 'r', encoding=encoding) |
| 135 | + for line in fh: |
| 136 | + yield line |
| 137 | + |
| 138 | + fh.close() |
| 139 | + |
| 140 | + |
| 141 | +def write_data_to_csv(data, function, encoding): |
| 142 | + filename = construct_filename_from_function(function, '.csv') |
| 143 | + fh = open_txt_file(filename, 'a', encoding=encoding) |
| 144 | + keys = data.keys() |
| 145 | + for key in keys: |
| 146 | + for value in data[key]: |
| 147 | + fh.write('%s\t%s\n' % (key, value)) |
| 148 | + fh.close() |
| 149 | + |
| 150 | + |
| 151 | +def open_txt_file(filename, mode, encoding): |
| 152 | + return codecs.open(filename, mode, encoding=encoding) |
| 153 | + |
| 154 | +def construct_filename_from_function(function, extension): |
| 155 | + return function.func_name + extension |
| 156 | + |
| 157 | +def check_file_exists(location, filename): |
| 158 | + if hasattr(filename, '__call__'): |
| 159 | + filename = construct_filename_from_function(filename, '.bin') |
| 160 | + if os.path.exists(location + filename): |
| 161 | + return True |
| 162 | + else: |
| 163 | + return False |
| 164 | + |
| 165 | + |
| 166 | +def store_object(object, location, filename): |
| 167 | + if hasattr(filename, '__call__'): |
| 168 | + filename = construct_filename_from_function(filename, '.bin') |
| 169 | + if not filename.endswith('.bin'): |
| 170 | + filename = filename + '.bin' |
| 171 | + fh = open(location + filename, 'wb') |
| 172 | + cPickle.dump(object, fh) |
| 173 | + fh.close() |
| 174 | + |
| 175 | + |
| 176 | +def load_object(location, filename): |
| 177 | + if hasattr(filename, '__call__'): |
| 178 | + filename = construct_filename_from_function(filename, '.bin') |
| 179 | + if not filename.endswith('.bin'): |
| 180 | + filename = filename + '.bin' |
| 181 | + fh = open(location + filename, 'rb') |
| 182 | + obj = cPickle.load(fh) |
| 183 | + fh.close() |
| 184 | + return obj |
| 185 | + |
| 186 | + |
| 187 | +def clean_string(string): |
| 188 | + string = string.replace('\n', '') |
| 189 | + return string |
| 190 | + |
| 191 | + |
| 192 | +def create_dict_from_csv_file(filename, encoding): |
| 193 | + d = {} |
| 194 | + for line in read_data_from_csv(filename, encoding): |
| 195 | + line = clean_string(line) |
| 196 | + value, key = line.split('\t') |
| 197 | + d[key] = value |
| 198 | + |
| 199 | + return d |
| 200 | + |
| 201 | + |
| 202 | +def retrieve_file_list(location, extension): |
| 203 | + all_files = os.listdir(location) |
| 204 | + if not extension.startswith('.'): |
| 205 | + extension = '.' + extension |
| 206 | + files = [] |
| 207 | + for file in all_files: |
| 208 | + if file.endswith(extension): |
| 209 | + files.append(file) |
| 210 | + |
| 211 | + return files |
| 212 | + |
| 213 | + |
| 214 | +# Progress bar related functions |
| 215 | +def update_progressbar(pbar, queue): |
| 216 | + ''' |
| 217 | + Updates the progressbar by determining how much work is left in a queue |
| 218 | + ''' |
| 219 | + x = pbar.maxval - queue.qsize() |
| 220 | + ''' |
| 221 | + Currently, calling the pbar.update function gives the following error: |
| 222 | + File "build\bdist.win32\egg\progressbar.py", line 352, in update |
| 223 | + self.fd.write(self._format_line() + '\r') |
| 224 | + ValueError: I/O operation on closed file |
| 225 | + Not sure how to fix this, that's why the line is commented. |
| 226 | + ''' |
| 227 | + #pbar.update(x) |
| 228 | + |
| 229 | + |
| 230 | +def humanize_time_difference(seconds_elapsed): |
| 231 | + """ |
| 232 | + Returns a humanized string representing time difference. |
| 233 | + It will only output the first two time units, so days and |
| 234 | + hours, or hours and minutes, except when there are only |
| 235 | + seconds. |
| 236 | + """ |
| 237 | + seconds_elapsed = int(seconds_elapsed) |
| 238 | + humanized_time = {} |
| 239 | + time_units = [('days', 86400), ('hours', 3600), ('minutes', 60), ('seconds', 1)] |
| 240 | + for time, unit in time_units: |
| 241 | + dt = seconds_elapsed / unit |
| 242 | + if dt > 0: |
| 243 | + humanized_time[time] = dt |
| 244 | + seconds_elapsed = seconds_elapsed - (unit * humanized_time[time]) |
| 245 | + #humanized_time['seconds'] = seconds_elapsed |
| 246 | + |
| 247 | + x = 0 |
| 248 | + if len(humanized_time) == 1: |
| 249 | + return '%s %s' % (humanized_time['seconds'], 'seconds') |
| 250 | + else: |
| 251 | + obs = [] |
| 252 | + for time, unit in time_units: |
| 253 | + if time in humanized_time: |
| 254 | + unit = humanized_time.get(time, None) |
| 255 | + if humanized_time[time] == 1: |
| 256 | + time = time[:-1] |
| 257 | + obs.append((time, unit)) |
| 258 | + x += 1 |
| 259 | + if x == 2: |
| 260 | + return '%s %s and %s %s' % (obs[0][1], obs[0][0], obs[1][1], obs[1][0]) |
| 261 | + |
| 262 | + |
| 263 | +def debug(): |
| 264 | + dt = humanize_time_difference(64) |
| 265 | + print dt |
| 266 | + |
| 267 | +if __name__ == '__main__': |
| 268 | + debug() |
Property changes on: trunk/tools/editor_trends/utils/utils.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 269 | + text/plain |
Added: svn:eol-style |
2 | 270 | + native |
Index: trunk/tools/editor_trends/utils/models.py |
— | — | @@ -0,0 +1,54 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +import multiprocessing |
| 19 | + |
| 20 | + |
| 21 | +class ProcessInputQueue(multiprocessing.Process): |
| 22 | + |
| 23 | + def __init__(self, target, input_queue, result_queue, pbar, **kwargs): |
| 24 | + multiprocessing.Process.__init__(self) |
| 25 | + self.input_queue = input_queue |
| 26 | + self.result_queue = result_queue |
| 27 | + self.target = target |
| 28 | + self.progressbar = pbar |
| 29 | + for kw in kwargs: |
| 30 | + setattr(self, kw, kwargs[kw]) |
| 31 | + |
| 32 | + def run(self): |
| 33 | + proc_name = self.name |
| 34 | + kwargs = {} |
| 35 | + IGNORE = [self.input_queue, self.result_queue, self.target, |
| 36 | + self.progressbar] |
| 37 | + for kw in self.__dict__: |
| 38 | + if kw not in IGNORE and not kw.startswith('_'): |
| 39 | + kwargs[kw] = getattr(self, kw) |
| 40 | + |
| 41 | + self.target(self.input_queue, self.result_queue, self.progressbar, kwargs) |
| 42 | + |
| 43 | + |
| 44 | +class ProcessResultQueue(multiprocessing.Process): |
| 45 | + |
| 46 | + def __init__(self, target, result_queue, pids, pbar): |
| 47 | + multiprocessing.Process.__init__(self) |
| 48 | + self.result_queue = result_queue |
| 49 | + self.target = target |
| 50 | + self.progressbar = pbar |
| 51 | + self.pids = pids |
| 52 | + |
| 53 | + def run(self): |
| 54 | + proc_name = self.name |
| 55 | + self.target(self.result_queue, self.pids) |
Property changes on: trunk/tools/editor_trends/utils/models.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 56 | + text/plain |
Added: svn:eol-style |
2 | 57 | + native |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -0,0 +1,113 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +from multiprocessing import Process, Queue, JoinableQueue |
| 19 | +from Queue import Empty |
| 20 | + |
| 21 | +import settings |
| 22 | +import utils |
| 23 | +import models |
| 24 | + |
| 25 | +#3rd party dependency |
| 26 | +import progressbar |
| 27 | + |
| 28 | + |
| 29 | +def build_scaffolding(load_input_queue, main, obj, result_processor=False, result_queue=False, **kwargs): |
| 30 | + ''' |
| 31 | + This a generic producer/consumer process launcher. It can launch two types |
| 32 | + of processes: |
| 33 | + a) Processes that take a task from a queue and do their thing |
| 34 | + b) Processes that take a task from a queue and put the result in the |
| 35 | + result_queue. |
| 36 | + If result_queue is False then a) is assumed. |
| 37 | + |
| 38 | + @load_input_queue is a function that is used to insert jobs into queue |
| 39 | + |
| 40 | + @main is the function that will process the input_queue |
| 41 | + |
| 42 | + @obj can be a pickled object or an enumerable variable that will be loaded |
| 43 | + into the input_queue |
| 44 | + |
| 45 | + @result_queue, if set to True will become a true queue and will be provided |
| 46 | + to main whose job it is to fill with new tasks. If False then this variable |
| 47 | + is ignored. |
| 48 | + |
| 49 | + @result_processor, name of the function to process the @result_queue |
| 50 | + |
| 51 | + @kwargs is a dictionary with optional variables. Used to supply to main |
| 52 | + ''' |
| 53 | + |
| 54 | + input_queue = Queue() |
| 55 | + if result_queue: |
| 56 | + result_queue = JoinableQueue() |
| 57 | + |
| 58 | + load_input_queue(input_queue, obj, poison_pill=True) |
| 59 | + |
| 60 | + if settings.PROGRESS_BAR: |
| 61 | + pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start() |
| 62 | + else: |
| 63 | + pbar = False |
| 64 | + |
| 65 | + |
| 66 | + input_processes = [models.ProcessInputQueue(main, input_queue, result_queue, |
| 67 | + pbar, **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES)] |
| 68 | + |
| 69 | + for input_process in input_processes: |
| 70 | + input_process.start() |
| 71 | + pids = [p.pid for p in input_processes] |
| 72 | + |
| 73 | + if result_queue: |
| 74 | + result_processes = [models.ProcessResultQueue(result_processor, |
| 75 | + result_queue, pids, pbar) for i in xrange(1)] |
| 76 | + for result_process in result_processes: |
| 77 | + result_process.start() |
| 78 | + |
| 79 | + for input_process in input_processes: |
| 80 | + print 'Waiting for input process to finish' |
| 81 | + input_process.join() |
| 82 | + print 'Input process finished' |
| 83 | + |
| 84 | + if result_queue: |
| 85 | + for result_process in result_processes: |
| 86 | + print 'Waiting for result process to finish.' |
| 87 | + result_process.join() |
| 88 | + print 'Result process finished' |
| 89 | + |
| 90 | + if pbar: |
| 91 | + pbar.finish() |
| 92 | + print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed)) |
| 93 | + |
| 94 | + |
| 95 | +def load_queue(input_queue, obj, poison_pill=False): |
| 96 | + ''' |
| 97 | + @input_queue should be an instance of multiprocessing.Queue |
| 98 | + |
| 99 | + @obj either pickled or enumerable variable that contains the tasks |
| 100 | + |
| 101 | + @returns: queue with tasks |
| 102 | + ''' |
| 103 | + |
| 104 | + if isinstance(obj, type(list)): |
| 105 | + data = utils.load_object(obj) |
| 106 | + else: |
| 107 | + data = obj |
| 108 | + for d in data: |
| 109 | + input_queue.put(d) |
| 110 | + |
| 111 | + if poison_pill: |
| 112 | + for p in xrange(settings.NUMBER_OF_PROCESSES): |
| 113 | + input_queue.put(None) |
| 114 | + return input_queue |
Property changes on: trunk/tools/editor_trends/utils/process_constructor.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 115 | + text/plain |
Added: svn:eol-style |
2 | 116 | + native |
Index: trunk/tools/editor_trends/requirements.txt |
— | — | @@ -0,0 +1,2 @@ |
| 2 | +progressbar==2.3-dev |
| 3 | +psyco==1.6 |
Property changes on: trunk/tools/editor_trends/requirements.txt |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 4 | + text/plain |
Added: svn:eol-style |
2 | 5 | + native |
Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -0,0 +1,127 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | + |
| 19 | +import xml.etree.cElementTree as cElementTree |
| 20 | +import codecs |
| 21 | +import utils |
| 22 | +import re |
| 23 | +import settings |
| 24 | + |
| 25 | +try: |
| 26 | + import psyco |
| 27 | + psyco.full() |
| 28 | +except ImportError: |
| 29 | + pass |
| 30 | + |
| 31 | + |
| 32 | +RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
| 33 | + |
| 34 | +#def convert_html_entities(text): |
| 35 | +# return utils.unescape(text) |
| 36 | + |
| 37 | + |
| 38 | +def remove_numeric_character_references(text): |
| 39 | + return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8') |
| 40 | + |
| 41 | + |
| 42 | +def lenient_deccharref(m): |
| 43 | + return unichr(int(m.group(1))) |
| 44 | + |
| 45 | + |
| 46 | +def remove_namespace(element, namespace): |
| 47 | + '''Remove namespace from the document.''' |
| 48 | + ns = u'{%s}' % namespace |
| 49 | + nsl = len(ns) |
| 50 | + for elem in element.getiterator(): |
| 51 | + if elem.tag.startswith(ns): |
| 52 | + elem.tag = elem.tag[nsl:] |
| 53 | + return element |
| 54 | + |
| 55 | + |
| 56 | +def parse_comments(xml, function): |
| 57 | + revisions = xml.findall('revision') |
| 58 | + for revision in revisions: |
| 59 | + comment = revision.find('comment') |
| 60 | + timestamp = revision.find('timestamp').text |
| 61 | + #if timestamp == '2007-11-25T09:21:11Z': |
| 62 | + # print 'debug' |
| 63 | + # text = comment.text |
| 64 | + #test2 = text.encode('utf-8') |
| 65 | + #test = text.decode('utf-8') |
| 66 | + |
| 67 | +# text1 = remove_ascii_control_characters(text) |
| 68 | +# text2 = remove_numeric_character_references(text) |
| 69 | +# text3 = convert_html_entities(text) |
| 70 | + |
| 71 | + if comment != None and comment.text != None: |
| 72 | + #print comment.text.encode('utf-8') |
| 73 | + |
| 74 | + comment.text = function(comment.text) |
| 75 | + #text = comment.text |
| 76 | + #print text |
| 77 | + return xml |
| 78 | + |
| 79 | + |
| 80 | +def write_xml_file(element, fh, counter): |
| 81 | + '''Get file handle and write xml element to file''' |
| 82 | + size = len(cElementTree.tostring(element)) |
| 83 | + fh, counter = create_xml_file_handle(fh, counter, size) |
| 84 | + fh.write(cElementTree.tostring(element)) |
| 85 | + fh.write('\n') |
| 86 | + return fh, counter |
| 87 | + |
| 88 | + |
| 89 | +def create_xml_file_handle(fh, counter, size): |
| 90 | + '''Create file handle if none is supplied or if file size > max file size.''' |
| 91 | + if not fh: |
| 92 | + counter = 0 |
| 93 | + fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
| 94 | + return fh, counter |
| 95 | + elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE: |
| 96 | + print 'Created chunk %s' % counter |
| 97 | + fh.close |
| 98 | + counter += 1 |
| 99 | + fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
| 100 | + return fh, counter |
| 101 | + else: |
| 102 | + return fh, counter |
| 103 | + |
| 104 | + |
| 105 | +def split_xml(): |
| 106 | + '''Reads xml file and splits it in N chunks''' |
| 107 | + fh = None |
| 108 | + counter = None |
| 109 | + tag = '{%s}page' % settings.NAME_SPACE |
| 110 | + |
| 111 | + context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end')) |
| 112 | + context = iter(context) |
| 113 | + event, root = context.next() # get the root element of the XML doc |
| 114 | + |
| 115 | + for event, elem in context: |
| 116 | + if event == 'end': |
| 117 | + if elem.tag == tag: |
| 118 | + elem = remove_namespace(elem, settings.NAME_SPACE) |
| 119 | + elem = parse_comments(elem, remove_numeric_character_references) |
| 120 | + #elem = parse_comments(elem, convert_html_entities) |
| 121 | + #elem = parse_comments(elem, remove_ascii_control_characters) |
| 122 | + fh, counter = write_xml_file(elem, fh, counter) |
| 123 | + #print cElementTree.tostring(elem) |
| 124 | + root.clear() # when done parsing a section clear the tree to safe memory |
| 125 | + |
| 126 | + |
| 127 | +if __name__ == "__main__": |
| 128 | + split_xml() |
Property changes on: trunk/tools/editor_trends/split_xml_file.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 129 | + text/plain |
Added: svn:eol-style |
2 | 130 | + native |
Index: trunk/tools/editor_trends/README.1ST |
— | — | @@ -0,0 +1,65 @@ |
| 2 | +===============================================================================
|
| 3 | +
|
| 4 | + Wikipedia Editor Trends Analytics
|
| 5 | +
|
| 6 | +===============================================================================
|
| 7 | +
|
| 8 | +BACKGROUND:
|
| 9 | +This package offers a set of tools used to create datasets to analyze Editor
|
| 10 | +Trends. By Editor Trends we refer to the overall pattern of entering and leaving
|
| 11 | +a Wikipedia site. The main information source for this package is:
|
| 12 | + http://strategy.wikimedia.org/wiki/Editor_Trends_Study
|
| 13 | +
|
| 14 | +REQUIREMENTS:
|
| 15 | +
|
| 16 | +* Python 2.6 or higher (this code has not been tested with Python 3.x)
|
| 17 | +
|
| 18 | +OPTIONAL
|
| 19 | +* MongoDB
|
| 20 | +
|
| 21 | +If you don't want to install / use MongDB then the package will use the built-in
|
| 22 | +Sqlite library. However, this not optimized for speed and may take a serious
|
| 23 | +amount of time. If possible, install MongoDB.
|
| 24 | +
|
| 25 | +INSTALLING USING VIRTUALENV
|
| 26 | +It's recommended to use Python virtualenv. If you are not familiar with
|
| 27 | +virtualenv then have a look over here:
|
| 28 | + http://groups.google.com/group/python-virtualenv/browse_thread/thread/f2f19d2cc93a844e
|
| 29 | +
|
| 30 | +To install Editor Trends Analytics:
|
| 31 | +
|
| 32 | + virtualenv --no-site-packages --distribute editor_trends
|
| 33 | + pip install -E editor_trends -r /editor_trends/requirements.txt
|
| 34 | +
|
| 35 | +
|
| 36 | +The first command creates a new virtualenv called editor_trends and the second
|
| 37 | +command installs the dependencies. Currently the dependencies are:
|
| 38 | +* PyMongo
|
| 39 | +* Progressbar
|
| 40 | +
|
| 41 | +INSTALLING WITHOUT VIRTUALENV
|
| 42 | +If you don't like virtualenv then do the following:
|
| 43 | +
|
| 44 | + easy_install pymongo
|
| 45 | + easy_install progressbar
|
| 46 | +
|
| 47 | +IMPORTANT MONGODB NOTES
|
| 48 | +If you decide to use MongDB to store the results then you have to install the
|
| 49 | +64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
|
| 50 | +databases created by this package will definitely be larger than that. For more
|
| 51 | +background information on this limitation, please read:
|
| 52 | + http://blog.mongodb.org/post/137788967/32-bit-limitations
|
| 53 | +
|
| 54 | +
|
| 55 | +CONFIGURATION:
|
| 56 | +If you would like to create a dataset for your own analyses then you should
|
| 57 | +first make the appropriate changes to settings.py. Settings.py contains
|
| 58 | +configuration variables such as the location of input and output files. Most
|
| 59 | +settings are self-explanatory but in cases of any questions please drop me a
|
| 60 | +line.
|
| 61 | +
|
| 62 | +CODE:
|
| 63 | +The Python code adheres to PEP8. Function names are deliberately expressive to
|
| 64 | +ease understanding what's going. If you find a bug please email me at dvanliere
|
| 65 | +at gmail dot com or leave a message on my Talk page.
|
| 66 | +
|
Property changes on: trunk/tools/editor_trends/README.1ST |
___________________________________________________________________ |
Added: native |
1 | 67 | + svn:eol-style=native |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -0,0 +1,159 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +from multiprocessing import Queue |
| 19 | +from Queue import Empty |
| 20 | +import sqlite3 |
| 21 | + |
| 22 | +import progressbar |
| 23 | + |
| 24 | +import settings |
| 25 | +from utils import models, utils |
| 26 | +from database import db |
| 27 | +from utils import process_constructor as pc |
| 28 | + |
| 29 | +try: |
| 30 | + import psyco |
| 31 | + psyco.full() |
| 32 | +except ImportError: |
| 33 | + pass |
| 34 | + |
| 35 | + |
| 36 | +def retrieve_editor_ids_mongo(): |
| 37 | + if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
| 38 | + retrieve_editor_ids_mongo): |
| 39 | + ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
| 40 | + retrieve_editor_ids_mongo) |
| 41 | + else: |
| 42 | + mongo = db.init_mongo_db('editors') |
| 43 | + editors = mongo['editors'] |
| 44 | + ids = editors.find().distinct('editor') |
| 45 | + print ids |
| 46 | + if ids != []: |
| 47 | + utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
| 48 | + return ids |
| 49 | + |
| 50 | + |
| 51 | +def generate_editor_dataset(input_queue, data_queue, pbar, kwargs): |
| 52 | + definition = kwargs.pop('definition') |
| 53 | + limit = kwargs.pop('limit') |
| 54 | + debug = kwargs.pop('debug') |
| 55 | + mongo = db.init_mongo_db('editors') |
| 56 | + editors = mongo['editors'] |
| 57 | + while True: |
| 58 | + try: |
| 59 | + if debug: |
| 60 | + id = u'99797' |
| 61 | + else: |
| 62 | + id = input_queue.get(block=False) |
| 63 | + |
| 64 | + contributors = set() |
| 65 | + if definition == 'Traditional': |
| 66 | + obs = editors.find({'editor': id}).limit(limit) #.sort({'date': 1}).limit(limit) |
| 67 | + for ob in obs: |
| 68 | + contributors.add(ob) |
| 69 | + else: |
| 70 | + obs = editors.find({'editor': id}).sort({'date': 1}) |
| 71 | + for ob in obs: |
| 72 | + if len(dates) > limit: |
| 73 | + break |
| 74 | + else: |
| 75 | + if edit.date not in dates: |
| 76 | + set.add(edit) |
| 77 | + utils.write_data_to_csv(contributors, generate_editor_dataset, settings.ENCODING) |
| 78 | + |
| 79 | + except Empty: |
| 80 | + break |
| 81 | + |
| 82 | + |
| 83 | +def retrieve_editor_ids_db(): |
| 84 | + contributors = set() |
| 85 | + connection = db.init_database() |
| 86 | + cursor = connection.cursor() |
| 87 | + if settings.PROGRESS_BAR: |
| 88 | + cursor.execute('SELECT MAX(ROWID) FROM contributors') |
| 89 | + for id in cursor: |
| 90 | + pass |
| 91 | + pbar = progressbar.ProgressBar(maxval=id[0]).start() |
| 92 | + |
| 93 | + cursor.execute('SELECT contributor FROM contributors WHERE bot=0') |
| 94 | + |
| 95 | + print 'Retrieving contributors...' |
| 96 | + for x, contributor in enumerate(cursor): |
| 97 | + contributors.add(contributor[0]) |
| 98 | + if x % 100000 == 0: |
| 99 | + pbar.update(x) |
| 100 | + print 'Serializing contributors...' |
| 101 | + utils.store_object(contributors, 'contributors') |
| 102 | + print 'Finished serializing contributors...' |
| 103 | + |
| 104 | + if pbar: |
| 105 | + pbar.finish() |
| 106 | + print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed)) |
| 107 | + |
| 108 | + connection.close() |
| 109 | + |
| 110 | + |
| 111 | +def retrieve_edits_by_contributor(input_queue, result_queue, pbar): |
| 112 | + connection = db.init_database() |
| 113 | + cursor = connection.cursor() |
| 114 | + |
| 115 | + while True: |
| 116 | + try: |
| 117 | + contributor = input_queue.get(block=False) |
| 118 | + if contributor == None: |
| 119 | + break |
| 120 | + |
| 121 | + cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,)) |
| 122 | + edits = {} |
| 123 | + edits[contributor] = set() |
| 124 | + for edit, timestamp, bot in cursor: |
| 125 | + date = utils.convert_timestamp_to_date(timestamp) |
| 126 | + edits[contributor].add(date) |
| 127 | + #print edit, timestamp, bot |
| 128 | + |
| 129 | + utils.write_data_to_csv(edits, retrieve_edits_by_contributor) |
| 130 | + if pbar: |
| 131 | + utils.update_progressbar(pbar, input_queue) |
| 132 | + |
| 133 | + except Empty: |
| 134 | + pass |
| 135 | + |
| 136 | + connection.close() |
| 137 | + |
| 138 | + |
| 139 | +def retrieve_edits_by_contributor_launcher(): |
| 140 | + pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors') |
| 141 | + |
| 142 | + |
| 143 | +def debug_retrieve_edits_by_contributor_launcher(): |
| 144 | + input_queue = Queue() |
| 145 | + kwargs = {'definition':'Traditional', |
| 146 | + 'limit': 10, |
| 147 | + 'debug': True |
| 148 | + } |
| 149 | + generate_editor_dataset(input_queue, False, False, kwargs) |
| 150 | + generate_editor_dataset_launcher() |
| 151 | + #retrieve_list_contributors() |
| 152 | + #retrieve_edits_by_contributor() |
| 153 | + |
| 154 | +def generate_editor_dataset_launcher(): |
| 155 | + ids = retrieve_editor_ids_mongo() |
| 156 | + pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, definition='Traditional', limit=10) |
| 157 | + |
| 158 | + |
| 159 | +if __name__ == '__main__': |
| 160 | + debug_retrieve_edits_by_contributor_launcher() |
Property changes on: trunk/tools/editor_trends/construct_datasets.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 161 | + text/plain |
Added: svn:eol-style |
2 | 162 | + native |
Index: trunk/tools/editor_trends/init_bot_db.py |
— | — | @@ -0,0 +1,196 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +import os |
| 19 | +import cStringIO |
| 20 | +import xml.etree.cElementTree as cElementTree |
| 21 | + |
| 22 | + |
| 23 | +import settings |
| 24 | +from wikitree import xml |
| 25 | +from database import db |
| 26 | +from database import db_settings |
| 27 | +from utils import utils |
| 28 | +from utils import process_constructor as pc |
| 29 | + |
| 30 | +try: |
| 31 | + import psyco |
| 32 | + psyco.full() |
| 33 | +except ImportError: |
| 34 | + pass |
| 35 | + |
| 36 | + |
| 37 | +def create_bot_ids_db_mongo(): |
| 38 | + ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.ENCODING) |
| 39 | + mongo = db.init_mongo_db('bots') |
| 40 | + collection = mongo['ids'] |
| 41 | + |
| 42 | + db.remove_documents_from_mongo_db(collection, None) |
| 43 | + |
| 44 | + for id, name in ids.iteritems(): |
| 45 | + collection.insert({'id': id, 'name': name}) |
| 46 | + |
| 47 | + print collection.count() |
| 48 | + |
| 49 | + |
| 50 | +def create_bots_db(db_name): |
| 51 | + ''' |
| 52 | + This function reads the csv file provided by Erik Zachte and constructs a |
| 53 | + sqlite memory database. The reason for this is that I suspect I will need |
| 54 | + some simple querying capabilities in the future, else a dictionary would |
| 55 | + suffice. |
| 56 | + ''' |
| 57 | + connection = db.init_database('db_name') |
| 58 | + #connection = db.init_database('data/database/bots.db') |
| 59 | + cursor = connection.cursor() |
| 60 | + db.create_tables(cursor, db_settings.BOT_TABLE) |
| 61 | + values = [] |
| 62 | + fields = [field[0] for field in db_settings.BOT_TABLE['bots']] |
| 63 | + for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.ENCODING): |
| 64 | + line = line.split(',') |
| 65 | + row = [] |
| 66 | + for x, (field, value) in enumerate(zip(fields, line)): |
| 67 | + if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER': |
| 68 | + value = int(value) |
| 69 | + elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT': |
| 70 | + value = value.replace('/', '-') |
| 71 | + #print field, value |
| 72 | + row.append(value) |
| 73 | + values.append(row) |
| 74 | + |
| 75 | + cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values) |
| 76 | + connection.commit() |
| 77 | + if db_name == ':memory': |
| 78 | + return cursor |
| 79 | + else: |
| 80 | + connection.close() |
| 81 | + |
| 82 | + |
| 83 | +def retrieve_botnames_without_id(cursor, language): |
| 84 | + return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall() |
| 85 | + |
| 86 | + |
| 87 | +def lookup_username(input_queue, result_queue, progressbar, bots, debug=False): |
| 88 | + ''' |
| 89 | + This function is used to find the id's belonging to the different bots that |
| 90 | + are patrolling the Wikipedia sites. |
| 91 | + @input_queue contains a list of xml files to parse |
| 92 | + |
| 93 | + @result_queue should be set to false as the results are directly written to |
| 94 | + a csv file. |
| 95 | + |
| 96 | + @progressbar depends on settings |
| 97 | + |
| 98 | + @bots is a dictionary containing the names of the bots to lookup |
| 99 | + ''' |
| 100 | + |
| 101 | + #if len(bots.keys()) == 1: |
| 102 | + bots = bots['bots'] |
| 103 | + #print bots.keys() |
| 104 | + |
| 105 | + if settings.DEBUG: |
| 106 | + messages = {} |
| 107 | + |
| 108 | + while True: |
| 109 | + if debug: |
| 110 | + file = input_queue |
| 111 | + else: |
| 112 | + file = input_queue.get(block=False) |
| 113 | + |
| 114 | + if file == None: |
| 115 | + break |
| 116 | + |
| 117 | + data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION + |
| 118 | + file, 'r', encoding=settings.ENCODING)) |
| 119 | + |
| 120 | + for raw_data in data: |
| 121 | + xml_buffer = cStringIO.StringIO() |
| 122 | + raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 123 | + raw_data = ''.join(raw_data) |
| 124 | + raw_data = raw_data.encode('utf-8') |
| 125 | + xml_buffer.write(raw_data) |
| 126 | + |
| 127 | + try: |
| 128 | + xml_nodes = cElementTree.XML(xml_buffer.getvalue()) |
| 129 | + revisions = xml_nodes.findall('revision') |
| 130 | + for revision in revisions: |
| 131 | + contributor = xml.retrieve_xml_node(revision, 'contributor') |
| 132 | + username = contributor.find('username') |
| 133 | + if username == None: |
| 134 | + continue |
| 135 | + username = xml.extract_text(username) |
| 136 | + #print username.encode('utf-8') |
| 137 | + |
| 138 | + if username in bots: |
| 139 | + id = contributor.find('id') |
| 140 | + id = xml.extract_text(id) |
| 141 | + #print username.encode('utf-8'), id |
| 142 | + utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.ENCODING) |
| 143 | + bots.pop(username) |
| 144 | + if bots == {}: |
| 145 | + print 'Mission accomplished' |
| 146 | + return |
| 147 | + except Exception, error: |
| 148 | + print error |
| 149 | + if settings.DEBUG: |
| 150 | + messages = utils.track_errors(xml_buffer, error, file, |
| 151 | + messages) |
| 152 | + |
| 153 | + if settings.DEBUG: |
| 154 | + utils.report_error_messages(messages, lookup_username) |
| 155 | + |
| 156 | + |
| 157 | +def add_id_to_botnames(): |
| 158 | + ''' |
| 159 | + This is the worker function for the multi-process version of |
| 160 | + lookup_username.First, the names of the bots are retrieved, then the |
| 161 | + multiprocess is launched by makinga call to pc.build_scaffolding. This is a |
| 162 | + generic launcher that takes as input the function to load the input_queue, |
| 163 | + the function that will do the main work and the objects to be put in the |
| 164 | + input_queue. The launcher also accepts optional keyword arguments. |
| 165 | + ''' |
| 166 | + cursor = create_bots_db(':memory') |
| 167 | + files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml') |
| 168 | + |
| 169 | + botnames = retrieve_botnames_without_id(cursor, 'en') |
| 170 | + bots = {} |
| 171 | + for botname in botnames: |
| 172 | + bots[botname[0]] = 1 |
| 173 | + pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots) |
| 174 | + cursor.close() |
| 175 | + |
| 176 | + |
| 177 | +def debug_lookup_username(): |
| 178 | + ''' |
| 179 | + This function launches the lookup_username function but then single |
| 180 | + threaded, this eases debugging. That's also the reason why the queue |
| 181 | + parameters are set to None. When launching this function make sure that |
| 182 | + debug=False when calling lookup_username |
| 183 | + ''' |
| 184 | + cursor = create_bots_db(':memory') |
| 185 | + botnames = retrieve_botnames_without_id(cursor, 'en') |
| 186 | + bots = {} |
| 187 | + for botname in botnames: |
| 188 | + bots[botname[0]] = 1 |
| 189 | + |
| 190 | + lookup_username('12.xml', None, None, bots, debug=True) |
| 191 | + cursor.close() |
| 192 | + |
| 193 | + |
| 194 | +if __name__ == '__main__': |
| 195 | + #debug() |
| 196 | + #add_id_to_botnames() |
| 197 | + create_bot_ids_db_mongo() |
Property changes on: trunk/tools/editor_trends/init_bot_db.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 198 | + text/plain |
Added: svn:eol-style |
2 | 199 | + native |
Index: trunk/tools/editor_trends/database/__init__.py |
Property changes on: trunk/tools/editor_trends/database/__init__.py |
___________________________________________________________________ |
Added: svn:mime-type |
3 | 200 | + text/plain |
Added: svn:eol-style |
4 | 201 | + native |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -0,0 +1,83 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +import sqlite3 as sqlite |
| 19 | +from pymongo import Connection |
| 20 | + |
| 21 | + |
| 22 | +import settings |
| 23 | +from database import db_settings |
| 24 | + |
| 25 | + |
| 26 | +def init_mongo_db(db): |
| 27 | + connection = Connection() |
| 28 | + db = connection[db] |
| 29 | + return db |
| 30 | + |
| 31 | + |
| 32 | +def remove_documents_from_mongo_db(collection, ids): |
| 33 | + collection.remove(ids) |
| 34 | + |
| 35 | + |
| 36 | +def add_index_to_collection(db, collection, keys): |
| 37 | + ''' |
| 38 | + @db is the name of the mongodb |
| 39 | + @collection is the name of the 'table' in mongodb |
| 40 | + @keys should be a list of keys used to create the index |
| 41 | + ''' |
| 42 | + |
| 43 | + mongo = init_mongo_db(db) |
| 44 | + collection = mongo[collection] |
| 45 | + mongo.collection.create_index(keys) |
| 46 | + mongo.collection.ensure_index(keys) |
| 47 | + |
| 48 | + |
| 49 | +def init_database(db=None): |
| 50 | + ''' |
| 51 | + This function initializes the connection with a sqlite db. |
| 52 | + If the database already exists then it returns False to indicate |
| 53 | + that the db already exists, else it returns True to indicate |
| 54 | + that it's an empty database without tables. |
| 55 | + ''' |
| 56 | + if db == None: |
| 57 | + db = settings.DATABASE_NAME |
| 58 | + |
| 59 | + return sqlite.connect(db, check_same_thread=False) |
| 60 | + |
| 61 | + |
| 62 | +def create_tables(cursor, tables): |
| 63 | + ''' |
| 64 | + Tables is expected to be a dictionary, with key |
| 65 | + table name and value another dictionary. This second |
| 66 | + dictionary contains variable names and datatypes. |
| 67 | + ''' |
| 68 | + for table in tables: |
| 69 | + vars = '(' |
| 70 | + for var, datatype in tables[table]: |
| 71 | + vars = vars + '%s %s,' % (var, datatype) |
| 72 | + vars = vars[:-1] |
| 73 | + vars = vars + ')' |
| 74 | + cursor.execute('CREATE TABLE IF NOT EXISTS ? ?' % (table, vars)) |
| 75 | + |
| 76 | + |
| 77 | +def debug(): |
| 78 | + connection = init_database() |
| 79 | + cursor = connection.cursor() |
| 80 | + create_tables(cursor, settings.TABLES) |
| 81 | + |
| 82 | + |
| 83 | +if __name__ == '__main__': |
| 84 | + debug() |
Property changes on: trunk/tools/editor_trends/database/db.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 85 | + text/plain |
Added: svn:eol-style |
2 | 86 | + native |
Index: trunk/tools/editor_trends/database/db_settings.py |
— | — | @@ -0,0 +1,38 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +''' |
| 19 | +This is a settings file that contains the layout of different tables. The main |
| 20 | +key will be used as the tablename while it,s values contain tuples containing |
| 21 | +fieldname and datatype This is only be used for sqlite. |
| 22 | +''' |
| 23 | +CONTRIBUTOR_TABLE = {'contributors': []} |
| 24 | +CONTRIBUTOR_TABLE['contributors'].append(('contributor', 'VARCHAR(64)')) |
| 25 | +CONTRIBUTOR_TABLE['contributors'].append(('article', 'INTEGER')) |
| 26 | +CONTRIBUTOR_TABLE['contributors'].append(('timestamp', 'TEXT')) |
| 27 | +CONTRIBUTOR_TABLE['contributors'].append(('bot', 'INTEGER')) |
| 28 | + |
| 29 | +BOT_TABLE = {'bots': []} |
| 30 | +BOT_TABLE['bots'].append(('language', 'VARCHAR(12)')) |
| 31 | +BOT_TABLE['bots'].append(('name', 'VARCHAR(64)')) |
| 32 | +BOT_TABLE['bots'].append(('edits_namespace_a', 'INTEGER')) |
| 33 | +BOT_TABLE['bots'].append(('edits_namespace_x', 'INTEGER')) |
| 34 | +BOT_TABLE['bots'].append(('rank_now', 'INTEGER')) |
| 35 | +BOT_TABLE['bots'].append(('rank_prev', 'INTEGER')) |
| 36 | +BOT_TABLE['bots'].append(('first_date', 'TEXT')) |
| 37 | +BOT_TABLE['bots'].append(('days_first', 'INTEGER')) |
| 38 | +BOT_TABLE['bots'].append(('last_date', 'TEXT')) |
| 39 | +BOT_TABLE['bots'].append(('days_last', 'INTEGER')) |
Property changes on: trunk/tools/editor_trends/database/db_settings.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 40 | + text/plain |
Added: svn:eol-style |
2 | 41 | + native |
Index: trunk/tools/editor_trends/bots/__init__.py |
Property changes on: trunk/tools/editor_trends/bots/__init__.py |
___________________________________________________________________ |
Added: svn:mime-type |
3 | 42 | + text/plain |
Added: svn:eol-style |
4 | 43 | + native |
Property changes on: trunk/tools/editor_trends/data/database |
___________________________________________________________________ |
Added: svn:ignore |
5 | 44 | + *.db |
Property changes on: trunk/tools/editor_trends/data/objects |
___________________________________________________________________ |
Added: svn:ignore |
6 | 45 | + *.bin |
Property changes on: trunk/tools/editor_trends/data/csv |
___________________________________________________________________ |
Added: svn:ignore |
7 | 46 | + *.csv |
Index: trunk/tools/editor_trends/run.bat |
— | — | @@ -0,0 +1,3 @@ |
| 2 | +@echo off
|
| 3 | +python split_xml_file.py
|
| 4 | +python map_wiki_editors.py
|
Index: trunk/tools/editor_trends/algorithms/__init__.py |
Property changes on: trunk/tools/editor_trends/algorithms/__init__.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 5 | + text/plain |
Added: svn:eol-style |
2 | 6 | + native |
Index: trunk/tools/editor_trends/algorithms/red_wiki_editors.py |
— | — | @@ -0,0 +1,40 @@ |
| 2 | +import re |
| 3 | +import settings |
| 4 | + |
| 5 | +try: |
| 6 | + import psyco |
| 7 | + psyco.full() |
| 8 | +except ImportError: |
| 9 | + pass |
| 10 | + |
| 11 | + |
| 12 | +RE_ID = re.compile('\d*') |
| 13 | +RE_IP = re.compile('(?:\d{1,3}\.){2,3}\d{1,3}') #Some of the addresses have the last 3 digits blocked as xxx |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | +def determine_contributor_type(id): |
| 18 | + if len(re.findall(RE_ID, id)) == 1: |
| 19 | + return 'id' |
| 20 | + elif len(re.findall(RE_IP, id)) == 1: |
| 21 | + return 'ip' |
| 22 | + else: |
| 23 | + return 'name' |
| 24 | + |
| 25 | +def open_file_handles(): |
| 26 | + fh1, fh2,fh3 = None, None, None |
| 27 | + handles = {'id.txt': fh1, |
| 28 | + 'ip.txt': fh2, |
| 29 | + 'name.txt': fh3 |
| 30 | + } |
| 31 | + for handle, var in handles.iteritems(): |
| 32 | + var = codecs.open(handle, 'w', encoding=settings.ENCODING) |
| 33 | + |
| 34 | + return handles |
| 35 | + |
| 36 | +def close_file_handles(handles): |
| 37 | + for handle, var in handles.iteritems(): |
| 38 | + var.close() |
| 39 | + |
| 40 | +def write_data(vars): |
| 41 | + pass |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/algorithms/red_wiki_editors.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 42 | + text/plain |
Added: svn:eol-style |
2 | 43 | + native |
Property changes on: trunk/tools/editor_trends/errors |
___________________________________________________________________ |
Added: svn:ignore |
3 | 44 | + *.bin |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Added: svn:ignore |
4 | 45 | + wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
Added: native |
5 | 46 | + svn:eol-style=native |