Index: trunk/tools/editor_trends/etl/differ.py |
— | — | @@ -0,0 +1,1017 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-04-10' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import json |
| 22 | +import cStringIO |
| 23 | +import codecs |
| 24 | +import sys |
| 25 | +import os |
| 26 | +import difflib |
| 27 | +from xml.etree.cElementTree import iterparse, dump |
| 28 | + |
| 29 | +if '..' not in sys.path: |
| 30 | + sys.path.append('../') |
| 31 | + |
| 32 | +from utils import file_utils |
| 33 | +from etl import variables |
| 34 | +from classes import exceptions |
| 35 | + |
| 36 | + |
| 37 | +def parse_xml(fh, format, process_id, location): |
| 38 | + ''' |
| 39 | + This function initializes the XML parser and calls the appropriate function |
| 40 | + to extract / construct the variables from the XML stream. |
| 41 | + ''' |
| 42 | + include_ns = {3: 'User Talk', |
| 43 | + 5: 'Wikipedia Talk', |
| 44 | + 1: 'Talk', |
| 45 | + } |
| 46 | + |
| 47 | + start = 'start'; end = 'end' |
| 48 | + context = iterparse(fh, events=(start, end)) |
| 49 | + context = iter(context) |
| 50 | + |
| 51 | + article = {} |
| 52 | + count_articles = 0 |
| 53 | + id = False |
| 54 | + ns = False |
| 55 | + parse = False |
| 56 | + rev1 = None |
| 57 | + rev2 = None |
| 58 | + file_id, fh_output = None, None |
| 59 | + |
| 60 | + try: |
| 61 | + for event, elem in context: |
| 62 | + if event is end and elem.tag.endswith('siteinfo'): |
| 63 | + ''' |
| 64 | + This event happens once for every dump file and is used to |
| 65 | + determine the version of the generator used to generate the XML |
| 66 | + file. |
| 67 | + ''' |
| 68 | + xml_namespace = variables.determine_xml_namespace(elem) |
| 69 | + namespaces = variables.create_namespace_dict(elem, xml_namespace) |
| 70 | + ns = True |
| 71 | + elem.clear() |
| 72 | + |
| 73 | + elif event is end and elem.tag.endswith('title'): |
| 74 | + ''' |
| 75 | + This function determines the title of an article and the |
| 76 | + namespace to which it belongs. Then, if the namespace is one |
| 77 | + which we are interested in set parse to True so that we start |
| 78 | + parsing this article, else it will skip this article. |
| 79 | + ''' |
| 80 | + title = variables.parse_title(elem) |
| 81 | + article['title'] = title |
| 82 | + current_namespace = variables.determine_namespace(title, namespaces, include_ns) |
| 83 | + if current_namespace == 1 or current_namespace == 3 or current_namespace == 5: |
| 84 | + parse = True |
| 85 | + article['namespace'] = current_namespace |
| 86 | + count_articles += 1 |
| 87 | + if count_articles % 10000 == 0: |
| 88 | + print 'Worker %s parsed %s articles' % (process_id, count_articles) |
| 89 | + elem.clear() |
| 90 | + |
| 91 | + elif elem.tag.endswith('revision'): |
| 92 | + ''' |
| 93 | + This function does the actual analysis of an individual revision, |
| 94 | + calculating size difference between this and previous revision and |
| 95 | + calculating md5 hash to determine whether this edit was reverted. |
| 96 | + ''' |
| 97 | + if parse: |
| 98 | + if event is start: |
| 99 | + clear = False |
| 100 | + else: |
| 101 | + #dump(elem) |
| 102 | + rev_id = elem.find('%s%s' % (xml_namespace, 'id')) |
| 103 | + timestamp = elem.find('%s%s' % (xml_namespace, 'timestamp')).text |
| 104 | + contributor = elem.find('%s%s' % (xml_namespace, 'contributor')) |
| 105 | + editor = variables.parse_contributor(contributor, None, xml_namespace) |
| 106 | + if editor: |
| 107 | + rev_id = variables.extract_revision_id(rev_id) |
| 108 | + |
| 109 | + if rev1 == None and rev2 == None: |
| 110 | + diff = variables.extract_revision_text(elem, xml_namespace) |
| 111 | + rev1 = elem |
| 112 | + if rev1 != None and rev2 != None: |
| 113 | + diff = diff_revision(rev1, rev2, xml_namespace) |
| 114 | + |
| 115 | + article[rev_id] = {} |
| 116 | + article[rev_id].update(editor) |
| 117 | + article[rev_id]['timestamp'] = timestamp |
| 118 | + article[rev_id]['diff'] = diff |
| 119 | + |
| 120 | + clear = True |
| 121 | + if clear: |
| 122 | + rev2 = rev1 |
| 123 | + elem.clear() |
| 124 | + else: |
| 125 | + elem.clear() |
| 126 | + |
| 127 | + elif event is end and elem.tag.endswith('id') and id == False: |
| 128 | + ''' |
| 129 | + Determine id of article |
| 130 | + ''' |
| 131 | + article['article_id'] = elem.text |
| 132 | + id = True |
| 133 | + elem.clear() |
| 134 | + |
| 135 | + elif event is end and elem.tag.endswith('page'): |
| 136 | + ''' |
| 137 | + We have reached end of an article, reset all variables and free |
| 138 | + memory. |
| 139 | + ''' |
| 140 | + elem.clear() |
| 141 | + #write diff of text to file |
| 142 | + if parse: |
| 143 | + #print article |
| 144 | + fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format) |
| 145 | + write_diff(fh_output, article, format) |
| 146 | + #Reset all variables for next article |
| 147 | + article = {} |
| 148 | + if rev1 != None: |
| 149 | + rev1.clear() |
| 150 | + if rev2 != None: |
| 151 | + rev2.clear() |
| 152 | + id = False |
| 153 | + parse = False |
| 154 | + |
| 155 | + except SyntaxError, error: |
| 156 | + print 'Encountered invalid XML tag. Error message: %s' % error |
| 157 | + dump(elem) |
| 158 | + sys.exit(-1) |
| 159 | + except IOError, error: |
| 160 | + print '''Archive file is possibly corrupted. Please delete this archive |
| 161 | + and retry downloading. Error message: %s''' % error |
| 162 | + sys.exit(-1) |
| 163 | + print 'Finished parsing Wikipedia dump file.' |
| 164 | + |
| 165 | + |
| 166 | +def assign_filehandle(fh, file_id, location, process_id, format): |
| 167 | + if not fh: |
| 168 | + file_id = 0 |
| 169 | + filename = '%s_%s.%s' % (file_id, process_id, format) |
| 170 | + fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8') |
| 171 | + else: |
| 172 | + size = fh.tell() |
| 173 | + max_size = 1024 * 1024 * 64 |
| 174 | + if size > max_size: |
| 175 | + fh.close() |
| 176 | + file_id += 1 |
| 177 | + filename = '%s_%s.%s' % (file_id, process_id, format) |
| 178 | + fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8') |
| 179 | + |
| 180 | + return fh, file_id |
| 181 | + |
| 182 | +def write_xml_diff(fh, article): |
| 183 | + pass |
| 184 | + |
| 185 | + |
| 186 | +def write_json_diff(fh, article): |
| 187 | + json.dump(article, fh) |
| 188 | + |
| 189 | + |
| 190 | +def write_diff(fh, article, format): |
| 191 | + if format == 'xml': |
| 192 | + write_xml_diff(fh, article) |
| 193 | + elif format == 'json': |
| 194 | + write_json_diff(fh, article) |
| 195 | + else: |
| 196 | + raise exceptions.OutputNotSupported() |
| 197 | + |
| 198 | + |
| 199 | +def diff_revision(rev1, rev2, xml_namespace): |
| 200 | + buffer = cStringIO.StringIO() |
| 201 | + if rev1.text != None and rev2.text != None: |
| 202 | + diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='') |
| 203 | + for line in diff: |
| 204 | + if len(line) > 3: |
| 205 | + print line |
| 206 | + buffer.write(line) |
| 207 | + |
| 208 | + return buffer.getvalue() |
| 209 | + |
| 210 | + |
| 211 | +def launcher(rts): |
| 212 | + ''' |
| 213 | + This function initializes the multiprocessor, and loading the queue with |
| 214 | + the compressed XML files. |
| 215 | + ''' |
| 216 | + input_queue = JoinableQueue() |
| 217 | + |
| 218 | + files = file_utils.retrieve_file_list(rts.input_location) |
| 219 | + |
| 220 | + if len(files) > cpu_count(): |
| 221 | + processors = cpu_count() - 1 |
| 222 | + else: |
| 223 | + processors = len(files) |
| 224 | + |
| 225 | + fhd = buffer.FileHandleDistributor(rts.max_filehandles, processors) |
| 226 | + |
| 227 | + for filename in files: |
| 228 | + filename = os.path.join(rts.input_location, filename) |
| 229 | + print filename |
| 230 | + input_queue.put(filename) |
| 231 | + |
| 232 | + for x in xrange(processors): |
| 233 | + print 'Inserting poison pill %s...' % x |
| 234 | + input_queue.put(None) |
| 235 | + |
| 236 | + extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id, |
| 237 | + fhd, rts]) |
| 238 | + for process_id in xrange(processors)] |
| 239 | + for extracter in extracters: |
| 240 | + extracter.start() |
| 241 | + |
| 242 | + input_queue.join() |
| 243 | + |
| 244 | + |
| 245 | +def launcher_simple(): |
| 246 | + location = 'c:\\wikimedia\\nl\\wiki\\' |
| 247 | + output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\' |
| 248 | + files = file_utils.retrieve_file_list(location) |
| 249 | + process_id = 0 |
| 250 | + format = 'json' |
| 251 | + for filename in files: |
| 252 | + fh = file_utils.create_streaming_buffer(os.path.join(location, filename)) |
| 253 | + #fh = codecs.open(os.path.join(location, filename), 'r', 'utf-8') |
| 254 | + parse_xml(fh, format, process_id, output_location) |
| 255 | + fh.close() |
| 256 | + |
| 257 | + |
| 258 | +def debug(): |
| 259 | + str1 = """ |
| 260 | + '''Welcome to Wikilytics ! |
| 261 | +''' |
| 262 | +== Background == |
| 263 | +This package offers a set of tools used to create datasets to analyze Editor |
| 264 | +Trends. By Editor Trends we refer to the overall pattern of entering and leaving |
| 265 | +a Wikipedia site. The main information source for this package is [[:strategy:Editor Trends Study|Editor Trends Study]] |
| 266 | + |
| 267 | +== High-level Overview Editor Trends Analytics == |
| 268 | + |
| 269 | +The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases: |
| 270 | +* Chunk the XML dump file in smaller parts |
| 271 | +** and discard all non-zero namespace revisions. |
| 272 | +* Parse XML chunks by taking the following steps: |
| 273 | +** read XML chunk |
| 274 | +** construct XML DOM |
| 275 | +** iterate over each article in XML DOM |
| 276 | +** iterate over each revision in each article |
| 277 | +** extract from each revision |
| 278 | +*** username id |
| 279 | +*** date edit |
| 280 | +*** article id |
| 281 | +** determine if username belongs to bot, discard information if yes |
| 282 | +** store data in MongoDB |
| 283 | +* Create dataset from MongoDB database |
| 284 | +** Create list with unique username id’s |
| 285 | +** Loop over each id |
| 286 | +*** determine year of first edit |
| 287 | +*** determine year of last edit |
| 288 | +*** count total number of edits by year |
| 289 | +*** sort edits by date and keep first 10 edits |
| 290 | +** Write to CSV file. |
| 291 | + |
| 292 | +== Schema of Editor Trends Database == |
| 293 | +Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure: |
| 294 | + |
| 295 | +<source lang='javascript'> |
| 296 | +{'editor': id, |
| 297 | + 'year_joined': year, |
| 298 | + 'new_wikipedian': True, |
| 299 | + 'total_edits': n, |
| 300 | + 'edits': { |
| 301 | + 'date': date, |
| 302 | + 'article': article_id, |
| 303 | + } |
| 304 | +} |
| 305 | +</source> |
| 306 | +The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying |
| 307 | +the database: |
| 308 | + |
| 309 | +<pre> |
| 310 | +use enwiki |
| 311 | +editors = enwiki['editors'] |
| 312 | +enwiki.editors.find_one({'editor': '35252'}, {'edits': 1})[0] |
| 313 | +</pre> |
| 314 | + |
| 315 | + |
| 316 | +Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements. |
| 317 | + |
| 318 | +== Installation == |
| 319 | + |
| 320 | +=== Step-by-Step Movie Tutorial === |
| 321 | +There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing. |
| 322 | + |
| 323 | +=== Dependencies === |
| 324 | + |
| 325 | +Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice. |
| 326 | + |
| 327 | +# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version. |
| 328 | +# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6) |
| 329 | +#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python. |
| 330 | +# Download and install [http://www.sliksvn.com/en/download Subversion] client |
| 331 | +# Depending on your platform make sure you have one of the following extraction utilities installed: |
| 332 | +:* Windows: [http://www.7zip.com 7zip] |
| 333 | +:* Linux: tar (should be installed by default) |
| 334 | + |
| 335 | +To verify that you have installed the required dependencies, do the following: |
| 336 | +<pre> |
| 337 | +<prompt>:: mongo |
| 338 | +MongoDB shell version: 1.6.3 |
| 339 | +connecting to: test |
| 340 | +<prompt> (in mongo shell) exit |
| 341 | + |
| 342 | +<prompt>:: python |
| 343 | +Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on |
| 344 | +win32 |
| 345 | +Type "help", "copyright", "credits" or "license" for more information. |
| 346 | +<prompt> (in python) exit() |
| 347 | + |
| 348 | +<prompt>:: 7z or tar (depending on your platform) |
| 349 | +7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03 |
| 350 | + |
| 351 | +<prompt>:: svn |
| 352 | + |
| 353 | +</pre> |
| 354 | +Output on the console might look different depending on your OS and installed version. |
| 355 | + |
| 356 | +'''For Windows Users, add the following directories to the path''' |
| 357 | +<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre> |
| 358 | + |
| 359 | +To finish the Mongodb configuration, do the following: |
| 360 | +<pre> |
| 361 | +cd \ |
| 362 | +mkdir data |
| 363 | +mkdir data\db |
| 364 | +cd \mongodb\bin |
| 365 | +mongod --install --logpath c:\mongodb\logs |
| 366 | +net start mongodb |
| 367 | +</pre> |
| 368 | + |
| 369 | +Prepare your Python environment by taking the following steps: |
| 370 | +1 Check whether easy_install is installed by issuing the command: |
| 371 | +<pre> |
| 372 | +easy_install |
| 373 | +</pre> |
| 374 | +If easy_install is not installed then enter the following command: |
| 375 | +<pre> |
| 376 | +sudo apt-get install python-setuptools |
| 377 | +</pre> |
| 378 | +2 Check whether virtualenv is installed by the issuing the following command: |
| 379 | +<pre> |
| 380 | +virtualenv |
| 381 | +</pre> |
| 382 | +If virtualenv is not installed enter this command: |
| 383 | +<pre> |
| 384 | +sudo easy_install virtualenv |
| 385 | +</pre> |
| 386 | +Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command: |
| 387 | +<pre> |
| 388 | +virtualenv editor_trends |
| 389 | +</pre> |
| 390 | +This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs |
| 391 | +Now, we have to activate our virtual Python: |
| 392 | +<pre> |
| 393 | +source bin/activate |
| 394 | +</pre> |
| 395 | +You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation. |
| 396 | +If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy. |
| 397 | +Finally, enter the following commands: |
| 398 | +<pre> |
| 399 | +easy_install progressbar |
| 400 | +easy_install pymongo |
| 401 | +easy_install argparse |
| 402 | +easy_install python-dateutil |
| 403 | +easy_install texttable |
| 404 | +</pre> |
| 405 | +Python is installed and you are ready to go! |
| 406 | + |
| 407 | +If everything is running, then you are ready to go. |
| 408 | +==== Important MongoDB Notes ==== |
| 409 | +If you decide to use MongoDB to store the results then you have to install the |
| 410 | +64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the |
| 411 | +databases created by this package will definitely be larger than that. For more |
| 412 | +background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations] |
| 413 | + |
| 414 | +=== Install Editor Trend Analytics === |
| 415 | +First, download Editor Trend Analytics |
| 416 | +* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends |
| 417 | +* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends |
| 418 | + |
| 419 | +=== Getting started === |
| 420 | +By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file. |
| 421 | +<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote> |
| 422 | + |
| 423 | +==== Download Wikipedia dump file ==== |
| 424 | +To download a dump file enter the following command: |
| 425 | +<pre> |
| 426 | +python manage.py download |
| 427 | +</pre> |
| 428 | +You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze: |
| 429 | +<pre> |
| 430 | +python manage.py -l Spanish download |
| 431 | +python manage.py -l Español download |
| 432 | +</pre> |
| 433 | +Or, if you want to download a non Wikipedia dump file, enter the following command: |
| 434 | +<pre> |
| 435 | +python manage.py -l Spanish download {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 436 | +</pre> |
| 437 | + |
| 438 | +To obtain a list of all supported languages, enter: |
| 439 | +<pre> |
| 440 | +manage show_languages |
| 441 | +</pre> |
| 442 | +or to obtain all languages starting with 'x', enter: |
| 443 | +<pre> |
| 444 | +python manage.py show_languages --first x |
| 445 | +</pre> |
| 446 | + |
| 447 | + |
| 448 | +==== Extract Wikipedia dump file ==== |
| 449 | +'''WARNING''': This process might take hours to days, depending on the configuration of your system. |
| 450 | +The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command: |
| 451 | +<pre> |
| 452 | +python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks) |
| 453 | +</pre> |
| 454 | +or, for one of the other Wikimedia projects, enter |
| 455 | +<pre> |
| 456 | +python manage.py -l Spanish -p commons extract |
| 457 | +</pre> |
| 458 | +Valid project choices are: {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 459 | + |
| 460 | +'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file. |
| 461 | + |
| 462 | + |
| 463 | +==== Sort Wikipedia dump file ==== |
| 464 | +'''WARNING''': This process might take a few hours. |
| 465 | +The chunks must be sorted before being added to the MongoDB. Enter the following command: |
| 466 | +<pre> |
| 467 | +python manage.py sort (for sorting the chunks as generated by the 'manage extract' step) |
| 468 | +</pre> |
| 469 | +or, for one of the other Wikimedia projects, enter |
| 470 | +<pre> |
| 471 | +python manage.py -l Spanish sort {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 472 | +</pre> |
| 473 | + |
| 474 | + |
| 475 | +==== Store Wikipedia dump file ==== |
| 476 | +'''WARNING''': This process might take hours to days, depending on the configuration of your system. |
| 477 | +Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command: |
| 478 | +<pre> |
| 479 | +python manage.py store |
| 480 | +python manage.py -l Spanish store |
| 481 | +</pre> |
| 482 | +or, for one of the other Wikimedia projects, enter |
| 483 | +<pre> |
| 484 | +python manage.py -l Spanish store {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 485 | +</pre> |
| 486 | + |
| 487 | +==== Transform dataset ==== |
| 488 | +'''WARNING''': This process might take a couple of hours. |
| 489 | +Finally, the raw data needs to be transformed in useful variables. Issue the following command: |
| 490 | +<pre> |
| 491 | +python manage.py transform |
| 492 | +python manage.py -l Spanish transform |
| 493 | +</pre> |
| 494 | + |
| 495 | +==== Create dataset ==== |
| 496 | +'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer. |
| 497 | +We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]]. |
| 498 | + |
| 499 | +Enter the following command: |
| 500 | +<pre> |
| 501 | +python manage.py dataset |
| 502 | +python manage.py -l Spanish dataset |
| 503 | +</pre> |
| 504 | +or, for one of the other Wikimedia projects, enter |
| 505 | +<pre> |
| 506 | +manage -l Spanish {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} dataset |
| 507 | +</pre> |
| 508 | + |
| 509 | +==== Everything in one shot ==== |
| 510 | +'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer. |
| 511 | +If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command: |
| 512 | +<pre> |
| 513 | +python manage.py all language |
| 514 | +python manage.py -l Spanish all |
| 515 | +</pre> |
| 516 | +<pre> |
| 517 | +python manage.py -p {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} all |
| 518 | +</pre> |
| 519 | + |
| 520 | + |
| 521 | +=== Benchmarks === |
| 522 | +{| border=0 |
| 523 | + |+ ''Benchmark German Wiki'' |
| 524 | +|- |
| 525 | + ! Task |
| 526 | + ! Configuration 1 |
| 527 | + ! Configuration 2 |
| 528 | +|- |
| 529 | + | Download |
| 530 | + | |
| 531 | + | 1 minute 14 seconds |
| 532 | +|- |
| 533 | + | Extract |
| 534 | + | |
| 535 | + | 4-6 hours |
| 536 | +|- |
| 537 | + | Sort |
| 538 | + | |
| 539 | + | ~30 minutes |
| 540 | +|- |
| 541 | + | Store |
| 542 | + | |
| 543 | + | 4-5 hours |
| 544 | +|- |
| 545 | + | Transform |
| 546 | + | |
| 547 | + | 2-3 hours |
| 548 | +|- |
| 549 | + | Total time |
| 550 | + | |
| 551 | + | 10-14 hours |
| 552 | + |
| 553 | +|} |
| 554 | + |
| 555 | + |
| 556 | +{| border=0 |
| 557 | + |+ ''Benchmark English Wiki'' |
| 558 | +|- |
| 559 | + ! Task |
| 560 | + ! Configuration 1 |
| 561 | + ! Configuration 2 |
| 562 | +|- |
| 563 | + | Download |
| 564 | + | |
| 565 | + | 15 minutes |
| 566 | +|- |
| 567 | + | Extract |
| 568 | + | |
| 569 | + | ~36 hours |
| 570 | +|- |
| 571 | + | Sort |
| 572 | + | |
| 573 | + | 10.5 hours |
| 574 | +|- |
| 575 | + | Store |
| 576 | + | |
| 577 | + | 21 hours |
| 578 | +|- |
| 579 | + | Transform |
| 580 | + | |
| 581 | + | 14.3 hours |
| 582 | +|- |
| 583 | + | Total time |
| 584 | + | |
| 585 | + | 3.4 days |
| 586 | + |
| 587 | +|} |
| 588 | + |
| 589 | + |
| 590 | +{| width="300" border="1" |
| 591 | + |+ ''Benchmark Hungarian Wiki'' |
| 592 | +|- |
| 593 | + ! Task |
| 594 | + ! Configuration 3 |
| 595 | +|- |
| 596 | + | Download |
| 597 | + | 1-2 minutes |
| 598 | +|- |
| 599 | + | Extract |
| 600 | + | 24.5 minutes |
| 601 | +|- |
| 602 | + | Sort |
| 603 | + | 1.5 minutes |
| 604 | +|- |
| 605 | + | Store |
| 606 | + | 7-8 minutes |
| 607 | +|- |
| 608 | + | Transform |
| 609 | + | 11 minutes |
| 610 | +|- |
| 611 | + | Total time |
| 612 | + | ~45 minutes |
| 613 | +|} |
| 614 | + |
| 615 | + |
| 616 | +;Configuration 2 |
| 617 | +''Amazon Web Services Large EC2 Instance'' |
| 618 | +* Ubuntu 64-bit |
| 619 | +* 4 EC2 Compute Units (2 virtual cores) |
| 620 | +* 7.5GB memory |
| 621 | +* 850GB storage |
| 622 | + |
| 623 | +;Configuration 3 |
| 624 | +* Win7 64 bit |
| 625 | +* Intel i7 CPU (8 virtual core) |
| 626 | +* 6GB memory |
| 627 | +* 1TB storage |
| 628 | +* 100/100Mb/s internet connection |
| 629 | + |
| 630 | + |
| 631 | +[[Category:Editor Trends Study]] |
| 632 | +""".splitlines(1) |
| 633 | + |
| 634 | + str2 = """ |
| 635 | +Welcome to '''Wikilytics''', a free and open source software toolkit for doing analysis of editing trends in Wikipedia and other Wikimedia projects. |
| 636 | + |
| 637 | +== Background == |
| 638 | +This package offers a set of tools used to create datasets to analyze editing trends. It was first created expressly for the [[:strategy:Editor Trends Study|Editor Trends Study]], but is well-suited to a variety of research into editing trends. It is thus free to use (as in beer and freedom) if you're interested in expanding on the [[:strategy:Editor Trends Study/Results|results of Editor Trend Study]] or if you'd like to participate in other [[Research/Projects|research projects]]. |
| 639 | + |
| 640 | +== High-level Overview Editor Trends Analytics == |
| 641 | + |
| 642 | +The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases: |
| 643 | +* Chunk the XML dump file in smaller parts |
| 644 | +** and discard all non-zero namespace revisions. |
| 645 | +* Parse XML chunks by taking the following steps: |
| 646 | +** read XML chunk |
| 647 | +** construct XML DOM |
| 648 | +** iterate over each article in XML DOM |
| 649 | +** iterate over each revision in each article |
| 650 | +** extract from each revision |
| 651 | +*** username id |
| 652 | +*** date edit |
| 653 | +*** article id |
| 654 | +** determine if username belongs to bot, discard information if yes |
| 655 | +** store data in MongoDB |
| 656 | +* Create dataset from MongoDB database |
| 657 | +** Create list with unique username id’s |
| 658 | +** Loop over each id |
| 659 | +*** determine year of first edit |
| 660 | +*** determine year of last edit |
| 661 | +*** count total number of edits by year |
| 662 | +*** sort edits by date and keep first 10 edits |
| 663 | +** Write to CSV file. |
| 664 | + |
| 665 | +== Schema of Editor Trends Database == |
| 666 | +Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure: |
| 667 | + |
| 668 | +<source lang='javascript'> |
| 669 | +{'editor': id, |
| 670 | + 'year_joined': year, |
| 671 | + 'new_wikipedian': True, |
| 672 | + 'total_edits': n, |
| 673 | + 'edits': { |
| 674 | + 'date': date, |
| 675 | + 'article': article_id, |
| 676 | + } |
| 677 | +} |
| 678 | +</source> |
| 679 | +The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying |
| 680 | +the database: |
| 681 | + |
| 682 | +<pre> |
| 683 | +use wikilitycs |
| 684 | +db.editors_dataset.find_one({'editor': '35252'}, {'edits': 1}) |
| 685 | +</pre> |
| 686 | + |
| 687 | + |
| 688 | +Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements. |
| 689 | + |
| 690 | +== Installation == |
| 691 | + |
| 692 | +=== Step-by-Step Movie Tutorial === |
| 693 | +There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing. |
| 694 | + |
| 695 | +=== Dependencies === |
| 696 | + |
| 697 | +Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice. |
| 698 | + |
| 699 | +# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version. |
| 700 | +# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6) |
| 701 | +#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python. |
| 702 | +# Download and install [http://www.sliksvn.com/en/download Subversion] client |
| 703 | +# Depending on your platform make sure you have one of the following extraction utilities installed: |
| 704 | +:* Windows: [http://www.7zip.com 7zip] |
| 705 | +:* Linux: tar (should be installed by default) |
| 706 | + |
| 707 | +To verify that you have installed the required dependencies, do the following: |
| 708 | +<pre> |
| 709 | +<prompt>:: mongo |
| 710 | +MongoDB shell version: 1.6.3 |
| 711 | +connecting to: test |
| 712 | +<prompt> (in mongo shell) exit |
| 713 | + |
| 714 | +<prompt>:: python |
| 715 | +Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on |
| 716 | +win32 |
| 717 | +Type "help", "copyright", "credits" or "license" for more information. |
| 718 | +<prompt> (in python) exit() |
| 719 | + |
| 720 | +<prompt>:: 7z or tar (depending on your platform) |
| 721 | +7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03 |
| 722 | + |
| 723 | +<prompt>:: svn |
| 724 | + |
| 725 | +</pre> |
| 726 | +Output on the console might look different depending on your OS and installed version. |
| 727 | + |
| 728 | +'''For Windows Users, add the following directories to the path''' |
| 729 | +<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre> |
| 730 | + |
| 731 | +To finish the Mongodb configuration, do the following: |
| 732 | +<pre> |
| 733 | +cd \ |
| 734 | +mkdir data |
| 735 | +mkdir data\db |
| 736 | +cd \mongodb\bin |
| 737 | +mongod --install --logpath c:\mongodb\logs |
| 738 | +net start mongodb |
| 739 | +</pre> |
| 740 | + |
| 741 | +Prepare your Python environment by taking the following steps: |
| 742 | +1 Check whether easy_install is installed by issuing the command: |
| 743 | +<pre> |
| 744 | +easy_install |
| 745 | +</pre> |
| 746 | +If easy_install is not installed then enter the following command: |
| 747 | +<pre> |
| 748 | +sudo apt-get install python-setuptools |
| 749 | +</pre> |
| 750 | +2 Check whether virtualenv is installed by the issuing the following command: |
| 751 | +<pre> |
| 752 | +virtualenv |
| 753 | +</pre> |
| 754 | +If virtualenv is not installed enter this command: |
| 755 | +<pre> |
| 756 | +sudo easy_install virtualenv |
| 757 | +</pre> |
| 758 | +Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command: |
| 759 | +<pre> |
| 760 | +virtualenv editor_trends |
| 761 | +</pre> |
| 762 | +This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs |
| 763 | +Now, we have to activate our virtual Python: |
| 764 | +<pre> |
| 765 | +source bin/activate |
| 766 | +</pre> |
| 767 | +You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation. |
| 768 | +If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy. |
| 769 | +Finally, enter the following commands: |
| 770 | +<pre> |
| 771 | +easy_install progressbar |
| 772 | +easy_install pymongo |
| 773 | +easy_install argparse |
| 774 | +easy_install python-dateutil |
| 775 | +easy_install texttable |
| 776 | +</pre> |
| 777 | +Python is installed and you are ready to go! |
| 778 | + |
| 779 | +If everything is running, then you are ready to go. |
| 780 | +==== Important MongoDB Notes ==== |
| 781 | +If you decide to use MongoDB to store the results then you have to install the |
| 782 | +64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the |
| 783 | +databases created by this package will definitely be larger than that. For more |
| 784 | +background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations] |
| 785 | + |
| 786 | +=== Install Editor Trend Analytics === |
| 787 | +First, download Editor Trend Analytics |
| 788 | +* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends |
| 789 | +* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends |
| 790 | + |
| 791 | +=== Getting started === |
| 792 | +By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file. |
| 793 | +<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote> |
| 794 | + |
| 795 | +==== Download Wikipedia dump file ==== |
| 796 | +To download a dump file enter the following command: |
| 797 | +<pre> |
| 798 | +python manage.py download |
| 799 | +</pre> |
| 800 | +You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze: |
| 801 | +<pre> |
| 802 | +python manage.py -l Spanish download |
| 803 | +python manage.py -l Español download |
| 804 | +</pre> |
| 805 | +Or, if you want to download a non Wikipedia dump file, enter the following command: |
| 806 | +<pre> |
| 807 | +python manage.py -l Spanish download {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 808 | +</pre> |
| 809 | + |
| 810 | +To obtain a list of all supported languages, enter: |
| 811 | +<pre> |
| 812 | +manage show_languages |
| 813 | +</pre> |
| 814 | +or to obtain all languages starting with 'x', enter: |
| 815 | +<pre> |
| 816 | +python manage.py show_languages --first x |
| 817 | +</pre> |
| 818 | + |
| 819 | + |
| 820 | +==== Extract Wikipedia dump file ==== |
| 821 | +'''WARNING''': This process might take hours to days, depending on the configuration of your system. |
| 822 | +The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command: |
| 823 | +<pre> |
| 824 | +python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks) |
| 825 | +</pre> |
| 826 | +or, for one of the other Wikimedia projects, enter |
| 827 | +<pre> |
| 828 | +python manage.py -l Spanish -p commons extract |
| 829 | +</pre> |
| 830 | +Valid project choices are: {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 831 | + |
| 832 | +'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file. |
| 833 | + |
| 834 | + |
| 835 | +==== Sort Wikipedia dump file ==== |
| 836 | +'''WARNING''': This process might take a few hours. |
| 837 | +The chunks must be sorted before being added to the MongoDB. Enter the following command: |
| 838 | +<pre> |
| 839 | +python manage.py sort (for sorting the chunks as generated by the 'manage extract' step) |
| 840 | +</pre> |
| 841 | +or, for one of the other Wikimedia projects, enter |
| 842 | +<pre> |
| 843 | +python manage.py -l Spanish sort {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 844 | +</pre> |
| 845 | + |
| 846 | + |
| 847 | +==== Store Wikipedia dump file ==== |
| 848 | +'''WARNING''': This process might take hours to days, depending on the configuration of your system. |
| 849 | +Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command: |
| 850 | +<pre> |
| 851 | +python manage.py store |
| 852 | +python manage.py -l Spanish store |
| 853 | +</pre> |
| 854 | +or, for one of the other Wikimedia projects, enter |
| 855 | +<pre> |
| 856 | +python manage.py -l Spanish store {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} |
| 857 | +</pre> |
| 858 | + |
| 859 | +==== Transform dataset ==== |
| 860 | +'''WARNING''': This process might take a couple of hours. |
| 861 | +Finally, the raw data needs to be transformed in useful variables. Issue the following command: |
| 862 | +<pre> |
| 863 | +python manage.py transform |
| 864 | +python manage.py -l Spanish transform |
| 865 | +</pre> |
| 866 | + |
| 867 | +==== Create dataset ==== |
| 868 | +'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer. |
| 869 | +We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]]. |
| 870 | + |
| 871 | +Enter the following command: |
| 872 | +<pre> |
| 873 | +python manage.py dataset |
| 874 | +python manage.py -l Spanish dataset |
| 875 | +</pre> |
| 876 | +or, for one of the other Wikimedia projects, enter |
| 877 | +<pre> |
| 878 | +manage -l Spanish {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} dataset |
| 879 | +</pre> |
| 880 | + |
| 881 | +==== Everything in one shot ==== |
| 882 | +'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer. |
| 883 | +If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command: |
| 884 | +<pre> |
| 885 | +python manage.py all language |
| 886 | +python manage.py -l Spanish all |
| 887 | +</pre> |
| 888 | +<pre> |
| 889 | +python manage.py -p {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} all |
| 890 | +</pre> |
| 891 | + |
| 892 | + |
| 893 | +=== Benchmarks === |
| 894 | +{| border=0 |
| 895 | + |+ ''Benchmark German Wiki'' |
| 896 | +|- |
| 897 | + ! Task |
| 898 | + ! Configuration 1 |
| 899 | + ! Configuration 2 |
| 900 | +|- |
| 901 | + | Download |
| 902 | + | |
| 903 | + | 1 minute 14 seconds |
| 904 | +|- |
| 905 | + | Extract |
| 906 | + | |
| 907 | + | 4-6 hours |
| 908 | +|- |
| 909 | + | Sort |
| 910 | + | |
| 911 | + | ~30 minutes |
| 912 | +|- |
| 913 | + | Store |
| 914 | + | |
| 915 | + | 4-5 hours |
| 916 | +|- |
| 917 | + | Transform |
| 918 | + | |
| 919 | + | 2-3 hours |
| 920 | +|- |
| 921 | + | Total time |
| 922 | + | |
| 923 | + | 10-14 hours |
| 924 | + |
| 925 | +|} |
| 926 | + |
| 927 | + |
| 928 | +{| border=0 |
| 929 | + |+ ''Benchmark English Wiki'' |
| 930 | +|- |
| 931 | + ! Task |
| 932 | + ! Configuration 1 |
| 933 | + ! Configuration 2 |
| 934 | +|- |
| 935 | + | Download |
| 936 | + | |
| 937 | + | 15 minutes |
| 938 | +|- |
| 939 | + | Extract |
| 940 | + | |
| 941 | + | ~36 hours |
| 942 | +|- |
| 943 | + | Sort |
| 944 | + | |
| 945 | + | 10.5 hours |
| 946 | +|- |
| 947 | + | Store |
| 948 | + | |
| 949 | + | 21 hours |
| 950 | +|- |
| 951 | + | Transform |
| 952 | + | |
| 953 | + | 14.3 hours |
| 954 | +|- |
| 955 | + | Total time |
| 956 | + | |
| 957 | + | 3.4 days |
| 958 | + |
| 959 | +|} |
| 960 | + |
| 961 | + |
| 962 | +{| width="300" border="1" |
| 963 | + |+ ''Benchmark Hungarian Wiki'' |
| 964 | +|- |
| 965 | + ! Task |
| 966 | + ! Configuration 3 |
| 967 | +|- |
| 968 | + | Download |
| 969 | + | 1-2 minutes |
| 970 | +|- |
| 971 | + | Extract |
| 972 | + | 24.5 minutes |
| 973 | +|- |
| 974 | + | Sort |
| 975 | + | 1.5 minutes |
| 976 | +|- |
| 977 | + | Store |
| 978 | + | 7-8 minutes |
| 979 | +|- |
| 980 | + | Transform |
| 981 | + | 11 minutes |
| 982 | +|- |
| 983 | + | Total time |
| 984 | + | ~45 minutes |
| 985 | +|} |
| 986 | + |
| 987 | + |
| 988 | +;Configuration 2 |
| 989 | +''Amazon Web Services Large EC2 Instance'' |
| 990 | +* Ubuntu 64-bit |
| 991 | +* 4 EC2 Compute Units (2 virtual cores) |
| 992 | +* 7.5GB memory |
| 993 | +* 850GB storage |
| 994 | + |
| 995 | +;Configuration 3 |
| 996 | +* Win7 64 bit |
| 997 | +* Intel i7 CPU (8 virtual core) |
| 998 | +* 6GB memory |
| 999 | +* 1TB storage |
| 1000 | +* 100/100Mb/s internet connection |
| 1001 | + |
| 1002 | +==See also== |
| 1003 | +* [[Wikilytics Dataset]] |
| 1004 | +* [[Wikilytics Plugins]] |
| 1005 | + |
| 1006 | +[[Category:Wikilytics]] |
| 1007 | + |
| 1008 | +""".splitlines(1) |
| 1009 | + |
| 1010 | + diff = difflib.unified_diff(str1, str2, n=0, lineterm='') |
| 1011 | + for line in diff: |
| 1012 | + if len(line) > 3: |
| 1013 | + print line |
| 1014 | +# print result |
| 1015 | + |
| 1016 | +if __name__ == '__main__': |
| 1017 | + launcher_simple() |
| 1018 | + #debug() |
Property changes on: trunk/tools/editor_trends/etl/differ.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 1019 | + native |