r88952 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88951‎ | r88952 | r88953 >
Date:23:36, 26 May 2011
Author:diederik
Status:deferred
Tags:
Comment:
First commit of differ. Differ creates text diffs between two revisions by streaming compressed XML datadump files and comparing two revisions. It outputs article_title, article_id, ns, revision_id, user_id, username, timestamp and diff of two revisions. Currently it writes the files to JSON, XML will follow shortly.
Modified paths:
  • /trunk/tools/editor_trends/etl/differ.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/differ.py
@@ -0,0 +1,1017 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-04-10'
 19+__version__ = '0.1'
 20+
 21+import json
 22+import cStringIO
 23+import codecs
 24+import sys
 25+import os
 26+import difflib
 27+from xml.etree.cElementTree import iterparse, dump
 28+
 29+if '..' not in sys.path:
 30+ sys.path.append('../')
 31+
 32+from utils import file_utils
 33+from etl import variables
 34+from classes import exceptions
 35+
 36+
 37+def parse_xml(fh, format, process_id, location):
 38+ '''
 39+ This function initializes the XML parser and calls the appropriate function
 40+ to extract / construct the variables from the XML stream.
 41+ '''
 42+ include_ns = {3: 'User Talk',
 43+ 5: 'Wikipedia Talk',
 44+ 1: 'Talk',
 45+ }
 46+
 47+ start = 'start'; end = 'end'
 48+ context = iterparse(fh, events=(start, end))
 49+ context = iter(context)
 50+
 51+ article = {}
 52+ count_articles = 0
 53+ id = False
 54+ ns = False
 55+ parse = False
 56+ rev1 = None
 57+ rev2 = None
 58+ file_id, fh_output = None, None
 59+
 60+ try:
 61+ for event, elem in context:
 62+ if event is end and elem.tag.endswith('siteinfo'):
 63+ '''
 64+ This event happens once for every dump file and is used to
 65+ determine the version of the generator used to generate the XML
 66+ file.
 67+ '''
 68+ xml_namespace = variables.determine_xml_namespace(elem)
 69+ namespaces = variables.create_namespace_dict(elem, xml_namespace)
 70+ ns = True
 71+ elem.clear()
 72+
 73+ elif event is end and elem.tag.endswith('title'):
 74+ '''
 75+ This function determines the title of an article and the
 76+ namespace to which it belongs. Then, if the namespace is one
 77+ which we are interested in set parse to True so that we start
 78+ parsing this article, else it will skip this article.
 79+ '''
 80+ title = variables.parse_title(elem)
 81+ article['title'] = title
 82+ current_namespace = variables.determine_namespace(title, namespaces, include_ns)
 83+ if current_namespace == 1 or current_namespace == 3 or current_namespace == 5:
 84+ parse = True
 85+ article['namespace'] = current_namespace
 86+ count_articles += 1
 87+ if count_articles % 10000 == 0:
 88+ print 'Worker %s parsed %s articles' % (process_id, count_articles)
 89+ elem.clear()
 90+
 91+ elif elem.tag.endswith('revision'):
 92+ '''
 93+ This function does the actual analysis of an individual revision,
 94+ calculating size difference between this and previous revision and
 95+ calculating md5 hash to determine whether this edit was reverted.
 96+ '''
 97+ if parse:
 98+ if event is start:
 99+ clear = False
 100+ else:
 101+ #dump(elem)
 102+ rev_id = elem.find('%s%s' % (xml_namespace, 'id'))
 103+ timestamp = elem.find('%s%s' % (xml_namespace, 'timestamp')).text
 104+ contributor = elem.find('%s%s' % (xml_namespace, 'contributor'))
 105+ editor = variables.parse_contributor(contributor, None, xml_namespace)
 106+ if editor:
 107+ rev_id = variables.extract_revision_id(rev_id)
 108+
 109+ if rev1 == None and rev2 == None:
 110+ diff = variables.extract_revision_text(elem, xml_namespace)
 111+ rev1 = elem
 112+ if rev1 != None and rev2 != None:
 113+ diff = diff_revision(rev1, rev2, xml_namespace)
 114+
 115+ article[rev_id] = {}
 116+ article[rev_id].update(editor)
 117+ article[rev_id]['timestamp'] = timestamp
 118+ article[rev_id]['diff'] = diff
 119+
 120+ clear = True
 121+ if clear:
 122+ rev2 = rev1
 123+ elem.clear()
 124+ else:
 125+ elem.clear()
 126+
 127+ elif event is end and elem.tag.endswith('id') and id == False:
 128+ '''
 129+ Determine id of article
 130+ '''
 131+ article['article_id'] = elem.text
 132+ id = True
 133+ elem.clear()
 134+
 135+ elif event is end and elem.tag.endswith('page'):
 136+ '''
 137+ We have reached end of an article, reset all variables and free
 138+ memory.
 139+ '''
 140+ elem.clear()
 141+ #write diff of text to file
 142+ if parse:
 143+ #print article
 144+ fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format)
 145+ write_diff(fh_output, article, format)
 146+ #Reset all variables for next article
 147+ article = {}
 148+ if rev1 != None:
 149+ rev1.clear()
 150+ if rev2 != None:
 151+ rev2.clear()
 152+ id = False
 153+ parse = False
 154+
 155+ except SyntaxError, error:
 156+ print 'Encountered invalid XML tag. Error message: %s' % error
 157+ dump(elem)
 158+ sys.exit(-1)
 159+ except IOError, error:
 160+ print '''Archive file is possibly corrupted. Please delete this archive
 161+ and retry downloading. Error message: %s''' % error
 162+ sys.exit(-1)
 163+ print 'Finished parsing Wikipedia dump file.'
 164+
 165+
 166+def assign_filehandle(fh, file_id, location, process_id, format):
 167+ if not fh:
 168+ file_id = 0
 169+ filename = '%s_%s.%s' % (file_id, process_id, format)
 170+ fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8')
 171+ else:
 172+ size = fh.tell()
 173+ max_size = 1024 * 1024 * 64
 174+ if size > max_size:
 175+ fh.close()
 176+ file_id += 1
 177+ filename = '%s_%s.%s' % (file_id, process_id, format)
 178+ fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8')
 179+
 180+ return fh, file_id
 181+
 182+def write_xml_diff(fh, article):
 183+ pass
 184+
 185+
 186+def write_json_diff(fh, article):
 187+ json.dump(article, fh)
 188+
 189+
 190+def write_diff(fh, article, format):
 191+ if format == 'xml':
 192+ write_xml_diff(fh, article)
 193+ elif format == 'json':
 194+ write_json_diff(fh, article)
 195+ else:
 196+ raise exceptions.OutputNotSupported()
 197+
 198+
 199+def diff_revision(rev1, rev2, xml_namespace):
 200+ buffer = cStringIO.StringIO()
 201+ if rev1.text != None and rev2.text != None:
 202+ diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='')
 203+ for line in diff:
 204+ if len(line) > 3:
 205+ print line
 206+ buffer.write(line)
 207+
 208+ return buffer.getvalue()
 209+
 210+
 211+def launcher(rts):
 212+ '''
 213+ This function initializes the multiprocessor, and loading the queue with
 214+ the compressed XML files.
 215+ '''
 216+ input_queue = JoinableQueue()
 217+
 218+ files = file_utils.retrieve_file_list(rts.input_location)
 219+
 220+ if len(files) > cpu_count():
 221+ processors = cpu_count() - 1
 222+ else:
 223+ processors = len(files)
 224+
 225+ fhd = buffer.FileHandleDistributor(rts.max_filehandles, processors)
 226+
 227+ for filename in files:
 228+ filename = os.path.join(rts.input_location, filename)
 229+ print filename
 230+ input_queue.put(filename)
 231+
 232+ for x in xrange(processors):
 233+ print 'Inserting poison pill %s...' % x
 234+ input_queue.put(None)
 235+
 236+ extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
 237+ fhd, rts])
 238+ for process_id in xrange(processors)]
 239+ for extracter in extracters:
 240+ extracter.start()
 241+
 242+ input_queue.join()
 243+
 244+
 245+def launcher_simple():
 246+ location = 'c:\\wikimedia\\nl\\wiki\\'
 247+ output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
 248+ files = file_utils.retrieve_file_list(location)
 249+ process_id = 0
 250+ format = 'json'
 251+ for filename in files:
 252+ fh = file_utils.create_streaming_buffer(os.path.join(location, filename))
 253+ #fh = codecs.open(os.path.join(location, filename), 'r', 'utf-8')
 254+ parse_xml(fh, format, process_id, output_location)
 255+ fh.close()
 256+
 257+
 258+def debug():
 259+ str1 = """
 260+ '''Welcome to Wikilytics !
 261+'''
 262+== Background ==
 263+This package offers a set of tools used to create datasets to analyze Editor
 264+Trends. By Editor Trends we refer to the overall pattern of entering and leaving
 265+a Wikipedia site. The main information source for this package is [[:strategy:Editor Trends Study|Editor Trends Study]]
 266+
 267+== High-level Overview Editor Trends Analytics ==
 268+
 269+The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
 270+* Chunk the XML dump file in smaller parts
 271+** and discard all non-zero namespace revisions.
 272+* Parse XML chunks by taking the following steps:
 273+** read XML chunk
 274+** construct XML DOM
 275+** iterate over each article in XML DOM
 276+** iterate over each revision in each article
 277+** extract from each revision
 278+*** username id
 279+*** date edit
 280+*** article id
 281+** determine if username belongs to bot, discard information if yes
 282+** store data in MongoDB
 283+* Create dataset from MongoDB database
 284+** Create list with unique username id’s
 285+** Loop over each id
 286+*** determine year of first edit
 287+*** determine year of last edit
 288+*** count total number of edits by year
 289+*** sort edits by date and keep first 10 edits
 290+** Write to CSV file.
 291+
 292+== Schema of Editor Trends Database ==
 293+Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure:
 294+
 295+<source lang='javascript'>
 296+{'editor': id,
 297+ 'year_joined': year,
 298+ 'new_wikipedian': True,
 299+ 'total_edits': n,
 300+ 'edits': {
 301+ 'date': date,
 302+ 'article': article_id,
 303+ }
 304+}
 305+</source>
 306+The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying
 307+the database:
 308+
 309+<pre>
 310+use enwiki
 311+editors = enwiki['editors']
 312+enwiki.editors.find_one({'editor': '35252'}, {'edits': 1})[0]
 313+</pre>
 314+
 315+
 316+Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements.
 317+
 318+== Installation ==
 319+
 320+=== Step-by-Step Movie Tutorial ===
 321+There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing.
 322+
 323+=== Dependencies ===
 324+
 325+Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice.
 326+
 327+# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version.
 328+# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6)
 329+#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python.
 330+# Download and install [http://www.sliksvn.com/en/download Subversion] client
 331+# Depending on your platform make sure you have one of the following extraction utilities installed:
 332+:* Windows: [http://www.7zip.com 7zip]
 333+:* Linux: tar (should be installed by default)
 334+
 335+To verify that you have installed the required dependencies, do the following:
 336+<pre>
 337+<prompt>:: mongo
 338+MongoDB shell version: 1.6.3
 339+connecting to: test
 340+<prompt> (in mongo shell) exit
 341+
 342+<prompt>:: python
 343+Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on
 344+win32
 345+Type "help", "copyright", "credits" or "license" for more information.
 346+<prompt> (in python) exit()
 347+
 348+<prompt>:: 7z or tar (depending on your platform)
 349+7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03
 350+
 351+<prompt>:: svn
 352+
 353+</pre>
 354+Output on the console might look different depending on your OS and installed version.
 355+
 356+'''For Windows Users, add the following directories to the path'''
 357+<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre>
 358+
 359+To finish the Mongodb configuration, do the following:
 360+<pre>
 361+cd \
 362+mkdir data
 363+mkdir data\db
 364+cd \mongodb\bin
 365+mongod --install --logpath c:\mongodb\logs
 366+net start mongodb
 367+</pre>
 368+
 369+Prepare your Python environment by taking the following steps:
 370+1 Check whether easy_install is installed by issuing the command:
 371+<pre>
 372+easy_install
 373+</pre>
 374+If easy_install is not installed then enter the following command:
 375+<pre>
 376+sudo apt-get install python-setuptools
 377+</pre>
 378+2 Check whether virtualenv is installed by the issuing the following command:
 379+<pre>
 380+virtualenv
 381+</pre>
 382+If virtualenv is not installed enter this command:
 383+<pre>
 384+sudo easy_install virtualenv
 385+</pre>
 386+Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command:
 387+<pre>
 388+virtualenv editor_trends
 389+</pre>
 390+This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs
 391+Now, we have to activate our virtual Python:
 392+<pre>
 393+source bin/activate
 394+</pre>
 395+You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation.
 396+If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy.
 397+Finally, enter the following commands:
 398+<pre>
 399+easy_install progressbar
 400+easy_install pymongo
 401+easy_install argparse
 402+easy_install python-dateutil
 403+easy_install texttable
 404+</pre>
 405+Python is installed and you are ready to go!
 406+
 407+If everything is running, then you are ready to go.
 408+==== Important MongoDB Notes ====
 409+If you decide to use MongoDB to store the results then you have to install the
 410+64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
 411+databases created by this package will definitely be larger than that. For more
 412+background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations]
 413+
 414+=== Install Editor Trend Analytics ===
 415+First, download Editor Trend Analytics
 416+* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
 417+* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
 418+
 419+=== Getting started ===
 420+By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file.
 421+<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote>
 422+
 423+==== Download Wikipedia dump file ====
 424+To download a dump file enter the following command:
 425+<pre>
 426+python manage.py download
 427+</pre>
 428+You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze:
 429+<pre>
 430+python manage.py -l Spanish download
 431+python manage.py -l Español download
 432+</pre>
 433+Or, if you want to download a non Wikipedia dump file, enter the following command:
 434+<pre>
 435+python manage.py -l Spanish download {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 436+</pre>
 437+
 438+To obtain a list of all supported languages, enter:
 439+<pre>
 440+manage show_languages
 441+</pre>
 442+or to obtain all languages starting with 'x', enter:
 443+<pre>
 444+python manage.py show_languages --first x
 445+</pre>
 446+
 447+
 448+==== Extract Wikipedia dump file ====
 449+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
 450+The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command:
 451+<pre>
 452+python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks)
 453+</pre>
 454+or, for one of the other Wikimedia projects, enter
 455+<pre>
 456+python manage.py -l Spanish -p commons extract
 457+</pre>
 458+Valid project choices are: {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 459+
 460+'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file.
 461+
 462+
 463+==== Sort Wikipedia dump file ====
 464+'''WARNING''': This process might take a few hours.
 465+The chunks must be sorted before being added to the MongoDB. Enter the following command:
 466+<pre>
 467+python manage.py sort (for sorting the chunks as generated by the 'manage extract' step)
 468+</pre>
 469+or, for one of the other Wikimedia projects, enter
 470+<pre>
 471+python manage.py -l Spanish sort {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 472+</pre>
 473+
 474+
 475+==== Store Wikipedia dump file ====
 476+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
 477+Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command:
 478+<pre>
 479+python manage.py store
 480+python manage.py -l Spanish store
 481+</pre>
 482+or, for one of the other Wikimedia projects, enter
 483+<pre>
 484+python manage.py -l Spanish store {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 485+</pre>
 486+
 487+==== Transform dataset ====
 488+'''WARNING''': This process might take a couple of hours.
 489+Finally, the raw data needs to be transformed in useful variables. Issue the following command:
 490+<pre>
 491+python manage.py transform
 492+python manage.py -l Spanish transform
 493+</pre>
 494+
 495+==== Create dataset ====
 496+'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer.
 497+We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]].
 498+
 499+Enter the following command:
 500+<pre>
 501+python manage.py dataset
 502+python manage.py -l Spanish dataset
 503+</pre>
 504+or, for one of the other Wikimedia projects, enter
 505+<pre>
 506+manage -l Spanish {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} dataset
 507+</pre>
 508+
 509+==== Everything in one shot ====
 510+'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer.
 511+If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command:
 512+<pre>
 513+python manage.py all language
 514+python manage.py -l Spanish all
 515+</pre>
 516+<pre>
 517+python manage.py -p {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} all
 518+</pre>
 519+
 520+
 521+=== Benchmarks ===
 522+{| border=0
 523+ |+ ''Benchmark German Wiki''
 524+|-
 525+ ! Task
 526+ ! Configuration 1
 527+ ! Configuration 2
 528+|-
 529+ | Download
 530+ |
 531+ | 1 minute 14 seconds
 532+|-
 533+ | Extract
 534+ |
 535+ | 4-6 hours
 536+|-
 537+ | Sort
 538+ |
 539+ | ~30 minutes
 540+|-
 541+ | Store
 542+ |
 543+ | 4-5 hours
 544+|-
 545+ | Transform
 546+ |
 547+ | 2-3 hours
 548+|-
 549+ | Total time
 550+ |
 551+ | 10-14 hours
 552+
 553+|}
 554+
 555+
 556+{| border=0
 557+ |+ ''Benchmark English Wiki''
 558+|-
 559+ ! Task
 560+ ! Configuration 1
 561+ ! Configuration 2
 562+|-
 563+ | Download
 564+ |
 565+ | 15 minutes
 566+|-
 567+ | Extract
 568+ |
 569+ | ~36 hours
 570+|-
 571+ | Sort
 572+ |
 573+ | 10.5 hours
 574+|-
 575+ | Store
 576+ |
 577+ | 21 hours
 578+|-
 579+ | Transform
 580+ |
 581+ | 14.3 hours
 582+|-
 583+ | Total time
 584+ |
 585+ | 3.4 days
 586+
 587+|}
 588+
 589+
 590+{| width="300" border="1"
 591+ |+ ''Benchmark Hungarian Wiki''
 592+|-
 593+ ! Task
 594+ ! Configuration 3
 595+|-
 596+ | Download
 597+ | 1-2 minutes
 598+|-
 599+ | Extract
 600+ | 24.5 minutes
 601+|-
 602+ | Sort
 603+ | 1.5 minutes
 604+|-
 605+ | Store
 606+ | 7-8 minutes
 607+|-
 608+ | Transform
 609+ | 11 minutes
 610+|-
 611+ | Total time
 612+ | ~45 minutes
 613+|}
 614+
 615+
 616+;Configuration 2
 617+''Amazon Web Services Large EC2 Instance''
 618+* Ubuntu 64-bit
 619+* 4 EC2 Compute Units (2 virtual cores)
 620+* 7.5GB memory
 621+* 850GB storage
 622+
 623+;Configuration 3
 624+* Win7 64 bit
 625+* Intel i7 CPU (8 virtual core)
 626+* 6GB memory
 627+* 1TB storage
 628+* 100/100Mb/s internet connection
 629+
 630+
 631+[[Category:Editor Trends Study]]
 632+""".splitlines(1)
 633+
 634+ str2 = """
 635+Welcome to '''Wikilytics''', a free and open source software toolkit for doing analysis of editing trends in Wikipedia and other Wikimedia projects.
 636+
 637+== Background ==
 638+This package offers a set of tools used to create datasets to analyze editing trends. It was first created expressly for the [[:strategy:Editor Trends Study|Editor Trends Study]], but is well-suited to a variety of research into editing trends. It is thus free to use (as in beer and freedom) if you're interested in expanding on the [[:strategy:Editor Trends Study/Results|results of Editor Trend Study]] or if you'd like to participate in other [[Research/Projects|research projects]].
 639+
 640+== High-level Overview Editor Trends Analytics ==
 641+
 642+The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
 643+* Chunk the XML dump file in smaller parts
 644+** and discard all non-zero namespace revisions.
 645+* Parse XML chunks by taking the following steps:
 646+** read XML chunk
 647+** construct XML DOM
 648+** iterate over each article in XML DOM
 649+** iterate over each revision in each article
 650+** extract from each revision
 651+*** username id
 652+*** date edit
 653+*** article id
 654+** determine if username belongs to bot, discard information if yes
 655+** store data in MongoDB
 656+* Create dataset from MongoDB database
 657+** Create list with unique username id’s
 658+** Loop over each id
 659+*** determine year of first edit
 660+*** determine year of last edit
 661+*** count total number of edits by year
 662+*** sort edits by date and keep first 10 edits
 663+** Write to CSV file.
 664+
 665+== Schema of Editor Trends Database ==
 666+Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure:
 667+
 668+<source lang='javascript'>
 669+{'editor': id,
 670+ 'year_joined': year,
 671+ 'new_wikipedian': True,
 672+ 'total_edits': n,
 673+ 'edits': {
 674+ 'date': date,
 675+ 'article': article_id,
 676+ }
 677+}
 678+</source>
 679+The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying
 680+the database:
 681+
 682+<pre>
 683+use wikilitycs
 684+db.editors_dataset.find_one({'editor': '35252'}, {'edits': 1})
 685+</pre>
 686+
 687+
 688+Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements.
 689+
 690+== Installation ==
 691+
 692+=== Step-by-Step Movie Tutorial ===
 693+There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing.
 694+
 695+=== Dependencies ===
 696+
 697+Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice.
 698+
 699+# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version.
 700+# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6)
 701+#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python.
 702+# Download and install [http://www.sliksvn.com/en/download Subversion] client
 703+# Depending on your platform make sure you have one of the following extraction utilities installed:
 704+:* Windows: [http://www.7zip.com 7zip]
 705+:* Linux: tar (should be installed by default)
 706+
 707+To verify that you have installed the required dependencies, do the following:
 708+<pre>
 709+<prompt>:: mongo
 710+MongoDB shell version: 1.6.3
 711+connecting to: test
 712+<prompt> (in mongo shell) exit
 713+
 714+<prompt>:: python
 715+Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on
 716+win32
 717+Type "help", "copyright", "credits" or "license" for more information.
 718+<prompt> (in python) exit()
 719+
 720+<prompt>:: 7z or tar (depending on your platform)
 721+7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03
 722+
 723+<prompt>:: svn
 724+
 725+</pre>
 726+Output on the console might look different depending on your OS and installed version.
 727+
 728+'''For Windows Users, add the following directories to the path'''
 729+<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre>
 730+
 731+To finish the Mongodb configuration, do the following:
 732+<pre>
 733+cd \
 734+mkdir data
 735+mkdir data\db
 736+cd \mongodb\bin
 737+mongod --install --logpath c:\mongodb\logs
 738+net start mongodb
 739+</pre>
 740+
 741+Prepare your Python environment by taking the following steps:
 742+1 Check whether easy_install is installed by issuing the command:
 743+<pre>
 744+easy_install
 745+</pre>
 746+If easy_install is not installed then enter the following command:
 747+<pre>
 748+sudo apt-get install python-setuptools
 749+</pre>
 750+2 Check whether virtualenv is installed by the issuing the following command:
 751+<pre>
 752+virtualenv
 753+</pre>
 754+If virtualenv is not installed enter this command:
 755+<pre>
 756+sudo easy_install virtualenv
 757+</pre>
 758+Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command:
 759+<pre>
 760+virtualenv editor_trends
 761+</pre>
 762+This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs
 763+Now, we have to activate our virtual Python:
 764+<pre>
 765+source bin/activate
 766+</pre>
 767+You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation.
 768+If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy.
 769+Finally, enter the following commands:
 770+<pre>
 771+easy_install progressbar
 772+easy_install pymongo
 773+easy_install argparse
 774+easy_install python-dateutil
 775+easy_install texttable
 776+</pre>
 777+Python is installed and you are ready to go!
 778+
 779+If everything is running, then you are ready to go.
 780+==== Important MongoDB Notes ====
 781+If you decide to use MongoDB to store the results then you have to install the
 782+64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
 783+databases created by this package will definitely be larger than that. For more
 784+background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations]
 785+
 786+=== Install Editor Trend Analytics ===
 787+First, download Editor Trend Analytics
 788+* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
 789+* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
 790+
 791+=== Getting started ===
 792+By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file.
 793+<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote>
 794+
 795+==== Download Wikipedia dump file ====
 796+To download a dump file enter the following command:
 797+<pre>
 798+python manage.py download
 799+</pre>
 800+You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze:
 801+<pre>
 802+python manage.py -l Spanish download
 803+python manage.py -l Español download
 804+</pre>
 805+Or, if you want to download a non Wikipedia dump file, enter the following command:
 806+<pre>
 807+python manage.py -l Spanish download {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 808+</pre>
 809+
 810+To obtain a list of all supported languages, enter:
 811+<pre>
 812+manage show_languages
 813+</pre>
 814+or to obtain all languages starting with 'x', enter:
 815+<pre>
 816+python manage.py show_languages --first x
 817+</pre>
 818+
 819+
 820+==== Extract Wikipedia dump file ====
 821+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
 822+The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command:
 823+<pre>
 824+python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks)
 825+</pre>
 826+or, for one of the other Wikimedia projects, enter
 827+<pre>
 828+python manage.py -l Spanish -p commons extract
 829+</pre>
 830+Valid project choices are: {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 831+
 832+'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file.
 833+
 834+
 835+==== Sort Wikipedia dump file ====
 836+'''WARNING''': This process might take a few hours.
 837+The chunks must be sorted before being added to the MongoDB. Enter the following command:
 838+<pre>
 839+python manage.py sort (for sorting the chunks as generated by the 'manage extract' step)
 840+</pre>
 841+or, for one of the other Wikimedia projects, enter
 842+<pre>
 843+python manage.py -l Spanish sort {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 844+</pre>
 845+
 846+
 847+==== Store Wikipedia dump file ====
 848+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
 849+Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command:
 850+<pre>
 851+python manage.py store
 852+python manage.py -l Spanish store
 853+</pre>
 854+or, for one of the other Wikimedia projects, enter
 855+<pre>
 856+python manage.py -l Spanish store {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
 857+</pre>
 858+
 859+==== Transform dataset ====
 860+'''WARNING''': This process might take a couple of hours.
 861+Finally, the raw data needs to be transformed in useful variables. Issue the following command:
 862+<pre>
 863+python manage.py transform
 864+python manage.py -l Spanish transform
 865+</pre>
 866+
 867+==== Create dataset ====
 868+'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer.
 869+We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]].
 870+
 871+Enter the following command:
 872+<pre>
 873+python manage.py dataset
 874+python manage.py -l Spanish dataset
 875+</pre>
 876+or, for one of the other Wikimedia projects, enter
 877+<pre>
 878+manage -l Spanish {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} dataset
 879+</pre>
 880+
 881+==== Everything in one shot ====
 882+'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer.
 883+If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command:
 884+<pre>
 885+python manage.py all language
 886+python manage.py -l Spanish all
 887+</pre>
 888+<pre>
 889+python manage.py -p {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} all
 890+</pre>
 891+
 892+
 893+=== Benchmarks ===
 894+{| border=0
 895+ |+ ''Benchmark German Wiki''
 896+|-
 897+ ! Task
 898+ ! Configuration 1
 899+ ! Configuration 2
 900+|-
 901+ | Download
 902+ |
 903+ | 1 minute 14 seconds
 904+|-
 905+ | Extract
 906+ |
 907+ | 4-6 hours
 908+|-
 909+ | Sort
 910+ |
 911+ | ~30 minutes
 912+|-
 913+ | Store
 914+ |
 915+ | 4-5 hours
 916+|-
 917+ | Transform
 918+ |
 919+ | 2-3 hours
 920+|-
 921+ | Total time
 922+ |
 923+ | 10-14 hours
 924+
 925+|}
 926+
 927+
 928+{| border=0
 929+ |+ ''Benchmark English Wiki''
 930+|-
 931+ ! Task
 932+ ! Configuration 1
 933+ ! Configuration 2
 934+|-
 935+ | Download
 936+ |
 937+ | 15 minutes
 938+|-
 939+ | Extract
 940+ |
 941+ | ~36 hours
 942+|-
 943+ | Sort
 944+ |
 945+ | 10.5 hours
 946+|-
 947+ | Store
 948+ |
 949+ | 21 hours
 950+|-
 951+ | Transform
 952+ |
 953+ | 14.3 hours
 954+|-
 955+ | Total time
 956+ |
 957+ | 3.4 days
 958+
 959+|}
 960+
 961+
 962+{| width="300" border="1"
 963+ |+ ''Benchmark Hungarian Wiki''
 964+|-
 965+ ! Task
 966+ ! Configuration 3
 967+|-
 968+ | Download
 969+ | 1-2 minutes
 970+|-
 971+ | Extract
 972+ | 24.5 minutes
 973+|-
 974+ | Sort
 975+ | 1.5 minutes
 976+|-
 977+ | Store
 978+ | 7-8 minutes
 979+|-
 980+ | Transform
 981+ | 11 minutes
 982+|-
 983+ | Total time
 984+ | ~45 minutes
 985+|}
 986+
 987+
 988+;Configuration 2
 989+''Amazon Web Services Large EC2 Instance''
 990+* Ubuntu 64-bit
 991+* 4 EC2 Compute Units (2 virtual cores)
 992+* 7.5GB memory
 993+* 850GB storage
 994+
 995+;Configuration 3
 996+* Win7 64 bit
 997+* Intel i7 CPU (8 virtual core)
 998+* 6GB memory
 999+* 1TB storage
 1000+* 100/100Mb/s internet connection
 1001+
 1002+==See also==
 1003+* [[Wikilytics Dataset]]
 1004+* [[Wikilytics Plugins]]
 1005+
 1006+[[Category:Wikilytics]]
 1007+
 1008+""".splitlines(1)
 1009+
 1010+ diff = difflib.unified_diff(str1, str2, n=0, lineterm='')
 1011+ for line in diff:
 1012+ if len(line) > 3:
 1013+ print line
 1014+# print result
 1015+
 1016+if __name__ == '__main__':
 1017+ launcher_simple()
 1018+ #debug()
Property changes on: trunk/tools/editor_trends/etl/differ.py
___________________________________________________________________
Added: svn:eol-style
11019 + native