r88952 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r88951‎ \| r88952 \| r88953 >
Date:	23:36, 26 May 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	First commit of differ. Differ creates text diffs between two revisions by streaming compressed XML datadump files and comparing two revisions. It outputs article_title, article_id, ns, revision_id, user_id, username, timestamp and diff of two revisions. Currently it writes the files to JSON, XML will follow shortly.
Modified paths:	/trunk/tools/editor_trends/etl/differ.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/differ.py
—	—	@@ -0,0 +1,1017 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-04-10'
	19	+__version__ = '0.1'
	20	+
	21	+import json
	22	+import cStringIO
	23	+import codecs
	24	+import sys
	25	+import os
	26	+import difflib
	27	+from xml.etree.cElementTree import iterparse, dump
	28	+
	29	+if '..' not in sys.path:
	30	+ sys.path.append('../')
	31	+
	32	+from utils import file_utils
	33	+from etl import variables
	34	+from classes import exceptions
	35	+
	36	+
	37	+def parse_xml(fh, format, process_id, location):
	38	+ '''
	39	+ This function initializes the XML parser and calls the appropriate function
	40	+ to extract / construct the variables from the XML stream.
	41	+ '''
	42	+ include_ns = {3: 'User Talk',
	43	+ 5: 'Wikipedia Talk',
	44	+ 1: 'Talk',
	45	+ }
	46	+
	47	+ start = 'start'; end = 'end'
	48	+ context = iterparse(fh, events=(start, end))
	49	+ context = iter(context)
	50	+
	51	+ article = {}
	52	+ count_articles = 0
	53	+ id = False
	54	+ ns = False
	55	+ parse = False
	56	+ rev1 = None
	57	+ rev2 = None
	58	+ file_id, fh_output = None, None
	59	+
	60	+ try:
	61	+ for event, elem in context:
	62	+ if event is end and elem.tag.endswith('siteinfo'):
	63	+ '''
	64	+ This event happens once for every dump file and is used to
	65	+ determine the version of the generator used to generate the XML
	66	+ file.
	67	+ '''
	68	+ xml_namespace = variables.determine_xml_namespace(elem)
	69	+ namespaces = variables.create_namespace_dict(elem, xml_namespace)
	70	+ ns = True
	71	+ elem.clear()
	72	+
	73	+ elif event is end and elem.tag.endswith('title'):
	74	+ '''
	75	+ This function determines the title of an article and the
	76	+ namespace to which it belongs. Then, if the namespace is one
	77	+ which we are interested in set parse to True so that we start
	78	+ parsing this article, else it will skip this article.
	79	+ '''
	80	+ title = variables.parse_title(elem)
	81	+ article['title'] = title
	82	+ current_namespace = variables.determine_namespace(title, namespaces, include_ns)
	83	+ if current_namespace == 1 or current_namespace == 3 or current_namespace == 5:
	84	+ parse = True
	85	+ article['namespace'] = current_namespace
	86	+ count_articles += 1
	87	+ if count_articles % 10000 == 0:
	88	+ print 'Worker %s parsed %s articles' % (process_id, count_articles)
	89	+ elem.clear()
	90	+
	91	+ elif elem.tag.endswith('revision'):
	92	+ '''
	93	+ This function does the actual analysis of an individual revision,
	94	+ calculating size difference between this and previous revision and
	95	+ calculating md5 hash to determine whether this edit was reverted.
	96	+ '''
	97	+ if parse:
	98	+ if event is start:
	99	+ clear = False
	100	+ else:
	101	+ #dump(elem)
	102	+ rev_id = elem.find('%s%s' % (xml_namespace, 'id'))
	103	+ timestamp = elem.find('%s%s' % (xml_namespace, 'timestamp')).text
	104	+ contributor = elem.find('%s%s' % (xml_namespace, 'contributor'))
	105	+ editor = variables.parse_contributor(contributor, None, xml_namespace)
	106	+ if editor:
	107	+ rev_id = variables.extract_revision_id(rev_id)
	108	+
	109	+ if rev1 == None and rev2 == None:
	110	+ diff = variables.extract_revision_text(elem, xml_namespace)
	111	+ rev1 = elem
	112	+ if rev1 != None and rev2 != None:
	113	+ diff = diff_revision(rev1, rev2, xml_namespace)
	114	+
	115	+ article[rev_id] = {}
	116	+ article[rev_id].update(editor)
	117	+ article[rev_id]['timestamp'] = timestamp
	118	+ article[rev_id]['diff'] = diff
	119	+
	120	+ clear = True
	121	+ if clear:
	122	+ rev2 = rev1
	123	+ elem.clear()
	124	+ else:
	125	+ elem.clear()
	126	+
	127	+ elif event is end and elem.tag.endswith('id') and id == False:
	128	+ '''
	129	+ Determine id of article
	130	+ '''
	131	+ article['article_id'] = elem.text
	132	+ id = True
	133	+ elem.clear()
	134	+
	135	+ elif event is end and elem.tag.endswith('page'):
	136	+ '''
	137	+ We have reached end of an article, reset all variables and free
	138	+ memory.
	139	+ '''
	140	+ elem.clear()
	141	+ #write diff of text to file
	142	+ if parse:
	143	+ #print article
	144	+ fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format)
	145	+ write_diff(fh_output, article, format)
	146	+ #Reset all variables for next article
	147	+ article = {}
	148	+ if rev1 != None:
	149	+ rev1.clear()
	150	+ if rev2 != None:
	151	+ rev2.clear()
	152	+ id = False
	153	+ parse = False
	154	+
	155	+ except SyntaxError, error:
	156	+ print 'Encountered invalid XML tag. Error message: %s' % error
	157	+ dump(elem)
	158	+ sys.exit(-1)
	159	+ except IOError, error:
	160	+ print '''Archive file is possibly corrupted. Please delete this archive
	161	+ and retry downloading. Error message: %s''' % error
	162	+ sys.exit(-1)
	163	+ print 'Finished parsing Wikipedia dump file.'
	164	+
	165	+
	166	+def assign_filehandle(fh, file_id, location, process_id, format):
	167	+ if not fh:
	168	+ file_id = 0
	169	+ filename = '%s_%s.%s' % (file_id, process_id, format)
	170	+ fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8')
	171	+ else:
	172	+ size = fh.tell()
	173	+ max_size = 1024 * 1024 * 64
	174	+ if size > max_size:
	175	+ fh.close()
	176	+ file_id += 1
	177	+ filename = '%s_%s.%s' % (file_id, process_id, format)
	178	+ fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8')
	179	+
	180	+ return fh, file_id
	181	+
	182	+def write_xml_diff(fh, article):
	183	+ pass
	184	+
	185	+
	186	+def write_json_diff(fh, article):
	187	+ json.dump(article, fh)
	188	+
	189	+
	190	+def write_diff(fh, article, format):
	191	+ if format == 'xml':
	192	+ write_xml_diff(fh, article)
	193	+ elif format == 'json':
	194	+ write_json_diff(fh, article)
	195	+ else:
	196	+ raise exceptions.OutputNotSupported()
	197	+
	198	+
	199	+def diff_revision(rev1, rev2, xml_namespace):
	200	+ buffer = cStringIO.StringIO()
	201	+ if rev1.text != None and rev2.text != None:
	202	+ diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='')
	203	+ for line in diff:
	204	+ if len(line) > 3:
	205	+ print line
	206	+ buffer.write(line)
	207	+
	208	+ return buffer.getvalue()
	209	+
	210	+
	211	+def launcher(rts):
	212	+ '''
	213	+ This function initializes the multiprocessor, and loading the queue with
	214	+ the compressed XML files.
	215	+ '''
	216	+ input_queue = JoinableQueue()
	217	+
	218	+ files = file_utils.retrieve_file_list(rts.input_location)
	219	+
	220	+ if len(files) > cpu_count():
	221	+ processors = cpu_count() - 1
	222	+ else:
	223	+ processors = len(files)
	224	+
	225	+ fhd = buffer.FileHandleDistributor(rts.max_filehandles, processors)
	226	+
	227	+ for filename in files:
	228	+ filename = os.path.join(rts.input_location, filename)
	229	+ print filename
	230	+ input_queue.put(filename)
	231	+
	232	+ for x in xrange(processors):
	233	+ print 'Inserting poison pill %s...' % x
	234	+ input_queue.put(None)
	235	+
	236	+ extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
	237	+ fhd, rts])
	238	+ for process_id in xrange(processors)]
	239	+ for extracter in extracters:
	240	+ extracter.start()
	241	+
	242	+ input_queue.join()
	243	+
	244	+
	245	+def launcher_simple():
	246	+ location = 'c:\\wikimedia\\nl\\wiki\\'
	247	+ output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
	248	+ files = file_utils.retrieve_file_list(location)
	249	+ process_id = 0
	250	+ format = 'json'
	251	+ for filename in files:
	252	+ fh = file_utils.create_streaming_buffer(os.path.join(location, filename))
	253	+ #fh = codecs.open(os.path.join(location, filename), 'r', 'utf-8')
	254	+ parse_xml(fh, format, process_id, output_location)
	255	+ fh.close()
	256	+
	257	+
	258	+def debug():
	259	+ str1 = """
	260	+ '''Welcome to Wikilytics !
	261	+'''
	262	+== Background ==
	263	+This package offers a set of tools used to create datasets to analyze Editor
	264	+Trends. By Editor Trends we refer to the overall pattern of entering and leaving
	265	+a Wikipedia site. The main information source for this package is [[:strategy:Editor Trends Study\|Editor Trends Study]]
	266	+
	267	+== High-level Overview Editor Trends Analytics ==
	268	+
	269	+The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
	270	+* Chunk the XML dump file in smaller parts
	271	+** and discard all non-zero namespace revisions.
	272	+* Parse XML chunks by taking the following steps:
	273	+** read XML chunk
	274	+** construct XML DOM
	275	+** iterate over each article in XML DOM
	276	+** iterate over each revision in each article
	277	+** extract from each revision
	278	+*** username id
	279	+*** date edit
	280	+*** article id
	281	+** determine if username belongs to bot, discard information if yes
	282	+** store data in MongoDB
	283	+* Create dataset from MongoDB database
	284	+** Create list with unique username id’s
	285	+** Loop over each id
	286	+*** determine year of first edit
	287	+*** determine year of last edit
	288	+*** count total number of edits by year
	289	+*** sort edits by date and keep first 10 edits
	290	+** Write to CSV file.
	291	+
	292	+== Schema of Editor Trends Database ==
	293	+Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure:
	294	+
	295	+<source lang='javascript'>
	296	+{'editor': id,
	297	+ 'year_joined': year,
	298	+ 'new_wikipedian': True,
	299	+ 'total_edits': n,
	300	+ 'edits': {
	301	+ 'date': date,
	302	+ 'article': article_id,
	303	+ }
	304	+}
	305	+</source>
	306	+The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying
	307	+the database:
	308	+
	309	+<pre>
	310	+use enwiki
	311	+editors = enwiki['editors']
	312	+enwiki.editors.find_one({'editor': '35252'}, {'edits': 1})[0]
	313	+</pre>
	314	+
	315	+
	316	+Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements.
	317	+
	318	+== Installation ==
	319	+
	320	+=== Step-by-Step Movie Tutorial ===
	321	+There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing.
	322	+
	323	+=== Dependencies ===
	324	+
	325	+Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice.
	326	+
	327	+# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version.
	328	+# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6)
	329	+#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python.
	330	+# Download and install [http://www.sliksvn.com/en/download Subversion] client
	331	+# Depending on your platform make sure you have one of the following extraction utilities installed:
	332	+:* Windows: [http://www.7zip.com 7zip]
	333	+:* Linux: tar (should be installed by default)
	334	+
	335	+To verify that you have installed the required dependencies, do the following:
	336	+<pre>
	337	+<prompt>:: mongo
	338	+MongoDB shell version: 1.6.3
	339	+connecting to: test
	340	+<prompt> (in mongo shell) exit
	341	+
	342	+<prompt>:: python
	343	+Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on
	344	+win32
	345	+Type "help", "copyright", "credits" or "license" for more information.
	346	+<prompt> (in python) exit()
	347	+
	348	+<prompt>:: 7z or tar (depending on your platform)
	349	+7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03
	350	+
	351	+<prompt>:: svn
	352	+
	353	+</pre>
	354	+Output on the console might look different depending on your OS and installed version.
	355	+
	356	+'''For Windows Users, add the following directories to the path'''
	357	+<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre>
	358	+
	359	+To finish the Mongodb configuration, do the following:
	360	+<pre>
	361	+cd \
	362	+mkdir data
	363	+mkdir data\db
	364	+cd \mongodb\bin
	365	+mongod --install --logpath c:\mongodb\logs
	366	+net start mongodb
	367	+</pre>
	368	+
	369	+Prepare your Python environment by taking the following steps:
	370	+1 Check whether easy_install is installed by issuing the command:
	371	+<pre>
	372	+easy_install
	373	+</pre>
	374	+If easy_install is not installed then enter the following command:
	375	+<pre>
	376	+sudo apt-get install python-setuptools
	377	+</pre>
	378	+2 Check whether virtualenv is installed by the issuing the following command:
	379	+<pre>
	380	+virtualenv
	381	+</pre>
	382	+If virtualenv is not installed enter this command:
	383	+<pre>
	384	+sudo easy_install virtualenv
	385	+</pre>
	386	+Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command:
	387	+<pre>
	388	+virtualenv editor_trends
	389	+</pre>
	390	+This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs
	391	+Now, we have to activate our virtual Python:
	392	+<pre>
	393	+source bin/activate
	394	+</pre>
	395	+You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation.
	396	+If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy.
	397	+Finally, enter the following commands:
	398	+<pre>
	399	+easy_install progressbar
	400	+easy_install pymongo
	401	+easy_install argparse
	402	+easy_install python-dateutil
	403	+easy_install texttable
	404	+</pre>
	405	+Python is installed and you are ready to go!
	406	+
	407	+If everything is running, then you are ready to go.
	408	+==== Important MongoDB Notes ====
	409	+If you decide to use MongoDB to store the results then you have to install the
	410	+64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
	411	+databases created by this package will definitely be larger than that. For more
	412	+background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations]
	413	+
	414	+=== Install Editor Trend Analytics ===
	415	+First, download Editor Trend Analytics
	416	+* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
	417	+* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
	418	+
	419	+=== Getting started ===
	420	+By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file.
	421	+<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote>
	422	+
	423	+==== Download Wikipedia dump file ====
	424	+To download a dump file enter the following command:
	425	+<pre>
	426	+python manage.py download
	427	+</pre>
	428	+You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze:
	429	+<pre>
	430	+python manage.py -l Spanish download
	431	+python manage.py -l Español download
	432	+</pre>
	433	+Or, if you want to download a non Wikipedia dump file, enter the following command:
	434	+<pre>
	435	+python manage.py -l Spanish download {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	436	+</pre>
	437	+
	438	+To obtain a list of all supported languages, enter:
	439	+<pre>
	440	+manage show_languages
	441	+</pre>
	442	+or to obtain all languages starting with 'x', enter:
	443	+<pre>
	444	+python manage.py show_languages --first x
	445	+</pre>
	446	+
	447	+
	448	+==== Extract Wikipedia dump file ====
	449	+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
	450	+The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command:
	451	+<pre>
	452	+python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks)
	453	+</pre>
	454	+or, for one of the other Wikimedia projects, enter
	455	+<pre>
	456	+python manage.py -l Spanish -p commons extract
	457	+</pre>
	458	+Valid project choices are: {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	459	+
	460	+'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file.
	461	+
	462	+
	463	+==== Sort Wikipedia dump file ====
	464	+'''WARNING''': This process might take a few hours.
	465	+The chunks must be sorted before being added to the MongoDB. Enter the following command:
	466	+<pre>
	467	+python manage.py sort (for sorting the chunks as generated by the 'manage extract' step)
	468	+</pre>
	469	+or, for one of the other Wikimedia projects, enter
	470	+<pre>
	471	+python manage.py -l Spanish sort {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	472	+</pre>
	473	+
	474	+
	475	+==== Store Wikipedia dump file ====
	476	+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
	477	+Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command:
	478	+<pre>
	479	+python manage.py store
	480	+python manage.py -l Spanish store
	481	+</pre>
	482	+or, for one of the other Wikimedia projects, enter
	483	+<pre>
	484	+python manage.py -l Spanish store {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	485	+</pre>
	486	+
	487	+==== Transform dataset ====
	488	+'''WARNING''': This process might take a couple of hours.
	489	+Finally, the raw data needs to be transformed in useful variables. Issue the following command:
	490	+<pre>
	491	+python manage.py transform
	492	+python manage.py -l Spanish transform
	493	+</pre>
	494	+
	495	+==== Create dataset ====
	496	+'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer.
	497	+We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV\|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]].
	498	+
	499	+Enter the following command:
	500	+<pre>
	501	+python manage.py dataset
	502	+python manage.py -l Spanish dataset
	503	+</pre>
	504	+or, for one of the other Wikimedia projects, enter
	505	+<pre>
	506	+manage -l Spanish {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary} dataset
	507	+</pre>
	508	+
	509	+==== Everything in one shot ====
	510	+'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer.
	511	+If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command:
	512	+<pre>
	513	+python manage.py all language
	514	+python manage.py -l Spanish all
	515	+</pre>
	516	+<pre>
	517	+python manage.py -p {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary} all
	518	+</pre>
	519	+
	520	+
	521	+=== Benchmarks ===
	522	+{\| border=0
	523	+ \|+ ''Benchmark German Wiki''
	524	+\|-
	525	+ ! Task
	526	+ ! Configuration 1
	527	+ ! Configuration 2
	528	+\|-
	529	+ \| Download
	530	+ \|
	531	+ \| 1 minute 14 seconds
	532	+\|-
	533	+ \| Extract
	534	+ \|
	535	+ \| 4-6 hours
	536	+\|-
	537	+ \| Sort
	538	+ \|
	539	+ \| ~30 minutes
	540	+\|-
	541	+ \| Store
	542	+ \|
	543	+ \| 4-5 hours
	544	+\|-
	545	+ \| Transform
	546	+ \|
	547	+ \| 2-3 hours
	548	+\|-
	549	+ \| Total time
	550	+ \|
	551	+ \| 10-14 hours
	552	+
	553	+\|}
	554	+
	555	+
	556	+{\| border=0
	557	+ \|+ ''Benchmark English Wiki''
	558	+\|-
	559	+ ! Task
	560	+ ! Configuration 1
	561	+ ! Configuration 2
	562	+\|-
	563	+ \| Download
	564	+ \|
	565	+ \| 15 minutes
	566	+\|-
	567	+ \| Extract
	568	+ \|
	569	+ \| ~36 hours
	570	+\|-
	571	+ \| Sort
	572	+ \|
	573	+ \| 10.5 hours
	574	+\|-
	575	+ \| Store
	576	+ \|
	577	+ \| 21 hours
	578	+\|-
	579	+ \| Transform
	580	+ \|
	581	+ \| 14.3 hours
	582	+\|-
	583	+ \| Total time
	584	+ \|
	585	+ \| 3.4 days
	586	+
	587	+\|}
	588	+
	589	+
	590	+{\| width="300" border="1"
	591	+ \|+ ''Benchmark Hungarian Wiki''
	592	+\|-
	593	+ ! Task
	594	+ ! Configuration 3
	595	+\|-
	596	+ \| Download
	597	+ \| 1-2 minutes
	598	+\|-
	599	+ \| Extract
	600	+ \| 24.5 minutes
	601	+\|-
	602	+ \| Sort
	603	+ \| 1.5 minutes
	604	+\|-
	605	+ \| Store
	606	+ \| 7-8 minutes
	607	+\|-
	608	+ \| Transform
	609	+ \| 11 minutes
	610	+\|-
	611	+ \| Total time
	612	+ \| ~45 minutes
	613	+\|}
	614	+
	615	+
	616	+;Configuration 2
	617	+''Amazon Web Services Large EC2 Instance''
	618	+* Ubuntu 64-bit
	619	+* 4 EC2 Compute Units (2 virtual cores)
	620	+* 7.5GB memory
	621	+* 850GB storage
	622	+
	623	+;Configuration 3
	624	+* Win7 64 bit
	625	+* Intel i7 CPU (8 virtual core)
	626	+* 6GB memory
	627	+* 1TB storage
	628	+* 100/100Mb/s internet connection
	629	+
	630	+
	631	+[[Category:Editor Trends Study]]
	632	+""".splitlines(1)
	633	+
	634	+ str2 = """
	635	+Welcome to '''Wikilytics''', a free and open source software toolkit for doing analysis of editing trends in Wikipedia and other Wikimedia projects.
	636	+
	637	+== Background ==
	638	+This package offers a set of tools used to create datasets to analyze editing trends. It was first created expressly for the [[:strategy:Editor Trends Study\|Editor Trends Study]], but is well-suited to a variety of research into editing trends. It is thus free to use (as in beer and freedom) if you're interested in expanding on the [[:strategy:Editor Trends Study/Results\|results of Editor Trend Study]] or if you'd like to participate in other [[Research/Projects\|research projects]].
	639	+
	640	+== High-level Overview Editor Trends Analytics ==
	641	+
	642	+The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
	643	+* Chunk the XML dump file in smaller parts
	644	+** and discard all non-zero namespace revisions.
	645	+* Parse XML chunks by taking the following steps:
	646	+** read XML chunk
	647	+** construct XML DOM
	648	+** iterate over each article in XML DOM
	649	+** iterate over each revision in each article
	650	+** extract from each revision
	651	+*** username id
	652	+*** date edit
	653	+*** article id
	654	+** determine if username belongs to bot, discard information if yes
	655	+** store data in MongoDB
	656	+* Create dataset from MongoDB database
	657	+** Create list with unique username id’s
	658	+** Loop over each id
	659	+*** determine year of first edit
	660	+*** determine year of last edit
	661	+*** count total number of edits by year
	662	+*** sort edits by date and keep first 10 edits
	663	+** Write to CSV file.
	664	+
	665	+== Schema of Editor Trends Database ==
	666	+Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure:
	667	+
	668	+<source lang='javascript'>
	669	+{'editor': id,
	670	+ 'year_joined': year,
	671	+ 'new_wikipedian': True,
	672	+ 'total_edits': n,
	673	+ 'edits': {
	674	+ 'date': date,
	675	+ 'article': article_id,
	676	+ }
	677	+}
	678	+</source>
	679	+The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying
	680	+the database:
	681	+
	682	+<pre>
	683	+use wikilitycs
	684	+db.editors_dataset.find_one({'editor': '35252'}, {'edits': 1})
	685	+</pre>
	686	+
	687	+
	688	+Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements.
	689	+
	690	+== Installation ==
	691	+
	692	+=== Step-by-Step Movie Tutorial ===
	693	+There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing.
	694	+
	695	+=== Dependencies ===
	696	+
	697	+Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice.
	698	+
	699	+# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version.
	700	+# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6)
	701	+#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python.
	702	+# Download and install [http://www.sliksvn.com/en/download Subversion] client
	703	+# Depending on your platform make sure you have one of the following extraction utilities installed:
	704	+:* Windows: [http://www.7zip.com 7zip]
	705	+:* Linux: tar (should be installed by default)
	706	+
	707	+To verify that you have installed the required dependencies, do the following:
	708	+<pre>
	709	+<prompt>:: mongo
	710	+MongoDB shell version: 1.6.3
	711	+connecting to: test
	712	+<prompt> (in mongo shell) exit
	713	+
	714	+<prompt>:: python
	715	+Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on
	716	+win32
	717	+Type "help", "copyright", "credits" or "license" for more information.
	718	+<prompt> (in python) exit()
	719	+
	720	+<prompt>:: 7z or tar (depending on your platform)
	721	+7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03
	722	+
	723	+<prompt>:: svn
	724	+
	725	+</pre>
	726	+Output on the console might look different depending on your OS and installed version.
	727	+
	728	+'''For Windows Users, add the following directories to the path'''
	729	+<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre>
	730	+
	731	+To finish the Mongodb configuration, do the following:
	732	+<pre>
	733	+cd \
	734	+mkdir data
	735	+mkdir data\db
	736	+cd \mongodb\bin
	737	+mongod --install --logpath c:\mongodb\logs
	738	+net start mongodb
	739	+</pre>
	740	+
	741	+Prepare your Python environment by taking the following steps:
	742	+1 Check whether easy_install is installed by issuing the command:
	743	+<pre>
	744	+easy_install
	745	+</pre>
	746	+If easy_install is not installed then enter the following command:
	747	+<pre>
	748	+sudo apt-get install python-setuptools
	749	+</pre>
	750	+2 Check whether virtualenv is installed by the issuing the following command:
	751	+<pre>
	752	+virtualenv
	753	+</pre>
	754	+If virtualenv is not installed enter this command:
	755	+<pre>
	756	+sudo easy_install virtualenv
	757	+</pre>
	758	+Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command:
	759	+<pre>
	760	+virtualenv editor_trends
	761	+</pre>
	762	+This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs
	763	+Now, we have to activate our virtual Python:
	764	+<pre>
	765	+source bin/activate
	766	+</pre>
	767	+You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation.
	768	+If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy.
	769	+Finally, enter the following commands:
	770	+<pre>
	771	+easy_install progressbar
	772	+easy_install pymongo
	773	+easy_install argparse
	774	+easy_install python-dateutil
	775	+easy_install texttable
	776	+</pre>
	777	+Python is installed and you are ready to go!
	778	+
	779	+If everything is running, then you are ready to go.
	780	+==== Important MongoDB Notes ====
	781	+If you decide to use MongoDB to store the results then you have to install the
	782	+64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
	783	+databases created by this package will definitely be larger than that. For more
	784	+background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations]
	785	+
	786	+=== Install Editor Trend Analytics ===
	787	+First, download Editor Trend Analytics
	788	+* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
	789	+* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
	790	+
	791	+=== Getting started ===
	792	+By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file.
	793	+<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote>
	794	+
	795	+==== Download Wikipedia dump file ====
	796	+To download a dump file enter the following command:
	797	+<pre>
	798	+python manage.py download
	799	+</pre>
	800	+You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze:
	801	+<pre>
	802	+python manage.py -l Spanish download
	803	+python manage.py -l Español download
	804	+</pre>
	805	+Or, if you want to download a non Wikipedia dump file, enter the following command:
	806	+<pre>
	807	+python manage.py -l Spanish download {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	808	+</pre>
	809	+
	810	+To obtain a list of all supported languages, enter:
	811	+<pre>
	812	+manage show_languages
	813	+</pre>
	814	+or to obtain all languages starting with 'x', enter:
	815	+<pre>
	816	+python manage.py show_languages --first x
	817	+</pre>
	818	+
	819	+
	820	+==== Extract Wikipedia dump file ====
	821	+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
	822	+The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command:
	823	+<pre>
	824	+python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks)
	825	+</pre>
	826	+or, for one of the other Wikimedia projects, enter
	827	+<pre>
	828	+python manage.py -l Spanish -p commons extract
	829	+</pre>
	830	+Valid project choices are: {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	831	+
	832	+'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file.
	833	+
	834	+
	835	+==== Sort Wikipedia dump file ====
	836	+'''WARNING''': This process might take a few hours.
	837	+The chunks must be sorted before being added to the MongoDB. Enter the following command:
	838	+<pre>
	839	+python manage.py sort (for sorting the chunks as generated by the 'manage extract' step)
	840	+</pre>
	841	+or, for one of the other Wikimedia projects, enter
	842	+<pre>
	843	+python manage.py -l Spanish sort {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	844	+</pre>
	845	+
	846	+
	847	+==== Store Wikipedia dump file ====
	848	+'''WARNING''': This process might take hours to days, depending on the configuration of your system.
	849	+Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command:
	850	+<pre>
	851	+python manage.py store
	852	+python manage.py -l Spanish store
	853	+</pre>
	854	+or, for one of the other Wikimedia projects, enter
	855	+<pre>
	856	+python manage.py -l Spanish store {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary}
	857	+</pre>
	858	+
	859	+==== Transform dataset ====
	860	+'''WARNING''': This process might take a couple of hours.
	861	+Finally, the raw data needs to be transformed in useful variables. Issue the following command:
	862	+<pre>
	863	+python manage.py transform
	864	+python manage.py -l Spanish transform
	865	+</pre>
	866	+
	867	+==== Create dataset ====
	868	+'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer.
	869	+We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV\|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]].
	870	+
	871	+Enter the following command:
	872	+<pre>
	873	+python manage.py dataset
	874	+python manage.py -l Spanish dataset
	875	+</pre>
	876	+or, for one of the other Wikimedia projects, enter
	877	+<pre>
	878	+manage -l Spanish {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary} dataset
	879	+</pre>
	880	+
	881	+==== Everything in one shot ====
	882	+'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer.
	883	+If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command:
	884	+<pre>
	885	+python manage.py all language
	886	+python manage.py -l Spanish all
	887	+</pre>
	888	+<pre>
	889	+python manage.py -p {commons\|wikibooks\|wikinews\|wikiquote\|wikisource\|wikiversity\|wikitionary} all
	890	+</pre>
	891	+
	892	+
	893	+=== Benchmarks ===
	894	+{\| border=0
	895	+ \|+ ''Benchmark German Wiki''
	896	+\|-
	897	+ ! Task
	898	+ ! Configuration 1
	899	+ ! Configuration 2
	900	+\|-
	901	+ \| Download
	902	+ \|
	903	+ \| 1 minute 14 seconds
	904	+\|-
	905	+ \| Extract
	906	+ \|
	907	+ \| 4-6 hours
	908	+\|-
	909	+ \| Sort
	910	+ \|
	911	+ \| ~30 minutes
	912	+\|-
	913	+ \| Store
	914	+ \|
	915	+ \| 4-5 hours
	916	+\|-
	917	+ \| Transform
	918	+ \|
	919	+ \| 2-3 hours
	920	+\|-
	921	+ \| Total time
	922	+ \|
	923	+ \| 10-14 hours
	924	+
	925	+\|}
	926	+
	927	+
	928	+{\| border=0
	929	+ \|+ ''Benchmark English Wiki''
	930	+\|-
	931	+ ! Task
	932	+ ! Configuration 1
	933	+ ! Configuration 2
	934	+\|-
	935	+ \| Download
	936	+ \|
	937	+ \| 15 minutes
	938	+\|-
	939	+ \| Extract
	940	+ \|
	941	+ \| ~36 hours
	942	+\|-
	943	+ \| Sort
	944	+ \|
	945	+ \| 10.5 hours
	946	+\|-
	947	+ \| Store
	948	+ \|
	949	+ \| 21 hours
	950	+\|-
	951	+ \| Transform
	952	+ \|
	953	+ \| 14.3 hours
	954	+\|-
	955	+ \| Total time
	956	+ \|
	957	+ \| 3.4 days
	958	+
	959	+\|}
	960	+
	961	+
	962	+{\| width="300" border="1"
	963	+ \|+ ''Benchmark Hungarian Wiki''
	964	+\|-
	965	+ ! Task
	966	+ ! Configuration 3
	967	+\|-
	968	+ \| Download
	969	+ \| 1-2 minutes
	970	+\|-
	971	+ \| Extract
	972	+ \| 24.5 minutes
	973	+\|-
	974	+ \| Sort
	975	+ \| 1.5 minutes
	976	+\|-
	977	+ \| Store
	978	+ \| 7-8 minutes
	979	+\|-
	980	+ \| Transform
	981	+ \| 11 minutes
	982	+\|-
	983	+ \| Total time
	984	+ \| ~45 minutes
	985	+\|}
	986	+
	987	+
	988	+;Configuration 2
	989	+''Amazon Web Services Large EC2 Instance''
	990	+* Ubuntu 64-bit
	991	+* 4 EC2 Compute Units (2 virtual cores)
	992	+* 7.5GB memory
	993	+* 850GB storage
	994	+
	995	+;Configuration 3
	996	+* Win7 64 bit
	997	+* Intel i7 CPU (8 virtual core)
	998	+* 6GB memory
	999	+* 1TB storage
	1000	+* 100/100Mb/s internet connection
	1001	+
	1002	+==See also==
	1003	+* [[Wikilytics Dataset]]
	1004	+* [[Wikilytics Plugins]]
	1005	+
	1006	+[[Category:Wikilytics]]
	1007	+
	1008	+""".splitlines(1)
	1009	+
	1010	+ diff = difflib.unified_diff(str1, str2, n=0, lineterm='')
	1011	+ for line in diff:
	1012	+ if len(line) > 3:
	1013	+ print line
	1014	+# print result
	1015	+
	1016	+if __name__ == '__main__':
	1017	+ launcher_simple()
	1018	+ #debug()
Property changes on: trunk/tools/editor_trends/etl/differ.py
___________________________________________________________________
Added: svn:eol-style
1	1019	+ native