r88956 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88955‎ | r88956 | r88957 >
Date:03:21, 27 May 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added multiprocessing support to differ
Modified paths:
  • /trunk/tools/editor_trends/classes/exceptions.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/etl/differ.py (modified) (history)
  • /trunk/tools/editor_trends/etl/variables.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/variables.py
@@ -205,10 +205,13 @@
206206 '''
207207 username = extract_username(revision, xml_namespace)
208208 user_id = extract_contributor_id(revision, xml_namespace)
209 - bot = determine_username_is_bot(revision, bots, xml_namespace)
210209 editor = {}
211210 editor['username'] = username
212 - editor['bot'] = bot
 211+
 212+ if bots:
 213+ bot = determine_username_is_bot(revision, bots, xml_namespace)
 214+ editor['bot'] = bot
 215+
213216 if user_id != None:
214217 editor.update(user_id)
215218 else:
Index: trunk/tools/editor_trends/etl/differ.py
@@ -24,7 +24,10 @@
2525 import os
2626 import difflib
2727 from xml.etree.cElementTree import iterparse, dump
 28+from multiprocessing import JoinableQueue, Process, cpu_count
 29+from datetime import datetime
2830
 31+
2932 if '..' not in sys.path:
3033 sys.path.append('../')
3134
@@ -206,14 +209,39 @@
207210
208211 return buffer.getvalue()
209212
 213+def stream_raw_xml(input_queue, process_id, rts, format):
 214+ '''
 215+ This function fetches an XML file from the queue and launches the processor.
 216+ '''
 217+ t0 = datetime.now()
 218+ file_id = 0
210219
 220+ while True:
 221+ filename = input_queue.get()
 222+ input_queue.task_done()
 223+ if filename == None:
 224+ print '%s files left in the queue' % input_queue.qsize()
 225+ break
 226+
 227+ print filename
 228+ fh = file_utils.create_streaming_buffer(filename)
 229+ parse_xml(fh, format, process_id, rts.input_location)
 230+ fh.close()
 231+
 232+ t1 = datetime.now()
 233+ print 'Worker %s: Processing of %s took %s' % (process_id, filename, (t1 - t0))
 234+ print 'There are %s files left in the queue' % (input_queue.qsize())
 235+ t0 = t1
 236+
 237+
 238+
211239 def launcher(rts):
212240 '''
213241 This function initializes the multiprocessor, and loading the queue with
214242 the compressed XML files.
215243 '''
216244 input_queue = JoinableQueue()
217 -
 245+ format = 'json'
218246 files = file_utils.retrieve_file_list(rts.input_location)
219247
220248 if len(files) > cpu_count():
@@ -221,8 +249,6 @@
222250 else:
223251 processors = len(files)
224252
225 - fhd = buffer.FileHandleDistributor(rts.max_filehandles, processors)
226 -
227253 for filename in files:
228254 filename = os.path.join(rts.input_location, filename)
229255 print filename
@@ -233,7 +259,7 @@
234260 input_queue.put(None)
235261
236262 extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
237 - fhd, rts])
 263+ rts, format])
238264 for process_id in xrange(processors)]
239265 for extracter in extracters:
240266 extracter.start()
@@ -249,769 +275,41 @@
250276 format = 'json'
251277 for filename in files:
252278 fh = file_utils.create_streaming_buffer(os.path.join(location, filename))
253 - #fh = codecs.open(os.path.join(location, filename), 'r', 'utf-8')
254279 parse_xml(fh, format, process_id, output_location)
255280 fh.close()
256281
257282
258283 def debug():
259284 str1 = """
260 - '''Welcome to Wikilytics !
261 -'''
262 -== Background ==
263 -This package offers a set of tools used to create datasets to analyze Editor
264 -Trends. By Editor Trends we refer to the overall pattern of entering and leaving
265 -a Wikipedia site. The main information source for this package is [[:strategy:Editor Trends Study|Editor Trends Study]]
 285+ '''Welcome to Wikilytics !
 286+ '''
 287+ == Background ==
 288+ This package offers a set of tools used to create datasets to analyze Editor
 289+ Trends. By Editor Trends we refer to the overall pattern of entering and leaving
 290+ a Wikipedia site. The main information source for this package is [[:strategy:Editor Trends Study|Editor Trends Study]]
 291+
 292+ == High-level Overview Editor Trends Analytics ==
 293+ """.splitlines(1)
266294
267 -== High-level Overview Editor Trends Analytics ==
268 -
269 -The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
270 -* Chunk the XML dump file in smaller parts
271 -** and discard all non-zero namespace revisions.
272 -* Parse XML chunks by taking the following steps:
273 -** read XML chunk
274 -** construct XML DOM
275 -** iterate over each article in XML DOM
276 -** iterate over each revision in each article
277 -** extract from each revision
278 -*** username id
279 -*** date edit
280 -*** article id
281 -** determine if username belongs to bot, discard information if yes
282 -** store data in MongoDB
283 -* Create dataset from MongoDB database
284 -** Create list with unique username id’s
285 -** Loop over each id
286 -*** determine year of first edit
287 -*** determine year of last edit
288 -*** count total number of edits by year
289 -*** sort edits by date and keep first 10 edits
290 -** Write to CSV file.
291 -
292 -== Schema of Editor Trends Database ==
293 -Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure:
294 -
295 -<source lang='javascript'>
296 -{'editor': id,
297 - 'year_joined': year,
298 - 'new_wikipedian': True,
299 - 'total_edits': n,
300 - 'edits': {
301 - 'date': date,
302 - 'article': article_id,
303 - }
304 -}
305 -</source>
306 -The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying
307 -the database:
308 -
309 -<pre>
310 -use enwiki
311 -editors = enwiki['editors']
312 -enwiki.editors.find_one({'editor': '35252'}, {'edits': 1})[0]
313 -</pre>
314 -
315 -
316 -Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements.
317 -
318 -== Installation ==
319 -
320 -=== Step-by-Step Movie Tutorial ===
321 -There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing.
322 -
323 -=== Dependencies ===
324 -
325 -Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice.
326 -
327 -# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version.
328 -# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6)
329 -#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python.
330 -# Download and install [http://www.sliksvn.com/en/download Subversion] client
331 -# Depending on your platform make sure you have one of the following extraction utilities installed:
332 -:* Windows: [http://www.7zip.com 7zip]
333 -:* Linux: tar (should be installed by default)
334 -
335 -To verify that you have installed the required dependencies, do the following:
336 -<pre>
337 -<prompt>:: mongo
338 -MongoDB shell version: 1.6.3
339 -connecting to: test
340 -<prompt> (in mongo shell) exit
341 -
342 -<prompt>:: python
343 -Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on
344 -win32
345 -Type "help", "copyright", "credits" or "license" for more information.
346 -<prompt> (in python) exit()
347 -
348 -<prompt>:: 7z or tar (depending on your platform)
349 -7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03
350 -
351 -<prompt>:: svn
352 -
353 -</pre>
354 -Output on the console might look different depending on your OS and installed version.
355 -
356 -'''For Windows Users, add the following directories to the path'''
357 -<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre>
358 -
359 -To finish the Mongodb configuration, do the following:
360 -<pre>
361 -cd \
362 -mkdir data
363 -mkdir data\db
364 -cd \mongodb\bin
365 -mongod --install --logpath c:\mongodb\logs
366 -net start mongodb
367 -</pre>
368 -
369 -Prepare your Python environment by taking the following steps:
370 -1 Check whether easy_install is installed by issuing the command:
371 -<pre>
372 -easy_install
373 -</pre>
374 -If easy_install is not installed then enter the following command:
375 -<pre>
376 -sudo apt-get install python-setuptools
377 -</pre>
378 -2 Check whether virtualenv is installed by the issuing the following command:
379 -<pre>
380 -virtualenv
381 -</pre>
382 -If virtualenv is not installed enter this command:
383 -<pre>
384 -sudo easy_install virtualenv
385 -</pre>
386 -Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command:
387 -<pre>
388 -virtualenv editor_trends
389 -</pre>
390 -This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs
391 -Now, we have to activate our virtual Python:
392 -<pre>
393 -source bin/activate
394 -</pre>
395 -You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation.
396 -If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy.
397 -Finally, enter the following commands:
398 -<pre>
399 -easy_install progressbar
400 -easy_install pymongo
401 -easy_install argparse
402 -easy_install python-dateutil
403 -easy_install texttable
404 -</pre>
405 -Python is installed and you are ready to go!
406 -
407 -If everything is running, then you are ready to go.
408 -==== Important MongoDB Notes ====
409 -If you decide to use MongoDB to store the results then you have to install the
410 -64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
411 -databases created by this package will definitely be larger than that. For more
412 -background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations]
413 -
414 -=== Install Editor Trend Analytics ===
415 -First, download Editor Trend Analytics
416 -* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
417 -* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
418 -
419 -=== Getting started ===
420 -By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file.
421 -<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote>
422 -
423 -==== Download Wikipedia dump file ====
424 -To download a dump file enter the following command:
425 -<pre>
426 -python manage.py download
427 -</pre>
428 -You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze:
429 -<pre>
430 -python manage.py -l Spanish download
431 -python manage.py -l Español download
432 -</pre>
433 -Or, if you want to download a non Wikipedia dump file, enter the following command:
434 -<pre>
435 -python manage.py -l Spanish download {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
436 -</pre>
437 -
438 -To obtain a list of all supported languages, enter:
439 -<pre>
440 -manage show_languages
441 -</pre>
442 -or to obtain all languages starting with 'x', enter:
443 -<pre>
444 -python manage.py show_languages --first x
445 -</pre>
446 -
447 -
448 -==== Extract Wikipedia dump file ====
449 -'''WARNING''': This process might take hours to days, depending on the configuration of your system.
450 -The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command:
451 -<pre>
452 -python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks)
453 -</pre>
454 -or, for one of the other Wikimedia projects, enter
455 -<pre>
456 -python manage.py -l Spanish -p commons extract
457 -</pre>
458 -Valid project choices are: {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
459 -
460 -'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file.
461 -
462 -
463 -==== Sort Wikipedia dump file ====
464 -'''WARNING''': This process might take a few hours.
465 -The chunks must be sorted before being added to the MongoDB. Enter the following command:
466 -<pre>
467 -python manage.py sort (for sorting the chunks as generated by the 'manage extract' step)
468 -</pre>
469 -or, for one of the other Wikimedia projects, enter
470 -<pre>
471 -python manage.py -l Spanish sort {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
472 -</pre>
473 -
474 -
475 -==== Store Wikipedia dump file ====
476 -'''WARNING''': This process might take hours to days, depending on the configuration of your system.
477 -Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command:
478 -<pre>
479 -python manage.py store
480 -python manage.py -l Spanish store
481 -</pre>
482 -or, for one of the other Wikimedia projects, enter
483 -<pre>
484 -python manage.py -l Spanish store {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
485 -</pre>
486 -
487 -==== Transform dataset ====
488 -'''WARNING''': This process might take a couple of hours.
489 -Finally, the raw data needs to be transformed in useful variables. Issue the following command:
490 -<pre>
491 -python manage.py transform
492 -python manage.py -l Spanish transform
493 -</pre>
494 -
495 -==== Create dataset ====
496 -'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer.
497 -We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]].
498 -
499 -Enter the following command:
500 -<pre>
501 -python manage.py dataset
502 -python manage.py -l Spanish dataset
503 -</pre>
504 -or, for one of the other Wikimedia projects, enter
505 -<pre>
506 -manage -l Spanish {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} dataset
507 -</pre>
508 -
509 -==== Everything in one shot ====
510 -'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer.
511 -If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command:
512 -<pre>
513 -python manage.py all language
514 -python manage.py -l Spanish all
515 -</pre>
516 -<pre>
517 -python manage.py -p {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} all
518 -</pre>
519 -
520 -
521 -=== Benchmarks ===
522 -{| border=0
523 - |+ ''Benchmark German Wiki''
524 -|-
525 - ! Task
526 - ! Configuration 1
527 - ! Configuration 2
528 -|-
529 - | Download
530 - |
531 - | 1 minute 14 seconds
532 -|-
533 - | Extract
534 - |
535 - | 4-6 hours
536 -|-
537 - | Sort
538 - |
539 - | ~30 minutes
540 -|-
541 - | Store
542 - |
543 - | 4-5 hours
544 -|-
545 - | Transform
546 - |
547 - | 2-3 hours
548 -|-
549 - | Total time
550 - |
551 - | 10-14 hours
552 -
553 -|}
554 -
555 -
556 -{| border=0
557 - |+ ''Benchmark English Wiki''
558 -|-
559 - ! Task
560 - ! Configuration 1
561 - ! Configuration 2
562 -|-
563 - | Download
564 - |
565 - | 15 minutes
566 -|-
567 - | Extract
568 - |
569 - | ~36 hours
570 -|-
571 - | Sort
572 - |
573 - | 10.5 hours
574 -|-
575 - | Store
576 - |
577 - | 21 hours
578 -|-
579 - | Transform
580 - |
581 - | 14.3 hours
582 -|-
583 - | Total time
584 - |
585 - | 3.4 days
586 -
587 -|}
588 -
589 -
590 -{| width="300" border="1"
591 - |+ ''Benchmark Hungarian Wiki''
592 -|-
593 - ! Task
594 - ! Configuration 3
595 -|-
596 - | Download
597 - | 1-2 minutes
598 -|-
599 - | Extract
600 - | 24.5 minutes
601 -|-
602 - | Sort
603 - | 1.5 minutes
604 -|-
605 - | Store
606 - | 7-8 minutes
607 -|-
608 - | Transform
609 - | 11 minutes
610 -|-
611 - | Total time
612 - | ~45 minutes
613 -|}
614 -
615 -
616 -;Configuration 2
617 -''Amazon Web Services Large EC2 Instance''
618 -* Ubuntu 64-bit
619 -* 4 EC2 Compute Units (2 virtual cores)
620 -* 7.5GB memory
621 -* 850GB storage
622 -
623 -;Configuration 3
624 -* Win7 64 bit
625 -* Intel i7 CPU (8 virtual core)
626 -* 6GB memory
627 -* 1TB storage
628 -* 100/100Mb/s internet connection
629 -
630 -
631 -[[Category:Editor Trends Study]]
632 -""".splitlines(1)
633 -
634295 str2 = """
635 -Welcome to '''Wikilytics''', a free and open source software toolkit for doing analysis of editing trends in Wikipedia and other Wikimedia projects.
 296+ Welcome to '''Wikilytics''', a free and open source software toolkit for doing analysis of editing trends in Wikipedia and other Wikimedia projects.
 297+
 298+ == Background ==
 299+ This package offers a set of tools used to create datasets to analyze editing trends. It was first created expressly for the [[:strategy:Editor Trends Study|Editor Trends Study]], but is well-suited to a variety of research into editing trends. It is thus free to use (as in beer and freedom) if you're interested in expanding on the [[:strategy:Editor Trends Study/Results|results of Editor Trend Study]] or if you'd like to participate in other [[Research/Projects|research projects]].
 300+
 301+ == High-level Overview Editor Trends Analytics ==
 302+
 303+ The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
 304+ * Chunk the XML dump file in smaller parts
 305+ ** and discard all non-zero namespace revisions.
 306+ """.splitlines(1)
636307
637 -== Background ==
638 -This package offers a set of tools used to create datasets to analyze editing trends. It was first created expressly for the [[:strategy:Editor Trends Study|Editor Trends Study]], but is well-suited to a variety of research into editing trends. It is thus free to use (as in beer and freedom) if you're interested in expanding on the [[:strategy:Editor Trends Study/Results|results of Editor Trend Study]] or if you'd like to participate in other [[Research/Projects|research projects]].
639 -
640 -== High-level Overview Editor Trends Analytics ==
641 -
642 -The Python scripts to create the dataset to answer the question '''“Which editors are the ones that are leaving - -are they the new editors or the more tenured ones?”''' consists of three separate phases:
643 -* Chunk the XML dump file in smaller parts
644 -** and discard all non-zero namespace revisions.
645 -* Parse XML chunks by taking the following steps:
646 -** read XML chunk
647 -** construct XML DOM
648 -** iterate over each article in XML DOM
649 -** iterate over each revision in each article
650 -** extract from each revision
651 -*** username id
652 -*** date edit
653 -*** article id
654 -** determine if username belongs to bot, discard information if yes
655 -** store data in MongoDB
656 -* Create dataset from MongoDB database
657 -** Create list with unique username id’s
658 -** Loop over each id
659 -*** determine year of first edit
660 -*** determine year of last edit
661 -*** count total number of edits by year
662 -*** sort edits by date and keep first 10 edits
663 -** Write to CSV file.
664 -
665 -== Schema of Editor Trends Database ==
666 -Each person who has contributed to Wikipedia has it's own document in the [http://www.mongodb.org MongoDB]. A document is a bit similar to a row in a [http://en.wikipedia.org/wiki/SQL SQL] database but there are important differences. The document has the following structure:
667 -
668 -<source lang='javascript'>
669 -{'editor': id,
670 - 'year_joined': year,
671 - 'new_wikipedian': True,
672 - 'total_edits': n,
673 - 'edits': {
674 - 'date': date,
675 - 'article': article_id,
676 - }
677 -}
678 -</source>
679 -The edits variable is a sub document containing all the edits made by that person. The edits variable is date sorted, so the first observation is the first edit made by that person while the last observation is the final edit made by that person. This structure allows for quickly querying
680 -the database:
681 -
682 -<pre>
683 -use wikilitycs
684 -db.editors_dataset.find_one({'editor': '35252'}, {'edits': 1})
685 -</pre>
686 -
687 -
688 -Because we know that each editor has their own document, we do not need to scan the entire table to find all relevant matches. Hence, we can use the find_one() function which results in considerable speed improvements.
689 -
690 -== Installation ==
691 -
692 -=== Step-by-Step Movie Tutorial ===
693 -There is a online tutorial available at [http://vimeo.com/16850312 Vimeo]. You cannot install Editor Trends toolkit on OSX at the moment, I will try to code around some OSX restrictions regarding multiprocessing.
694 -
695 -=== Dependencies ===
696 -
697 -Follow the next steps if you would like to replicate the analysis on a Wikipedia of your choice.
698 -
699 -# Download and install [http://www.mongodb.com MongoDB], preferably the 64 bit version.
700 -# Download and install [http://www.python.org/download Python] 2.6 or 2.7 (The code is not Python 3 compliant and it has not been tested using Python < 2.6)
701 -#: Linux users may need to install the packages python-argparse, python-progressbar and pymongo if that functionality is not installed by default with python.
702 -# Download and install [http://www.sliksvn.com/en/download Subversion] client
703 -# Depending on your platform make sure you have one of the following extraction utilities installed:
704 -:* Windows: [http://www.7zip.com 7zip]
705 -:* Linux: tar (should be installed by default)
706 -
707 -To verify that you have installed the required dependencies, do the following:
708 -<pre>
709 -<prompt>:: mongo
710 -MongoDB shell version: 1.6.3
711 -connecting to: test
712 -<prompt> (in mongo shell) exit
713 -
714 -<prompt>:: python
715 -Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02) [MSC v.1500 32 bit (Intel)] on
716 -win32
717 -Type "help", "copyright", "credits" or "license" for more information.
718 -<prompt> (in python) exit()
719 -
720 -<prompt>:: 7z or tar (depending on your platform)
721 -7-Zip [64] 4.65 Copyright (c) 1999-2009 Igor Pavlov 2009-02-03
722 -
723 -<prompt>:: svn
724 -
725 -</pre>
726 -Output on the console might look different depending on your OS and installed version.
727 -
728 -'''For Windows Users, add the following directories to the path'''
729 -<pre>c:\python26;c:\python26\scripts;c:\mongodb\bin;</pre>
730 -
731 -To finish the Mongodb configuration, do the following:
732 -<pre>
733 -cd \
734 -mkdir data
735 -mkdir data\db
736 -cd \mongodb\bin
737 -mongod --install --logpath c:\mongodb\logs
738 -net start mongodb
739 -</pre>
740 -
741 -Prepare your Python environment by taking the following steps:
742 -1 Check whether easy_install is installed by issuing the command:
743 -<pre>
744 -easy_install
745 -</pre>
746 -If easy_install is not installed then enter the following command:
747 -<pre>
748 -sudo apt-get install python-setuptools
749 -</pre>
750 -2 Check whether virtualenv is installed by the issuing the following command:
751 -<pre>
752 -virtualenv
753 -</pre>
754 -If virtualenv is not installed enter this command:
755 -<pre>
756 -sudo easy_install virtualenv
757 -</pre>
758 -Go to the directory where you want to install your virtual Python, it's okay to go to the parent directory of editor_trends. Then, issue this command:
759 -<pre>
760 -virtualenv editor_trends
761 -</pre>
762 -This will copy the Python executable and libraries to editor_trends/bin and editor_trends/libs
763 -Now, we have to activate our virtual Python:
764 -<pre>
765 -source bin/activate
766 -</pre>
767 -You will see that your command prompt has changed to indicate that you are working with the virtual Python installation instead of working with the systems default installation.
768 -If you now install dependencies then these dependencies will be installed in your virtual Python installation instead of in the system Python installation. This will keep everybody happy.
769 -Finally, enter the following commands:
770 -<pre>
771 -easy_install progressbar
772 -easy_install pymongo
773 -easy_install argparse
774 -easy_install python-dateutil
775 -easy_install texttable
776 -</pre>
777 -Python is installed and you are ready to go!
778 -
779 -If everything is running, then you are ready to go.
780 -==== Important MongoDB Notes ====
781 -If you decide to use MongoDB to store the results then you have to install the
782 -64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
783 -databases created by this package will definitely be larger than that. For more
784 -background information on this limitation, please read [http://blog.mongodb.org/post/137788967/32-bit-limitations MongoDB 32-bit limitations]
785 -
786 -=== Install Editor Trend Analytics ===
787 -First, download Editor Trend Analytics
788 -* Windows: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
789 -* Linux: svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/tools/editor_trends/ editor_trends
790 -
791 -=== Getting started ===
792 -By now, you should have Editor Trend Analytics up and running. The first thing you need to do is to download a Wikipedia dump file.
793 -<blockquote>From now on, I'll assume that you are locate in the directory where you installed Editor Trend Analytics.</blockquote>
794 -
795 -==== Download Wikipedia dump file ====
796 -To download a dump file enter the following command:
797 -<pre>
798 -python manage.py download
799 -</pre>
800 -You can also specify the language (either using the English name or the local name) of the Wikipedia project that you would like to analyze:
801 -<pre>
802 -python manage.py -l Spanish download
803 -python manage.py -l Español download
804 -</pre>
805 -Or, if you want to download a non Wikipedia dump file, enter the following command:
806 -<pre>
807 -python manage.py -l Spanish download {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
808 -</pre>
809 -
810 -To obtain a list of all supported languages, enter:
811 -<pre>
812 -manage show_languages
813 -</pre>
814 -or to obtain all languages starting with 'x', enter:
815 -<pre>
816 -python manage.py show_languages --first x
817 -</pre>
818 -
819 -
820 -==== Extract Wikipedia dump file ====
821 -'''WARNING''': This process might take hours to days, depending on the configuration of your system.
822 -The Wikipedia dump file is extracted and split into smaller chunks to speed up the processing. Enter the following command:
823 -<pre>
824 -python manage.py extract (for extracting data from the Wikipedia dump file and storing it in smaller chunks)
825 -</pre>
826 -or, for one of the other Wikimedia projects, enter
827 -<pre>
828 -python manage.py -l Spanish -p commons extract
829 -</pre>
830 -Valid project choices are: {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
831 -
832 -'''Note:''' The extract process may need to be run twice. Once to unzip the dump file, then again to extract the data from the dump file.
833 -
834 -
835 -==== Sort Wikipedia dump file ====
836 -'''WARNING''': This process might take a few hours.
837 -The chunks must be sorted before being added to the MongoDB. Enter the following command:
838 -<pre>
839 -python manage.py sort (for sorting the chunks as generated by the 'manage extract' step)
840 -</pre>
841 -or, for one of the other Wikimedia projects, enter
842 -<pre>
843 -python manage.py -l Spanish sort {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
844 -</pre>
845 -
846 -
847 -==== Store Wikipedia dump file ====
848 -'''WARNING''': This process might take hours to days, depending on the configuration of your system.
849 -Now, we are ready to extract the required information from the Wikipedia dump file chunks and store it in the MongoDB. Enter the following command:
850 -<pre>
851 -python manage.py store
852 -python manage.py -l Spanish store
853 -</pre>
854 -or, for one of the other Wikimedia projects, enter
855 -<pre>
856 -python manage.py -l Spanish store {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary}
857 -</pre>
858 -
859 -==== Transform dataset ====
860 -'''WARNING''': This process might take a couple of hours.
861 -Finally, the raw data needs to be transformed in useful variables. Issue the following command:
862 -<pre>
863 -python manage.py transform
864 -python manage.py -l Spanish transform
865 -</pre>
866 -
867 -==== Create dataset ====
868 -'''WARNING''': This process might take a couple of hours to days depending on the configuration of your computer.
869 -We are almost there, the data is in the database and now we need to export the data to a [[:en:CSV|CSV]] file so we can import it using a statistical program such as [[:en:R (programming language)]], [[:en:Stata]] or [[:en:SPSS]].
870 -
871 -Enter the following command:
872 -<pre>
873 -python manage.py dataset
874 -python manage.py -l Spanish dataset
875 -</pre>
876 -or, for one of the other Wikimedia projects, enter
877 -<pre>
878 -manage -l Spanish {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} dataset
879 -</pre>
880 -
881 -==== Everything in one shot ====
882 -'''WARNING''': This process might take a couple of days or even more than a week depending on the configuration of your computer.
883 -If you don't feel like monitoring your computer and you just want to create a dataset from scratch, enter the following command:
884 -<pre>
885 -python manage.py all language
886 -python manage.py -l Spanish all
887 -</pre>
888 -<pre>
889 -python manage.py -p {commons|wikibooks|wikinews|wikiquote|wikisource|wikiversity|wikitionary} all
890 -</pre>
891 -
892 -
893 -=== Benchmarks ===
894 -{| border=0
895 - |+ ''Benchmark German Wiki''
896 -|-
897 - ! Task
898 - ! Configuration 1
899 - ! Configuration 2
900 -|-
901 - | Download
902 - |
903 - | 1 minute 14 seconds
904 -|-
905 - | Extract
906 - |
907 - | 4-6 hours
908 -|-
909 - | Sort
910 - |
911 - | ~30 minutes
912 -|-
913 - | Store
914 - |
915 - | 4-5 hours
916 -|-
917 - | Transform
918 - |
919 - | 2-3 hours
920 -|-
921 - | Total time
922 - |
923 - | 10-14 hours
924 -
925 -|}
926 -
927 -
928 -{| border=0
929 - |+ ''Benchmark English Wiki''
930 -|-
931 - ! Task
932 - ! Configuration 1
933 - ! Configuration 2
934 -|-
935 - | Download
936 - |
937 - | 15 minutes
938 -|-
939 - | Extract
940 - |
941 - | ~36 hours
942 -|-
943 - | Sort
944 - |
945 - | 10.5 hours
946 -|-
947 - | Store
948 - |
949 - | 21 hours
950 -|-
951 - | Transform
952 - |
953 - | 14.3 hours
954 -|-
955 - | Total time
956 - |
957 - | 3.4 days
958 -
959 -|}
960 -
961 -
962 -{| width="300" border="1"
963 - |+ ''Benchmark Hungarian Wiki''
964 -|-
965 - ! Task
966 - ! Configuration 3
967 -|-
968 - | Download
969 - | 1-2 minutes
970 -|-
971 - | Extract
972 - | 24.5 minutes
973 -|-
974 - | Sort
975 - | 1.5 minutes
976 -|-
977 - | Store
978 - | 7-8 minutes
979 -|-
980 - | Transform
981 - | 11 minutes
982 -|-
983 - | Total time
984 - | ~45 minutes
985 -|}
986 -
987 -
988 -;Configuration 2
989 -''Amazon Web Services Large EC2 Instance''
990 -* Ubuntu 64-bit
991 -* 4 EC2 Compute Units (2 virtual cores)
992 -* 7.5GB memory
993 -* 850GB storage
994 -
995 -;Configuration 3
996 -* Win7 64 bit
997 -* Intel i7 CPU (8 virtual core)
998 -* 6GB memory
999 -* 1TB storage
1000 -* 100/100Mb/s internet connection
1001 -
1002 -==See also==
1003 -* [[Wikilytics Dataset]]
1004 -* [[Wikilytics Plugins]]
1005 -
1006 -[[Category:Wikilytics]]
1007 -
1008 -""".splitlines(1)
1009 -
1010308 diff = difflib.unified_diff(str1, str2, n=0, lineterm='')
1011309 for line in diff:
1012310 if len(line) > 3:
1013311 print line
1014 -# print result
1015312
 313+
1016314 if __name__ == '__main__':
1017315 launcher_simple()
1018316 #debug()
Index: trunk/tools/editor_trends/classes/exceptions.py
@@ -90,6 +90,14 @@
9191 return '''You need either to install Mongo or Cassandra to use
9292 Wikiltyics.'''
9393
 94+class OutputNotSupported(Exception):
 95+ def __init__(self, format):
 96+ super(OutputNotSupported, self).__init_()
 97+ self.format = format
 98+
 99+ def __str__(self):
 100+ return '''Output format %s is not supported.''' % format
 101+
94102 class UnknownPluginError(Exception):
95103 '''Exception to notify the user that the requested plugin does not exist.'''
96104 def __init__(self, plugin, plugins):
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -72,11 +72,13 @@
7373 self.dataset = os.path.join(self.dataset_location, self.project.name)
7474 self.txt = os.path.join(self.output_location, 'txt')
7575 self.sorted = os.path.join(self.output_location, 'sorted')
 76+ self.diffs = os.path.join(self.output_location, 'diffs')
7677
7778 self.directories = [self.output_location,
7879 self.txt,
7980 self.sorted,
80 - self.dataset]
 81+ self.dataset,
 82+ self.diffs]
8183 self.verify_environment(self.directories)
8284
8385 #Wikidump file related variables
@@ -88,6 +90,7 @@
8991 self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
9092 self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
9193 self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
 94+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
9295
9396
9497