Index: trunk/tools/editor_trends/.svn_ignore |
— | — | @@ -1,12 +0,0 @@ |
2 | | -*.pyc
|
3 | | -*.xml
|
4 | | -*.db
|
5 | | -*.bin
|
6 | | -*.zip
|
7 | | -*.csv
|
8 | | -.*
|
9 | | -zips/
|
10 | | -wikistats/
|
11 | | -datasets/
|
12 | | -data/
|
13 | | -notes.txt |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/namespaces |
___________________________________________________________________ |
Added: svn:ignore |
14 | 1 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py |
— | — | @@ -1,49 +1,49 @@ |
2 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
3 | | -__author__email = 'dvanliere at gmail dot com'
|
4 | | -__date__ = '2010-11-24'
|
5 | | -__version__ = '0.1'
|
6 | | -
|
7 | | -import sys
|
8 | | -sys.path.append('..')
|
9 | | -
|
10 | | -import configuration
|
11 | | -settings = configuration.Settings()
|
12 | | -from utils import utils
|
13 | | -from database import db
|
14 | | -
|
15 | | -
|
16 | | -#def dataset_edits_by_month(dbname, **kwargs):
|
17 | | -# dbname = kwargs.pop('dbname')
|
18 | | -# mongo = db.init_mongo_db(dbname)
|
19 | | -# editors = mongo['dataset']
|
20 | | -# name = dbname + '_edits_by_month.csv'
|
21 | | -# fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
|
22 | | -# x = 0
|
23 | | -# vars_to_expand = ['monthly_edits']
|
24 | | -# while True:
|
25 | | -# try:
|
26 | | -# id = input_queue.get(block=False)
|
27 | | -# print input_queue.qsize()
|
28 | | -# obs = editors.find_one({'editor': id})
|
29 | | -# obs = expand_observations(obs, vars_to_expand)
|
30 | | -# if x == 0:
|
31 | | -# headers = obs.keys()
|
32 | | -# headers.sort()
|
33 | | -# headers = expand_headers(headers, vars_to_expand, obs)
|
34 | | -# utils.write_list_to_csv(headers, fh)
|
35 | | -# data = []
|
36 | | -# keys = obs.keys()
|
37 | | -# keys.sort()
|
38 | | -# for key in keys:
|
39 | | -# data.append(obs[key])
|
40 | | -# utils.write_list_to_csv(data, fh)
|
41 | | -#
|
42 | | -# x += 1
|
43 | | -# except Empty:
|
44 | | -# break
|
45 | | -# fh.close()
|
46 | | -
|
47 | | -
|
48 | | -if __name__ == '__main__':
|
49 | | -
|
| 2 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 3 | +__author__email = 'dvanliere at gmail dot com' |
| 4 | +__date__ = '2010-11-24' |
| 5 | +__version__ = '0.1' |
| 6 | + |
| 7 | +import sys |
| 8 | +sys.path.append('..') |
| 9 | + |
| 10 | +import configuration |
| 11 | +settings = configuration.Settings() |
| 12 | +from utils import utils |
| 13 | +from database import db |
| 14 | + |
| 15 | + |
| 16 | +#def dataset_edits_by_month(dbname, **kwargs): |
| 17 | +# dbname = kwargs.pop('dbname') |
| 18 | +# mongo = db.init_mongo_db(dbname) |
| 19 | +# editors = mongo['dataset'] |
| 20 | +# name = dbname + '_edits_by_month.csv' |
| 21 | +# fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding) |
| 22 | +# x = 0 |
| 23 | +# vars_to_expand = ['monthly_edits'] |
| 24 | +# while True: |
| 25 | +# try: |
| 26 | +# id = input_queue.get(block=False) |
| 27 | +# print input_queue.qsize() |
| 28 | +# obs = editors.find_one({'editor': id}) |
| 29 | +# obs = expand_observations(obs, vars_to_expand) |
| 30 | +# if x == 0: |
| 31 | +# headers = obs.keys() |
| 32 | +# headers.sort() |
| 33 | +# headers = expand_headers(headers, vars_to_expand, obs) |
| 34 | +# utils.write_list_to_csv(headers, fh) |
| 35 | +# data = [] |
| 36 | +# keys = obs.keys() |
| 37 | +# keys.sort() |
| 38 | +# for key in keys: |
| 39 | +# data.append(obs[key]) |
| 40 | +# utils.write_list_to_csv(data, fh) |
| 41 | +# |
| 42 | +# x += 1 |
| 43 | +# except Empty: |
| 44 | +# break |
| 45 | +# fh.close() |
| 46 | + |
| 47 | + |
| 48 | +if __name__ == '__main__': |
| 49 | + |
50 | 50 | |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py |
___________________________________________________________________ |
Added: svn:eol-style |
51 | 51 | + native |
Property changes on: trunk/tools/editor_trends/analyses |
___________________________________________________________________ |
Added: svn:ignore |
52 | 52 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/wikitree |
___________________________________________________________________ |
Added: svn:ignore |
53 | 53 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/etl |
___________________________________________________________________ |
Added: svn:ignore |
54 | 54 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/scripts |
___________________________________________________________________ |
Added: svn:ignore |
55 | 55 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/experience |
___________________________________________________________________ |
Added: svn:ignore |
56 | 56 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/statistics/r |
___________________________________________________________________ |
Added: svn:ignore |
57 | 57 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do |
— | — | @@ -1,25 +1,25 @@ |
2 | | -label var months_3 "3 Months"
|
3 | | -label var months_6 "6 Months"
|
4 | | -label var months_9 "9 Months"
|
5 | | -label var months_12 "1 Year"
|
6 | | -label var months_24 "2 Years"
|
7 | | -label var months_36 "3 Years"
|
8 | | -label var months_48 "4 Years"
|
9 | | -label var months_60 "5 Years"
|
10 | | -label var months_72 "6 Years"
|
11 | | -label var months_84 "7 Years"
|
12 | | -label var months_96 "8 Years"
|
13 | | -label var months_108 "9 Years"
|
14 | | -generate one_year_exp = months_3+ months_6+ months_9+ months_12
|
15 | | -
|
16 | | -generate fewer_one_year_abs = (one_year_exp/100) * n
|
17 | | -generate more_one_year_abs = n - fewer_one_year_abs
|
18 | | -label var fewer_one_year_abs "Editors with less than one year experience"
|
19 | | -label var more_one_year_abs "Editors with more than one year experience"
|
20 | | -
|
21 | | -graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
|
22 | | -
|
23 | | -twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
|
24 | | -
|
25 | | -
|
26 | | -graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
|
| 2 | +label var months_3 "3 Months" |
| 3 | +label var months_6 "6 Months" |
| 4 | +label var months_9 "9 Months" |
| 5 | +label var months_12 "1 Year" |
| 6 | +label var months_24 "2 Years" |
| 7 | +label var months_36 "3 Years" |
| 8 | +label var months_48 "4 Years" |
| 9 | +label var months_60 "5 Years" |
| 10 | +label var months_72 "6 Years" |
| 11 | +label var months_84 "7 Years" |
| 12 | +label var months_96 "8 Years" |
| 13 | +label var months_108 "9 Years" |
| 14 | +generate one_year_exp = months_3+ months_6+ months_9+ months_12 |
| 15 | + |
| 16 | +generate fewer_one_year_abs = (one_year_exp/100) * n |
| 17 | +generate more_one_year_abs = n - fewer_one_year_abs |
| 18 | +label var fewer_one_year_abs "Editors with less than one year experience" |
| 19 | +label var more_one_year_abs "Editors with more than one year experience" |
| 20 | + |
| 21 | +graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall)) |
| 22 | + |
| 23 | +twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall)) |
| 24 | + |
| 25 | + |
| 26 | +graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1)) |
Property changes on: trunk/tools/editor_trends/statistics/stata/cohort_charts.do |
___________________________________________________________________ |
Added: svn:eol-style |
27 | 27 | + native |
Property changes on: trunk/tools/editor_trends/statistics/stata |
___________________________________________________________________ |
Added: svn:ignore |
28 | 28 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/statistics |
___________________________________________________________________ |
Added: svn:ignore |
29 | 29 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/utils/namespace_downloader.py |
— | — | @@ -1,44 +1,44 @@ |
2 | | -
|
3 | | -
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__author__email = 'dvanliere at gmail dot com'
|
18 | | -__date__ = 'Oct 27, 2010'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -import languages
|
22 | | -import dump_downloader as dd
|
23 | | -import configuration
|
24 | | -settings = configuration.Settings()
|
25 | | -
|
26 | | -
|
27 | | -
|
28 | | -def retrieve_json_namespace():
|
29 | | - path = '/w/api.php?action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json'
|
30 | | - visited = set()
|
31 | | - for language in languages.MAPPING:
|
32 | | - language = languages.MAPPING[language]
|
33 | | - filename = '%s_ns.json' % language
|
34 | | - if language not in visited:
|
35 | | - domain = 'http://%s.wikipedia.org' % language
|
36 | | - dd.download_wiki_file(domain, path, filename, settings.namespace_location, 'w', True)
|
37 | | - visited.add(language)
|
38 | | -
|
39 | | -
|
40 | | -def launch_downloader():
|
41 | | - retrieve_json_namespace()
|
42 | | -
|
43 | | -
|
44 | | -if __name__ == '__main__':
|
| 2 | + |
| 3 | + |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = 'Oct 27, 2010' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import languages |
| 22 | +import dump_downloader as dd |
| 23 | +import configuration |
| 24 | +settings = configuration.Settings() |
| 25 | + |
| 26 | + |
| 27 | + |
| 28 | +def retrieve_json_namespace(): |
| 29 | + path = '/w/api.php?action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json' |
| 30 | + visited = set() |
| 31 | + for language in languages.MAPPING: |
| 32 | + language = languages.MAPPING[language] |
| 33 | + filename = '%s_ns.json' % language |
| 34 | + if language not in visited: |
| 35 | + domain = 'http://%s.wikipedia.org' % language |
| 36 | + dd.download_wiki_file(domain, path, filename, settings.namespace_location, 'w', True) |
| 37 | + visited.add(language) |
| 38 | + |
| 39 | + |
| 40 | +def launch_downloader(): |
| 41 | + retrieve_json_namespace() |
| 42 | + |
| 43 | + |
| 44 | +if __name__ == '__main__': |
45 | 45 | launch_downloader() |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/utils/namespace_downloader.py |
___________________________________________________________________ |
Added: svn:eol-style |
46 | 46 | + native |
Property changes on: trunk/tools/editor_trends/utils |
___________________________________________________________________ |
Added: svn:ignore |
47 | 47 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/tests/mongodb |
___________________________________________________________________ |
Added: svn:ignore |
48 | 48 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/tests |
___________________________________________________________________ |
Added: svn:ignore |
49 | 49 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/README.1ST |
— | — | @@ -1,89 +1,89 @@ |
2 | | -===============================================================================
|
3 | | -
|
4 | | - Wikipedia Editor Trends Analytics
|
5 | | -
|
6 | | -===============================================================================
|
7 | | -
|
8 | | -BACKGROUND:
|
9 | | -This package offers a set of tools used to create datasets to analyze Editor
|
10 | | -Trends. By Editor Trends we refer to the overall pattern of entering and leaving
|
11 | | -a Wikipedia site. The main information source for this package is:
|
12 | | - http://strategy.wikimedia.org/wiki/Editor_Trends_Study
|
13 | | -
|
14 | | -REQUIREMENTS:
|
15 | | -
|
16 | | -* Python 2.6 or higher (this code has not been tested with Python 3.x)
|
17 | | -
|
18 | | -OPTIONAL
|
19 | | -* MongoDB
|
20 | | -
|
21 | | -If you don't want to install / use MongDB then the package will use the built-in
|
22 | | -Sqlite library. However, this not optimized for speed and may take a serious
|
23 | | -amount of time. If possible, install MongoDB.
|
24 | | -
|
25 | | -INSTALLING USING VIRTUALENV
|
26 | | -It's recommended to use Python virtualenv. If you are not familiar with
|
27 | | -virtualenv then have a look over here:
|
28 | | - http://groups.google.com/group/python-virtualenv/browse_thread/thread/f2f19d2cc93a844e
|
29 | | -
|
30 | | -To install Editor Trends Analytics:
|
31 | | -
|
32 | | - virtualenv --no-site-packages --distribute editor_trends
|
33 | | - pip install -E editor_trends -r /editor_trends/requirements.txt
|
34 | | -
|
35 | | -
|
36 | | -The first command creates a new virtualenv called editor_trends and the second
|
37 | | -command installs the dependencies. Currently the dependencies are:
|
38 | | -* PyMongo
|
39 | | -* Progressbar
|
40 | | -
|
41 | | -INSTALLING WITHOUT VIRTUALENV
|
42 | | -If you don't like virtualenv then do the following:
|
43 | | -
|
44 | | - easy_install pymongo
|
45 | | - easy_install progressbar
|
46 | | -
|
47 | | -IMPORTANT MONGODB NOTES
|
48 | | -If you decide to use MongDB to store the results then you have to install the
|
49 | | -64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the
|
50 | | -databases created by this package will definitely be larger than that. For more
|
51 | | -background information on this limitation, please read:
|
52 | | - http://blog.mongodb.org/post/137788967/32-bit-limitations
|
53 | | -
|
54 | | -
|
55 | | -CONFIGURATION:
|
56 | | -If you would like to create a dataset for your own analyses then you should
|
57 | | -first make the appropriate changes to settings.py. Settings.py contains
|
58 | | -configuration variables such as the location of input and output files. Most
|
59 | | -settings are self-explanatory but in cases of any questions please drop me a
|
60 | | -line.
|
61 | | -
|
62 | | -PROCESSING TIMES:
|
63 | | -
|
64 | | -CONFIG NAMESPACE FILENAME CHUNKING STORING INDEXING RETRIEVING TOTAL
|
65 | | -1 0 stub-meta-history 7 3 1 ? 11
|
66 | | -
|
67 | | -
|
68 | | -*CHUNKING == splitting XML file in smaller pieces
|
69 | | -*STORING == parsing xml files and storing it in MongoDB
|
70 | | -*INDEXING == creating an index in MongoDB
|
71 | | -*RETRIEVING == generating a dataset
|
72 | | -*TOTAL == sum of all parts
|
73 | | -
|
74 | | -MACHINE CONFIGURATIONS
|
75 | | -
|
76 | | -ID OS VERSION MEMORY PROCESSOR SPEED
|
77 | | -1 Windows 7 64-bit 4GB Duo Core 2.8MHZ
|
78 | | -Please add your processing times plus configuration to help improve performance.
|
79 | | -
|
80 | | -HARDDISK REQUIREMENTS
|
81 | | -You will need at least 3x the size of xml dump file in free space on your hard
|
82 | | -disk if you want to create the databases and datasets to run your own analyses.
|
83 | | -The English stub-meta-history.xml is about 15Gb so you need about 45Gb of free
|
84 | | -diskspace.
|
85 | | -
|
86 | | -CODE:
|
87 | | -The Python code adheres to PEP8. Function names are deliberately expressive to
|
88 | | -ease understanding what's going. If you find a bug please email me at dvanliere
|
89 | | -at gmail dot com or leave a message on my Talk page.
|
90 | | -
|
| 2 | +=============================================================================== |
| 3 | + |
| 4 | + Wikipedia Editor Trends Analytics |
| 5 | + |
| 6 | +=============================================================================== |
| 7 | + |
| 8 | +BACKGROUND: |
| 9 | +This package offers a set of tools used to create datasets to analyze Editor |
| 10 | +Trends. By Editor Trends we refer to the overall pattern of entering and leaving |
| 11 | +a Wikipedia site. The main information source for this package is: |
| 12 | + http://strategy.wikimedia.org/wiki/Editor_Trends_Study |
| 13 | + |
| 14 | +REQUIREMENTS: |
| 15 | + |
| 16 | +* Python 2.6 or higher (this code has not been tested with Python 3.x) |
| 17 | + |
| 18 | +OPTIONAL |
| 19 | +* MongoDB |
| 20 | + |
| 21 | +If you don't want to install / use MongDB then the package will use the built-in |
| 22 | +Sqlite library. However, this not optimized for speed and may take a serious |
| 23 | +amount of time. If possible, install MongoDB. |
| 24 | + |
| 25 | +INSTALLING USING VIRTUALENV |
| 26 | +It's recommended to use Python virtualenv. If you are not familiar with |
| 27 | +virtualenv then have a look over here: |
| 28 | + http://groups.google.com/group/python-virtualenv/browse_thread/thread/f2f19d2cc93a844e |
| 29 | + |
| 30 | +To install Editor Trends Analytics: |
| 31 | + |
| 32 | + virtualenv --no-site-packages --distribute editor_trends |
| 33 | + pip install -E editor_trends -r /editor_trends/requirements.txt |
| 34 | + |
| 35 | + |
| 36 | +The first command creates a new virtualenv called editor_trends and the second |
| 37 | +command installs the dependencies. Currently the dependencies are: |
| 38 | +* PyMongo |
| 39 | +* Progressbar |
| 40 | + |
| 41 | +INSTALLING WITHOUT VIRTUALENV |
| 42 | +If you don't like virtualenv then do the following: |
| 43 | + |
| 44 | + easy_install pymongo |
| 45 | + easy_install progressbar |
| 46 | + |
| 47 | +IMPORTANT MONGODB NOTES |
| 48 | +If you decide to use MongDB to store the results then you have to install the |
| 49 | +64-bit version. 32-bit versions of MongoDB are limited to 2GB of data and the |
| 50 | +databases created by this package will definitely be larger than that. For more |
| 51 | +background information on this limitation, please read: |
| 52 | + http://blog.mongodb.org/post/137788967/32-bit-limitations |
| 53 | + |
| 54 | + |
| 55 | +CONFIGURATION: |
| 56 | +If you would like to create a dataset for your own analyses then you should |
| 57 | +first make the appropriate changes to settings.py. Settings.py contains |
| 58 | +configuration variables such as the location of input and output files. Most |
| 59 | +settings are self-explanatory but in cases of any questions please drop me a |
| 60 | +line. |
| 61 | + |
| 62 | +PROCESSING TIMES: |
| 63 | + |
| 64 | +CONFIG NAMESPACE FILENAME CHUNKING STORING INDEXING RETRIEVING TOTAL |
| 65 | +1 0 stub-meta-history 7 3 1 ? 11 |
| 66 | + |
| 67 | + |
| 68 | +*CHUNKING == splitting XML file in smaller pieces |
| 69 | +*STORING == parsing xml files and storing it in MongoDB |
| 70 | +*INDEXING == creating an index in MongoDB |
| 71 | +*RETRIEVING == generating a dataset |
| 72 | +*TOTAL == sum of all parts |
| 73 | + |
| 74 | +MACHINE CONFIGURATIONS |
| 75 | + |
| 76 | +ID OS VERSION MEMORY PROCESSOR SPEED |
| 77 | +1 Windows 7 64-bit 4GB Duo Core 2.8MHZ |
| 78 | +Please add your processing times plus configuration to help improve performance. |
| 79 | + |
| 80 | +HARDDISK REQUIREMENTS |
| 81 | +You will need at least 3x the size of xml dump file in free space on your hard |
| 82 | +disk if you want to create the databases and datasets to run your own analyses. |
| 83 | +The English stub-meta-history.xml is about 15Gb so you need about 45Gb of free |
| 84 | +diskspace. |
| 85 | + |
| 86 | +CODE: |
| 87 | +The Python code adheres to PEP8. Function names are deliberately expressive to |
| 88 | +ease understanding what's going. If you find a bug please email me at dvanliere |
| 89 | +at gmail dot com or leave a message on my Talk page. |
| 90 | + |
Property changes on: trunk/tools/editor_trends/README.1ST |
___________________________________________________________________ |
Added: svn:eol-style |
91 | 91 | + native |
Property changes on: trunk/tools/editor_trends/logs |
___________________________________________________________________ |
Modified: svn:ignore |
92 | 92 | - *.bin |
split_xml |
93 | 93 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/database/sqlite_logic.py |
— | — | @@ -1,156 +1,156 @@ |
2 | | -def retrieve_editor_ids_db():
|
3 | | - contributors = set()
|
4 | | - connection = db.init_database()
|
5 | | - cursor = connection.cursor()
|
6 | | - if settings.PROGRESS_BAR:
|
7 | | - cursor.execute('SELECT MAX(ROWID) FROM contributors')
|
8 | | - for id in cursor:
|
9 | | - pass
|
10 | | - pbar = progressbar.ProgressBar(maxval=id[0]).start()
|
11 | | -
|
12 | | - cursor.execute('SELECT contributor FROM contributors WHERE bot=0')
|
13 | | -
|
14 | | - print 'Retrieving contributors...'
|
15 | | - for x, contributor in enumerate(cursor):
|
16 | | - contributors.add(contributor[0])
|
17 | | - if x % 100000 == 0:
|
18 | | - pbar.update(x)
|
19 | | - print 'Serializing contributors...'
|
20 | | - utils.store_object(contributors, 'contributors')
|
21 | | - print 'Finished serializing contributors...'
|
22 | | -
|
23 | | - if pbar:
|
24 | | - pbar.finish()
|
25 | | - print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
|
26 | | -
|
27 | | - connection.close()
|
28 | | -
|
29 | | -def retrieve_edits_by_contributor(input_queue, result_queue, pbar):
|
30 | | - connection = db.init_database()
|
31 | | - cursor = connection.cursor()
|
32 | | -
|
33 | | - while True:
|
34 | | - try:
|
35 | | - contributor = input_queue.get(block=False)
|
36 | | - if contributor == None:
|
37 | | - break
|
38 | | -
|
39 | | - cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,))
|
40 | | - edits = {}
|
41 | | - edits[contributor] = set()
|
42 | | - for edit, timestamp, bot in cursor:
|
43 | | - date = utils.convert_timestamp_to_date(timestamp)
|
44 | | - edits[contributor].add(date)
|
45 | | - #print edit, timestamp, bot
|
46 | | -
|
47 | | - utils.write_data_to_csv(edits, retrieve_edits_by_contributor)
|
48 | | - if pbar:
|
49 | | - utils.update_progressbar(pbar, input_queue)
|
50 | | -
|
51 | | - except Empty:
|
52 | | - pass
|
53 | | -
|
54 | | - connection.close()
|
55 | | -
|
56 | | -
|
57 | | -def store_data_db(data_queue, pids):
|
58 | | - connection = db.init_database()
|
59 | | - cursor = connection.cursor()
|
60 | | - db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE)
|
61 | | - empty = 0
|
62 | | - values = []
|
63 | | - while True:
|
64 | | - try:
|
65 | | - chunk = data_queue.get(block=False)
|
66 | | - contributor = chunk['contributor'].encode(settings.encoding)
|
67 | | - article = chunk['article']
|
68 | | - timestamp = chunk['timestamp'].encode(settings.encoding)
|
69 | | - bot = chunk['bot']
|
70 | | - values.append((contributor, article, timestamp, bot))
|
71 | | -
|
72 | | - if len(values) == 50000:
|
73 | | - cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values)
|
74 | | - connection.commit()
|
75 | | - #print 'Size of queue: %s' % data_queue.qsize()
|
76 | | - values = []
|
77 | | -
|
78 | | - except Empty:
|
79 | | -
|
80 | | - if all([utils.check_if_process_is_running(pid) for pid in pids]):
|
81 | | - pass
|
82 | | - else:
|
83 | | - break
|
84 | | - connection.close()
|
85 | | -
|
86 | | -
|
87 | | -def create_bots_db(db_name):
|
88 | | - '''
|
89 | | - This function reads the csv file provided by Erik Zachte and constructs a
|
90 | | - sqlite memory database. The reason for this is that I suspect I will need
|
91 | | - some simple querying capabilities in the future, else a dictionary would
|
92 | | - suffice.
|
93 | | - '''
|
94 | | - connection = db.init_database('db_name')
|
95 | | - #connection = db.init_database('data/database/bots.db')
|
96 | | - cursor = connection.cursor()
|
97 | | - db.create_tables(cursor, db_settings.BOT_TABLE)
|
98 | | - values = []
|
99 | | - fields = [field[0] for field in db_settings.BOT_TABLE['bots']]
|
100 | | - for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.encoding):
|
101 | | - line = line.split(',')
|
102 | | - row = []
|
103 | | - for x, (field, value) in enumerate(zip(fields, line)):
|
104 | | - if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER':
|
105 | | - value = int(value)
|
106 | | - elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT':
|
107 | | - value = value.replace('/', '-')
|
108 | | - #print field, value
|
109 | | - row.append(value)
|
110 | | - values.append(row)
|
111 | | -
|
112 | | - cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values)
|
113 | | - connection.commit()
|
114 | | - if db_name == ':memory':
|
115 | | - return cursor
|
116 | | - else:
|
117 | | - connection.close()
|
118 | | -
|
119 | | -def retrieve_botnames_without_id(cursor, language):
|
120 | | - return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall()
|
121 | | -
|
122 | | -
|
123 | | -def add_id_to_botnames():
|
124 | | - '''
|
125 | | - This is the worker function for the multi-process version of
|
126 | | - lookup_username.First, the names of the bots are retrieved, then the
|
127 | | - multiprocess is launched by making a call to pc.build_scaffolding. This is a
|
128 | | - generic launcher that takes as input the function to load the input_queue,
|
129 | | - the function that will do the main work and the objects to be put in the
|
130 | | - input_queue. The launcher also accepts optional keyword arguments.
|
131 | | - '''
|
132 | | - cursor = create_bots_db(':memory')
|
133 | | - files = utils.retrieve_file_list(settings.input_location, 'xml')
|
134 | | -
|
135 | | - botnames = retrieve_botnames_without_id(cursor, 'en')
|
136 | | - bots = {}
|
137 | | - for botname in botnames:
|
138 | | - bots[botname[0]] = 1
|
139 | | - pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots)
|
140 | | - cursor.close()
|
141 | | -
|
142 | | -
|
143 | | -def debug_lookup_username():
|
144 | | - '''
|
145 | | - This function launches the lookup_username function but then single
|
146 | | - threaded, this eases debugging. That's also the reason why the queue
|
147 | | - parameters are set to None. When launching this function make sure that
|
148 | | - debug=False when calling lookup_username
|
149 | | - '''
|
150 | | - cursor = create_bots_db(':memory')
|
151 | | - botnames = retrieve_botnames_without_id(cursor, 'en')
|
152 | | - bots = {}
|
153 | | - for botname in botnames:
|
154 | | - bots[botname[0]] = 1
|
155 | | -
|
156 | | - lookup_username('12.xml', None, None, bots, debug=True)
|
157 | | - cursor.close()
|
| 2 | +def retrieve_editor_ids_db(): |
| 3 | + contributors = set() |
| 4 | + connection = db.init_database() |
| 5 | + cursor = connection.cursor() |
| 6 | + if settings.PROGRESS_BAR: |
| 7 | + cursor.execute('SELECT MAX(ROWID) FROM contributors') |
| 8 | + for id in cursor: |
| 9 | + pass |
| 10 | + pbar = progressbar.ProgressBar(maxval=id[0]).start() |
| 11 | + |
| 12 | + cursor.execute('SELECT contributor FROM contributors WHERE bot=0') |
| 13 | + |
| 14 | + print 'Retrieving contributors...' |
| 15 | + for x, contributor in enumerate(cursor): |
| 16 | + contributors.add(contributor[0]) |
| 17 | + if x % 100000 == 0: |
| 18 | + pbar.update(x) |
| 19 | + print 'Serializing contributors...' |
| 20 | + utils.store_object(contributors, 'contributors') |
| 21 | + print 'Finished serializing contributors...' |
| 22 | + |
| 23 | + if pbar: |
| 24 | + pbar.finish() |
| 25 | + print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed)) |
| 26 | + |
| 27 | + connection.close() |
| 28 | + |
| 29 | +def retrieve_edits_by_contributor(input_queue, result_queue, pbar): |
| 30 | + connection = db.init_database() |
| 31 | + cursor = connection.cursor() |
| 32 | + |
| 33 | + while True: |
| 34 | + try: |
| 35 | + contributor = input_queue.get(block=False) |
| 36 | + if contributor == None: |
| 37 | + break |
| 38 | + |
| 39 | + cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,)) |
| 40 | + edits = {} |
| 41 | + edits[contributor] = set() |
| 42 | + for edit, timestamp, bot in cursor: |
| 43 | + date = utils.convert_timestamp_to_date(timestamp) |
| 44 | + edits[contributor].add(date) |
| 45 | + #print edit, timestamp, bot |
| 46 | + |
| 47 | + utils.write_data_to_csv(edits, retrieve_edits_by_contributor) |
| 48 | + if pbar: |
| 49 | + utils.update_progressbar(pbar, input_queue) |
| 50 | + |
| 51 | + except Empty: |
| 52 | + pass |
| 53 | + |
| 54 | + connection.close() |
| 55 | + |
| 56 | + |
| 57 | +def store_data_db(data_queue, pids): |
| 58 | + connection = db.init_database() |
| 59 | + cursor = connection.cursor() |
| 60 | + db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE) |
| 61 | + empty = 0 |
| 62 | + values = [] |
| 63 | + while True: |
| 64 | + try: |
| 65 | + chunk = data_queue.get(block=False) |
| 66 | + contributor = chunk['contributor'].encode(settings.encoding) |
| 67 | + article = chunk['article'] |
| 68 | + timestamp = chunk['timestamp'].encode(settings.encoding) |
| 69 | + bot = chunk['bot'] |
| 70 | + values.append((contributor, article, timestamp, bot)) |
| 71 | + |
| 72 | + if len(values) == 50000: |
| 73 | + cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values) |
| 74 | + connection.commit() |
| 75 | + #print 'Size of queue: %s' % data_queue.qsize() |
| 76 | + values = [] |
| 77 | + |
| 78 | + except Empty: |
| 79 | + |
| 80 | + if all([utils.check_if_process_is_running(pid) for pid in pids]): |
| 81 | + pass |
| 82 | + else: |
| 83 | + break |
| 84 | + connection.close() |
| 85 | + |
| 86 | + |
| 87 | +def create_bots_db(db_name): |
| 88 | + ''' |
| 89 | + This function reads the csv file provided by Erik Zachte and constructs a |
| 90 | + sqlite memory database. The reason for this is that I suspect I will need |
| 91 | + some simple querying capabilities in the future, else a dictionary would |
| 92 | + suffice. |
| 93 | + ''' |
| 94 | + connection = db.init_database('db_name') |
| 95 | + #connection = db.init_database('data/database/bots.db') |
| 96 | + cursor = connection.cursor() |
| 97 | + db.create_tables(cursor, db_settings.BOT_TABLE) |
| 98 | + values = [] |
| 99 | + fields = [field[0] for field in db_settings.BOT_TABLE['bots']] |
| 100 | + for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.encoding): |
| 101 | + line = line.split(',') |
| 102 | + row = [] |
| 103 | + for x, (field, value) in enumerate(zip(fields, line)): |
| 104 | + if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER': |
| 105 | + value = int(value) |
| 106 | + elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT': |
| 107 | + value = value.replace('/', '-') |
| 108 | + #print field, value |
| 109 | + row.append(value) |
| 110 | + values.append(row) |
| 111 | + |
| 112 | + cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values) |
| 113 | + connection.commit() |
| 114 | + if db_name == ':memory': |
| 115 | + return cursor |
| 116 | + else: |
| 117 | + connection.close() |
| 118 | + |
| 119 | +def retrieve_botnames_without_id(cursor, language): |
| 120 | + return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall() |
| 121 | + |
| 122 | + |
| 123 | +def add_id_to_botnames(): |
| 124 | + ''' |
| 125 | + This is the worker function for the multi-process version of |
| 126 | + lookup_username.First, the names of the bots are retrieved, then the |
| 127 | + multiprocess is launched by making a call to pc.build_scaffolding. This is a |
| 128 | + generic launcher that takes as input the function to load the input_queue, |
| 129 | + the function that will do the main work and the objects to be put in the |
| 130 | + input_queue. The launcher also accepts optional keyword arguments. |
| 131 | + ''' |
| 132 | + cursor = create_bots_db(':memory') |
| 133 | + files = utils.retrieve_file_list(settings.input_location, 'xml') |
| 134 | + |
| 135 | + botnames = retrieve_botnames_without_id(cursor, 'en') |
| 136 | + bots = {} |
| 137 | + for botname in botnames: |
| 138 | + bots[botname[0]] = 1 |
| 139 | + pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots) |
| 140 | + cursor.close() |
| 141 | + |
| 142 | + |
| 143 | +def debug_lookup_username(): |
| 144 | + ''' |
| 145 | + This function launches the lookup_username function but then single |
| 146 | + threaded, this eases debugging. That's also the reason why the queue |
| 147 | + parameters are set to None. When launching this function make sure that |
| 148 | + debug=False when calling lookup_username |
| 149 | + ''' |
| 150 | + cursor = create_bots_db(':memory') |
| 151 | + botnames = retrieve_botnames_without_id(cursor, 'en') |
| 152 | + bots = {} |
| 153 | + for botname in botnames: |
| 154 | + bots[botname[0]] = 1 |
| 155 | + |
| 156 | + lookup_username('12.xml', None, None, bots, debug=True) |
| 157 | + cursor.close() |
Property changes on: trunk/tools/editor_trends/database/sqlite_logic.py |
___________________________________________________________________ |
Added: svn:eol-style |
158 | 158 | + native |
Property changes on: trunk/tools/editor_trends/database |
___________________________________________________________________ |
Added: svn:ignore |
159 | 159 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/bots |
___________________________________________________________________ |
Added: svn:ignore |
160 | 160 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/datasets |
___________________________________________________________________ |
Modified: svn:ignore |
161 | 161 | - cohort_data.txt |
cohorts.dta |
difference observations erik vs diederik.ods |
difference observations erik vs diederik.xlsx |
editors.dta |
enwiki_editors.csv |
enwiki_long_editors.csv |
enwiki_wide_editors.csv |
162 | 162 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/documentation |
___________________________________________________________________ |
Modified: svn:ignore |
163 | 163 | - language_codes.xlsx |
164 | 164 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/data/database |
___________________________________________________________________ |
Modified: svn:ignore |
165 | 165 | - *.db |
166 | 166 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/data/objects |
___________________________________________________________________ |
Modified: svn:ignore |
167 | 167 | - *.bin |
168 | 168 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/data/csv |
___________________________________________________________________ |
Modified: svn:ignore |
169 | 169 | - *.csv |
170 | 170 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/data |
___________________________________________________________________ |
Added: svn:ignore |
171 | 171 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Property changes on: trunk/tools/editor_trends/algorithms |
___________________________________________________________________ |
Added: svn:ignore |
172 | 172 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/run.bat |
— | — | @@ -1,3 +1,3 @@ |
2 | | -@echo off
|
3 | | -python split_xml_file.py
|
4 | | -python map_wiki_editors.py
|
| 2 | +@echo off |
| 3 | +python split_xml_file.py |
| 4 | +python map_wiki_editors.py |
Property changes on: trunk/tools/editor_trends/run.bat |
___________________________________________________________________ |
Added: svn:eol-style |
5 | 5 | + native |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Modified: svn:ignore |
6 | 6 | - wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
7 | 7 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |