r90935 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r90934‎ | r90935 | r90936 >
Date:04:19, 28 June 2011
Author:diederik
Status:deferred
Tags:
Comment:
Backlog of small fixes.
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py (added) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/community_graph.py (deleted) (history)
  • /trunk/tools/editor_trends/analyses/network (added) (history)
  • /trunk/tools/editor_trends/analyses/network/community_graph.py (added) (history)
  • /trunk/tools/editor_trends/analyses/network/graph_db.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py (added) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/kaggle/training.py (deleted) (history)
  • /trunk/tools/editor_trends/kaggle/training_db.py (added) (history)
  • /trunk/tools/editor_trends/kaggle/training_file.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
@@ -0,0 +1,49 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-04-19'
 19+__version__ = '0.1'
 20+
 21+from datetime import datetime
 22+from dateutil.relativedelta import *
 23+
 24+
 25+def kaggle_correlation(var, editor, **kwargs):
 26+ end_date = datetime(2011, 2, 1)
 27+ cutoff_date = datetime(2010, 9, 1)
 28+ start_date = datetime(2009, 9, 1)
 29+ edits = editor['edit_count']
 30+ username = editor['username']
 31+
 32+ pre, after = 0, 0
 33+
 34+ while start_date < cutoff_date:
 35+ year = str(start_date.year)
 36+ month = str(start_date.month)
 37+ pre += edits.get(year, {}).get(month, {}).get('0', 0)
 38+ start_date = start_date + relativedelta(months= +1)
 39+
 40+ start_date = datetime(2010, 9, 1)
 41+ while start_date < end_date:
 42+ year = str(start_date.year)
 43+ month = str(start_date.month)
 44+ after += edits.get(year, {}).get(month, {}).get('0', 0)
 45+ start_date = start_date + relativedelta(months= +1)
 46+
 47+ if pre > 0:
 48+ var.add(end_date, pre, {'after': after, 'username': username})
 49+
 50+ return var
Index: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
@@ -0,0 +1,41 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-28'
 19+__version__ = '0.1'
 20+
 21+
 22+from datetime import datetime
 23+from dateutil.relativedelta import *
 24+
 25+
 26+def kaggle_sanity_check_edits(var, editor, **kwargs):
 27+ end_date = datetime(2011, 2, 1)
 28+ start_date = datetime(2010, 9, 1)
 29+ edits = editor['edit_count']
 30+ username = editor['username']
 31+
 32+ count = 0
 33+ while start_date < end_date:
 34+ year = str(start_date.year)
 35+ month = str(start_date.month)
 36+ count += edits.get(year, {}).get(month, {}).get('0', 0)
 37+ start_date = start_date + relativedelta(months= +1)
 38+
 39+ if count > 0:
 40+ var.add(end_date, count, {'editor': username})
 41+
 42+ return var
Index: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
@@ -0,0 +1,49 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-28'
 19+__version__ = '0.1'
 20+
 21+
 22+from datetime import datetime
 23+from dateutil.relativedelta import *
 24+
 25+
 26+def kaggle_sanity_check(var, editor, **kwargs):
 27+ end_date = datetime(2011, 2, 1)
 28+ cutoff = datetime(2010, 9, 1)
 29+ start_date = datetime(2009, 9, 1)
 30+ edits = editor['edit_count']
 31+ active = 0
 32+ count = 0
 33+ while start_date < cutoff:
 34+ year = str(start_date.year)
 35+ month = str(start_date.month)
 36+ #namespaces = edits.get(year, {}).get(month, {}).keys()
 37+ #for ns in namespaces:
 38+ count += edits.get(year, {}).get(month, {}).get('0', 0)
 39+ start_date = start_date + relativedelta(months= +1)
 40+
 41+ if count > 0:
 42+ while start_date < end_date:
 43+ year = str(start_date.year)
 44+ month = str(start_date.month)
 45+ active += edits.get(year, {}).get(month, {}).get('0', 0)
 46+ start_date = start_date + relativedelta(months= +1)
 47+ if active > 0 :
 48+ var.add(cutoff, 1)
 49+
 50+ return var
Index: trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
@@ -0,0 +1,42 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+from classes import storage
 22+
 23+def sor_newbie_treatment(editor, var, **kwargs):
 24+ rts = kwargs.pop('rts')
 25+ tenth_edit = editor['new_wikipedian']
 26+ title = ':%s' % editor['username']
 27+ collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name)
 28+ db = storage.init_database(rts.storage, rts.dbname, collection)
 29+
 30+ if tenth_edit != False:
 31+ qualifier = {'ns': 3, 'timestamp': {'$lt':tenth_edit}}
 32+ observations = db.find_one(qualifier)
 33+ else:
 34+ observations = db.find_one('editor', editor)
 35+
 36+ if observations != None:
 37+ for obs in observations:
 38+ if obs['ns'] == 3:
 39+ values = obs.values()
 40+ print values
 41+
 42+
 43+
Index: trunk/tools/editor_trends/analyses/network/community_graph.py
@@ -0,0 +1,63 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-10'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+if '../../' not in sys.path:
 23+ sys.path.append('../../')
 24+
 25+from classes import settings
 26+settings = settings.Settings()
 27+from classes import storage
 28+from utils import file_utils
 29+
 30+try:
 31+ import psyco
 32+ psyco.full()
 33+except ImportError:
 34+ pass
 35+
 36+def create_articles_set(edits):
 37+ s = set()
 38+ years = edits.keys()
 39+ for year in years:
 40+ for edit in edits[year]:
 41+ s.add(edit['article'])
 42+ return s
 43+
 44+
 45+def create_edgelist(project, collection):
 46+ db = storage.init_database('mongo', project, collection)
 47+ ids = db.retrieve_distinct_keys('editor')
 48+ ids.sort()
 49+ fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8')
 50+ for i in ids:
 51+ author_i = db.find_one({'editor': i})
 52+ if author_i != None:
 53+ article_i = create_articles_set(author_i['edits'])
 54+ for j in ids:
 55+ if i > j:
 56+ author_j = db.find_one({'editor': j})
 57+ article_j = create_articles_set(author_j['edits'])
 58+ common = article_i.intersection(article_j)
 59+ if len(common) > 0:
 60+ file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)
 61+ fh.close()
 62+
 63+if __name__ == '__main__':
 64+ create_edgelist('wikilytics', 'enwiki_editors_raw')
Property changes on: trunk/tools/editor_trends/analyses/network/community_graph.py
___________________________________________________________________
Added: svn:eol-style
165 + native
Index: trunk/tools/editor_trends/analyses/network/graph_db.py
@@ -0,0 +1,82 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-25'
 19+__version__ = '0.1'
 20+
 21+import codecs
 22+from neo4jrestclient import GraphDatabase, NotFoundError
 23+
 24+neo4jrestclient.request.CACHE = True
 25+
 26+class IDGenerator:
 27+ def __init__(self):
 28+ self.n = 0
 29+ self.ids = {}
 30+ self.inverted_ids = {}
 31+
 32+ def invert_dict(self):
 33+ return dict((v, k) for k, v in self.ids.iteritems())
 34+
 35+ def get_id(self, n):
 36+ if n not in self.ids:
 37+ self.ids[n] = self.n
 38+ self.n += 1
 39+ return self.ids[n]
 40+
 41+ def reverse_lookup(self, n):
 42+ if self.inverted_ids == {}:
 43+ self.inverted_ids = self.invert_dict()
 44+ return self.inverted_ids[n]
 45+
 46+
 47+def read_edgelist():
 48+ fh = codecs.open('C:\\Users\\diederik.vanliere\\Dropbox\\wsor\\diederik\\wikilytics_edgelist.csv', 'r', 'utf-8')
 49+ for line in fh:
 50+ line = line.strip()
 51+ line = line.split('\t')
 52+ actor_a = line[0]
 53+ actor_b = line[1]
 54+ weight = int(line[2])
 55+ yield (actor_a, actor_b, weight)
 56+ fh.close()
 57+
 58+def init_db():
 59+ gdb = GraphDatabase("http://localhost:7474/db/data/")
 60+ return gdb
 61+
 62+def get_node(gdb, idg, node):
 63+ node = idg.get_id(node)
 64+ try:
 65+ #n = gdb.nodes.get('id', node)
 66+ n = gdb.nodes[node]
 67+ except NotFoundError:
 68+ n = gdb.nodes.create(id=node)
 69+ n['id'] = node
 70+
 71+ return n
 72+
 73+def load_data():
 74+ idg = IDGenerator()
 75+ gdb = init_db()
 76+ for (actor_a, actor_b, weight) in read_edgelist():
 77+ n1 = get_node(gdb, idg, actor_a)
 78+ n2 = get_node(gdb, idg, actor_b)
 79+ n1.relationships.create("cognitive_distance", n2, weight=weight)
 80+
 81+if __name__ == '__main__':
 82+ load_data()
 83+
Property changes on: trunk/tools/editor_trends/analyses/network/graph_db.py
___________________________________________________________________
Added: svn:eol-style
184 + native
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
@@ -1,62 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-10'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -if '..' not in sys.path:
23 - sys.path.append('..')
24 -
25 -from classes import settings
26 -settings = settings.Settings()
27 -from classes import storage
28 -from utils import file_utils
29 -
30 -try:
31 - import psyco
32 - psyco.full()
33 -except ImportError:
34 - pass
35 -
36 -def create_articles_set(edits):
37 - s = set()
38 - years = edits.keys()
39 - for year in years:
40 - for edit in edits[year]:
41 - s.add(edit['article'])
42 - return s
43 -
44 -
45 -def create_edgelist(project, collection):
46 - db = storage.init_database(rts.storage, project, collection)
47 - ids = db.retrieve_distinct_keys('editor')
48 - ids.sort()
49 - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8')
50 - for i in ids:
51 - author_i = conn[collection].find_one({'editor': i})
52 - article_i = create_articles_set(author_i['edits'])
53 - for j in ids:
54 - if i > j:
55 - author_j = conn[collection].find_one({'editor': j})
56 - article_j = create_articles_set(author_j['edits'])
57 - common = article_i.intersection(article_j)
58 - if len(common) > 0:
59 - file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)
60 - fh.close()
61 -
62 -if __name__ == '__main__':
63 - create_edgelist('enwiki', 'editors')
Index: trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
@@ -0,0 +1,24 @@
 2+from Queue import Queue
 3+#import cProfile
 4+from guppy import hpy
 5+h = hpy()
 6+
 7+q1, q2, q3 = Queue(), Queue(), Queue()
 8+h.heap()
 9+print 'ughh'
 10+for x in xrange(1000):
 11+ q1.put(x)
 12+ q2.put({})
 13+ q3.put([])
 14+ #h = hpy()
 15+hpy().doc
 16+h.heap()
 17+# for x in xrange(100):
 18+# a = q1.get()
 19+# b = q2.get()
 20+# c = q3.get()
 21+# h.heap()
 22+
 23+#if __name__ == '__main__':
 24+# main()
 25+ #cProfile.run('main()')
Index: trunk/tools/editor_trends/etl/store.py
@@ -79,7 +79,7 @@
8080 date = text_utils.convert_timestamp_to_datetime_utc(line[6])
8181 md5 = line[7]
8282 revert = int(line[8])
83 - reverted_user = int(line[9])
 83+ reverted_user = line[9]
8484 reverted_rev_id = int(line[10])
8585 bot = int(line[11])
8686 cur_size = int(line[12])
@@ -96,12 +96,10 @@
9797 'cur_size':cur_size,
9898 'delta':delta,
9999 'bot':bot,
 100+ 'reverted_user': reverted_user,
 101+ 'reverted_rev_id': reverted_rev_id
100102 }
101103
102 - if reverted_user > -1:
103 - data['reverted_user'] = reverted_user,
104 - data['reverted_rev_id'] = reverted_rev_id
105 -
106104 return data
107105
108106
Index: trunk/tools/editor_trends/etl/sort.py
@@ -55,11 +55,6 @@
5656 fh.close()
5757 for x, d in enumerate(data):
5858 d = d.strip().split('\t')
59 - #TEMP FIX:
60 - #editor = d[2]
61 - #d[2] = d[0]
62 - #d[0] = editor
63 - #END TEMP FIX
6459 data[x] = d
6560 #data = [d.strip() for d in data]
6661 #data = [d.split('\t') for d in data]
@@ -153,7 +148,7 @@
154149 pbar = progressbar.ProgressBar(maxval=len(files)).start()
155150 tasks = multiprocessing.JoinableQueue()
156151 result = multiprocessing.JoinableQueue()
157 - number_of_processes = 3
 152+ number_of_processes = 2
158153 sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)]
159154
160155 for filename in files:
@@ -166,16 +161,14 @@
167162 sorter.start()
168163
169164 ppills = number_of_processes
170 - while True:
171 - while ppills > 0:
172 - try:
173 - res = result.get(block=True)
174 - if res == True:
175 - pbar.update(pbar.currval + 1)
176 - else:
177 - ppills -= 1
178 - except Empty:
179 - pass
180 - break
 165+ while ppills > 0:
 166+ try:
 167+ res = result.get()
 168+ if res == True:
 169+ pbar.update(pbar.currval + 1)
 170+ else:
 171+ ppills -= 1
 172+ except Empty:
 173+ pass
181174
182175 tasks.join()
Index: trunk/tools/editor_trends/kaggle/training.py
@@ -1,141 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-04-12'
19 -__version__ = '0.1'
20 -
21 -import os
22 -import sys
23 -import cPickle
24 -import codecs
25 -from datetime import datetime
26 -sys.path.append('../')
27 -
28 -from classes import storage
29 -
30 -location = '/home/diederik/wikimedia/en/wiki/kaggle'
31 -files = os.listdir(location)
32 -files.reverse()
33 -
34 -max_size = 2147483648
35 -max_size_reached = False
36 -
37 -t0 = datetime.now()
38 -titles = {}
39 -ids = set()
40 -dates = {}
41 -edits = {}
42 -ignore_ids = set()
43 -size = 0
44 -cnt_obs = 0
45 -cutoff_date = datetime(2010, 8, 31)
46 -
47 -print 'Constructing training dataset...'
48 -db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
49 -dataset = codecs.open('training.tsv', 'w', 'utf-8')
50 -for filename in files:
51 - if not filename.startswith('comments') and not filename.startswith('articles'):
52 - fh = codecs.open(os.path.join(location, filename))
53 - if max_size_reached == True:
54 - break
55 - for line in fh:
56 - line = line.strip()
57 - line = line.split('\t')
58 - if len(line) != 12:
59 - continue
60 - if line[10] == '1':
61 - continue
62 - timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
63 - if timestamp > cutoff_date:
64 - continue
65 - username = line[3].lower()
66 - if username.endswith('bot') or username.find('script') > -1:
67 - #line[10] = '1'
68 - continue
69 - id = line[2]
70 - if id not in ids and id not in ignore_ids:
71 - res = db.find_one({'editor': id})
72 - if res == None:
73 - ignore_ids.add(id)
74 - continue
75 - cnt_obs += 1
76 - title_id = line[1]
77 - ids.add(id)
78 - simple_date = '%s-%s' % (timestamp.year, timestamp.month)
79 - dates.setdefault(simple_date, 0)
80 - dates[simple_date] += 1
81 - title = line.pop(5)
82 - titles[title_id] = title
83 - line.append('\n')
84 - line = '\t'.join(line)
85 - size += len(line)
86 - if size > max_size:
87 - max_size_reached = True
88 - dataset.write(line.decode('utf-8'))
89 -
90 -dataset.close()
91 -
92 -print 'Constructing title dataset...'
93 -fh = codecs.open('titles.tsv', 'w', 'utf-8')
94 -for id, title in titles.iteritems():
95 - fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
96 -fh.close()
97 -
98 -
99 -print 'Constructing solution dataset...'
100 -x = 0
101 -fh = codecs.open('solutions.tsv', 'w', 'utf-8')
102 -for id in ids:
103 - if id not in ignore_ids:
104 - obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
105 - if obs != None:
106 - x += 1
107 - n = obs['cum_edit_count_main_ns']
108 - fh.write('%s,%s\n' % (id.decode('utf-8'), n))
109 - edits.setdefault(n, 0)
110 - edits[n] += 1
111 - else:
112 - print id
113 -fh.close()
114 -
115 -print 'Storing date histogram'
116 -fh = open('histogram_dates.bin', 'wb')
117 -cPickle.dump(dates, fh)
118 -fh.close()
119 -
120 -
121 -fh = open('histogram_dates.tsv', 'w')
122 -for date, n in dates.iteritems():
123 - fh.write('%s\t%s\n' % (date, n))
124 -fh.close()
125 -
126 -
127 -print 'Storing edit histogram'
128 -fh = open('histogram_edits.bin', 'wb')
129 -cPickle.dump(edits, fh)
130 -fh.close()
131 -
132 -fh = open('histogram_edits.tsv', 'w')
133 -for edit, n in edits.iteritems():
134 - fh.write('%s\t%s\n' % (edit, n))
135 -fh.close()
136 -
137 -
138 -t1 = datetime.now()
139 -print 'Descriptives:'
140 -print 'Number of editors: %s' % x
141 -print 'Number of edits: %s' % cnt_obs
142 -print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/kaggle/training_file.py
@@ -0,0 +1,430 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-04-12'
 19+__version__ = '0.1'
 20+
 21+import os
 22+import sys
 23+import cPickle
 24+import codecs
 25+import random
 26+from itertools import izip_longest
 27+from datetime import datetime
 28+from dateutil.relativedelta import *
 29+sys.path.append('../')
 30+import resource
 31+
 32+random.seed(1024)
 33+from classes import storage
 34+
 35+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
 36+ 'md5', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
 37+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
 38+ 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
 39+
 40+size = 0 #current size of file
 41+#max_size = 2147483648
 42+max_size = 5000000
 43+editors_seen = {}
 44+cnt_obs = 0 #count of number of edits
 45+revs = {}
 46+titles = {}
 47+predictions = {}
 48+
 49+t0 = datetime.now()
 50+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
 51+txt_files = '/home/diederik/wikimedia/xml/en/wiki/sorted/'
 52+files = os.listdir(location)
 53+max_file_handles = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
 54+#files.sort()
 55+#files.reverse()
 56+
 57+cutoff_date = datetime(2010, 8, 31) #operator is >
 58+end_date = datetime(2011, 2, 1) #operator is <
 59+cutoff_date_training = datetime(2010, 1, 31) #operator is >
 60+end_date_training = datetime(2010, 9, 1) # operator is <
 61+
 62+
 63+class IDGenerator:
 64+ def __init__(self):
 65+ self.n = 0
 66+ self.ids = {}
 67+ self.rnd_ids = {}
 68+ self.inverted_ids = None
 69+
 70+ def invert_dict(self, dictionary):
 71+ return dict((v, k) for k, v in dictionary.iteritems())
 72+
 73+ def get_id(self, n):
 74+ if n not in self.ids:
 75+ self.n += 1
 76+ while len(self.rnd_ids) < self.n :
 77+ rnd_id = self.get_random_id()
 78+ if self.rnd_ids.get(rnd_id, False) == False:
 79+ self.rnd_ids[rnd_id] = True
 80+ self.ids[n] = rnd_id
 81+ return self.ids[n]
 82+
 83+ def get_random_id(self):
 84+ return random.randrange(0, 1000000)
 85+
 86+ def reverse_lookup(self, n):
 87+ self.inverted_ids = self.invert_dict(self.ids)
 88+ return self.inverted_ids[n]
 89+
 90+
 91+def construct_article_meta(fh_articles, files):
 92+ print 'Constructing title dataset...'
 93+ headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page']
 94+ write_headers(fh_articles, headers)
 95+ #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
 96+ article_meta = {}
 97+ for filename in files:
 98+ if filename.startswith('articles_meta'):
 99+ fh = codecs.open(os.path.join(location, filename))
 100+ for line in fh:
 101+ line = line.strip()
 102+ line = line.split('\t')
 103+ category = line[1]
 104+ if category != 'List':
 105+ title = line[2]
 106+ title = title.split('/')
 107+ article_meta.setdefault(title[-1], {})
 108+ article_meta[title[-1]]['category'] = category
 109+ article_meta[title[-1]]['id'] = line[0]
 110+ fh.close()
 111+ return article_meta
 112+
 113+
 114+def determine_active(edits, start_date, end_date):
 115+ active = 0
 116+ namespaces = ['0', '1', '2', '3', '4', '5']
 117+ if start_date == datetime(2009, 9, 1):
 118+ if '2009' not in edits and '2010' not in edits:
 119+ return active
 120+ elif start_date == datetime(2010, 9, 1):
 121+ if '2010' not in edits and '2011' not in edits:
 122+ return active
 123+
 124+ while start_date < end_date:
 125+ year = str(start_date.year)
 126+ month = str(start_date.month)
 127+ for ns in namespaces:
 128+ active += edits.get(year, {}).get(month, {}).get(ns, 0)
 129+ if active > 0: #we don't need to know how many edits,just if active
 130+ return active
 131+ start_date = start_date + relativedelta(months= +1)
 132+ return active
 133+
 134+def load_binary_file(filename):
 135+ fh = open('set_b.bin', 'rb')
 136+ obj = cPickle.load(fh)
 137+ fh.close()
 138+ return obj
 139+
 140+
 141+def convert_tz_to_mysql_tz(tz):
 142+ iso = tz.__str__()
 143+ tz = iso[0:4] + '-' + iso[4:6] + '-' + iso[6:]
 144+ return tz
 145+
 146+
 147+def check_reverter(idg, reverter):
 148+ try:
 149+ reverter = int(reverter)
 150+ if reverter != -1:
 151+ reverter = idg.get_id(reverter)
 152+ return reverter
 153+ except ValueError:
 154+ pass
 155+ return -1
 156+
 157+
 158+def check_user_id(user_id):
 159+ try:
 160+ int(user_id)
 161+ except ValueError:
 162+ return False
 163+ return True
 164+
 165+
 166+def check_username(username):
 167+ username = username.lower()
 168+ if username.endswith('bot') or username.find('script') > -1:
 169+ return False #exclude more bots and scripts
 170+ return True
 171+
 172+
 173+def determine_editors(db):
 174+ start_date_pre = datetime(2009, 9, 1)
 175+ end_date_pre = datetime(2010, 9, 1)
 176+ end_date = datetime(2011, 2, 1)
 177+ pre_editors = set()
 178+ post_editors = set()
 179+ #cursor = db.find({'date': {'$gte': start_date_pre, '$lt': end_date_pre}}, 'first_edit,edit_count,user_id,username')
 180+ cursor = db.find({}, 'first_edit,edit_count,user_id,username')
 181+ x, y, z = 0, 0, 0
 182+ for editor in cursor:
 183+ x += 1
 184+ if 'first_edit' not in editor:
 185+ continue
 186+ if editor['first_edit'] >= end_date_pre:
 187+ continue
 188+ if check_username(editor['username']) == False:
 189+ continue
 190+ if check_user_id(editor['editor']) == False:
 191+ continue
 192+
 193+ #print editor['edit_count']
 194+ active = determine_active(editor['edit_count'], start_date_pre, end_date_pre)
 195+ if active > 0:
 196+ pre_editors.add(editor['editor'])
 197+ y += 1
 198+ active = determine_active(editor['edit_count'], end_date_pre, end_date)
 199+ if active > 0:
 200+ post_editors.add(editor['editor'])
 201+ z += 1
 202+ if x % 100000 == 0:
 203+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
 204+
 205+ #set_a = pre_editors.difference(post_editors)
 206+ post_editors = pre_editors.intersection(post_editors)
 207+
 208+ return pre_editors, post_editors
 209+
 210+
 211+def write_headers(fh, headers):
 212+ for i, key in enumerate(headers):
 213+ fh.write('%s' % key)
 214+ if (i + 1) != len(keys):
 215+ fh.write('\t')
 216+ else:
 217+ fh.write('\n')
 218+
 219+def write_revision(dataset, revision):
 220+ size = 0
 221+ for i, key in enumerate(keys):
 222+ #print key, revision[key]
 223+# if key == 'reverted_user' or key == 'reverted_rev_id':
 224+# revision[key] = revision[key][0]
 225+ if type(revision[key]) == type(0):
 226+ revision[key] = str(revision[key])
 227+
 228+ dataset.write('%s' % revision[key].decode('utf-8'))
 229+ size += len(revision[key])
 230+ if (i + 1) != len(keys):
 231+ dataset.write('\t')
 232+ else:
 233+ dataset.write('\n')
 234+ return size
 235+
 236+
 237+print 'Constructing training dataset...'
 238+db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
 239+print 'Loading editors...'
 240+if not os.path.exists('set_a.bin'):
 241+ pre_editors, post_editors = determine_editors(db_dataset)
 242+ fh = open('set_a.bin', 'wb')
 243+ cPickle.dump(pre_editors, fh)
 244+ fh.close()
 245+
 246+ fh = open('set_b.bin', 'wb')
 247+ cPickle.dump(post_editors, fh)
 248+ fh.close()
 249+else:
 250+ pre_editors = load_binary_file('set_a.bin')
 251+ post_editors = load_binary_file('set_b.bin')
 252+
 253+
 254+dataset = codecs.open('training.tsv', 'w', 'utf-8')
 255+write_headers(dataset, headers)
 256+idg = IDGenerator()
 257+
 258+
 259+
 260+print 'Parsing revisions...'
 261+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
 262+seen_editors = {}
 263+for editors in izip_longest(post_editors, pre_editors, fillvalue=None):
 264+ for editor in editors:
 265+ go = editors_seen.get(editor, True)
 266+ if go:
 267+ #if editor:
 268+ editors_seen[editor] = False
 269+ print 'Parsing editor %s...' % editor
 270+ #revisions = db_raw.find({'user_id': editor})
 271+ file_id = int(editor) % max_file_handles
 272+ fh = codecs.open(os.path.join(txt_files, '%s.csv' % file_id), 'r', 'utf-8')
 273+ for line in fh:
 274+ line = line.strip()
 275+ line = line.split('\t')
 276+ if line[0] != editor:
 277+ continue
 278+ revision = {}
 279+ revision['user_id'] = int(line[0])
 280+ revision['article_id'] = int(line[1])
 281+ revision['rev_id'] = int(line[2])
 282+ revision['ns'] = line[4]
 283+ revision['date'] = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
 284+ revision['hash'] = line[7]
 285+ revision['revert'] = line[8]
 286+ revision['reverted_user'] = line[9]
 287+ revision['reverted_rev_id'] = line[10]
 288+ revision['cur_size'] = line[12]
 289+ revision['delta'] = line[13]
 290+ #print line
 291+ #print revision
 292+
 293+ #'user_id', 'article_id', 'rev_id', 'ns', 'date',
 294+ #'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size'
 295+ #print 'Editor %s made % edits' % (editor, len(revisions))
 296+ #for revision in revisions:
 297+ user_id = idg.get_id(revision['user_id'])
 298+ revision['user_id'] = user_id #recode id to make it harder to look up answers
 299+ if revision['ns'] < 0:
 300+ continue
 301+ timestamp = revision['date']
 302+ #revision['date'] = convert_tz_to_mysql_tz(timestamp)
 303+
 304+ predictions.setdefault(user_id, {})
 305+ predictions[user_id].setdefault('solution', 0)
 306+ predictions[user_id].setdefault('training', 0)
 307+
 308+ if timestamp > cutoff_date and timestamp < end_date:
 309+ predictions[user_id]['solution'] += 1
 310+ elif timestamp > cutoff_date_training and timestamp < end_date_training:
 311+ predictions[user_id]['training'] += 1
 312+ if timestamp > cutoff_date: #exclude edits after cut off date
 313+ continue
 314+
 315+ revision['reverted_user'] = check_reverter(idg, revision.get('reverted_user', -1))
 316+ #revision.pop('_id')
 317+ #revision.pop('username')
 318+ revision['date'] = revision['date'].__str__()
 319+ titles[revision['article_id']] = True
 320+ revs[revision['rev_id']] = True
 321+ size += write_revision(dataset, revision)
 322+ cnt_obs += 1
 323+ if cnt_obs % 10000 == 0:
 324+ print 'Parsed %s revisions...' % cnt_obs
 325+ fh.close()
 326+ if size > max_size:
 327+ break
 328+if size > max_size:
 329+ print 'Reached maximum filesize...'
 330+else:
 331+ print 'Parsed all available editors in post set...'
 332+dataset.close()
 333+
 334+
 335+
 336+print 'Constructing solution dataset...'
 337+fh = codecs.open('solutions.csv', 'w', 'utf-8')
 338+keys = predictions.keys()
 339+keys.sort()
 340+fh.write('%s,%s\n' % ('editor_id', 'solution'))
 341+for key in keys:
 342+ fh.write('%s,%s\n' % (key, predictions[key]['solution']))
 343+fh.close()
 344+
 345+
 346+print 'Constructing test dataset...'
 347+fh = codecs.open('test.csv', 'w', 'utf-8')
 348+fh.write('%s,%s\n' % ('editor_id', 'test'))
 349+for key, value in predictions.iteritems():
 350+ fh.write('%s,%s\n' % (key, value['training']))
 351+fh.close()
 352+
 353+
 354+print 'Constructing article file...'
 355+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
 356+article_meta = construct_article_meta(fh_articles, files)
 357+for filename in files:
 358+ if filename.startswith('articles') and not filename.startswith('articles_meta'):
 359+ fh = codecs.open(os.path.join(location, filename))
 360+ for line in fh:
 361+ line = line.strip()
 362+ line = line.split('\t')
 363+ if len(line) == 6:
 364+ article_id = int(line[0])
 365+ title = titles.get(article_id, None)
 366+ if title:
 367+ title = line[-1]
 368+ meta = article_meta.get(title, None)
 369+ parent_id = -1
 370+ category = 'Null'
 371+ if meta:
 372+ parent_id = meta['id']
 373+ category = meta['category']
 374+
 375+ line[1] = category
 376+ line[2] = convert_tz_to_mysql_tz(line[2])
 377+ line[-1] = line[-1].decode('utf-8')
 378+ line.append(str(parent_id))
 379+ line.append('\n')
 380+ fh_articles.write('\t'.join(line))
 381+ fh.close()
 382+fh_articles.close()
 383+
 384+
 385+print 'Constructing comment dataset...'
 386+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
 387+fh_comments.write('%s\t%s\n' % ('rev_id', 'text'))
 388+cnt = len(revs.keys())
 389+for filename in files:
 390+ if filename.startswith('comments'):
 391+ fh = codecs.open(os.path.join(location, filename))
 392+ for line in fh:
 393+ if cnt == 0:
 394+ break
 395+ line = line.strip()
 396+ line = line.split('\t')
 397+ if len(line) == 2: #some lines are missing rev id, not sure why.
 398+ try:
 399+ rev_id = int(line[0])
 400+ exists = revs.get(rev_id, None)
 401+ if exists:
 402+ fh_comments.write('%s\t%s\n' % (rev_id, line[1].decode('utf-8')))
 403+ cnt -= 1
 404+ except (ValueError, KeyError), error:
 405+ print error
 406+ fh.close()
 407+fh_comments.close()
 408+
 409+print 'Storing random ids...'
 410+fh = open('random_ids.bin', 'wb')
 411+cPickle.dump(idg, fh)
 412+fh.close()
 413+
 414+
 415+fh = open('descriptives.tsv', 'w')
 416+fh.write('Number of unique editors: %s\n' % idg.n)
 417+fh.write('Number of revisions: %s\n' % cnt_obs)
 418+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
 419+fh.write('Number of post-editors: %s\n' % len(post_editors))
 420+fh.write('Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors)))
 421+fh.close()
 422+
 423+
 424+t1 = datetime.now()
 425+print 'Descriptives:'
 426+print 'Number of unique editors: %s' % idg.n
 427+print 'Number of revisions: %s' % cnt_obs
 428+print 'Number of pre-editors: %s' % len(pre_editors)
 429+print 'Number of post-editors: %s' % len(post_editors)
 430+print 'Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors))
 431+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/kaggle/training_db.py
@@ -0,0 +1,452 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-04-12'
 19+__version__ = '0.1'
 20+
 21+import os
 22+import sys
 23+import cPickle
 24+import codecs
 25+import random
 26+from itertools import izip
 27+from datetime import datetime
 28+from dateutil.relativedelta import *
 29+sys.path.append('../')
 30+
 31+random.seed(1024)
 32+from classes import storage
 33+
 34+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
 35+ 'md5', 'reverted', 'reverted_user_id', 'reverted_revision_id', 'delta', 'cur_size']
 36+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
 37+ 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
 38+
 39+max_size = 2147483648
 40+#max_size = 2000000
 41+cnt_obs = 0 #count of number of edits
 42+revs = {}
 43+titles = {}
 44+predictions = {}
 45+
 46+t0 = datetime.now()
 47+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
 48+files = os.listdir(location)
 49+#files.sort()
 50+#files.reverse()
 51+editors_seen = {}
 52+cutoff_date = datetime(2010, 9, 1) #operator is >
 53+end_date = datetime(2011, 2, 1) #operator is <
 54+cutoff_date_training = datetime(2010, 1, 31) #operator is >
 55+end_date_training = datetime(2010, 9, 1) # operator is <
 56+
 57+class IDGenerator:
 58+ def __init__(self):
 59+ self.n = 0
 60+ self.ids = {}
 61+
 62+ def get_id(self, n):
 63+ if n not in self.ids:
 64+ self.ids[n] = self.n
 65+ self.n += 1
 66+ return str(self.ids[n])
 67+
 68+class RandomIDGenerator:
 69+ def __init__(self):
 70+ self.n = 0
 71+ self.ids = {}
 72+ self.rnd_ids = {}
 73+ self.inverted_ids = None
 74+
 75+ def invert_dict(self, dictionary):
 76+ return dict((v, k) for k, v in dictionary.iteritems())
 77+
 78+ def get_id(self, n):
 79+ if n not in self.ids:
 80+ self.n += 1
 81+ while len(self.rnd_ids) < self.n :
 82+ rnd_id = self.get_random_id()
 83+ if self.rnd_ids.get(rnd_id, False) == False:
 84+ self.rnd_ids[rnd_id] = True
 85+ self.ids[n] = rnd_id
 86+ return self.ids[n]
 87+
 88+ def get_random_id(self):
 89+ return random.randrange(0, 1000000)
 90+
 91+ def reverse_lookup(self, n):
 92+ self.inverted_ids = self.invert_dict(self.ids)
 93+ return self.inverted_ids[n]
 94+
 95+
 96+def construct_article_meta(fh_articles, files):
 97+ print 'Constructing title dataset...'
 98+ headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page']
 99+ write_headers(fh_articles, headers)
 100+ #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
 101+ article_meta = {}
 102+ for filename in files:
 103+ if filename.startswith('articles_meta'):
 104+ fh = codecs.open(os.path.join(location, filename))
 105+ for line in fh:
 106+ line = line.strip()
 107+ line = line.split('\t')
 108+ category = line[1]
 109+ if category != 'List':
 110+ title = line[2]
 111+ title = title.split('/')
 112+ article_meta.setdefault(title[-1], {})
 113+ article_meta[title[-1]]['category'] = category
 114+ article_meta[title[-1]]['id'] = line[0]
 115+ fh.close()
 116+ return article_meta
 117+
 118+
 119+def determine_active(edits, start_date, end_date):
 120+ active = 0
 121+ if start_date == datetime(2009, 9, 1):
 122+ if '2009' not in edits and '2010' not in edits:
 123+ return active
 124+# elif start_date == datetime(2010, 9, 1):
 125+# if '2010' not in edits and '2011' not in edits:
 126+# return active
 127+
 128+
 129+ namespaces = ['0', '1', '2', '3', '4', '5']
 130+ while start_date < end_date:
 131+ year = str(start_date.year)
 132+ month = str(start_date.month)
 133+ for ns in namespaces:
 134+ active += edits.get(year, {}).get(month, {}).get(ns, 0)
 135+ if active > 0: #we don't need to know how many edits,just if active
 136+ return active
 137+ start_date = start_date + relativedelta(months= +1)
 138+ return active
 139+
 140+
 141+def load_binary_file(filename):
 142+ fh = open(filename, 'rb')
 143+ obj = cPickle.load(fh)
 144+ fh.close()
 145+ return obj
 146+
 147+
 148+def convert_tz_to_mysql_tz(tz):
 149+ return tz.__str__()
 150+
 151+
 152+def check_reverter(idg, reverter):
 153+ try:
 154+ if reverter != -1:
 155+ reverter = idg.get_id(reverter)
 156+ return reverter
 157+ except ValueError:
 158+ pass
 159+ return -1
 160+
 161+
 162+def check_user_id(user_id):
 163+ try:
 164+ int(user_id)
 165+ except ValueError:
 166+ return False
 167+ return True
 168+
 169+
 170+def check_username(username):
 171+ username = username.lower()
 172+ if username.endswith('bot') or username.find('script') > -1:
 173+ return False #exclude more bots and scripts
 174+ return True
 175+
 176+
 177+def determine_editors(db):
 178+ start_date_pre = datetime(2009, 9, 1)
 179+ end_date_pre = datetime(2010, 9, 1)
 180+ end_date = datetime(2011, 2, 1)
 181+ pre_editors = set()
 182+ post_editors = set()
 183+ cursor = db.find({}, 'first_edit,edit_count,user_id,username')
 184+ x, y, z = 0, 0, 0
 185+ for editor in cursor:
 186+ x += 1
 187+ if 'first_edit' not in editor:
 188+ continue
 189+ if editor['first_edit'] > end_date_pre:
 190+ continue
 191+ if check_username(editor['username']) == False:
 192+ continue
 193+ if check_user_id(editor['user_id']) == False:
 194+ continue
 195+
 196+ active_pre = determine_active(editor['edit_count'], start_date_pre, end_date_pre)
 197+ if x % 100000 == 0:
 198+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
 199+
 200+ if active_pre == 0:
 201+ continue #exclude editors who are not active in the year before the cutoff date
 202+ else:
 203+ active_post = determine_active(editor['edit_count'], end_date_pre, end_date)
 204+ if active_post == 0:
 205+ pre_editors.add(user_id)
 206+ y += 1
 207+ else:
 208+ post_editors.add(user_id)
 209+ z += 1
 210+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
 211+ return pre_editors, post_editors
 212+
 213+
 214+def write_headers(fh, headers):
 215+ for i, key in enumerate(headers):
 216+ fh.write('%s' % key)
 217+ if (i + 1) != len(headers):
 218+ fh.write('\t')
 219+ else:
 220+ fh.write('\n')
 221+
 222+def write_revision(dataset, revision):
 223+ for i, key in enumerate(keys):
 224+ if type(revision[key]) == type(0):
 225+ revision[key] = str(revision[key])
 226+ dataset.write('%s' % revision[key].decode('utf-8'))
 227+ if (i + 1) != len(keys):
 228+ dataset.write('\t')
 229+ else:
 230+ dataset.write('\n')
 231+
 232+
 233+print 'Constructing training dataset...'
 234+db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
 235+print 'Loading editors...'
 236+if not os.path.exists('set_a.bin'):
 237+ pre_editors, post_editors = determine_editors(db_dataset)
 238+ fh = open('set_a.bin', 'wb')
 239+ cPickle.dump(pre_editors, fh)
 240+ fh.close()
 241+
 242+ fh = open('set_b.bin', 'wb')
 243+ cPickle.dump(post_editors, fh)
 244+ fh.close()
 245+else:
 246+ pre_editors = load_binary_file('set_a.bin')
 247+ post_editors = load_binary_file('set_b.bin')
 248+
 249+
 250+dataset = codecs.open('training.tsv', 'w', 'utf-8')
 251+write_headers(dataset, headers)
 252+idg = RandomIDGenerator()
 253+
 254+namespaces = IDGenerator()
 255+print 'Parsing revisions...'
 256+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
 257+seen_editors = {}
 258+editors = {}
 259+x = 1
 260+for editor in post_editors:
 261+ #print editor
 262+ editors[x] = editor
 263+ x += 2
 264+x = 0
 265+z = len(post_editors)
 266+for y, editor in enumerate(pre_editors):
 267+ #print editor
 268+ editors[x] = editor
 269+ x += 2
 270+ if z == y:
 271+ break
 272+
 273+editor_keys = editors.keys()
 274+editor_keys.sort()
 275+for key in editor_keys:
 276+ #print editors
 277+ #for editor in editors:
 278+ editor = editors[key]
 279+ #print editor
 280+ go = editors_seen.get(editor, True)
 281+ if go:
 282+ editors_seen[editor] = False
 283+ user_id = idg.get_id(editor)
 284+ print 'Parsing editor %s (%s) ...' % (editor, user_id)
 285+ revisions = db_raw.find({'user_id': str(editor)})
 286+
 287+ predictions.setdefault(user_id, {})
 288+ predictions[user_id].setdefault('solution', 0)
 289+ predictions[user_id].setdefault('training', 0)
 290+
 291+ for revision in revisions:
 292+ revision['user_id'] = user_id #recode id to make it harder to look up answers
 293+ if revision['ns'] < 0 or revision['ns'] > 5:
 294+ continue
 295+ #revision['ns'] = namespaces.get_id(revision['ns'])
 296+ timestamp = revision['date']
 297+ revision['date'] = convert_tz_to_mysql_tz(timestamp)
 298+
 299+
 300+
 301+ if timestamp > cutoff_date:
 302+ #print editor, user_id, timestamp, revision['date']
 303+ if timestamp < end_date:
 304+ predictions[user_id]['solution'] += 1
 305+ elif timestamp > cutoff_date_training:
 306+ if timestamp < end_date_training:
 307+ predictions[user_id]['training'] += 1
 308+
 309+ if timestamp > cutoff_date: #exclude edits after cut off date
 310+ continue
 311+ if revision['revert'] == 1:
 312+ revision['reverted_user'] = check_reverter(idg, revision.get('reverted_user', -1))
 313+ revision.pop('_id')
 314+ revision.pop('username')
 315+ titles[revision['article_id']] = True
 316+ revs[revision['rev_id']] = True
 317+ write_revision(dataset, revision)
 318+ cnt_obs += 1
 319+ if cnt_obs % 10000 == 0:
 320+ print 'Parsed %s revisions...' % cnt_obs
 321+ if dataset.tell() > max_size:
 322+ break
 323+if dataset.tell() > max_size:
 324+ print 'Reached maximum filesize...'
 325+else:
 326+ print 'Parsed all available editors in post set...'
 327+dataset.close()
 328+
 329+
 330+
 331+print 'Constructing solution dataset...'
 332+fh = codecs.open('solutions.csv', 'w', 'utf-8')
 333+editor_keys = predictions.keys()
 334+editor_keys.sort()
 335+fh.write('%s,%s\n' % ('user_id', 'solution'))
 336+for key in editor_keys:
 337+ fh.write('%s,%s\n' % (key, predictions[key]['solution']))
 338+ print key, predictions[key]['solution']
 339+fh.close()
 340+
 341+
 342+print 'Constructing test dataset...'
 343+fh = codecs.open('test.csv', 'w', 'utf-8')
 344+fh.write('%s,%s\n' % ('user_id', 'test'))
 345+for key, value in predictions.iteritems():
 346+ fh.write('%s,%s\n' % (key, value['training']))
 347+fh.close()
 348+
 349+print 'Constructing article file...'
 350+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
 351+article_meta = construct_article_meta(fh_articles, files)
 352+categories = IDGenerator()
 353+for filename in files:
 354+ if filename.startswith('articles') and not filename.startswith('articles_meta'):
 355+ fh = codecs.open(os.path.join(location, filename))
 356+ for line in fh:
 357+ line = line.strip()
 358+ line = line.split('\t')
 359+ if len(line) == 6:
 360+ article_id = int(line[0])
 361+ title = titles.pop(article_id, None)
 362+ if title:
 363+ title = line[-1]
 364+ meta = article_meta.get(title, None)
 365+ parent_id = '-1'
 366+ category = -1
 367+ redirect = line[4]
 368+ if redirect == 'False':
 369+ redirect = '0'
 370+ else:
 371+ redirect = '1'
 372+ line[4] = redirect
 373+ if meta:
 374+ parent_id = meta['id']
 375+ category = meta['category']
 376+
 377+
 378+ line[1] = categories.get_id(category)
 379+ tz = datetime.strptime(line[2], '%Y-%m-%dT%H:%M:%SZ')
 380+ line[2] = convert_tz_to_mysql_tz(tz)
 381+ line[-1] = line[-1].decode('utf-8')
 382+ line.append(parent_id)
 383+ line.append('\n')
 384+ fh_articles.write('\t'.join(line))
 385+ fh.close()
 386+fh_articles.close()
 387+
 388+
 389+print 'Constructing comment dataset...'
 390+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
 391+fh_comments.write('%s\t%s\n' % ('revision_id', 'comment'))
 392+cnt = len(revs.keys())
 393+for filename in files:
 394+ if filename.startswith('comments'):
 395+ fh = codecs.open(os.path.join(location, filename))
 396+ for line in fh:
 397+ if cnt == 0:
 398+ break
 399+ line = line.strip()
 400+ line = line.split('\t')
 401+ if len(line) == 2: #some lines are missing rev id, not sure why.
 402+ try:
 403+ rev_id = int(line[0])
 404+ exists = revs.get(rev_id, None)
 405+ if exists:
 406+ fh_comments.write('%s\t%s\n' % (rev_id, line[1].decode('utf-8')))
 407+ cnt -= 1
 408+ except (ValueError, KeyError), error:
 409+ print error
 410+ fh.close()
 411+fh_comments.close()
 412+
 413+print 'Storing random ids...'
 414+fh = open('random_ids.bin', 'wb')
 415+cPickle.dump(idg, fh)
 416+fh.close()
 417+
 418+fh = codecs.open('namespaces.tsv', 'w', 'utf-8')
 419+write_headers(fh, ['key', 'namespace'])
 420+namespaces = {'0':'Main',
 421+ '1':'Talk',
 422+ '2':'User',
 423+ '3':'User Talk',
 424+ '4':'Wikipedia',
 425+ '5':'Wikipedia Talk'
 426+ }
 427+for key, value in namespaces.iteritems():
 428+ fh.write('%s\t%s\n' % (key, value))
 429+fh.close()
 430+
 431+fh = codecs.open('categories.tsv', 'w', 'utf-8')
 432+write_headers(fh, ['id', 'name'])
 433+for key, value in categories.ids.iteritems():
 434+ fh.write('%s\t%s\n' % (value, key))
 435+fh.close()
 436+
 437+fh = open('descriptives.tsv', 'w')
 438+fh.write('Number of unique editors: %s\n' % idg.n)
 439+fh.write('Number of revisions: %s\n' % cnt_obs)
 440+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
 441+fh.write('Number of post-editors: %s\n' % len(post_editors))
 442+fh.write('Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors)))
 443+fh.close()
 444+
 445+
 446+t1 = datetime.now()
 447+print 'Descriptives:'
 448+print 'Number of unique editors: %s' % idg.n
 449+print 'Number of revisions: %s' % cnt_obs
 450+print 'Number of pre-editors: %s' % len(pre_editors)
 451+print 'Number of post-editors: %s' % len(post_editors)
 452+print 'Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors))
 453+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Property changes on: trunk/tools/editor_trends/kaggle/training_db.py
___________________________________________________________________
Added: svn:eol-style
1454 + native