r90935 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r90934‎ \| r90935 \| r90936 >
Date:	04:19, 28 June 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Backlog of small fixes.
Modified paths:	/trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py (added) (history) /trunk/tools/editor_trends/analyses/adhoc/community_graph.py (deleted) (history) /trunk/tools/editor_trends/analyses/network (added) (history) /trunk/tools/editor_trends/analyses/network/community_graph.py (added) (history) /trunk/tools/editor_trends/analyses/network/graph_db.py (added) (history) /trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py (added) (history) /trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py (added) (history) /trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py (added) (history) /trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py (added) (history) /trunk/tools/editor_trends/etl/sort.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/kaggle/training.py (deleted) (history) /trunk/tools/editor_trends/kaggle/training_db.py (added) (history) /trunk/tools/editor_trends/kaggle/training_file.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
—	—	@@ -0,0 +1,49 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-04-19'
	19	+__version__ = '0.1'
	20	+
	21	+from datetime import datetime
	22	+from dateutil.relativedelta import *
	23	+
	24	+
	25	+def kaggle_correlation(var, editor, **kwargs):
	26	+ end_date = datetime(2011, 2, 1)
	27	+ cutoff_date = datetime(2010, 9, 1)
	28	+ start_date = datetime(2009, 9, 1)
	29	+ edits = editor['edit_count']
	30	+ username = editor['username']
	31	+
	32	+ pre, after = 0, 0
	33	+
	34	+ while start_date < cutoff_date:
	35	+ year = str(start_date.year)
	36	+ month = str(start_date.month)
	37	+ pre += edits.get(year, {}).get(month, {}).get('0', 0)
	38	+ start_date = start_date + relativedelta(months= +1)
	39	+
	40	+ start_date = datetime(2010, 9, 1)
	41	+ while start_date < end_date:
	42	+ year = str(start_date.year)
	43	+ month = str(start_date.month)
	44	+ after += edits.get(year, {}).get(month, {}).get('0', 0)
	45	+ start_date = start_date + relativedelta(months= +1)
	46	+
	47	+ if pre > 0:
	48	+ var.add(end_date, pre, {'after': after, 'username': username})
	49	+
	50	+ return var
Index: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
—	—	@@ -0,0 +1,41 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-28'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+from datetime import datetime
	23	+from dateutil.relativedelta import *
	24	+
	25	+
	26	+def kaggle_sanity_check_edits(var, editor, **kwargs):
	27	+ end_date = datetime(2011, 2, 1)
	28	+ start_date = datetime(2010, 9, 1)
	29	+ edits = editor['edit_count']
	30	+ username = editor['username']
	31	+
	32	+ count = 0
	33	+ while start_date < end_date:
	34	+ year = str(start_date.year)
	35	+ month = str(start_date.month)
	36	+ count += edits.get(year, {}).get(month, {}).get('0', 0)
	37	+ start_date = start_date + relativedelta(months= +1)
	38	+
	39	+ if count > 0:
	40	+ var.add(end_date, count, {'editor': username})
	41	+
	42	+ return var
Index: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
—	—	@@ -0,0 +1,49 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-28'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+from datetime import datetime
	23	+from dateutil.relativedelta import *
	24	+
	25	+
	26	+def kaggle_sanity_check(var, editor, **kwargs):
	27	+ end_date = datetime(2011, 2, 1)
	28	+ cutoff = datetime(2010, 9, 1)
	29	+ start_date = datetime(2009, 9, 1)
	30	+ edits = editor['edit_count']
	31	+ active = 0
	32	+ count = 0
	33	+ while start_date < cutoff:
	34	+ year = str(start_date.year)
	35	+ month = str(start_date.month)
	36	+ #namespaces = edits.get(year, {}).get(month, {}).keys()
	37	+ #for ns in namespaces:
	38	+ count += edits.get(year, {}).get(month, {}).get('0', 0)
	39	+ start_date = start_date + relativedelta(months= +1)
	40	+
	41	+ if count > 0:
	42	+ while start_date < end_date:
	43	+ year = str(start_date.year)
	44	+ month = str(start_date.month)
	45	+ active += edits.get(year, {}).get(month, {}).get('0', 0)
	46	+ start_date = start_date + relativedelta(months= +1)
	47	+ if active > 0 :
	48	+ var.add(cutoff, 1)
	49	+
	50	+ return var
Index: trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
—	—	@@ -0,0 +1,42 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+from classes import storage
	22	+
	23	+def sor_newbie_treatment(editor, var, **kwargs):
	24	+ rts = kwargs.pop('rts')
	25	+ tenth_edit = editor['new_wikipedian']
	26	+ title = ':%s' % editor['username']
	27	+ collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name)
	28	+ db = storage.init_database(rts.storage, rts.dbname, collection)
	29	+
	30	+ if tenth_edit != False:
	31	+ qualifier = {'ns': 3, 'timestamp': {'$lt':tenth_edit}}
	32	+ observations = db.find_one(qualifier)
	33	+ else:
	34	+ observations = db.find_one('editor', editor)
	35	+
	36	+ if observations != None:
	37	+ for obs in observations:
	38	+ if obs['ns'] == 3:
	39	+ values = obs.values()
	40	+ print values
	41	+
	42	+
	43	+
Index: trunk/tools/editor_trends/analyses/network/community_graph.py
—	—	@@ -0,0 +1,63 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-10'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+if '../../' not in sys.path:
	23	+ sys.path.append('../../')
	24	+
	25	+from classes import settings
	26	+settings = settings.Settings()
	27	+from classes import storage
	28	+from utils import file_utils
	29	+
	30	+try:
	31	+ import psyco
	32	+ psyco.full()
	33	+except ImportError:
	34	+ pass
	35	+
	36	+def create_articles_set(edits):
	37	+ s = set()
	38	+ years = edits.keys()
	39	+ for year in years:
	40	+ for edit in edits[year]:
	41	+ s.add(edit['article'])
	42	+ return s
	43	+
	44	+
	45	+def create_edgelist(project, collection):
	46	+ db = storage.init_database('mongo', project, collection)
	47	+ ids = db.retrieve_distinct_keys('editor')
	48	+ ids.sort()
	49	+ fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8')
	50	+ for i in ids:
	51	+ author_i = db.find_one({'editor': i})
	52	+ if author_i != None:
	53	+ article_i = create_articles_set(author_i['edits'])
	54	+ for j in ids:
	55	+ if i > j:
	56	+ author_j = db.find_one({'editor': j})
	57	+ article_j = create_articles_set(author_j['edits'])
	58	+ common = article_i.intersection(article_j)
	59	+ if len(common) > 0:
	60	+ file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)
	61	+ fh.close()
	62	+
	63	+if __name__ == '__main__':
	64	+ create_edgelist('wikilytics', 'enwiki_editors_raw')
Property changes on: trunk/tools/editor_trends/analyses/network/community_graph.py
___________________________________________________________________
Added: svn:eol-style
1	65	+ native
Index: trunk/tools/editor_trends/analyses/network/graph_db.py
—	—	@@ -0,0 +1,82 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-25'
	19	+__version__ = '0.1'
	20	+
	21	+import codecs
	22	+from neo4jrestclient import GraphDatabase, NotFoundError
	23	+
	24	+neo4jrestclient.request.CACHE = True
	25	+
	26	+class IDGenerator:
	27	+ def __init__(self):
	28	+ self.n = 0
	29	+ self.ids = {}
	30	+ self.inverted_ids = {}
	31	+
	32	+ def invert_dict(self):
	33	+ return dict((v, k) for k, v in self.ids.iteritems())
	34	+
	35	+ def get_id(self, n):
	36	+ if n not in self.ids:
	37	+ self.ids[n] = self.n
	38	+ self.n += 1
	39	+ return self.ids[n]
	40	+
	41	+ def reverse_lookup(self, n):
	42	+ if self.inverted_ids == {}:
	43	+ self.inverted_ids = self.invert_dict()
	44	+ return self.inverted_ids[n]
	45	+
	46	+
	47	+def read_edgelist():
	48	+ fh = codecs.open('C:\\Users\\diederik.vanliere\\Dropbox\\wsor\\diederik\\wikilytics_edgelist.csv', 'r', 'utf-8')
	49	+ for line in fh:
	50	+ line = line.strip()
	51	+ line = line.split('\t')
	52	+ actor_a = line[0]
	53	+ actor_b = line[1]
	54	+ weight = int(line[2])
	55	+ yield (actor_a, actor_b, weight)
	56	+ fh.close()
	57	+
	58	+def init_db():
	59	+ gdb = GraphDatabase("http://localhost:7474/db/data/")
	60	+ return gdb
	61	+
	62	+def get_node(gdb, idg, node):
	63	+ node = idg.get_id(node)
	64	+ try:
	65	+ #n = gdb.nodes.get('id', node)
	66	+ n = gdb.nodes[node]
	67	+ except NotFoundError:
	68	+ n = gdb.nodes.create(id=node)
	69	+ n['id'] = node
	70	+
	71	+ return n
	72	+
	73	+def load_data():
	74	+ idg = IDGenerator()
	75	+ gdb = init_db()
	76	+ for (actor_a, actor_b, weight) in read_edgelist():
	77	+ n1 = get_node(gdb, idg, actor_a)
	78	+ n2 = get_node(gdb, idg, actor_b)
	79	+ n1.relationships.create("cognitive_distance", n2, weight=weight)
	80	+
	81	+if __name__ == '__main__':
	82	+ load_data()
	83	+
Property changes on: trunk/tools/editor_trends/analyses/network/graph_db.py
___________________________________________________________________
Added: svn:eol-style
1	84	+ native
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
—	—	@@ -1,62 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-10'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		~~-if '..' not in sys.path:~~
23		~~- sys.path.append('..')~~
24		-
25		~~-from classes import settings~~
26		~~-settings = settings.Settings()~~
27		~~-from classes import storage~~
28		~~-from utils import file_utils~~
29		-
30		~~-try:~~
31		~~- import psyco~~
32		~~- psyco.full()~~
33		~~-except ImportError:~~
34		~~- pass~~
35		-
36		~~-def create_articles_set(edits):~~
37		~~- s = set()~~
38		~~- years = edits.keys()~~
39		~~- for year in years:~~
40		~~- for edit in edits[year]:~~
41		~~- s.add(edit['article'])~~
42		~~- return s~~
43		-
44		-
45		~~-def create_edgelist(project, collection):~~
46		~~- db = storage.init_database(rts.storage, project, collection)~~
47		~~- ids = db.retrieve_distinct_keys('editor')~~
48		~~- ids.sort()~~
49		~~- fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8')~~
50		~~- for i in ids:~~
51		~~- author_i = conn[collection].find_one({'editor': i})~~
52		~~- article_i = create_articles_set(author_i['edits'])~~
53		~~- for j in ids:~~
54		~~- if i > j:~~
55		~~- author_j = conn[collection].find_one({'editor': j})~~
56		~~- article_j = create_articles_set(author_j['edits'])~~
57		~~- common = article_i.intersection(article_j)~~
58		~~- if len(common) > 0:~~
59		~~- file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)~~
60		~~- fh.close()~~
61		-
62		~~-if __name__ == '__main__':~~
63		~~- create_edgelist('enwiki', 'editors')~~
Index: trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
—	—	@@ -0,0 +1,24 @@
	2	+from Queue import Queue
	3	+#import cProfile
	4	+from guppy import hpy
	5	+h = hpy()
	6	+
	7	+q1, q2, q3 = Queue(), Queue(), Queue()
	8	+h.heap()
	9	+print 'ughh'
	10	+for x in xrange(1000):
	11	+ q1.put(x)
	12	+ q2.put({})
	13	+ q3.put([])
	14	+ #h = hpy()
	15	+hpy().doc
	16	+h.heap()
	17	+# for x in xrange(100):
	18	+# a = q1.get()
	19	+# b = q2.get()
	20	+# c = q3.get()
	21	+# h.heap()
	22	+
	23	+#if __name__ == '__main__':
	24	+# main()
	25	+ #cProfile.run('main()')
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -79,7 +79,7 @@
80	80	date = text_utils.convert_timestamp_to_datetime_utc(line[6])
81	81	md5 = line[7]
82	82	revert = int(line[8])
83		~~- reverted_user = int(line[9])~~
	83	+ reverted_user = line[9]
84	84	reverted_rev_id = int(line[10])
85	85	bot = int(line[11])
86	86	cur_size = int(line[12])
—	—	@@ -96,12 +96,10 @@
97	97	'cur_size':cur_size,
98	98	'delta':delta,
99	99	'bot':bot,
	100	+ 'reverted_user': reverted_user,
	101	+ 'reverted_rev_id': reverted_rev_id
100	102	}
101	103
102		~~- if reverted_user > -1:~~
103		~~- data['reverted_user'] = reverted_user,~~
104		~~- data['reverted_rev_id'] = reverted_rev_id~~
105		-
106	104	return data
107	105
108	106
Index: trunk/tools/editor_trends/etl/sort.py
—	—	@@ -55,11 +55,6 @@
56	56	fh.close()
57	57	for x, d in enumerate(data):
58	58	d = d.strip().split('\t')
59		~~- #TEMP FIX:~~
60		~~- #editor = d[2]~~
61		~~- #d[2] = d[0]~~
62		~~- #d[0] = editor~~
63		~~- #END TEMP FIX~~
64	59	data[x] = d
65	60	#data = [d.strip() for d in data]
66	61	#data = [d.split('\t') for d in data]
—	—	@@ -153,7 +148,7 @@
154	149	pbar = progressbar.ProgressBar(maxval=len(files)).start()
155	150	tasks = multiprocessing.JoinableQueue()
156	151	result = multiprocessing.JoinableQueue()
157		~~- number_of_processes = 3~~
	152	+ number_of_processes = 2
158	153	sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)]
159	154
160	155	for filename in files:
—	—	@@ -166,16 +161,14 @@
167	162	sorter.start()
168	163
169	164	ppills = number_of_processes
170		~~- while True:~~
171		~~- while ppills > 0:~~
172		~~- try:~~
173		~~- res = result.get(block=True)~~
174		~~- if res == True:~~
175		~~- pbar.update(pbar.currval + 1)~~
176		~~- else:~~
177		~~- ppills -= 1~~
178		~~- except Empty:~~
179		~~- pass~~
180		~~- break~~
	165	+ while ppills > 0:
	166	+ try:
	167	+ res = result.get()
	168	+ if res == True:
	169	+ pbar.update(pbar.currval + 1)
	170	+ else:
	171	+ ppills -= 1
	172	+ except Empty:
	173	+ pass
181	174
182	175	tasks.join()
Index: trunk/tools/editor_trends/kaggle/training.py
—	—	@@ -1,141 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-04-12'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import os~~
22		~~-import sys~~
23		~~-import cPickle~~
24		~~-import codecs~~
25		~~-from datetime import datetime~~
26		~~-sys.path.append('../')~~
27		-
28		~~-from classes import storage~~
29		-
30		~~-location = '/home/diederik/wikimedia/en/wiki/kaggle'~~
31		~~-files = os.listdir(location)~~
32		~~-files.reverse()~~
33		-
34		~~-max_size = 2147483648~~
35		~~-max_size_reached = False~~
36		-
37		~~-t0 = datetime.now()~~
38		~~-titles = {}~~
39		~~-ids = set()~~
40		~~-dates = {}~~
41		~~-edits = {}~~
42		~~-ignore_ids = set()~~
43		~~-size = 0~~
44		~~-cnt_obs = 0~~
45		~~-cutoff_date = datetime(2010, 8, 31)~~
46		-
47		~~-print 'Constructing training dataset...'~~
48		~~-db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')~~
49		~~-dataset = codecs.open('training.tsv', 'w', 'utf-8')~~
50		~~-for filename in files:~~
51		~~- if not filename.startswith('comments') and not filename.startswith('articles'):~~
52		~~- fh = codecs.open(os.path.join(location, filename))~~
53		~~- if max_size_reached == True:~~
54		~~- break~~
55		~~- for line in fh:~~
56		~~- line = line.strip()~~
57		~~- line = line.split('\t')~~
58		~~- if len(line) != 12:~~
59		~~- continue~~
60		~~- if line[10] == '1':~~
61		~~- continue~~
62		~~- timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')~~
63		~~- if timestamp > cutoff_date:~~
64		~~- continue~~
65		~~- username = line[3].lower()~~
66		~~- if username.endswith('bot') or username.find('script') > -1:~~
67		~~- #line[10] = '1'~~
68		~~- continue~~
69		~~- id = line[2]~~
70		~~- if id not in ids and id not in ignore_ids:~~
71		~~- res = db.find_one({'editor': id})~~
72		~~- if res == None:~~
73		~~- ignore_ids.add(id)~~
74		~~- continue~~
75		~~- cnt_obs += 1~~
76		~~- title_id = line[1]~~
77		~~- ids.add(id)~~
78		~~- simple_date = '%s-%s' % (timestamp.year, timestamp.month)~~
79		~~- dates.setdefault(simple_date, 0)~~
80		~~- dates[simple_date] += 1~~
81		~~- title = line.pop(5)~~
82		~~- titles[title_id] = title~~
83		~~- line.append('\n')~~
84		~~- line = '\t'.join(line)~~
85		~~- size += len(line)~~
86		~~- if size > max_size:~~
87		~~- max_size_reached = True~~
88		~~- dataset.write(line.decode('utf-8'))~~
89		-
90		~~-dataset.close()~~
91		-
92		~~-print 'Constructing title dataset...'~~
93		~~-fh = codecs.open('titles.tsv', 'w', 'utf-8')~~
94		~~-for id, title in titles.iteritems():~~
95		~~- fh.write('%s\t%s\n' % (id, title.decode('utf-8')))~~
96		~~-fh.close()~~
97		-
98		-
99		~~-print 'Constructing solution dataset...'~~
100		~~-x = 0~~
101		~~-fh = codecs.open('solutions.tsv', 'w', 'utf-8')~~
102		~~-for id in ids:~~
103		~~- if id not in ignore_ids:~~
104		~~- obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')~~
105		~~- if obs != None:~~
106		~~- x += 1~~
107		~~- n = obs['cum_edit_count_main_ns']~~
108		~~- fh.write('%s,%s\n' % (id.decode('utf-8'), n))~~
109		~~- edits.setdefault(n, 0)~~
110		~~- edits[n] += 1~~
111		~~- else:~~
112		~~- print id~~
113		~~-fh.close()~~
114		-
115		~~-print 'Storing date histogram'~~
116		~~-fh = open('histogram_dates.bin', 'wb')~~
117		~~-cPickle.dump(dates, fh)~~
118		~~-fh.close()~~
119		-
120		-
121		~~-fh = open('histogram_dates.tsv', 'w')~~
122		~~-for date, n in dates.iteritems():~~
123		~~- fh.write('%s\t%s\n' % (date, n))~~
124		~~-fh.close()~~
125		-
126		-
127		~~-print 'Storing edit histogram'~~
128		~~-fh = open('histogram_edits.bin', 'wb')~~
129		~~-cPickle.dump(edits, fh)~~
130		~~-fh.close()~~
131		-
132		~~-fh = open('histogram_edits.tsv', 'w')~~
133		~~-for edit, n in edits.iteritems():~~
134		~~- fh.write('%s\t%s\n' % (edit, n))~~
135		~~-fh.close()~~
136		-
137		-
138		~~-t1 = datetime.now()~~
139		~~-print 'Descriptives:'~~
140		~~-print 'Number of editors: %s' % x~~
141		~~-print 'Number of edits: %s' % cnt_obs~~
142		~~-print 'It took %s to construct the Kaggle training set' % (t1 - t0)~~
Index: trunk/tools/editor_trends/kaggle/training_file.py
—	—	@@ -0,0 +1,430 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-04-12'
	19	+__version__ = '0.1'
	20	+
	21	+import os
	22	+import sys
	23	+import cPickle
	24	+import codecs
	25	+import random
	26	+from itertools import izip_longest
	27	+from datetime import datetime
	28	+from dateutil.relativedelta import *
	29	+sys.path.append('../')
	30	+import resource
	31	+
	32	+random.seed(1024)
	33	+from classes import storage
	34	+
	35	+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
	36	+ 'md5', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
	37	+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
	38	+ 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
	39	+
	40	+size = 0 #current size of file
	41	+#max_size = 2147483648
	42	+max_size = 5000000
	43	+editors_seen = {}
	44	+cnt_obs = 0 #count of number of edits
	45	+revs = {}
	46	+titles = {}
	47	+predictions = {}
	48	+
	49	+t0 = datetime.now()
	50	+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
	51	+txt_files = '/home/diederik/wikimedia/xml/en/wiki/sorted/'
	52	+files = os.listdir(location)
	53	+max_file_handles = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
	54	+#files.sort()
	55	+#files.reverse()
	56	+
	57	+cutoff_date = datetime(2010, 8, 31) #operator is >
	58	+end_date = datetime(2011, 2, 1) #operator is <
	59	+cutoff_date_training = datetime(2010, 1, 31) #operator is >
	60	+end_date_training = datetime(2010, 9, 1) # operator is <
	61	+
	62	+
	63	+class IDGenerator:
	64	+ def __init__(self):
	65	+ self.n = 0
	66	+ self.ids = {}
	67	+ self.rnd_ids = {}
	68	+ self.inverted_ids = None
	69	+
	70	+ def invert_dict(self, dictionary):
	71	+ return dict((v, k) for k, v in dictionary.iteritems())
	72	+
	73	+ def get_id(self, n):
	74	+ if n not in self.ids:
	75	+ self.n += 1
	76	+ while len(self.rnd_ids) < self.n :
	77	+ rnd_id = self.get_random_id()
	78	+ if self.rnd_ids.get(rnd_id, False) == False:
	79	+ self.rnd_ids[rnd_id] = True
	80	+ self.ids[n] = rnd_id
	81	+ return self.ids[n]
	82	+
	83	+ def get_random_id(self):
	84	+ return random.randrange(0, 1000000)
	85	+
	86	+ def reverse_lookup(self, n):
	87	+ self.inverted_ids = self.invert_dict(self.ids)
	88	+ return self.inverted_ids[n]
	89	+
	90	+
	91	+def construct_article_meta(fh_articles, files):
	92	+ print 'Constructing title dataset...'
	93	+ headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page']
	94	+ write_headers(fh_articles, headers)
	95	+ #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
	96	+ article_meta = {}
	97	+ for filename in files:
	98	+ if filename.startswith('articles_meta'):
	99	+ fh = codecs.open(os.path.join(location, filename))
	100	+ for line in fh:
	101	+ line = line.strip()
	102	+ line = line.split('\t')
	103	+ category = line[1]
	104	+ if category != 'List':
	105	+ title = line[2]
	106	+ title = title.split('/')
	107	+ article_meta.setdefault(title[-1], {})
	108	+ article_meta[title[-1]]['category'] = category
	109	+ article_meta[title[-1]]['id'] = line[0]
	110	+ fh.close()
	111	+ return article_meta
	112	+
	113	+
	114	+def determine_active(edits, start_date, end_date):
	115	+ active = 0
	116	+ namespaces = ['0', '1', '2', '3', '4', '5']
	117	+ if start_date == datetime(2009, 9, 1):
	118	+ if '2009' not in edits and '2010' not in edits:
	119	+ return active
	120	+ elif start_date == datetime(2010, 9, 1):
	121	+ if '2010' not in edits and '2011' not in edits:
	122	+ return active
	123	+
	124	+ while start_date < end_date:
	125	+ year = str(start_date.year)
	126	+ month = str(start_date.month)
	127	+ for ns in namespaces:
	128	+ active += edits.get(year, {}).get(month, {}).get(ns, 0)
	129	+ if active > 0: #we don't need to know how many edits,just if active
	130	+ return active
	131	+ start_date = start_date + relativedelta(months= +1)
	132	+ return active
	133	+
	134	+def load_binary_file(filename):
	135	+ fh = open('set_b.bin', 'rb')
	136	+ obj = cPickle.load(fh)
	137	+ fh.close()
	138	+ return obj
	139	+
	140	+
	141	+def convert_tz_to_mysql_tz(tz):
	142	+ iso = tz.__str__()
	143	+ tz = iso[0:4] + '-' + iso[4:6] + '-' + iso[6:]
	144	+ return tz
	145	+
	146	+
	147	+def check_reverter(idg, reverter):
	148	+ try:
	149	+ reverter = int(reverter)
	150	+ if reverter != -1:
	151	+ reverter = idg.get_id(reverter)
	152	+ return reverter
	153	+ except ValueError:
	154	+ pass
	155	+ return -1
	156	+
	157	+
	158	+def check_user_id(user_id):
	159	+ try:
	160	+ int(user_id)
	161	+ except ValueError:
	162	+ return False
	163	+ return True
	164	+
	165	+
	166	+def check_username(username):
	167	+ username = username.lower()
	168	+ if username.endswith('bot') or username.find('script') > -1:
	169	+ return False #exclude more bots and scripts
	170	+ return True
	171	+
	172	+
	173	+def determine_editors(db):
	174	+ start_date_pre = datetime(2009, 9, 1)
	175	+ end_date_pre = datetime(2010, 9, 1)
	176	+ end_date = datetime(2011, 2, 1)
	177	+ pre_editors = set()
	178	+ post_editors = set()
	179	+ #cursor = db.find({'date': {'$gte': start_date_pre, '$lt': end_date_pre}}, 'first_edit,edit_count,user_id,username')
	180	+ cursor = db.find({}, 'first_edit,edit_count,user_id,username')
	181	+ x, y, z = 0, 0, 0
	182	+ for editor in cursor:
	183	+ x += 1
	184	+ if 'first_edit' not in editor:
	185	+ continue
	186	+ if editor['first_edit'] >= end_date_pre:
	187	+ continue
	188	+ if check_username(editor['username']) == False:
	189	+ continue
	190	+ if check_user_id(editor['editor']) == False:
	191	+ continue
	192	+
	193	+ #print editor['edit_count']
	194	+ active = determine_active(editor['edit_count'], start_date_pre, end_date_pre)
	195	+ if active > 0:
	196	+ pre_editors.add(editor['editor'])
	197	+ y += 1
	198	+ active = determine_active(editor['edit_count'], end_date_pre, end_date)
	199	+ if active > 0:
	200	+ post_editors.add(editor['editor'])
	201	+ z += 1
	202	+ if x % 100000 == 0:
	203	+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
	204	+
	205	+ #set_a = pre_editors.difference(post_editors)
	206	+ post_editors = pre_editors.intersection(post_editors)
	207	+
	208	+ return pre_editors, post_editors
	209	+
	210	+
	211	+def write_headers(fh, headers):
	212	+ for i, key in enumerate(headers):
	213	+ fh.write('%s' % key)
	214	+ if (i + 1) != len(keys):
	215	+ fh.write('\t')
	216	+ else:
	217	+ fh.write('\n')
	218	+
	219	+def write_revision(dataset, revision):
	220	+ size = 0
	221	+ for i, key in enumerate(keys):
	222	+ #print key, revision[key]
	223	+# if key == 'reverted_user' or key == 'reverted_rev_id':
	224	+# revision[key] = revision[key][0]
	225	+ if type(revision[key]) == type(0):
	226	+ revision[key] = str(revision[key])
	227	+
	228	+ dataset.write('%s' % revision[key].decode('utf-8'))
	229	+ size += len(revision[key])
	230	+ if (i + 1) != len(keys):
	231	+ dataset.write('\t')
	232	+ else:
	233	+ dataset.write('\n')
	234	+ return size
	235	+
	236	+
	237	+print 'Constructing training dataset...'
	238	+db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
	239	+print 'Loading editors...'
	240	+if not os.path.exists('set_a.bin'):
	241	+ pre_editors, post_editors = determine_editors(db_dataset)
	242	+ fh = open('set_a.bin', 'wb')
	243	+ cPickle.dump(pre_editors, fh)
	244	+ fh.close()
	245	+
	246	+ fh = open('set_b.bin', 'wb')
	247	+ cPickle.dump(post_editors, fh)
	248	+ fh.close()
	249	+else:
	250	+ pre_editors = load_binary_file('set_a.bin')
	251	+ post_editors = load_binary_file('set_b.bin')
	252	+
	253	+
	254	+dataset = codecs.open('training.tsv', 'w', 'utf-8')
	255	+write_headers(dataset, headers)
	256	+idg = IDGenerator()
	257	+
	258	+
	259	+
	260	+print 'Parsing revisions...'
	261	+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
	262	+seen_editors = {}
	263	+for editors in izip_longest(post_editors, pre_editors, fillvalue=None):
	264	+ for editor in editors:
	265	+ go = editors_seen.get(editor, True)
	266	+ if go:
	267	+ #if editor:
	268	+ editors_seen[editor] = False
	269	+ print 'Parsing editor %s...' % editor
	270	+ #revisions = db_raw.find({'user_id': editor})
	271	+ file_id = int(editor) % max_file_handles
	272	+ fh = codecs.open(os.path.join(txt_files, '%s.csv' % file_id), 'r', 'utf-8')
	273	+ for line in fh:
	274	+ line = line.strip()
	275	+ line = line.split('\t')
	276	+ if line[0] != editor:
	277	+ continue
	278	+ revision = {}
	279	+ revision['user_id'] = int(line[0])
	280	+ revision['article_id'] = int(line[1])
	281	+ revision['rev_id'] = int(line[2])
	282	+ revision['ns'] = line[4]
	283	+ revision['date'] = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
	284	+ revision['hash'] = line[7]
	285	+ revision['revert'] = line[8]
	286	+ revision['reverted_user'] = line[9]
	287	+ revision['reverted_rev_id'] = line[10]
	288	+ revision['cur_size'] = line[12]
	289	+ revision['delta'] = line[13]
	290	+ #print line
	291	+ #print revision
	292	+
	293	+ #'user_id', 'article_id', 'rev_id', 'ns', 'date',
	294	+ #'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size'
	295	+ #print 'Editor %s made % edits' % (editor, len(revisions))
	296	+ #for revision in revisions:
	297	+ user_id = idg.get_id(revision['user_id'])
	298	+ revision['user_id'] = user_id #recode id to make it harder to look up answers
	299	+ if revision['ns'] < 0:
	300	+ continue
	301	+ timestamp = revision['date']
	302	+ #revision['date'] = convert_tz_to_mysql_tz(timestamp)
	303	+
	304	+ predictions.setdefault(user_id, {})
	305	+ predictions[user_id].setdefault('solution', 0)
	306	+ predictions[user_id].setdefault('training', 0)
	307	+
	308	+ if timestamp > cutoff_date and timestamp < end_date:
	309	+ predictions[user_id]['solution'] += 1
	310	+ elif timestamp > cutoff_date_training and timestamp < end_date_training:
	311	+ predictions[user_id]['training'] += 1
	312	+ if timestamp > cutoff_date: #exclude edits after cut off date
	313	+ continue
	314	+
	315	+ revision['reverted_user'] = check_reverter(idg, revision.get('reverted_user', -1))
	316	+ #revision.pop('_id')
	317	+ #revision.pop('username')
	318	+ revision['date'] = revision['date'].__str__()
	319	+ titles[revision['article_id']] = True
	320	+ revs[revision['rev_id']] = True
	321	+ size += write_revision(dataset, revision)
	322	+ cnt_obs += 1
	323	+ if cnt_obs % 10000 == 0:
	324	+ print 'Parsed %s revisions...' % cnt_obs
	325	+ fh.close()
	326	+ if size > max_size:
	327	+ break
	328	+if size > max_size:
	329	+ print 'Reached maximum filesize...'
	330	+else:
	331	+ print 'Parsed all available editors in post set...'
	332	+dataset.close()
	333	+
	334	+
	335	+
	336	+print 'Constructing solution dataset...'
	337	+fh = codecs.open('solutions.csv', 'w', 'utf-8')
	338	+keys = predictions.keys()
	339	+keys.sort()
	340	+fh.write('%s,%s\n' % ('editor_id', 'solution'))
	341	+for key in keys:
	342	+ fh.write('%s,%s\n' % (key, predictions[key]['solution']))
	343	+fh.close()
	344	+
	345	+
	346	+print 'Constructing test dataset...'
	347	+fh = codecs.open('test.csv', 'w', 'utf-8')
	348	+fh.write('%s,%s\n' % ('editor_id', 'test'))
	349	+for key, value in predictions.iteritems():
	350	+ fh.write('%s,%s\n' % (key, value['training']))
	351	+fh.close()
	352	+
	353	+
	354	+print 'Constructing article file...'
	355	+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
	356	+article_meta = construct_article_meta(fh_articles, files)
	357	+for filename in files:
	358	+ if filename.startswith('articles') and not filename.startswith('articles_meta'):
	359	+ fh = codecs.open(os.path.join(location, filename))
	360	+ for line in fh:
	361	+ line = line.strip()
	362	+ line = line.split('\t')
	363	+ if len(line) == 6:
	364	+ article_id = int(line[0])
	365	+ title = titles.get(article_id, None)
	366	+ if title:
	367	+ title = line[-1]
	368	+ meta = article_meta.get(title, None)
	369	+ parent_id = -1
	370	+ category = 'Null'
	371	+ if meta:
	372	+ parent_id = meta['id']
	373	+ category = meta['category']
	374	+
	375	+ line[1] = category
	376	+ line[2] = convert_tz_to_mysql_tz(line[2])
	377	+ line[-1] = line[-1].decode('utf-8')
	378	+ line.append(str(parent_id))
	379	+ line.append('\n')
	380	+ fh_articles.write('\t'.join(line))
	381	+ fh.close()
	382	+fh_articles.close()
	383	+
	384	+
	385	+print 'Constructing comment dataset...'
	386	+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
	387	+fh_comments.write('%s\t%s\n' % ('rev_id', 'text'))
	388	+cnt = len(revs.keys())
	389	+for filename in files:
	390	+ if filename.startswith('comments'):
	391	+ fh = codecs.open(os.path.join(location, filename))
	392	+ for line in fh:
	393	+ if cnt == 0:
	394	+ break
	395	+ line = line.strip()
	396	+ line = line.split('\t')
	397	+ if len(line) == 2: #some lines are missing rev id, not sure why.
	398	+ try:
	399	+ rev_id = int(line[0])
	400	+ exists = revs.get(rev_id, None)
	401	+ if exists:
	402	+ fh_comments.write('%s\t%s\n' % (rev_id, line[1].decode('utf-8')))
	403	+ cnt -= 1
	404	+ except (ValueError, KeyError), error:
	405	+ print error
	406	+ fh.close()
	407	+fh_comments.close()
	408	+
	409	+print 'Storing random ids...'
	410	+fh = open('random_ids.bin', 'wb')
	411	+cPickle.dump(idg, fh)
	412	+fh.close()
	413	+
	414	+
	415	+fh = open('descriptives.tsv', 'w')
	416	+fh.write('Number of unique editors: %s\n' % idg.n)
	417	+fh.write('Number of revisions: %s\n' % cnt_obs)
	418	+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
	419	+fh.write('Number of post-editors: %s\n' % len(post_editors))
	420	+fh.write('Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors)))
	421	+fh.close()
	422	+
	423	+
	424	+t1 = datetime.now()
	425	+print 'Descriptives:'
	426	+print 'Number of unique editors: %s' % idg.n
	427	+print 'Number of revisions: %s' % cnt_obs
	428	+print 'Number of pre-editors: %s' % len(pre_editors)
	429	+print 'Number of post-editors: %s' % len(post_editors)
	430	+print 'Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors))
	431	+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/kaggle/training_db.py
—	—	@@ -0,0 +1,452 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-04-12'
	19	+__version__ = '0.1'
	20	+
	21	+import os
	22	+import sys
	23	+import cPickle
	24	+import codecs
	25	+import random
	26	+from itertools import izip
	27	+from datetime import datetime
	28	+from dateutil.relativedelta import *
	29	+sys.path.append('../')
	30	+
	31	+random.seed(1024)
	32	+from classes import storage
	33	+
	34	+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
	35	+ 'md5', 'reverted', 'reverted_user_id', 'reverted_revision_id', 'delta', 'cur_size']
	36	+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
	37	+ 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
	38	+
	39	+max_size = 2147483648
	40	+#max_size = 2000000
	41	+cnt_obs = 0 #count of number of edits
	42	+revs = {}
	43	+titles = {}
	44	+predictions = {}
	45	+
	46	+t0 = datetime.now()
	47	+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
	48	+files = os.listdir(location)
	49	+#files.sort()
	50	+#files.reverse()
	51	+editors_seen = {}
	52	+cutoff_date = datetime(2010, 9, 1) #operator is >
	53	+end_date = datetime(2011, 2, 1) #operator is <
	54	+cutoff_date_training = datetime(2010, 1, 31) #operator is >
	55	+end_date_training = datetime(2010, 9, 1) # operator is <
	56	+
	57	+class IDGenerator:
	58	+ def __init__(self):
	59	+ self.n = 0
	60	+ self.ids = {}
	61	+
	62	+ def get_id(self, n):
	63	+ if n not in self.ids:
	64	+ self.ids[n] = self.n
	65	+ self.n += 1
	66	+ return str(self.ids[n])
	67	+
	68	+class RandomIDGenerator:
	69	+ def __init__(self):
	70	+ self.n = 0
	71	+ self.ids = {}
	72	+ self.rnd_ids = {}
	73	+ self.inverted_ids = None
	74	+
	75	+ def invert_dict(self, dictionary):
	76	+ return dict((v, k) for k, v in dictionary.iteritems())
	77	+
	78	+ def get_id(self, n):
	79	+ if n not in self.ids:
	80	+ self.n += 1
	81	+ while len(self.rnd_ids) < self.n :
	82	+ rnd_id = self.get_random_id()
	83	+ if self.rnd_ids.get(rnd_id, False) == False:
	84	+ self.rnd_ids[rnd_id] = True
	85	+ self.ids[n] = rnd_id
	86	+ return self.ids[n]
	87	+
	88	+ def get_random_id(self):
	89	+ return random.randrange(0, 1000000)
	90	+
	91	+ def reverse_lookup(self, n):
	92	+ self.inverted_ids = self.invert_dict(self.ids)
	93	+ return self.inverted_ids[n]
	94	+
	95	+
	96	+def construct_article_meta(fh_articles, files):
	97	+ print 'Constructing title dataset...'
	98	+ headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page']
	99	+ write_headers(fh_articles, headers)
	100	+ #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
	101	+ article_meta = {}
	102	+ for filename in files:
	103	+ if filename.startswith('articles_meta'):
	104	+ fh = codecs.open(os.path.join(location, filename))
	105	+ for line in fh:
	106	+ line = line.strip()
	107	+ line = line.split('\t')
	108	+ category = line[1]
	109	+ if category != 'List':
	110	+ title = line[2]
	111	+ title = title.split('/')
	112	+ article_meta.setdefault(title[-1], {})
	113	+ article_meta[title[-1]]['category'] = category
	114	+ article_meta[title[-1]]['id'] = line[0]
	115	+ fh.close()
	116	+ return article_meta
	117	+
	118	+
	119	+def determine_active(edits, start_date, end_date):
	120	+ active = 0
	121	+ if start_date == datetime(2009, 9, 1):
	122	+ if '2009' not in edits and '2010' not in edits:
	123	+ return active
	124	+# elif start_date == datetime(2010, 9, 1):
	125	+# if '2010' not in edits and '2011' not in edits:
	126	+# return active
	127	+
	128	+
	129	+ namespaces = ['0', '1', '2', '3', '4', '5']
	130	+ while start_date < end_date:
	131	+ year = str(start_date.year)
	132	+ month = str(start_date.month)
	133	+ for ns in namespaces:
	134	+ active += edits.get(year, {}).get(month, {}).get(ns, 0)
	135	+ if active > 0: #we don't need to know how many edits,just if active
	136	+ return active
	137	+ start_date = start_date + relativedelta(months= +1)
	138	+ return active
	139	+
	140	+
	141	+def load_binary_file(filename):
	142	+ fh = open(filename, 'rb')
	143	+ obj = cPickle.load(fh)
	144	+ fh.close()
	145	+ return obj
	146	+
	147	+
	148	+def convert_tz_to_mysql_tz(tz):
	149	+ return tz.__str__()
	150	+
	151	+
	152	+def check_reverter(idg, reverter):
	153	+ try:
	154	+ if reverter != -1:
	155	+ reverter = idg.get_id(reverter)
	156	+ return reverter
	157	+ except ValueError:
	158	+ pass
	159	+ return -1
	160	+
	161	+
	162	+def check_user_id(user_id):
	163	+ try:
	164	+ int(user_id)
	165	+ except ValueError:
	166	+ return False
	167	+ return True
	168	+
	169	+
	170	+def check_username(username):
	171	+ username = username.lower()
	172	+ if username.endswith('bot') or username.find('script') > -1:
	173	+ return False #exclude more bots and scripts
	174	+ return True
	175	+
	176	+
	177	+def determine_editors(db):
	178	+ start_date_pre = datetime(2009, 9, 1)
	179	+ end_date_pre = datetime(2010, 9, 1)
	180	+ end_date = datetime(2011, 2, 1)
	181	+ pre_editors = set()
	182	+ post_editors = set()
	183	+ cursor = db.find({}, 'first_edit,edit_count,user_id,username')
	184	+ x, y, z = 0, 0, 0
	185	+ for editor in cursor:
	186	+ x += 1
	187	+ if 'first_edit' not in editor:
	188	+ continue
	189	+ if editor['first_edit'] > end_date_pre:
	190	+ continue
	191	+ if check_username(editor['username']) == False:
	192	+ continue
	193	+ if check_user_id(editor['user_id']) == False:
	194	+ continue
	195	+
	196	+ active_pre = determine_active(editor['edit_count'], start_date_pre, end_date_pre)
	197	+ if x % 100000 == 0:
	198	+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
	199	+
	200	+ if active_pre == 0:
	201	+ continue #exclude editors who are not active in the year before the cutoff date
	202	+ else:
	203	+ active_post = determine_active(editor['edit_count'], end_date_pre, end_date)
	204	+ if active_post == 0:
	205	+ pre_editors.add(user_id)
	206	+ y += 1
	207	+ else:
	208	+ post_editors.add(user_id)
	209	+ z += 1
	210	+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
	211	+ return pre_editors, post_editors
	212	+
	213	+
	214	+def write_headers(fh, headers):
	215	+ for i, key in enumerate(headers):
	216	+ fh.write('%s' % key)
	217	+ if (i + 1) != len(headers):
	218	+ fh.write('\t')
	219	+ else:
	220	+ fh.write('\n')
	221	+
	222	+def write_revision(dataset, revision):
	223	+ for i, key in enumerate(keys):
	224	+ if type(revision[key]) == type(0):
	225	+ revision[key] = str(revision[key])
	226	+ dataset.write('%s' % revision[key].decode('utf-8'))
	227	+ if (i + 1) != len(keys):
	228	+ dataset.write('\t')
	229	+ else:
	230	+ dataset.write('\n')
	231	+
	232	+
	233	+print 'Constructing training dataset...'
	234	+db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
	235	+print 'Loading editors...'
	236	+if not os.path.exists('set_a.bin'):
	237	+ pre_editors, post_editors = determine_editors(db_dataset)
	238	+ fh = open('set_a.bin', 'wb')
	239	+ cPickle.dump(pre_editors, fh)
	240	+ fh.close()
	241	+
	242	+ fh = open('set_b.bin', 'wb')
	243	+ cPickle.dump(post_editors, fh)
	244	+ fh.close()
	245	+else:
	246	+ pre_editors = load_binary_file('set_a.bin')
	247	+ post_editors = load_binary_file('set_b.bin')
	248	+
	249	+
	250	+dataset = codecs.open('training.tsv', 'w', 'utf-8')
	251	+write_headers(dataset, headers)
	252	+idg = RandomIDGenerator()
	253	+
	254	+namespaces = IDGenerator()
	255	+print 'Parsing revisions...'
	256	+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
	257	+seen_editors = {}
	258	+editors = {}
	259	+x = 1
	260	+for editor in post_editors:
	261	+ #print editor
	262	+ editors[x] = editor
	263	+ x += 2
	264	+x = 0
	265	+z = len(post_editors)
	266	+for y, editor in enumerate(pre_editors):
	267	+ #print editor
	268	+ editors[x] = editor
	269	+ x += 2
	270	+ if z == y:
	271	+ break
	272	+
	273	+editor_keys = editors.keys()
	274	+editor_keys.sort()
	275	+for key in editor_keys:
	276	+ #print editors
	277	+ #for editor in editors:
	278	+ editor = editors[key]
	279	+ #print editor
	280	+ go = editors_seen.get(editor, True)
	281	+ if go:
	282	+ editors_seen[editor] = False
	283	+ user_id = idg.get_id(editor)
	284	+ print 'Parsing editor %s (%s) ...' % (editor, user_id)
	285	+ revisions = db_raw.find({'user_id': str(editor)})
	286	+
	287	+ predictions.setdefault(user_id, {})
	288	+ predictions[user_id].setdefault('solution', 0)
	289	+ predictions[user_id].setdefault('training', 0)
	290	+
	291	+ for revision in revisions:
	292	+ revision['user_id'] = user_id #recode id to make it harder to look up answers
	293	+ if revision['ns'] < 0 or revision['ns'] > 5:
	294	+ continue
	295	+ #revision['ns'] = namespaces.get_id(revision['ns'])
	296	+ timestamp = revision['date']
	297	+ revision['date'] = convert_tz_to_mysql_tz(timestamp)
	298	+
	299	+
	300	+
	301	+ if timestamp > cutoff_date:
	302	+ #print editor, user_id, timestamp, revision['date']
	303	+ if timestamp < end_date:
	304	+ predictions[user_id]['solution'] += 1
	305	+ elif timestamp > cutoff_date_training:
	306	+ if timestamp < end_date_training:
	307	+ predictions[user_id]['training'] += 1
	308	+
	309	+ if timestamp > cutoff_date: #exclude edits after cut off date
	310	+ continue
	311	+ if revision['revert'] == 1:
	312	+ revision['reverted_user'] = check_reverter(idg, revision.get('reverted_user', -1))
	313	+ revision.pop('_id')
	314	+ revision.pop('username')
	315	+ titles[revision['article_id']] = True
	316	+ revs[revision['rev_id']] = True
	317	+ write_revision(dataset, revision)
	318	+ cnt_obs += 1
	319	+ if cnt_obs % 10000 == 0:
	320	+ print 'Parsed %s revisions...' % cnt_obs
	321	+ if dataset.tell() > max_size:
	322	+ break
	323	+if dataset.tell() > max_size:
	324	+ print 'Reached maximum filesize...'
	325	+else:
	326	+ print 'Parsed all available editors in post set...'
	327	+dataset.close()
	328	+
	329	+
	330	+
	331	+print 'Constructing solution dataset...'
	332	+fh = codecs.open('solutions.csv', 'w', 'utf-8')
	333	+editor_keys = predictions.keys()
	334	+editor_keys.sort()
	335	+fh.write('%s,%s\n' % ('user_id', 'solution'))
	336	+for key in editor_keys:
	337	+ fh.write('%s,%s\n' % (key, predictions[key]['solution']))
	338	+ print key, predictions[key]['solution']
	339	+fh.close()
	340	+
	341	+
	342	+print 'Constructing test dataset...'
	343	+fh = codecs.open('test.csv', 'w', 'utf-8')
	344	+fh.write('%s,%s\n' % ('user_id', 'test'))
	345	+for key, value in predictions.iteritems():
	346	+ fh.write('%s,%s\n' % (key, value['training']))
	347	+fh.close()
	348	+
	349	+print 'Constructing article file...'
	350	+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
	351	+article_meta = construct_article_meta(fh_articles, files)
	352	+categories = IDGenerator()
	353	+for filename in files:
	354	+ if filename.startswith('articles') and not filename.startswith('articles_meta'):
	355	+ fh = codecs.open(os.path.join(location, filename))
	356	+ for line in fh:
	357	+ line = line.strip()
	358	+ line = line.split('\t')
	359	+ if len(line) == 6:
	360	+ article_id = int(line[0])
	361	+ title = titles.pop(article_id, None)
	362	+ if title:
	363	+ title = line[-1]
	364	+ meta = article_meta.get(title, None)
	365	+ parent_id = '-1'
	366	+ category = -1
	367	+ redirect = line[4]
	368	+ if redirect == 'False':
	369	+ redirect = '0'
	370	+ else:
	371	+ redirect = '1'
	372	+ line[4] = redirect
	373	+ if meta:
	374	+ parent_id = meta['id']
	375	+ category = meta['category']
	376	+
	377	+
	378	+ line[1] = categories.get_id(category)
	379	+ tz = datetime.strptime(line[2], '%Y-%m-%dT%H:%M:%SZ')
	380	+ line[2] = convert_tz_to_mysql_tz(tz)
	381	+ line[-1] = line[-1].decode('utf-8')
	382	+ line.append(parent_id)
	383	+ line.append('\n')
	384	+ fh_articles.write('\t'.join(line))
	385	+ fh.close()
	386	+fh_articles.close()
	387	+
	388	+
	389	+print 'Constructing comment dataset...'
	390	+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
	391	+fh_comments.write('%s\t%s\n' % ('revision_id', 'comment'))
	392	+cnt = len(revs.keys())
	393	+for filename in files:
	394	+ if filename.startswith('comments'):
	395	+ fh = codecs.open(os.path.join(location, filename))
	396	+ for line in fh:
	397	+ if cnt == 0:
	398	+ break
	399	+ line = line.strip()
	400	+ line = line.split('\t')
	401	+ if len(line) == 2: #some lines are missing rev id, not sure why.
	402	+ try:
	403	+ rev_id = int(line[0])
	404	+ exists = revs.get(rev_id, None)
	405	+ if exists:
	406	+ fh_comments.write('%s\t%s\n' % (rev_id, line[1].decode('utf-8')))
	407	+ cnt -= 1
	408	+ except (ValueError, KeyError), error:
	409	+ print error
	410	+ fh.close()
	411	+fh_comments.close()
	412	+
	413	+print 'Storing random ids...'
	414	+fh = open('random_ids.bin', 'wb')
	415	+cPickle.dump(idg, fh)
	416	+fh.close()
	417	+
	418	+fh = codecs.open('namespaces.tsv', 'w', 'utf-8')
	419	+write_headers(fh, ['key', 'namespace'])
	420	+namespaces = {'0':'Main',
	421	+ '1':'Talk',
	422	+ '2':'User',
	423	+ '3':'User Talk',
	424	+ '4':'Wikipedia',
	425	+ '5':'Wikipedia Talk'
	426	+ }
	427	+for key, value in namespaces.iteritems():
	428	+ fh.write('%s\t%s\n' % (key, value))
	429	+fh.close()
	430	+
	431	+fh = codecs.open('categories.tsv', 'w', 'utf-8')
	432	+write_headers(fh, ['id', 'name'])
	433	+for key, value in categories.ids.iteritems():
	434	+ fh.write('%s\t%s\n' % (value, key))
	435	+fh.close()
	436	+
	437	+fh = open('descriptives.tsv', 'w')
	438	+fh.write('Number of unique editors: %s\n' % idg.n)
	439	+fh.write('Number of revisions: %s\n' % cnt_obs)
	440	+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
	441	+fh.write('Number of post-editors: %s\n' % len(post_editors))
	442	+fh.write('Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors)))
	443	+fh.close()
	444	+
	445	+
	446	+t1 = datetime.now()
	447	+print 'Descriptives:'
	448	+print 'Number of unique editors: %s' % idg.n
	449	+print 'Number of revisions: %s' % cnt_obs
	450	+print 'Number of pre-editors: %s' % len(pre_editors)
	451	+print 'Number of post-editors: %s' % len(post_editors)
	452	+print 'Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors))
	453	+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Property changes on: trunk/tools/editor_trends/kaggle/training_db.py
___________________________________________________________________
Added: svn:eol-style
1	454	+ native