Index: trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py |
— | — | @@ -0,0 +1,49 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-04-19'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +from datetime import datetime
|
| 22 | +from dateutil.relativedelta import *
|
| 23 | +
|
| 24 | +
|
| 25 | +def kaggle_correlation(var, editor, **kwargs):
|
| 26 | + end_date = datetime(2011, 2, 1)
|
| 27 | + cutoff_date = datetime(2010, 9, 1)
|
| 28 | + start_date = datetime(2009, 9, 1)
|
| 29 | + edits = editor['edit_count']
|
| 30 | + username = editor['username']
|
| 31 | +
|
| 32 | + pre, after = 0, 0
|
| 33 | +
|
| 34 | + while start_date < cutoff_date:
|
| 35 | + year = str(start_date.year)
|
| 36 | + month = str(start_date.month)
|
| 37 | + pre += edits.get(year, {}).get(month, {}).get('0', 0)
|
| 38 | + start_date = start_date + relativedelta(months= +1)
|
| 39 | +
|
| 40 | + start_date = datetime(2010, 9, 1)
|
| 41 | + while start_date < end_date:
|
| 42 | + year = str(start_date.year)
|
| 43 | + month = str(start_date.month)
|
| 44 | + after += edits.get(year, {}).get(month, {}).get('0', 0)
|
| 45 | + start_date = start_date + relativedelta(months= +1)
|
| 46 | +
|
| 47 | + if pre > 0:
|
| 48 | + var.add(end_date, pre, {'after': after, 'username': username})
|
| 49 | +
|
| 50 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py |
— | — | @@ -0,0 +1,41 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-28'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +from datetime import datetime
|
| 23 | +from dateutil.relativedelta import *
|
| 24 | +
|
| 25 | +
|
| 26 | +def kaggle_sanity_check_edits(var, editor, **kwargs):
|
| 27 | + end_date = datetime(2011, 2, 1)
|
| 28 | + start_date = datetime(2010, 9, 1)
|
| 29 | + edits = editor['edit_count']
|
| 30 | + username = editor['username']
|
| 31 | +
|
| 32 | + count = 0
|
| 33 | + while start_date < end_date:
|
| 34 | + year = str(start_date.year)
|
| 35 | + month = str(start_date.month)
|
| 36 | + count += edits.get(year, {}).get(month, {}).get('0', 0)
|
| 37 | + start_date = start_date + relativedelta(months= +1)
|
| 38 | +
|
| 39 | + if count > 0:
|
| 40 | + var.add(end_date, count, {'editor': username})
|
| 41 | +
|
| 42 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py |
— | — | @@ -0,0 +1,49 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-28'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +from datetime import datetime
|
| 23 | +from dateutil.relativedelta import *
|
| 24 | +
|
| 25 | +
|
| 26 | +def kaggle_sanity_check(var, editor, **kwargs):
|
| 27 | + end_date = datetime(2011, 2, 1)
|
| 28 | + cutoff = datetime(2010, 9, 1)
|
| 29 | + start_date = datetime(2009, 9, 1)
|
| 30 | + edits = editor['edit_count']
|
| 31 | + active = 0
|
| 32 | + count = 0
|
| 33 | + while start_date < cutoff:
|
| 34 | + year = str(start_date.year)
|
| 35 | + month = str(start_date.month)
|
| 36 | + #namespaces = edits.get(year, {}).get(month, {}).keys()
|
| 37 | + #for ns in namespaces:
|
| 38 | + count += edits.get(year, {}).get(month, {}).get('0', 0)
|
| 39 | + start_date = start_date + relativedelta(months= +1)
|
| 40 | +
|
| 41 | + if count > 0:
|
| 42 | + while start_date < end_date:
|
| 43 | + year = str(start_date.year)
|
| 44 | + month = str(start_date.month)
|
| 45 | + active += edits.get(year, {}).get(month, {}).get('0', 0)
|
| 46 | + start_date = start_date + relativedelta(months= +1)
|
| 47 | + if active > 0 :
|
| 48 | + var.add(cutoff, 1)
|
| 49 | +
|
| 50 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py |
— | — | @@ -0,0 +1,42 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +from classes import storage
|
| 22 | +
|
| 23 | +def sor_newbie_treatment(editor, var, **kwargs):
|
| 24 | + rts = kwargs.pop('rts')
|
| 25 | + tenth_edit = editor['new_wikipedian']
|
| 26 | + title = ':%s' % editor['username']
|
| 27 | + collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name)
|
| 28 | + db = storage.init_database(rts.storage, rts.dbname, collection)
|
| 29 | +
|
| 30 | + if tenth_edit != False:
|
| 31 | + qualifier = {'ns': 3, 'timestamp': {'$lt':tenth_edit}}
|
| 32 | + observations = db.find_one(qualifier)
|
| 33 | + else:
|
| 34 | + observations = db.find_one('editor', editor)
|
| 35 | +
|
| 36 | + if observations != None:
|
| 37 | + for obs in observations:
|
| 38 | + if obs['ns'] == 3:
|
| 39 | + values = obs.values()
|
| 40 | + print values
|
| 41 | +
|
| 42 | +
|
| 43 | +
|
Index: trunk/tools/editor_trends/analyses/network/community_graph.py |
— | — | @@ -0,0 +1,63 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-10' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +if '../../' not in sys.path: |
| 23 | + sys.path.append('../../') |
| 24 | + |
| 25 | +from classes import settings |
| 26 | +settings = settings.Settings() |
| 27 | +from classes import storage |
| 28 | +from utils import file_utils |
| 29 | + |
| 30 | +try: |
| 31 | + import psyco |
| 32 | + psyco.full() |
| 33 | +except ImportError: |
| 34 | + pass |
| 35 | + |
| 36 | +def create_articles_set(edits): |
| 37 | + s = set() |
| 38 | + years = edits.keys() |
| 39 | + for year in years: |
| 40 | + for edit in edits[year]: |
| 41 | + s.add(edit['article']) |
| 42 | + return s |
| 43 | + |
| 44 | + |
| 45 | +def create_edgelist(project, collection): |
| 46 | + db = storage.init_database('mongo', project, collection) |
| 47 | + ids = db.retrieve_distinct_keys('editor') |
| 48 | + ids.sort() |
| 49 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') |
| 50 | + for i in ids: |
| 51 | + author_i = db.find_one({'editor': i}) |
| 52 | + if author_i != None: |
| 53 | + article_i = create_articles_set(author_i['edits']) |
| 54 | + for j in ids: |
| 55 | + if i > j: |
| 56 | + author_j = db.find_one({'editor': j}) |
| 57 | + article_j = create_articles_set(author_j['edits']) |
| 58 | + common = article_i.intersection(article_j) |
| 59 | + if len(common) > 0: |
| 60 | + file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) |
| 61 | + fh.close() |
| 62 | + |
| 63 | +if __name__ == '__main__': |
| 64 | + create_edgelist('wikilytics', 'enwiki_editors_raw') |
Property changes on: trunk/tools/editor_trends/analyses/network/community_graph.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 65 | + native |
Index: trunk/tools/editor_trends/analyses/network/graph_db.py |
— | — | @@ -0,0 +1,82 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import codecs |
| 22 | +from neo4jrestclient import GraphDatabase, NotFoundError |
| 23 | + |
| 24 | +neo4jrestclient.request.CACHE = True |
| 25 | + |
| 26 | +class IDGenerator: |
| 27 | + def __init__(self): |
| 28 | + self.n = 0 |
| 29 | + self.ids = {} |
| 30 | + self.inverted_ids = {} |
| 31 | + |
| 32 | + def invert_dict(self): |
| 33 | + return dict((v, k) for k, v in self.ids.iteritems()) |
| 34 | + |
| 35 | + def get_id(self, n): |
| 36 | + if n not in self.ids: |
| 37 | + self.ids[n] = self.n |
| 38 | + self.n += 1 |
| 39 | + return self.ids[n] |
| 40 | + |
| 41 | + def reverse_lookup(self, n): |
| 42 | + if self.inverted_ids == {}: |
| 43 | + self.inverted_ids = self.invert_dict() |
| 44 | + return self.inverted_ids[n] |
| 45 | + |
| 46 | + |
| 47 | +def read_edgelist(): |
| 48 | + fh = codecs.open('C:\\Users\\diederik.vanliere\\Dropbox\\wsor\\diederik\\wikilytics_edgelist.csv', 'r', 'utf-8') |
| 49 | + for line in fh: |
| 50 | + line = line.strip() |
| 51 | + line = line.split('\t') |
| 52 | + actor_a = line[0] |
| 53 | + actor_b = line[1] |
| 54 | + weight = int(line[2]) |
| 55 | + yield (actor_a, actor_b, weight) |
| 56 | + fh.close() |
| 57 | + |
| 58 | +def init_db(): |
| 59 | + gdb = GraphDatabase("http://localhost:7474/db/data/") |
| 60 | + return gdb |
| 61 | + |
| 62 | +def get_node(gdb, idg, node): |
| 63 | + node = idg.get_id(node) |
| 64 | + try: |
| 65 | + #n = gdb.nodes.get('id', node) |
| 66 | + n = gdb.nodes[node] |
| 67 | + except NotFoundError: |
| 68 | + n = gdb.nodes.create(id=node) |
| 69 | + n['id'] = node |
| 70 | + |
| 71 | + return n |
| 72 | + |
| 73 | +def load_data(): |
| 74 | + idg = IDGenerator() |
| 75 | + gdb = init_db() |
| 76 | + for (actor_a, actor_b, weight) in read_edgelist(): |
| 77 | + n1 = get_node(gdb, idg, actor_a) |
| 78 | + n2 = get_node(gdb, idg, actor_b) |
| 79 | + n1.relationships.create("cognitive_distance", n2, weight=weight) |
| 80 | + |
| 81 | +if __name__ == '__main__': |
| 82 | + load_data() |
| 83 | + |
Property changes on: trunk/tools/editor_trends/analyses/network/graph_db.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 84 | + native |
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py |
— | — | @@ -1,62 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-01-10' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -if '..' not in sys.path: |
23 | | - sys.path.append('..') |
24 | | - |
25 | | -from classes import settings |
26 | | -settings = settings.Settings() |
27 | | -from classes import storage |
28 | | -from utils import file_utils |
29 | | - |
30 | | -try: |
31 | | - import psyco |
32 | | - psyco.full() |
33 | | -except ImportError: |
34 | | - pass |
35 | | - |
36 | | -def create_articles_set(edits): |
37 | | - s = set() |
38 | | - years = edits.keys() |
39 | | - for year in years: |
40 | | - for edit in edits[year]: |
41 | | - s.add(edit['article']) |
42 | | - return s |
43 | | - |
44 | | - |
45 | | -def create_edgelist(project, collection): |
46 | | - db = storage.init_database(rts.storage, project, collection) |
47 | | - ids = db.retrieve_distinct_keys('editor') |
48 | | - ids.sort() |
49 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') |
50 | | - for i in ids: |
51 | | - author_i = conn[collection].find_one({'editor': i}) |
52 | | - article_i = create_articles_set(author_i['edits']) |
53 | | - for j in ids: |
54 | | - if i > j: |
55 | | - author_j = conn[collection].find_one({'editor': j}) |
56 | | - article_j = create_articles_set(author_j['edits']) |
57 | | - common = article_i.intersection(article_j) |
58 | | - if len(common) > 0: |
59 | | - file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) |
60 | | - fh.close() |
61 | | - |
62 | | -if __name__ == '__main__': |
63 | | - create_edgelist('enwiki', 'editors') |
Index: trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py |
— | — | @@ -0,0 +1,24 @@ |
| 2 | +from Queue import Queue
|
| 3 | +#import cProfile
|
| 4 | +from guppy import hpy
|
| 5 | +h = hpy()
|
| 6 | +
|
| 7 | +q1, q2, q3 = Queue(), Queue(), Queue()
|
| 8 | +h.heap()
|
| 9 | +print 'ughh'
|
| 10 | +for x in xrange(1000):
|
| 11 | + q1.put(x)
|
| 12 | + q2.put({})
|
| 13 | + q3.put([])
|
| 14 | + #h = hpy()
|
| 15 | +hpy().doc
|
| 16 | +h.heap()
|
| 17 | +# for x in xrange(100):
|
| 18 | +# a = q1.get()
|
| 19 | +# b = q2.get()
|
| 20 | +# c = q3.get()
|
| 21 | +# h.heap()
|
| 22 | +
|
| 23 | +#if __name__ == '__main__':
|
| 24 | +# main()
|
| 25 | + #cProfile.run('main()')
|
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -79,7 +79,7 @@ |
80 | 80 | date = text_utils.convert_timestamp_to_datetime_utc(line[6]) |
81 | 81 | md5 = line[7] |
82 | 82 | revert = int(line[8]) |
83 | | - reverted_user = int(line[9]) |
| 83 | + reverted_user = line[9] |
84 | 84 | reverted_rev_id = int(line[10]) |
85 | 85 | bot = int(line[11]) |
86 | 86 | cur_size = int(line[12]) |
— | — | @@ -96,12 +96,10 @@ |
97 | 97 | 'cur_size':cur_size, |
98 | 98 | 'delta':delta, |
99 | 99 | 'bot':bot, |
| 100 | + 'reverted_user': reverted_user, |
| 101 | + 'reverted_rev_id': reverted_rev_id |
100 | 102 | } |
101 | 103 | |
102 | | - if reverted_user > -1: |
103 | | - data['reverted_user'] = reverted_user, |
104 | | - data['reverted_rev_id'] = reverted_rev_id |
105 | | - |
106 | 104 | return data |
107 | 105 | |
108 | 106 | |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -55,11 +55,6 @@ |
56 | 56 | fh.close() |
57 | 57 | for x, d in enumerate(data): |
58 | 58 | d = d.strip().split('\t') |
59 | | - #TEMP FIX: |
60 | | - #editor = d[2] |
61 | | - #d[2] = d[0] |
62 | | - #d[0] = editor |
63 | | - #END TEMP FIX |
64 | 59 | data[x] = d |
65 | 60 | #data = [d.strip() for d in data] |
66 | 61 | #data = [d.split('\t') for d in data] |
— | — | @@ -153,7 +148,7 @@ |
154 | 149 | pbar = progressbar.ProgressBar(maxval=len(files)).start() |
155 | 150 | tasks = multiprocessing.JoinableQueue() |
156 | 151 | result = multiprocessing.JoinableQueue() |
157 | | - number_of_processes = 3 |
| 152 | + number_of_processes = 2 |
158 | 153 | sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)] |
159 | 154 | |
160 | 155 | for filename in files: |
— | — | @@ -166,16 +161,14 @@ |
167 | 162 | sorter.start() |
168 | 163 | |
169 | 164 | ppills = number_of_processes |
170 | | - while True: |
171 | | - while ppills > 0: |
172 | | - try: |
173 | | - res = result.get(block=True) |
174 | | - if res == True: |
175 | | - pbar.update(pbar.currval + 1) |
176 | | - else: |
177 | | - ppills -= 1 |
178 | | - except Empty: |
179 | | - pass |
180 | | - break |
| 165 | + while ppills > 0: |
| 166 | + try: |
| 167 | + res = result.get() |
| 168 | + if res == True: |
| 169 | + pbar.update(pbar.currval + 1) |
| 170 | + else: |
| 171 | + ppills -= 1 |
| 172 | + except Empty: |
| 173 | + pass |
181 | 174 | |
182 | 175 | tasks.join() |
Index: trunk/tools/editor_trends/kaggle/training.py |
— | — | @@ -1,141 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-04-12' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import os |
22 | | -import sys |
23 | | -import cPickle |
24 | | -import codecs |
25 | | -from datetime import datetime |
26 | | -sys.path.append('../') |
27 | | - |
28 | | -from classes import storage |
29 | | - |
30 | | -location = '/home/diederik/wikimedia/en/wiki/kaggle' |
31 | | -files = os.listdir(location) |
32 | | -files.reverse() |
33 | | - |
34 | | -max_size = 2147483648 |
35 | | -max_size_reached = False |
36 | | - |
37 | | -t0 = datetime.now() |
38 | | -titles = {} |
39 | | -ids = set() |
40 | | -dates = {} |
41 | | -edits = {} |
42 | | -ignore_ids = set() |
43 | | -size = 0 |
44 | | -cnt_obs = 0 |
45 | | -cutoff_date = datetime(2010, 8, 31) |
46 | | - |
47 | | -print 'Constructing training dataset...' |
48 | | -db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset') |
49 | | -dataset = codecs.open('training.tsv', 'w', 'utf-8') |
50 | | -for filename in files: |
51 | | - if not filename.startswith('comments') and not filename.startswith('articles'): |
52 | | - fh = codecs.open(os.path.join(location, filename)) |
53 | | - if max_size_reached == True: |
54 | | - break |
55 | | - for line in fh: |
56 | | - line = line.strip() |
57 | | - line = line.split('\t') |
58 | | - if len(line) != 12: |
59 | | - continue |
60 | | - if line[10] == '1': |
61 | | - continue |
62 | | - timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ') |
63 | | - if timestamp > cutoff_date: |
64 | | - continue |
65 | | - username = line[3].lower() |
66 | | - if username.endswith('bot') or username.find('script') > -1: |
67 | | - #line[10] = '1' |
68 | | - continue |
69 | | - id = line[2] |
70 | | - if id not in ids and id not in ignore_ids: |
71 | | - res = db.find_one({'editor': id}) |
72 | | - if res == None: |
73 | | - ignore_ids.add(id) |
74 | | - continue |
75 | | - cnt_obs += 1 |
76 | | - title_id = line[1] |
77 | | - ids.add(id) |
78 | | - simple_date = '%s-%s' % (timestamp.year, timestamp.month) |
79 | | - dates.setdefault(simple_date, 0) |
80 | | - dates[simple_date] += 1 |
81 | | - title = line.pop(5) |
82 | | - titles[title_id] = title |
83 | | - line.append('\n') |
84 | | - line = '\t'.join(line) |
85 | | - size += len(line) |
86 | | - if size > max_size: |
87 | | - max_size_reached = True |
88 | | - dataset.write(line.decode('utf-8')) |
89 | | - |
90 | | -dataset.close() |
91 | | - |
92 | | -print 'Constructing title dataset...' |
93 | | -fh = codecs.open('titles.tsv', 'w', 'utf-8') |
94 | | -for id, title in titles.iteritems(): |
95 | | - fh.write('%s\t%s\n' % (id, title.decode('utf-8'))) |
96 | | -fh.close() |
97 | | - |
98 | | - |
99 | | -print 'Constructing solution dataset...' |
100 | | -x = 0 |
101 | | -fh = codecs.open('solutions.tsv', 'w', 'utf-8') |
102 | | -for id in ids: |
103 | | - if id not in ignore_ids: |
104 | | - obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns') |
105 | | - if obs != None: |
106 | | - x += 1 |
107 | | - n = obs['cum_edit_count_main_ns'] |
108 | | - fh.write('%s,%s\n' % (id.decode('utf-8'), n)) |
109 | | - edits.setdefault(n, 0) |
110 | | - edits[n] += 1 |
111 | | - else: |
112 | | - print id |
113 | | -fh.close() |
114 | | - |
115 | | -print 'Storing date histogram' |
116 | | -fh = open('histogram_dates.bin', 'wb') |
117 | | -cPickle.dump(dates, fh) |
118 | | -fh.close() |
119 | | - |
120 | | - |
121 | | -fh = open('histogram_dates.tsv', 'w') |
122 | | -for date, n in dates.iteritems(): |
123 | | - fh.write('%s\t%s\n' % (date, n)) |
124 | | -fh.close() |
125 | | - |
126 | | - |
127 | | -print 'Storing edit histogram' |
128 | | -fh = open('histogram_edits.bin', 'wb') |
129 | | -cPickle.dump(edits, fh) |
130 | | -fh.close() |
131 | | - |
132 | | -fh = open('histogram_edits.tsv', 'w') |
133 | | -for edit, n in edits.iteritems(): |
134 | | - fh.write('%s\t%s\n' % (edit, n)) |
135 | | -fh.close() |
136 | | - |
137 | | - |
138 | | -t1 = datetime.now() |
139 | | -print 'Descriptives:' |
140 | | -print 'Number of editors: %s' % x |
141 | | -print 'Number of edits: %s' % cnt_obs |
142 | | -print 'It took %s to construct the Kaggle training set' % (t1 - t0) |
Index: trunk/tools/editor_trends/kaggle/training_file.py |
— | — | @@ -0,0 +1,430 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-04-12'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import os
|
| 22 | +import sys
|
| 23 | +import cPickle
|
| 24 | +import codecs
|
| 25 | +import random
|
| 26 | +from itertools import izip_longest
|
| 27 | +from datetime import datetime
|
| 28 | +from dateutil.relativedelta import *
|
| 29 | +sys.path.append('../')
|
| 30 | +import resource
|
| 31 | +
|
| 32 | +random.seed(1024)
|
| 33 | +from classes import storage
|
| 34 | +
|
| 35 | +headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
|
| 36 | + 'md5', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
|
| 37 | +keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
|
| 38 | + 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size']
|
| 39 | +
|
| 40 | +size = 0 #current size of file
|
| 41 | +#max_size = 2147483648
|
| 42 | +max_size = 5000000
|
| 43 | +editors_seen = {}
|
| 44 | +cnt_obs = 0 #count of number of edits
|
| 45 | +revs = {}
|
| 46 | +titles = {}
|
| 47 | +predictions = {}
|
| 48 | +
|
| 49 | +t0 = datetime.now()
|
| 50 | +location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
|
| 51 | +txt_files = '/home/diederik/wikimedia/xml/en/wiki/sorted/'
|
| 52 | +files = os.listdir(location)
|
| 53 | +max_file_handles = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
|
| 54 | +#files.sort()
|
| 55 | +#files.reverse()
|
| 56 | +
|
| 57 | +cutoff_date = datetime(2010, 8, 31) #operator is >
|
| 58 | +end_date = datetime(2011, 2, 1) #operator is <
|
| 59 | +cutoff_date_training = datetime(2010, 1, 31) #operator is >
|
| 60 | +end_date_training = datetime(2010, 9, 1) # operator is <
|
| 61 | +
|
| 62 | +
|
| 63 | +class IDGenerator:
|
| 64 | + def __init__(self):
|
| 65 | + self.n = 0
|
| 66 | + self.ids = {}
|
| 67 | + self.rnd_ids = {}
|
| 68 | + self.inverted_ids = None
|
| 69 | +
|
| 70 | + def invert_dict(self, dictionary):
|
| 71 | + return dict((v, k) for k, v in dictionary.iteritems())
|
| 72 | +
|
| 73 | + def get_id(self, n):
|
| 74 | + if n not in self.ids:
|
| 75 | + self.n += 1
|
| 76 | + while len(self.rnd_ids) < self.n :
|
| 77 | + rnd_id = self.get_random_id()
|
| 78 | + if self.rnd_ids.get(rnd_id, False) == False:
|
| 79 | + self.rnd_ids[rnd_id] = True
|
| 80 | + self.ids[n] = rnd_id
|
| 81 | + return self.ids[n]
|
| 82 | +
|
| 83 | + def get_random_id(self):
|
| 84 | + return random.randrange(0, 1000000)
|
| 85 | +
|
| 86 | + def reverse_lookup(self, n):
|
| 87 | + self.inverted_ids = self.invert_dict(self.ids)
|
| 88 | + return self.inverted_ids[n]
|
| 89 | +
|
| 90 | +
|
| 91 | +def construct_article_meta(fh_articles, files):
|
| 92 | + print 'Constructing title dataset...'
|
| 93 | + headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page']
|
| 94 | + write_headers(fh_articles, headers)
|
| 95 | + #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
|
| 96 | + article_meta = {}
|
| 97 | + for filename in files:
|
| 98 | + if filename.startswith('articles_meta'):
|
| 99 | + fh = codecs.open(os.path.join(location, filename))
|
| 100 | + for line in fh:
|
| 101 | + line = line.strip()
|
| 102 | + line = line.split('\t')
|
| 103 | + category = line[1]
|
| 104 | + if category != 'List':
|
| 105 | + title = line[2]
|
| 106 | + title = title.split('/')
|
| 107 | + article_meta.setdefault(title[-1], {})
|
| 108 | + article_meta[title[-1]]['category'] = category
|
| 109 | + article_meta[title[-1]]['id'] = line[0]
|
| 110 | + fh.close()
|
| 111 | + return article_meta
|
| 112 | +
|
| 113 | +
|
| 114 | +def determine_active(edits, start_date, end_date):
|
| 115 | + active = 0
|
| 116 | + namespaces = ['0', '1', '2', '3', '4', '5']
|
| 117 | + if start_date == datetime(2009, 9, 1):
|
| 118 | + if '2009' not in edits and '2010' not in edits:
|
| 119 | + return active
|
| 120 | + elif start_date == datetime(2010, 9, 1):
|
| 121 | + if '2010' not in edits and '2011' not in edits:
|
| 122 | + return active
|
| 123 | +
|
| 124 | + while start_date < end_date:
|
| 125 | + year = str(start_date.year)
|
| 126 | + month = str(start_date.month)
|
| 127 | + for ns in namespaces:
|
| 128 | + active += edits.get(year, {}).get(month, {}).get(ns, 0)
|
| 129 | + if active > 0: #we don't need to know how many edits,just if active
|
| 130 | + return active
|
| 131 | + start_date = start_date + relativedelta(months= +1)
|
| 132 | + return active
|
| 133 | +
|
| 134 | +def load_binary_file(filename):
|
| 135 | + fh = open('set_b.bin', 'rb')
|
| 136 | + obj = cPickle.load(fh)
|
| 137 | + fh.close()
|
| 138 | + return obj
|
| 139 | +
|
| 140 | +
|
| 141 | +def convert_tz_to_mysql_tz(tz):
|
| 142 | + iso = tz.__str__()
|
| 143 | + tz = iso[0:4] + '-' + iso[4:6] + '-' + iso[6:]
|
| 144 | + return tz
|
| 145 | +
|
| 146 | +
|
| 147 | +def check_reverter(idg, reverter):
|
| 148 | + try:
|
| 149 | + reverter = int(reverter)
|
| 150 | + if reverter != -1:
|
| 151 | + reverter = idg.get_id(reverter)
|
| 152 | + return reverter
|
| 153 | + except ValueError:
|
| 154 | + pass
|
| 155 | + return -1
|
| 156 | +
|
| 157 | +
|
| 158 | +def check_user_id(user_id):
|
| 159 | + try:
|
| 160 | + int(user_id)
|
| 161 | + except ValueError:
|
| 162 | + return False
|
| 163 | + return True
|
| 164 | +
|
| 165 | +
|
| 166 | +def check_username(username):
|
| 167 | + username = username.lower()
|
| 168 | + if username.endswith('bot') or username.find('script') > -1:
|
| 169 | + return False #exclude more bots and scripts
|
| 170 | + return True
|
| 171 | +
|
| 172 | +
|
| 173 | +def determine_editors(db):
|
| 174 | + start_date_pre = datetime(2009, 9, 1)
|
| 175 | + end_date_pre = datetime(2010, 9, 1)
|
| 176 | + end_date = datetime(2011, 2, 1)
|
| 177 | + pre_editors = set()
|
| 178 | + post_editors = set()
|
| 179 | + #cursor = db.find({'date': {'$gte': start_date_pre, '$lt': end_date_pre}}, 'first_edit,edit_count,user_id,username')
|
| 180 | + cursor = db.find({}, 'first_edit,edit_count,user_id,username')
|
| 181 | + x, y, z = 0, 0, 0
|
| 182 | + for editor in cursor:
|
| 183 | + x += 1
|
| 184 | + if 'first_edit' not in editor:
|
| 185 | + continue
|
| 186 | + if editor['first_edit'] >= end_date_pre:
|
| 187 | + continue
|
| 188 | + if check_username(editor['username']) == False:
|
| 189 | + continue
|
| 190 | + if check_user_id(editor['editor']) == False:
|
| 191 | + continue
|
| 192 | +
|
| 193 | + #print editor['edit_count']
|
| 194 | + active = determine_active(editor['edit_count'], start_date_pre, end_date_pre)
|
| 195 | + if active > 0:
|
| 196 | + pre_editors.add(editor['editor'])
|
| 197 | + y += 1
|
| 198 | + active = determine_active(editor['edit_count'], end_date_pre, end_date)
|
| 199 | + if active > 0:
|
| 200 | + post_editors.add(editor['editor'])
|
| 201 | + z += 1
|
| 202 | + if x % 100000 == 0:
|
| 203 | + print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x)
|
| 204 | +
|
| 205 | + #set_a = pre_editors.difference(post_editors)
|
| 206 | + post_editors = pre_editors.intersection(post_editors)
|
| 207 | +
|
| 208 | + return pre_editors, post_editors
|
| 209 | +
|
| 210 | +
|
| 211 | +def write_headers(fh, headers):
|
| 212 | + for i, key in enumerate(headers):
|
| 213 | + fh.write('%s' % key)
|
| 214 | + if (i + 1) != len(keys):
|
| 215 | + fh.write('\t')
|
| 216 | + else:
|
| 217 | + fh.write('\n')
|
| 218 | +
|
| 219 | +def write_revision(dataset, revision):
|
| 220 | + size = 0
|
| 221 | + for i, key in enumerate(keys):
|
| 222 | + #print key, revision[key]
|
| 223 | +# if key == 'reverted_user' or key == 'reverted_rev_id':
|
| 224 | +# revision[key] = revision[key][0]
|
| 225 | + if type(revision[key]) == type(0):
|
| 226 | + revision[key] = str(revision[key])
|
| 227 | +
|
| 228 | + dataset.write('%s' % revision[key].decode('utf-8'))
|
| 229 | + size += len(revision[key])
|
| 230 | + if (i + 1) != len(keys):
|
| 231 | + dataset.write('\t')
|
| 232 | + else:
|
| 233 | + dataset.write('\n')
|
| 234 | + return size
|
| 235 | +
|
| 236 | +
|
| 237 | +print 'Constructing training dataset...'
|
| 238 | +db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
|
| 239 | +print 'Loading editors...'
|
| 240 | +if not os.path.exists('set_a.bin'):
|
| 241 | + pre_editors, post_editors = determine_editors(db_dataset)
|
| 242 | + fh = open('set_a.bin', 'wb')
|
| 243 | + cPickle.dump(pre_editors, fh)
|
| 244 | + fh.close()
|
| 245 | +
|
| 246 | + fh = open('set_b.bin', 'wb')
|
| 247 | + cPickle.dump(post_editors, fh)
|
| 248 | + fh.close()
|
| 249 | +else:
|
| 250 | + pre_editors = load_binary_file('set_a.bin')
|
| 251 | + post_editors = load_binary_file('set_b.bin')
|
| 252 | +
|
| 253 | +
|
| 254 | +dataset = codecs.open('training.tsv', 'w', 'utf-8')
|
| 255 | +write_headers(dataset, headers)
|
| 256 | +idg = IDGenerator()
|
| 257 | +
|
| 258 | +
|
| 259 | +
|
| 260 | +print 'Parsing revisions...'
|
| 261 | +db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
|
| 262 | +seen_editors = {}
|
| 263 | +for editors in izip_longest(post_editors, pre_editors, fillvalue=None):
|
| 264 | + for editor in editors:
|
| 265 | + go = editors_seen.get(editor, True)
|
| 266 | + if go:
|
| 267 | + #if editor:
|
| 268 | + editors_seen[editor] = False
|
| 269 | + print 'Parsing editor %s...' % editor
|
| 270 | + #revisions = db_raw.find({'user_id': editor})
|
| 271 | + file_id = int(editor) % max_file_handles
|
| 272 | + fh = codecs.open(os.path.join(txt_files, '%s.csv' % file_id), 'r', 'utf-8')
|
| 273 | + for line in fh:
|
| 274 | + line = line.strip()
|
| 275 | + line = line.split('\t')
|
| 276 | + if line[0] != editor:
|
| 277 | + continue
|
| 278 | + revision = {}
|
| 279 | + revision['user_id'] = int(line[0])
|
| 280 | + revision['article_id'] = int(line[1])
|
| 281 | + revision['rev_id'] = int(line[2])
|
| 282 | + revision['ns'] = line[4]
|
| 283 | + revision['date'] = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
|
| 284 | + revision['hash'] = line[7]
|
| 285 | + revision['revert'] = line[8]
|
| 286 | + revision['reverted_user'] = line[9]
|
| 287 | + revision['reverted_rev_id'] = line[10]
|
| 288 | + revision['cur_size'] = line[12]
|
| 289 | + revision['delta'] = line[13]
|
| 290 | + #print line
|
| 291 | + #print revision
|
| 292 | +
|
| 293 | + #'user_id', 'article_id', 'rev_id', 'ns', 'date',
|
| 294 | + #'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size'
|
| 295 | + #print 'Editor %s made % edits' % (editor, len(revisions))
|
| 296 | + #for revision in revisions:
|
| 297 | + user_id = idg.get_id(revision['user_id'])
|
| 298 | + revision['user_id'] = user_id #recode id to make it harder to look up answers
|
| 299 | + if revision['ns'] < 0:
|
| 300 | + continue
|
| 301 | + timestamp = revision['date']
|
| 302 | + #revision['date'] = convert_tz_to_mysql_tz(timestamp)
|
| 303 | +
|
| 304 | + predictions.setdefault(user_id, {})
|
| 305 | + predictions[user_id].setdefault('solution', 0)
|
| 306 | + predictions[user_id].setdefault('training', 0)
|
| 307 | +
|
| 308 | + if timestamp > cutoff_date and timestamp < end_date:
|
| 309 | + predictions[user_id]['solution'] += 1
|
| 310 | + elif timestamp > cutoff_date_training and timestamp < end_date_training:
|
| 311 | + predictions[user_id]['training'] += 1
|
| 312 | + if timestamp > cutoff_date: #exclude edits after cut off date
|
| 313 | + continue
|
| 314 | +
|
| 315 | + revision['reverted_user'] = check_reverter(idg, revision.get('reverted_user', -1))
|
| 316 | + #revision.pop('_id')
|
| 317 | + #revision.pop('username')
|
| 318 | + revision['date'] = revision['date'].__str__()
|
| 319 | + titles[revision['article_id']] = True
|
| 320 | + revs[revision['rev_id']] = True
|
| 321 | + size += write_revision(dataset, revision)
|
| 322 | + cnt_obs += 1
|
| 323 | + if cnt_obs % 10000 == 0:
|
| 324 | + print 'Parsed %s revisions...' % cnt_obs
|
| 325 | + fh.close()
|
| 326 | + if size > max_size:
|
| 327 | + break
|
| 328 | +if size > max_size:
|
| 329 | + print 'Reached maximum filesize...'
|
| 330 | +else:
|
| 331 | + print 'Parsed all available editors in post set...'
|
| 332 | +dataset.close()
|
| 333 | +
|
| 334 | +
|
| 335 | +
|
| 336 | +print 'Constructing solution dataset...'
|
| 337 | +fh = codecs.open('solutions.csv', 'w', 'utf-8')
|
| 338 | +keys = predictions.keys()
|
| 339 | +keys.sort()
|
| 340 | +fh.write('%s,%s\n' % ('editor_id', 'solution'))
|
| 341 | +for key in keys:
|
| 342 | + fh.write('%s,%s\n' % (key, predictions[key]['solution']))
|
| 343 | +fh.close()
|
| 344 | +
|
| 345 | +
|
| 346 | +print 'Constructing test dataset...'
|
| 347 | +fh = codecs.open('test.csv', 'w', 'utf-8')
|
| 348 | +fh.write('%s,%s\n' % ('editor_id', 'test'))
|
| 349 | +for key, value in predictions.iteritems():
|
| 350 | + fh.write('%s,%s\n' % (key, value['training']))
|
| 351 | +fh.close()
|
| 352 | +
|
| 353 | +
|
| 354 | +print 'Constructing article file...'
|
| 355 | +fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
|
| 356 | +article_meta = construct_article_meta(fh_articles, files)
|
| 357 | +for filename in files:
|
| 358 | + if filename.startswith('articles') and not filename.startswith('articles_meta'):
|
| 359 | + fh = codecs.open(os.path.join(location, filename))
|
| 360 | + for line in fh:
|
| 361 | + line = line.strip()
|
| 362 | + line = line.split('\t')
|
| 363 | + if len(line) == 6:
|
| 364 | + article_id = int(line[0])
|
| 365 | + title = titles.get(article_id, None)
|
| 366 | + if title:
|
| 367 | + title = line[-1]
|
| 368 | + meta = article_meta.get(title, None)
|
| 369 | + parent_id = -1
|
| 370 | + category = 'Null'
|
| 371 | + if meta:
|
| 372 | + parent_id = meta['id']
|
| 373 | + category = meta['category']
|
| 374 | +
|
| 375 | + line[1] = category
|
| 376 | + line[2] = convert_tz_to_mysql_tz(line[2])
|
| 377 | + line[-1] = line[-1].decode('utf-8')
|
| 378 | + line.append(str(parent_id))
|
| 379 | + line.append('\n')
|
| 380 | + fh_articles.write('\t'.join(line))
|
| 381 | + fh.close()
|
| 382 | +fh_articles.close()
|
| 383 | +
|
| 384 | +
|
| 385 | +print 'Constructing comment dataset...'
|
| 386 | +fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
|
| 387 | +fh_comments.write('%s\t%s\n' % ('rev_id', 'text'))
|
| 388 | +cnt = len(revs.keys())
|
| 389 | +for filename in files:
|
| 390 | + if filename.startswith('comments'):
|
| 391 | + fh = codecs.open(os.path.join(location, filename))
|
| 392 | + for line in fh:
|
| 393 | + if cnt == 0:
|
| 394 | + break
|
| 395 | + line = line.strip()
|
| 396 | + line = line.split('\t')
|
| 397 | + if len(line) == 2: #some lines are missing rev id, not sure why.
|
| 398 | + try:
|
| 399 | + rev_id = int(line[0])
|
| 400 | + exists = revs.get(rev_id, None)
|
| 401 | + if exists:
|
| 402 | + fh_comments.write('%s\t%s\n' % (rev_id, line[1].decode('utf-8')))
|
| 403 | + cnt -= 1
|
| 404 | + except (ValueError, KeyError), error:
|
| 405 | + print error
|
| 406 | + fh.close()
|
| 407 | +fh_comments.close()
|
| 408 | +
|
| 409 | +print 'Storing random ids...'
|
| 410 | +fh = open('random_ids.bin', 'wb')
|
| 411 | +cPickle.dump(idg, fh)
|
| 412 | +fh.close()
|
| 413 | +
|
| 414 | +
|
| 415 | +fh = open('descriptives.tsv', 'w')
|
| 416 | +fh.write('Number of unique editors: %s\n' % idg.n)
|
| 417 | +fh.write('Number of revisions: %s\n' % cnt_obs)
|
| 418 | +fh.write('Number of pre-editors: %s\n' % len(pre_editors))
|
| 419 | +fh.write('Number of post-editors: %s\n' % len(post_editors))
|
| 420 | +fh.write('Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors)))
|
| 421 | +fh.close()
|
| 422 | +
|
| 423 | +
|
| 424 | +t1 = datetime.now()
|
| 425 | +print 'Descriptives:'
|
| 426 | +print 'Number of unique editors: %s' % idg.n
|
| 427 | +print 'Number of revisions: %s' % cnt_obs
|
| 428 | +print 'Number of pre-editors: %s' % len(pre_editors)
|
| 429 | +print 'Number of post-editors: %s' % len(post_editors)
|
| 430 | +print 'Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors))
|
| 431 | +print 'It took %s to construct the Kaggle training set' % (t1 - t0)
|
Index: trunk/tools/editor_trends/kaggle/training_db.py |
— | — | @@ -0,0 +1,452 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)']) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-04-12' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import os |
| 22 | +import sys |
| 23 | +import cPickle |
| 24 | +import codecs |
| 25 | +import random |
| 26 | +from itertools import izip |
| 27 | +from datetime import datetime |
| 28 | +from dateutil.relativedelta import * |
| 29 | +sys.path.append('../') |
| 30 | + |
| 31 | +random.seed(1024) |
| 32 | +from classes import storage |
| 33 | + |
| 34 | +headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp', |
| 35 | + 'md5', 'reverted', 'reverted_user_id', 'reverted_revision_id', 'delta', 'cur_size'] |
| 36 | +keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date', |
| 37 | + 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 'cur_size'] |
| 38 | + |
| 39 | +max_size = 2147483648 |
| 40 | +#max_size = 2000000 |
| 41 | +cnt_obs = 0 #count of number of edits |
| 42 | +revs = {} |
| 43 | +titles = {} |
| 44 | +predictions = {} |
| 45 | + |
| 46 | +t0 = datetime.now() |
| 47 | +location = '/home/diederik/wikimedia/xml/en/wiki/txt/' |
| 48 | +files = os.listdir(location) |
| 49 | +#files.sort() |
| 50 | +#files.reverse() |
| 51 | +editors_seen = {} |
| 52 | +cutoff_date = datetime(2010, 9, 1) #operator is > |
| 53 | +end_date = datetime(2011, 2, 1) #operator is < |
| 54 | +cutoff_date_training = datetime(2010, 1, 31) #operator is > |
| 55 | +end_date_training = datetime(2010, 9, 1) # operator is < |
| 56 | + |
| 57 | +class IDGenerator: |
| 58 | + def __init__(self): |
| 59 | + self.n = 0 |
| 60 | + self.ids = {} |
| 61 | + |
| 62 | + def get_id(self, n): |
| 63 | + if n not in self.ids: |
| 64 | + self.ids[n] = self.n |
| 65 | + self.n += 1 |
| 66 | + return str(self.ids[n]) |
| 67 | + |
| 68 | +class RandomIDGenerator: |
| 69 | + def __init__(self): |
| 70 | + self.n = 0 |
| 71 | + self.ids = {} |
| 72 | + self.rnd_ids = {} |
| 73 | + self.inverted_ids = None |
| 74 | + |
| 75 | + def invert_dict(self, dictionary): |
| 76 | + return dict((v, k) for k, v in dictionary.iteritems()) |
| 77 | + |
| 78 | + def get_id(self, n): |
| 79 | + if n not in self.ids: |
| 80 | + self.n += 1 |
| 81 | + while len(self.rnd_ids) < self.n : |
| 82 | + rnd_id = self.get_random_id() |
| 83 | + if self.rnd_ids.get(rnd_id, False) == False: |
| 84 | + self.rnd_ids[rnd_id] = True |
| 85 | + self.ids[n] = rnd_id |
| 86 | + return self.ids[n] |
| 87 | + |
| 88 | + def get_random_id(self): |
| 89 | + return random.randrange(0, 1000000) |
| 90 | + |
| 91 | + def reverse_lookup(self, n): |
| 92 | + self.inverted_ids = self.invert_dict(self.ids) |
| 93 | + return self.inverted_ids[n] |
| 94 | + |
| 95 | + |
| 96 | +def construct_article_meta(fh_articles, files): |
| 97 | + print 'Constructing title dataset...' |
| 98 | + headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'] |
| 99 | + write_headers(fh_articles, headers) |
| 100 | + #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page')) |
| 101 | + article_meta = {} |
| 102 | + for filename in files: |
| 103 | + if filename.startswith('articles_meta'): |
| 104 | + fh = codecs.open(os.path.join(location, filename)) |
| 105 | + for line in fh: |
| 106 | + line = line.strip() |
| 107 | + line = line.split('\t') |
| 108 | + category = line[1] |
| 109 | + if category != 'List': |
| 110 | + title = line[2] |
| 111 | + title = title.split('/') |
| 112 | + article_meta.setdefault(title[-1], {}) |
| 113 | + article_meta[title[-1]]['category'] = category |
| 114 | + article_meta[title[-1]]['id'] = line[0] |
| 115 | + fh.close() |
| 116 | + return article_meta |
| 117 | + |
| 118 | + |
| 119 | +def determine_active(edits, start_date, end_date): |
| 120 | + active = 0 |
| 121 | + if start_date == datetime(2009, 9, 1): |
| 122 | + if '2009' not in edits and '2010' not in edits: |
| 123 | + return active |
| 124 | +# elif start_date == datetime(2010, 9, 1): |
| 125 | +# if '2010' not in edits and '2011' not in edits: |
| 126 | +# return active |
| 127 | + |
| 128 | + |
| 129 | + namespaces = ['0', '1', '2', '3', '4', '5'] |
| 130 | + while start_date < end_date: |
| 131 | + year = str(start_date.year) |
| 132 | + month = str(start_date.month) |
| 133 | + for ns in namespaces: |
| 134 | + active += edits.get(year, {}).get(month, {}).get(ns, 0) |
| 135 | + if active > 0: #we don't need to know how many edits,just if active |
| 136 | + return active |
| 137 | + start_date = start_date + relativedelta(months= +1) |
| 138 | + return active |
| 139 | + |
| 140 | + |
| 141 | +def load_binary_file(filename): |
| 142 | + fh = open(filename, 'rb') |
| 143 | + obj = cPickle.load(fh) |
| 144 | + fh.close() |
| 145 | + return obj |
| 146 | + |
| 147 | + |
| 148 | +def convert_tz_to_mysql_tz(tz): |
| 149 | + return tz.__str__() |
| 150 | + |
| 151 | + |
| 152 | +def check_reverter(idg, reverter): |
| 153 | + try: |
| 154 | + if reverter != -1: |
| 155 | + reverter = idg.get_id(reverter) |
| 156 | + return reverter |
| 157 | + except ValueError: |
| 158 | + pass |
| 159 | + return -1 |
| 160 | + |
| 161 | + |
| 162 | +def check_user_id(user_id): |
| 163 | + try: |
| 164 | + int(user_id) |
| 165 | + except ValueError: |
| 166 | + return False |
| 167 | + return True |
| 168 | + |
| 169 | + |
| 170 | +def check_username(username): |
| 171 | + username = username.lower() |
| 172 | + if username.endswith('bot') or username.find('script') > -1: |
| 173 | + return False #exclude more bots and scripts |
| 174 | + return True |
| 175 | + |
| 176 | + |
| 177 | +def determine_editors(db): |
| 178 | + start_date_pre = datetime(2009, 9, 1) |
| 179 | + end_date_pre = datetime(2010, 9, 1) |
| 180 | + end_date = datetime(2011, 2, 1) |
| 181 | + pre_editors = set() |
| 182 | + post_editors = set() |
| 183 | + cursor = db.find({}, 'first_edit,edit_count,user_id,username') |
| 184 | + x, y, z = 0, 0, 0 |
| 185 | + for editor in cursor: |
| 186 | + x += 1 |
| 187 | + if 'first_edit' not in editor: |
| 188 | + continue |
| 189 | + if editor['first_edit'] > end_date_pre: |
| 190 | + continue |
| 191 | + if check_username(editor['username']) == False: |
| 192 | + continue |
| 193 | + if check_user_id(editor['user_id']) == False: |
| 194 | + continue |
| 195 | + |
| 196 | + active_pre = determine_active(editor['edit_count'], start_date_pre, end_date_pre) |
| 197 | + if x % 100000 == 0: |
| 198 | + print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x) |
| 199 | + |
| 200 | + if active_pre == 0: |
| 201 | + continue #exclude editors who are not active in the year before the cutoff date |
| 202 | + else: |
| 203 | + active_post = determine_active(editor['edit_count'], end_date_pre, end_date) |
| 204 | + if active_post == 0: |
| 205 | + pre_editors.add(user_id) |
| 206 | + y += 1 |
| 207 | + else: |
| 208 | + post_editors.add(user_id) |
| 209 | + z += 1 |
| 210 | + print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % (y, z, x) |
| 211 | + return pre_editors, post_editors |
| 212 | + |
| 213 | + |
| 214 | +def write_headers(fh, headers): |
| 215 | + for i, key in enumerate(headers): |
| 216 | + fh.write('%s' % key) |
| 217 | + if (i + 1) != len(headers): |
| 218 | + fh.write('\t') |
| 219 | + else: |
| 220 | + fh.write('\n') |
| 221 | + |
| 222 | +def write_revision(dataset, revision): |
| 223 | + for i, key in enumerate(keys): |
| 224 | + if type(revision[key]) == type(0): |
| 225 | + revision[key] = str(revision[key]) |
| 226 | + dataset.write('%s' % revision[key].decode('utf-8')) |
| 227 | + if (i + 1) != len(keys): |
| 228 | + dataset.write('\t') |
| 229 | + else: |
| 230 | + dataset.write('\n') |
| 231 | + |
| 232 | + |
| 233 | +print 'Constructing training dataset...' |
| 234 | +db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset') |
| 235 | +print 'Loading editors...' |
| 236 | +if not os.path.exists('set_a.bin'): |
| 237 | + pre_editors, post_editors = determine_editors(db_dataset) |
| 238 | + fh = open('set_a.bin', 'wb') |
| 239 | + cPickle.dump(pre_editors, fh) |
| 240 | + fh.close() |
| 241 | + |
| 242 | + fh = open('set_b.bin', 'wb') |
| 243 | + cPickle.dump(post_editors, fh) |
| 244 | + fh.close() |
| 245 | +else: |
| 246 | + pre_editors = load_binary_file('set_a.bin') |
| 247 | + post_editors = load_binary_file('set_b.bin') |
| 248 | + |
| 249 | + |
| 250 | +dataset = codecs.open('training.tsv', 'w', 'utf-8') |
| 251 | +write_headers(dataset, headers) |
| 252 | +idg = RandomIDGenerator() |
| 253 | + |
| 254 | +namespaces = IDGenerator() |
| 255 | +print 'Parsing revisions...' |
| 256 | +db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw') |
| 257 | +seen_editors = {} |
| 258 | +editors = {} |
| 259 | +x = 1 |
| 260 | +for editor in post_editors: |
| 261 | + #print editor |
| 262 | + editors[x] = editor |
| 263 | + x += 2 |
| 264 | +x = 0 |
| 265 | +z = len(post_editors) |
| 266 | +for y, editor in enumerate(pre_editors): |
| 267 | + #print editor |
| 268 | + editors[x] = editor |
| 269 | + x += 2 |
| 270 | + if z == y: |
| 271 | + break |
| 272 | + |
| 273 | +editor_keys = editors.keys() |
| 274 | +editor_keys.sort() |
| 275 | +for key in editor_keys: |
| 276 | + #print editors |
| 277 | + #for editor in editors: |
| 278 | + editor = editors[key] |
| 279 | + #print editor |
| 280 | + go = editors_seen.get(editor, True) |
| 281 | + if go: |
| 282 | + editors_seen[editor] = False |
| 283 | + user_id = idg.get_id(editor) |
| 284 | + print 'Parsing editor %s (%s) ...' % (editor, user_id) |
| 285 | + revisions = db_raw.find({'user_id': str(editor)}) |
| 286 | + |
| 287 | + predictions.setdefault(user_id, {}) |
| 288 | + predictions[user_id].setdefault('solution', 0) |
| 289 | + predictions[user_id].setdefault('training', 0) |
| 290 | + |
| 291 | + for revision in revisions: |
| 292 | + revision['user_id'] = user_id #recode id to make it harder to look up answers |
| 293 | + if revision['ns'] < 0 or revision['ns'] > 5: |
| 294 | + continue |
| 295 | + #revision['ns'] = namespaces.get_id(revision['ns']) |
| 296 | + timestamp = revision['date'] |
| 297 | + revision['date'] = convert_tz_to_mysql_tz(timestamp) |
| 298 | + |
| 299 | + |
| 300 | + |
| 301 | + if timestamp > cutoff_date: |
| 302 | + #print editor, user_id, timestamp, revision['date'] |
| 303 | + if timestamp < end_date: |
| 304 | + predictions[user_id]['solution'] += 1 |
| 305 | + elif timestamp > cutoff_date_training: |
| 306 | + if timestamp < end_date_training: |
| 307 | + predictions[user_id]['training'] += 1 |
| 308 | + |
| 309 | + if timestamp > cutoff_date: #exclude edits after cut off date |
| 310 | + continue |
| 311 | + if revision['revert'] == 1: |
| 312 | + revision['reverted_user'] = check_reverter(idg, revision.get('reverted_user', -1)) |
| 313 | + revision.pop('_id') |
| 314 | + revision.pop('username') |
| 315 | + titles[revision['article_id']] = True |
| 316 | + revs[revision['rev_id']] = True |
| 317 | + write_revision(dataset, revision) |
| 318 | + cnt_obs += 1 |
| 319 | + if cnt_obs % 10000 == 0: |
| 320 | + print 'Parsed %s revisions...' % cnt_obs |
| 321 | + if dataset.tell() > max_size: |
| 322 | + break |
| 323 | +if dataset.tell() > max_size: |
| 324 | + print 'Reached maximum filesize...' |
| 325 | +else: |
| 326 | + print 'Parsed all available editors in post set...' |
| 327 | +dataset.close() |
| 328 | + |
| 329 | + |
| 330 | + |
| 331 | +print 'Constructing solution dataset...' |
| 332 | +fh = codecs.open('solutions.csv', 'w', 'utf-8') |
| 333 | +editor_keys = predictions.keys() |
| 334 | +editor_keys.sort() |
| 335 | +fh.write('%s,%s\n' % ('user_id', 'solution')) |
| 336 | +for key in editor_keys: |
| 337 | + fh.write('%s,%s\n' % (key, predictions[key]['solution'])) |
| 338 | + print key, predictions[key]['solution'] |
| 339 | +fh.close() |
| 340 | + |
| 341 | + |
| 342 | +print 'Constructing test dataset...' |
| 343 | +fh = codecs.open('test.csv', 'w', 'utf-8') |
| 344 | +fh.write('%s,%s\n' % ('user_id', 'test')) |
| 345 | +for key, value in predictions.iteritems(): |
| 346 | + fh.write('%s,%s\n' % (key, value['training'])) |
| 347 | +fh.close() |
| 348 | + |
| 349 | +print 'Constructing article file...' |
| 350 | +fh_articles = codecs.open('titles.tsv', 'w', 'utf-8') |
| 351 | +article_meta = construct_article_meta(fh_articles, files) |
| 352 | +categories = IDGenerator() |
| 353 | +for filename in files: |
| 354 | + if filename.startswith('articles') and not filename.startswith('articles_meta'): |
| 355 | + fh = codecs.open(os.path.join(location, filename)) |
| 356 | + for line in fh: |
| 357 | + line = line.strip() |
| 358 | + line = line.split('\t') |
| 359 | + if len(line) == 6: |
| 360 | + article_id = int(line[0]) |
| 361 | + title = titles.pop(article_id, None) |
| 362 | + if title: |
| 363 | + title = line[-1] |
| 364 | + meta = article_meta.get(title, None) |
| 365 | + parent_id = '-1' |
| 366 | + category = -1 |
| 367 | + redirect = line[4] |
| 368 | + if redirect == 'False': |
| 369 | + redirect = '0' |
| 370 | + else: |
| 371 | + redirect = '1' |
| 372 | + line[4] = redirect |
| 373 | + if meta: |
| 374 | + parent_id = meta['id'] |
| 375 | + category = meta['category'] |
| 376 | + |
| 377 | + |
| 378 | + line[1] = categories.get_id(category) |
| 379 | + tz = datetime.strptime(line[2], '%Y-%m-%dT%H:%M:%SZ') |
| 380 | + line[2] = convert_tz_to_mysql_tz(tz) |
| 381 | + line[-1] = line[-1].decode('utf-8') |
| 382 | + line.append(parent_id) |
| 383 | + line.append('\n') |
| 384 | + fh_articles.write('\t'.join(line)) |
| 385 | + fh.close() |
| 386 | +fh_articles.close() |
| 387 | + |
| 388 | + |
| 389 | +print 'Constructing comment dataset...' |
| 390 | +fh_comments = codecs.open('comments.tsv', 'w', 'utf-8') |
| 391 | +fh_comments.write('%s\t%s\n' % ('revision_id', 'comment')) |
| 392 | +cnt = len(revs.keys()) |
| 393 | +for filename in files: |
| 394 | + if filename.startswith('comments'): |
| 395 | + fh = codecs.open(os.path.join(location, filename)) |
| 396 | + for line in fh: |
| 397 | + if cnt == 0: |
| 398 | + break |
| 399 | + line = line.strip() |
| 400 | + line = line.split('\t') |
| 401 | + if len(line) == 2: #some lines are missing rev id, not sure why. |
| 402 | + try: |
| 403 | + rev_id = int(line[0]) |
| 404 | + exists = revs.get(rev_id, None) |
| 405 | + if exists: |
| 406 | + fh_comments.write('%s\t%s\n' % (rev_id, line[1].decode('utf-8'))) |
| 407 | + cnt -= 1 |
| 408 | + except (ValueError, KeyError), error: |
| 409 | + print error |
| 410 | + fh.close() |
| 411 | +fh_comments.close() |
| 412 | + |
| 413 | +print 'Storing random ids...' |
| 414 | +fh = open('random_ids.bin', 'wb') |
| 415 | +cPickle.dump(idg, fh) |
| 416 | +fh.close() |
| 417 | + |
| 418 | +fh = codecs.open('namespaces.tsv', 'w', 'utf-8') |
| 419 | +write_headers(fh, ['key', 'namespace']) |
| 420 | +namespaces = {'0':'Main', |
| 421 | + '1':'Talk', |
| 422 | + '2':'User', |
| 423 | + '3':'User Talk', |
| 424 | + '4':'Wikipedia', |
| 425 | + '5':'Wikipedia Talk' |
| 426 | + } |
| 427 | +for key, value in namespaces.iteritems(): |
| 428 | + fh.write('%s\t%s\n' % (key, value)) |
| 429 | +fh.close() |
| 430 | + |
| 431 | +fh = codecs.open('categories.tsv', 'w', 'utf-8') |
| 432 | +write_headers(fh, ['id', 'name']) |
| 433 | +for key, value in categories.ids.iteritems(): |
| 434 | + fh.write('%s\t%s\n' % (value, key)) |
| 435 | +fh.close() |
| 436 | + |
| 437 | +fh = open('descriptives.tsv', 'w') |
| 438 | +fh.write('Number of unique editors: %s\n' % idg.n) |
| 439 | +fh.write('Number of revisions: %s\n' % cnt_obs) |
| 440 | +fh.write('Number of pre-editors: %s\n' % len(pre_editors)) |
| 441 | +fh.write('Number of post-editors: %s\n' % len(post_editors)) |
| 442 | +fh.write('Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors))) |
| 443 | +fh.close() |
| 444 | + |
| 445 | + |
| 446 | +t1 = datetime.now() |
| 447 | +print 'Descriptives:' |
| 448 | +print 'Number of unique editors: %s' % idg.n |
| 449 | +print 'Number of revisions: %s' % cnt_obs |
| 450 | +print 'Number of pre-editors: %s' % len(pre_editors) |
| 451 | +print 'Number of post-editors: %s' % len(post_editors) |
| 452 | +print 'Number of editors with zero edits after August 30th. 2010: %s' % (len(pre_editors) - len(post_editors)) |
| 453 | +print 'It took %s to construct the Kaggle training set' % (t1 - t0) |
Property changes on: trunk/tools/editor_trends/kaggle/training_db.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 454 | + native |