Index: trunk/tools/editor_trends/run.py |
— | — | @@ -1,37 +1,37 @@ |
2 | | -import os
|
3 | | -
|
4 | | -import settings
|
5 | | -#from utils import namespace_downloader as nd
|
6 | | -#nd.launch_downloader()
|
7 | | -
|
8 | | -
|
9 | | -#def which(program):
|
10 | | -# import os
|
11 | | -# def is_exe(fpath):
|
12 | | -# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
|
13 | | -#
|
14 | | -# fpath, fname = os.path.split(program)
|
15 | | -# if fpath:
|
16 | | -# if is_exe(program):
|
17 | | -# return program
|
18 | | -# else:
|
19 | | -# for path in os.environ["PATH"].split(os.pathsep):
|
20 | | -# exe_file = os.path.join(path, program)
|
21 | | -# if is_exe(exe_file):
|
22 | | -# return exe_file
|
23 | | -#
|
24 | | -# return None
|
25 | | -#
|
26 | | -#
|
27 | | -#result = which('7z.exe')
|
28 | | -#print result
|
29 | | -
|
30 | | -#from database import launcher
|
31 | | -#launcher.launcher()
|
32 | | -from utils import sort
|
33 | | -input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
|
34 | | -output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
|
35 | | -dbname = 'enwiki'
|
36 | | -#sort.debug_mergesort_feeder(input, output)
|
37 | | -#sort.mergesort_launcher(input, output)
|
| 2 | +import os |
| 3 | + |
| 4 | +import settings |
| 5 | +#from utils import namespace_downloader as nd |
| 6 | +#nd.launch_downloader() |
| 7 | + |
| 8 | + |
| 9 | +#def which(program): |
| 10 | +# import os |
| 11 | +# def is_exe(fpath): |
| 12 | +# return os.path.exists(fpath) and os.access(fpath, os.X_OK) |
| 13 | +# |
| 14 | +# fpath, fname = os.path.split(program) |
| 15 | +# if fpath: |
| 16 | +# if is_exe(program): |
| 17 | +# return program |
| 18 | +# else: |
| 19 | +# for path in os.environ["PATH"].split(os.pathsep): |
| 20 | +# exe_file = os.path.join(path, program) |
| 21 | +# if is_exe(exe_file): |
| 22 | +# return exe_file |
| 23 | +# |
| 24 | +# return None |
| 25 | +# |
| 26 | +# |
| 27 | +#result = which('7z.exe') |
| 28 | +#print result |
| 29 | + |
| 30 | +#from database import launcher |
| 31 | +#launcher.launcher() |
| 32 | +from utils import sort |
| 33 | +input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt') |
| 34 | +output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted') |
| 35 | +dbname = 'enwiki' |
| 36 | +#sort.debug_mergesort_feeder(input, output) |
| 37 | +#sort.mergesort_launcher(input, output) |
38 | 38 | sort.mergesort_external_launcher(dbname, output, output) |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/run.py |
___________________________________________________________________ |
Added: svn:eol-style |
39 | 39 | + native |
Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -1,145 +1,145 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__author__email = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2010-11-02'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -from multiprocessing import Queue
|
22 | | -from Queue import Empty
|
23 | | -from operator import itemgetter
|
24 | | -import datetime
|
25 | | -
|
26 | | -import settings
|
27 | | -from database import db
|
28 | | -from utils import process_constructor as pc
|
29 | | -from utils import utils
|
30 | | -import construct_datasets
|
31 | | -
|
32 | | -
|
33 | | -def create_datacontainer(init_value=0):
|
34 | | - '''
|
35 | | - This function initializes an empty dictionary with as key the year (starting
|
36 | | - 2001 and running through) and as value @init_value, in most cases this will
|
37 | | - be zero so the dictionary will act as a running tally for a variable but
|
38 | | - @init_value can also a list, [], or a dictionary, {}, or a set, set().
|
39 | | - '''
|
40 | | - data = {}
|
41 | | - year = datetime.datetime.now().year + 1
|
42 | | - for x in xrange(2001, year):
|
43 | | - data[str(x)] = init_value
|
44 | | - return data
|
45 | | -
|
46 | | -
|
47 | | -def determine_edits_by_year(dates):
|
48 | | - '''
|
49 | | - This function counts the number of edits by year made by a particular editor.
|
50 | | - '''
|
51 | | - edits = create_datacontainer()
|
52 | | - for date in dates:
|
53 | | - year = str(date['date'].year)
|
54 | | - edits[year] += 1
|
55 | | - return edits
|
56 | | -
|
57 | | -
|
58 | | -def determine_articles_by_year(dates):
|
59 | | - '''
|
60 | | - This function counts the number of unique articles by year edited by a
|
61 | | - particular editor.
|
62 | | - '''
|
63 | | - articles = create_datacontainer(set())
|
64 | | - for date in dates:
|
65 | | - year = str(date['date'].year)
|
66 | | - articles[year].add(date['article'])
|
67 | | - for article in articles:
|
68 | | - articles[article] = len(article)
|
69 | | - return articles
|
70 | | -
|
71 | | -
|
72 | | -def sort_edits(edits):
|
73 | | - edits = utils.merge_list(edits)
|
74 | | - return sorted(edits, key=itemgetter('date'))
|
75 | | -
|
76 | | -
|
77 | | -def optimize_editors(input_queue, result_queue, pbar, **kwargs):
|
78 | | - dbname = kwargs.pop('dbname')
|
79 | | - mongo = db.init_mongo_db(dbname)
|
80 | | - input = mongo['editors']
|
81 | | - output = mongo['dataset']
|
82 | | - output.ensure_index('editor')
|
83 | | - output.ensure_index('year_joined')
|
84 | | - definition = kwargs.pop('definition')
|
85 | | - while True:
|
86 | | - try:
|
87 | | - id = input_queue.get(block=False)
|
88 | | - editor = input.find_one({'editor': id})
|
89 | | - edits = editor['edits']
|
90 | | - edits = sort_edits(edits)
|
91 | | - edit_count = len(edits)
|
92 | | - new_wikipedian = edits[9]['date']
|
93 | | - first_edit = edits[0]['date']
|
94 | | - final_edit = edits[-1]['date']
|
95 | | - edits_by_year = determine_edits_by_year(edits)
|
96 | | - articles_by_year = determine_articles_by_year(edits)
|
97 | | - edits = edits[:10]
|
98 | | -
|
99 | | - output.insert({'editor': id, 'edits': edits,
|
100 | | - 'edits_by_year': edits_by_year,
|
101 | | - 'new_wikipedian': new_wikipedian,
|
102 | | - 'edit_count': edit_count,
|
103 | | - 'final_edit': final_edit,
|
104 | | - 'first_edit': first_edit,
|
105 | | - 'articles_by_year': articles_by_year})
|
106 | | - print 'Items left: %s' % input_queue.qsize()
|
107 | | - except Empty:
|
108 | | - break
|
109 | | -
|
110 | | -
|
111 | | -def run_optimize_editors(dbname):
|
112 | | - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
|
113 | | - kwargs = {'definition': 'traditional',
|
114 | | - 'pbar': True,
|
115 | | - 'dbname': 'enwiki',
|
116 | | - 'nr_input_processors': 1,
|
117 | | - 'nr_output_processors': 0,
|
118 | | - }
|
119 | | - print len(ids)
|
120 | | - ids = list(ids)
|
121 | | - chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
|
122 | | -# chunks = {}
|
123 | | -# parts = int(round(float(len(ids)) / 1, 0))
|
124 | | -# a = 0
|
125 | | -# for x in xrange(settings.NUMBER_OF_PROCESSES):
|
126 | | -# b = a + parts
|
127 | | -# chunks[x] = ids[a:b]
|
128 | | -# a = (x + 1) * parts
|
129 | | -# if a >= len(ids):
|
130 | | -# break
|
131 | | -
|
132 | | - pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
|
133 | | -
|
134 | | -
|
135 | | -def debug_optimize_editors(dbname):
|
136 | | - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
|
137 | | - q = pc.load_queue(ids)
|
138 | | - kwargs = {'definition': 'traditional',
|
139 | | - 'dbname': dbname
|
140 | | - }
|
141 | | - optimize_editors(q, False, True, kwargs)
|
142 | | -
|
143 | | -
|
144 | | -if __name__ == '__main__':
|
145 | | - #debug_optimize_editors('test')
|
146 | | - run_optimize_editors('enwiki')
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-02' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +from multiprocessing import Queue |
| 22 | +from Queue import Empty |
| 23 | +from operator import itemgetter |
| 24 | +import datetime |
| 25 | + |
| 26 | +import settings |
| 27 | +from database import db |
| 28 | +from utils import process_constructor as pc |
| 29 | +from utils import utils |
| 30 | +import construct_datasets |
| 31 | + |
| 32 | + |
| 33 | +def create_datacontainer(init_value=0): |
| 34 | + ''' |
| 35 | + This function initializes an empty dictionary with as key the year (starting |
| 36 | + 2001 and running through) and as value @init_value, in most cases this will |
| 37 | + be zero so the dictionary will act as a running tally for a variable but |
| 38 | + @init_value can also a list, [], or a dictionary, {}, or a set, set(). |
| 39 | + ''' |
| 40 | + data = {} |
| 41 | + year = datetime.datetime.now().year + 1 |
| 42 | + for x in xrange(2001, year): |
| 43 | + data[str(x)] = init_value |
| 44 | + return data |
| 45 | + |
| 46 | + |
| 47 | +def determine_edits_by_year(dates): |
| 48 | + ''' |
| 49 | + This function counts the number of edits by year made by a particular editor. |
| 50 | + ''' |
| 51 | + edits = create_datacontainer() |
| 52 | + for date in dates: |
| 53 | + year = str(date['date'].year) |
| 54 | + edits[year] += 1 |
| 55 | + return edits |
| 56 | + |
| 57 | + |
| 58 | +def determine_articles_by_year(dates): |
| 59 | + ''' |
| 60 | + This function counts the number of unique articles by year edited by a |
| 61 | + particular editor. |
| 62 | + ''' |
| 63 | + articles = create_datacontainer(set()) |
| 64 | + for date in dates: |
| 65 | + year = str(date['date'].year) |
| 66 | + articles[year].add(date['article']) |
| 67 | + for article in articles: |
| 68 | + articles[article] = len(article) |
| 69 | + return articles |
| 70 | + |
| 71 | + |
| 72 | +def sort_edits(edits): |
| 73 | + edits = utils.merge_list(edits) |
| 74 | + return sorted(edits, key=itemgetter('date')) |
| 75 | + |
| 76 | + |
| 77 | +def optimize_editors(input_queue, result_queue, pbar, **kwargs): |
| 78 | + dbname = kwargs.pop('dbname') |
| 79 | + mongo = db.init_mongo_db(dbname) |
| 80 | + input = mongo['editors'] |
| 81 | + output = mongo['dataset'] |
| 82 | + output.ensure_index('editor') |
| 83 | + output.ensure_index('year_joined') |
| 84 | + definition = kwargs.pop('definition') |
| 85 | + while True: |
| 86 | + try: |
| 87 | + id = input_queue.get(block=False) |
| 88 | + editor = input.find_one({'editor': id}) |
| 89 | + edits = editor['edits'] |
| 90 | + edits = sort_edits(edits) |
| 91 | + edit_count = len(edits) |
| 92 | + new_wikipedian = edits[9]['date'] |
| 93 | + first_edit = edits[0]['date'] |
| 94 | + final_edit = edits[-1]['date'] |
| 95 | + edits_by_year = determine_edits_by_year(edits) |
| 96 | + articles_by_year = determine_articles_by_year(edits) |
| 97 | + edits = edits[:10] |
| 98 | + |
| 99 | + output.insert({'editor': id, 'edits': edits, |
| 100 | + 'edits_by_year': edits_by_year, |
| 101 | + 'new_wikipedian': new_wikipedian, |
| 102 | + 'edit_count': edit_count, |
| 103 | + 'final_edit': final_edit, |
| 104 | + 'first_edit': first_edit, |
| 105 | + 'articles_by_year': articles_by_year}) |
| 106 | + print 'Items left: %s' % input_queue.qsize() |
| 107 | + except Empty: |
| 108 | + break |
| 109 | + |
| 110 | + |
| 111 | +def run_optimize_editors(dbname): |
| 112 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
| 113 | + kwargs = {'definition': 'traditional', |
| 114 | + 'pbar': True, |
| 115 | + 'dbname': 'enwiki', |
| 116 | + 'nr_input_processors': 1, |
| 117 | + 'nr_output_processors': 0, |
| 118 | + } |
| 119 | + print len(ids) |
| 120 | + ids = list(ids) |
| 121 | + chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES) |
| 122 | +# chunks = {} |
| 123 | +# parts = int(round(float(len(ids)) / 1, 0)) |
| 124 | +# a = 0 |
| 125 | +# for x in xrange(settings.NUMBER_OF_PROCESSES): |
| 126 | +# b = a + parts |
| 127 | +# chunks[x] = ids[a:b] |
| 128 | +# a = (x + 1) * parts |
| 129 | +# if a >= len(ids): |
| 130 | +# break |
| 131 | + |
| 132 | + pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs) |
| 133 | + |
| 134 | + |
| 135 | +def debug_optimize_editors(dbname): |
| 136 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
| 137 | + q = pc.load_queue(ids) |
| 138 | + kwargs = {'definition': 'traditional', |
| 139 | + 'dbname': dbname |
| 140 | + } |
| 141 | + optimize_editors(q, False, True, kwargs) |
| 142 | + |
| 143 | + |
| 144 | +if __name__ == '__main__': |
| 145 | + #debug_optimize_editors('test') |
| 146 | + run_optimize_editors('enwiki') |
Property changes on: trunk/tools/editor_trends/optimize_editors.py |
___________________________________________________________________ |
Added: svn:eol-style |
147 | 147 | + native |