Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -22,8 +22,8 @@ |
23 | 23 | import subprocess |
24 | 24 | from argparse import ArgumentParser |
25 | 25 | from argparse import RawTextHelpFormatter |
| 26 | +import locale |
26 | 27 | |
27 | | - |
28 | 28 | import progressbar |
29 | 29 | |
30 | 30 | import settings |
— | — | @@ -43,6 +43,11 @@ |
44 | 44 | config.load_configuration(args) |
45 | 45 | |
46 | 46 | |
| 47 | +def determine_default_language(): |
| 48 | + language_code = locale.getdefaultlocale()[0] |
| 49 | + return language_code.split('_')[0] |
| 50 | + |
| 51 | + |
47 | 52 | def retrieve_projectname(args): |
48 | 53 | language_code = retrieve_language(args) |
49 | 54 | if language_code == None: |
— | — | @@ -53,13 +58,16 @@ |
54 | 59 | if project == None: |
55 | 60 | print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project') |
56 | 61 | sys.exit(-1) |
| 62 | + if project == 'commonswiki': |
| 63 | + return project |
| 64 | + else: |
| 65 | + return '%s%s' % (language_code, project) |
57 | 66 | |
58 | | - return '%s%s' % (language_code, project) |
59 | 67 | |
60 | 68 | def retrieve_language(args): |
61 | 69 | language = get_value(args, 'language') |
62 | 70 | language = language.title() |
63 | | - return languages.MAPPING.get(language, None) |
| 71 | + return languages.MAPPING.get(language, 'en') |
64 | 72 | |
65 | 73 | |
66 | 74 | def retrieve_project(args): |
— | — | @@ -75,13 +83,24 @@ |
76 | 84 | |
77 | 85 | def determine_file_locations(args): |
78 | 86 | locations = {} |
| 87 | + location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION |
79 | 88 | locations['language_code'] = retrieve_language(args) |
80 | | - locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args)) |
| 89 | + locations['location'] = os.path.join(location, retrieve_language(args)) |
81 | 90 | locations['project'] = retrieve_projectname(args) |
82 | 91 | locations['filename'] = generate_wikidump_filename(args) |
83 | 92 | return locations |
84 | 93 | |
85 | 94 | |
| 95 | +def show_settings(args, location, filename, project, language_code): |
| 96 | + project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki') |
| 97 | + project = project.title() |
| 98 | + language_map = utils.invert_dict(languages.MAPPING) |
| 99 | + print 'Project: %s' % (project) |
| 100 | + print 'Language: %s' % language_map[language_code] |
| 101 | + print 'Input directory: %s' % location |
| 102 | + print 'Output directory: TODO' |
| 103 | + |
| 104 | + |
86 | 105 | def dump_downloader_launcher(args, location, filename, project, language_code): |
87 | 106 | print 'dump downloader' |
88 | 107 | pbar = get_value(args, 'progress') |
— | — | @@ -113,8 +132,8 @@ |
114 | 133 | path = config.detect_installed_program('7zip') |
115 | 134 | |
116 | 135 | source = os.path.join(location, file) |
117 | | - retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)]) |
118 | | - return retcode |
| 136 | + p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)]) |
| 137 | + return p |
119 | 138 | |
120 | 139 | |
121 | 140 | def mongodb_script_launcher(args, location, filename, project, language_code): |
— | — | @@ -153,6 +172,7 @@ |
154 | 173 | |
155 | 174 | |
156 | 175 | def main(): |
| 176 | + default_language = determine_default_language() |
157 | 177 | file_choices = ('stub-meta-history.xml.gz', |
158 | 178 | 'stub-meta-current.xml.gz', |
159 | 179 | 'pages-meta-history.xml.7z', |
— | — | @@ -188,7 +208,7 @@ |
189 | 209 | parser.add_argument('-l', '--language', action='store', |
190 | 210 | help='Example of valid languages.', |
191 | 211 | choices=supported_languages(), |
192 | | - default='Russian') |
| 212 | + default=default_language) |
193 | 213 | |
194 | 214 | parser.add_argument('-p', '--project', action='store', |
195 | 215 | help='Specify the Wikimedia project that you would like to download', |
— | — | @@ -210,6 +230,7 @@ |
211 | 231 | args = parser.parse_args() |
212 | 232 | config.load_configuration(args) |
213 | 233 | locations = determine_file_locations(args) |
| 234 | + show_settings(args, **locations) |
214 | 235 | args.func(args, **locations) |
215 | 236 | |
216 | 237 | |
Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -17,11 +17,15 @@ |
18 | 18 | __date__ = '2010-11-02'
|
19 | 19 | __version__ = '0.1'
|
20 | 20 |
|
| 21 | +from multiprocessing import Queue
|
| 22 | +from Queue import Empty
|
| 23 | +from operator import itemgetter
|
| 24 | +import datetime
|
21 | 25 |
|
22 | | -
|
23 | 26 | import settings
|
24 | 27 | from database import db
|
25 | 28 | from utils import process_constructor as pc
|
| 29 | +import construct_datasets
|
26 | 30 |
|
27 | 31 |
|
28 | 32 | def create_datacontainer(init_value=0):
|
— | — | @@ -37,7 +41,7 @@ |
38 | 42 | data[str(x)] = init_value
|
39 | 43 | return data
|
40 | 44 |
|
41 | | -
|
| 45 | +
|
42 | 46 | def determine_edits_by_year(dates):
|
43 | 47 | '''
|
44 | 48 | This function counts the number of edits by year made by a particular editor.
|
— | — | @@ -87,7 +91,7 @@ |
88 | 92 |
|
89 | 93 | output.insert({'editor': id, 'edits': edits,
|
90 | 94 | 'edits_by_year': edits_by_year,
|
91 | | - 'year_joined': year,
|
| 95 | + 'year_joined': new_wikipedian,
|
92 | 96 | 'edit_count': edit_count,
|
93 | 97 | 'final_edit': final_edit,
|
94 | 98 | 'first_edit': first_edit,
|
— | — | @@ -101,20 +105,31 @@ |
102 | 106 | kwargs = {'definition': 'traditional',
|
103 | 107 | 'pbar': True,
|
104 | 108 | 'dbname': 'enwiki',
|
105 | | - 'nr_input_processors': 2,
|
| 109 | + 'nr_input_processors': 1,
|
106 | 110 | 'nr_output_processors': 0,
|
107 | 111 | }
|
108 | | - pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
|
| 112 | + chunks = {}
|
| 113 | + parts = int(round(float(len(ids)) / 1, 0))
|
| 114 | + a = 0
|
| 115 | + for x in xrange(settings.NUMBER_OF_PROCESSES):
|
| 116 | + b = a + parts
|
| 117 | + chunks[x] = ids[a:b]
|
| 118 | + a = (x + 1) * parts
|
| 119 | + if a >= len(ids):
|
| 120 | + break
|
109 | 121 |
|
| 122 | + pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
|
110 | 123 |
|
| 124 | +
|
111 | 125 | def debug_optimize_editors(dbname):
|
112 | 126 | ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
|
113 | 127 | q = pc.load_queue(ids)
|
114 | 128 | kwargs = {'definition': 'traditional',
|
115 | | - 'dbname': 'enwiki'
|
| 129 | + 'dbname': dbname
|
116 | 130 | }
|
117 | 131 | optimize_editors(q, False, True, kwargs)
|
118 | 132 |
|
119 | 133 |
|
120 | 134 | if __name__ == '__main__':
|
121 | | - run_optimize_editors('enwiki') |
\ No newline at end of file |
| 135 | + debug_optimize_editors('test')
|
| 136 | + #run_optimize_editors('test')
|
Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -244,6 +244,26 @@ |
245 | 245 | print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n) |
246 | 246 | |
247 | 247 | |
| 248 | +def load_cache_objects(): |
| 249 | + cache = {} |
| 250 | + files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin') |
| 251 | + for x, file in enumerate(files): |
| 252 | + cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file) |
| 253 | + return cache |
| 254 | + |
| 255 | + |
| 256 | +def search_cache_for_missed_editors(dbname): |
| 257 | + mongo = db.init_mongo_db(dbname) |
| 258 | + collection = mongo['editors'] |
| 259 | + editor_cache = cache.EditorCache(collection) |
| 260 | + cache = load_cache_objects() |
| 261 | + for c in cache: |
| 262 | + for editor in cache[c]: |
| 263 | + editor_cache.add(editor, cache[c][editor]) |
| 264 | + cache[c] = {} |
| 265 | + editor_cache.add('NEXT', '') |
| 266 | + |
| 267 | + |
248 | 268 | def load_bot_ids(): |
249 | 269 | ''' |
250 | 270 | Loader function to retrieve list of id's of known Wikipedia bots. |
— | — | @@ -267,7 +287,6 @@ |
268 | 288 | 'language': language, |
269 | 289 | } |
270 | 290 | chunks = {} |
271 | | - #file_location = os.path.join(settings.XML_FILE_LOCATION, language) |
272 | 291 | files = utils.retrieve_file_list(location, 'xml') |
273 | 292 | parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0)) |
274 | 293 | a = 0 |
— | — | @@ -277,12 +296,14 @@ |
278 | 297 | a = (x + 1) * parts |
279 | 298 | |
280 | 299 | pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs) |
| 300 | + search_cache_for_missed_editors(dbname) |
281 | 301 | |
282 | 302 | |
283 | 303 | def debug_parse_editors(dbname): |
284 | 304 | q = JoinableQueue() |
285 | 305 | parse_editors('en\\522.xml', q, None, None, True) |
286 | 306 | store_editors(q, [], dbname) |
| 307 | + search_cache_for_missed_editors(dbname) |
287 | 308 | |
288 | 309 | |
289 | 310 | if __name__ == "__main__": |
Index: trunk/tools/editor_trends/config.py |
— | — | @@ -20,13 +20,14 @@ |
21 | 21 | |
22 | 22 | import os |
23 | 23 | import ConfigParser |
24 | | -from _winreg import * |
25 | 24 | |
| 25 | + |
26 | 26 | import settings |
27 | 27 | from utils import utils |
28 | 28 | |
29 | 29 | |
30 | 30 | def detect_windows_program(program): |
| 31 | + from _winreg import * |
31 | 32 | entry = settings.WINDOWS_REGISTER[program] |
32 | 33 | try: |
33 | 34 | key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -32,6 +32,7 @@ |
33 | 33 | import ctypes |
34 | 34 | |
35 | 35 | import settings |
| 36 | +import exceptions |
36 | 37 | |
37 | 38 | |
38 | 39 | try: |
— | — | @@ -160,6 +161,7 @@ |
161 | 162 | else: |
162 | 163 | return 'wb' |
163 | 164 | |
| 165 | + |
164 | 166 | def write_list_to_csv(data, fh, recursive=False): |
165 | 167 | if recursive: |
166 | 168 | recursive = False |
— | — | @@ -170,6 +172,7 @@ |
171 | 173 | fh.write('%s\t' % d) |
172 | 174 | if recursive: |
173 | 175 | return True |
| 176 | + |
174 | 177 | |
175 | 178 | def write_dict_to_csv(data, fh): |
176 | 179 | keys = data.keys() |
— | — | @@ -225,7 +228,7 @@ |
226 | 229 | if is_exe(exe_file): |
227 | 230 | return exe_file |
228 | 231 | |
229 | | - return None |
| 232 | + raise exceptions.FileNotFoundException(program) |
230 | 233 | |
231 | 234 | |
232 | 235 | def store_object(object, location, filename): |
— | — | @@ -254,6 +257,15 @@ |
255 | 258 | return string |
256 | 259 | |
257 | 260 | |
| 261 | +def invert_dict(dictionary): |
| 262 | + ''' |
| 263 | + @dictionary is a simple dictionary containing simple values, ie. no lists, |
| 264 | + or other dictionaries |
| 265 | + output: dictionary where key and value are swapped. |
| 266 | + ''' |
| 267 | + return dict([[v,k] for k,v in dictionary.items()]) |
| 268 | + |
| 269 | + |
258 | 270 | def create_dict_from_csv_file(filename, encoding): |
259 | 271 | d = {} |
260 | 272 | for line in read_data_from_csv(filename, encoding): |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -57,14 +57,16 @@ |
58 | 58 | nr_output_processors = kwargs.pop('nr_output_processors') |
59 | 59 | input_queues = {} |
60 | 60 | result_queues = {} |
61 | | - assert len(obj) == nr_input_processors |
62 | | - if result_queue: |
63 | | - assert len(obj)== nr_output_processors |
| 61 | + #assert len(obj) == nr_input_processors |
| 62 | + #if result_queue: |
| 63 | + # assert len(obj)== nr_output_processors |
64 | 64 | |
65 | 65 | for i, o in enumerate(obj): |
66 | 66 | input_queues[i] = load_input_queue(obj[o], poison_pill=True) |
67 | 67 | if result_queue: |
68 | 68 | result_queues[i] = JoinableQueue() |
| 69 | + else: |
| 70 | + result_queues[i] = False |
69 | 71 | |
70 | 72 | if settings.PROGRESS_BAR: |
71 | 73 | size = sum([input_queues[q].qsize() for q in input_queues]) |
Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -172,7 +172,7 @@ |
173 | 173 | #elem = parse_comments(elem, remove_ascii_control_characters) |
174 | 174 | #print cElementTree.tostring(elem) |
175 | 175 | except SyntaxError: |
176 | | - fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING) |
| 176 | + fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING) |
177 | 177 | fh.write(cElementTree.tostring(elem)) |
178 | 178 | fh.close() |
179 | 179 | |