r79958 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r79957‎ | r79958 | r79959 >
Date:21:06, 10 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Numerous small fixes
Modified paths:
  • /trunk/tools/editor_trends/analyses/aggregates.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/cohort_charts.py (modified) (history)
  • /trunk/tools/editor_trends/config.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/database/db_settings.py (deleted) (history)
  • /trunk/tools/editor_trends/database/launcher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/exporter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/shaper.py (modified) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/languages.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/consumers.py (added) (history)
  • /trunk/tools/editor_trends/utils/messages.py (modified) (history)
  • /trunk/tools/editor_trends/utils/models.py (deleted) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -46,8 +46,12 @@
4747 from etl import transformer
4848 from etl import exporter
4949
 50+datasets = {'forward': 'generate_cohort_dataset_forward',
 51+ 'backward': 'generate_cohort_dataset_backward',
 52+ 'backward_custom': 'generate_cohort_dataset_backward_custom',
 53+ 'wide': 'generate_wide_editor_dataset',
 54+ }
5055
51 -
5256 class Timer(object):
5357 def __init__(self):
5458 self.t0 = datetime.datetime.now()
@@ -102,26 +106,31 @@
103107 else:
104108 return namespaces
105109
 110+
106111 def write_message_to_log(logger, args, message=None, verb=None, **kwargs):
107112 function = get_value(args, 'func')
108 - logger.debug('Starting %s task' % function.func_name)
 113+ logger.debug('%s\tStarting %s task' % (datetime.datetime.now(), function.func_name))
109114 if message:
110 - logger.debug(message)
 115+ logger.debug('%s\t%s' % (datetime.datetime.now(), message))
111116
112117 max_length = max([len(kw) for kw in kwargs])
113 - #max_tab = max_length / 4
 118+ max_tabs = max_length // settings.tab_width
 119+ res = max_length % settings.tab_width
 120+ if res > 0:
 121+ max_tabs += 1
 122+ pos = max_tabs * settings.tab_width
114123 for kw in kwargs:
115124 if verb:
116 - logger.debug('Action: %s\tSetting: %s' % (verb, kwargs[kw]))
 125+ logger.debug('%s\tAction: %s\tSetting: %s' % (datetime.datetime.now(), verb, kwargs[kw]))
117126 else:
118 - tabs = (max_length - len(kw)) / 4
119 - if tabs == 0:
120 - tabs = 1
 127+ tabs = (pos - len(kw)) // settings.tab_width
 128+ res = len(kw) % settings.tab_width
 129+ if res > 0 or tabs == 0:
 130+ tabs += 1
121131 tabs = ''.join(['\t' for t in xrange(tabs)])
122 - logger.debug('\tKey: %s%sSetting: %s' % (kw, tabs, kwargs[kw]))
 132+ logger.debug('%s\t\tKey: %s%sSetting: %s' % (datetime.datetime.now(), kw, tabs, kwargs[kw]))
123133
124134
125 -
126135 def get_project(args):
127136 project = get_value(args, 'project')
128137 if project != 'wiki':
@@ -142,21 +151,17 @@
143152 config['language_code'] = language_code
144153 config['language'] = get_value(args, 'language')
145154 config['location'] = os.path.join(location, language_code, project)
146 - #config['chunks'] = os.path.join(config['location'], 'chunks')
147155 config['txt'] = os.path.join(config['location'], 'txt')
148156 config['sorted'] = os.path.join(config['location'], 'sorted')
149 - config['dbready'] = os.path.join(config['location'], 'dbready')
150157 config['project'] = project
151158 config['full_project'] = get_projectname(args)
152159 config['filename'] = generate_wikidump_filename(language_code, project, args)
153160 config['collection'] = get_value(args, 'collection')
154161 config['namespaces'] = get_namespaces(args)
155 - config['directories'] = [config['location'], config['txt'], config['sorted'], config['dbready']]
 162+ config['directories'] = [config['location'], config['txt'], config['sorted']]
156163
157164 message = 'Settings as generated from the configuration module.'
158165 write_message_to_log(logger, args, message, None, **config)
159 - #for c in config:
160 - # logger.debug('Key: %s - Setting: %s' % (c, config[c]))
161166 return config
162167
163168
@@ -170,10 +175,11 @@
171176 config['Input directory'] = '%s' % kwargs.get('location')
172177 config['Output directory'] = '%s and subdirectories' % kwargs.get('location')
173178
 179+ max_length_key = max([len(key) for key in config.keys()])
174180 message = 'Final settings after parsing command line arguments:'
175181 write_message_to_log(logger, args, message, None, **config)
176182 for c in config:
177 - print '%s\t%s' % (c, config[c])
 183+ print '%s: %s' % (c.rjust(max_length_key), config[c])
178184
179185
180186 def dump_downloader_launcher(args, logger, **kwargs):
@@ -244,10 +250,8 @@
245251 location = kwargs.pop('location')
246252 input = os.path.join(location, 'txt')
247253 output = os.path.join(location, 'sorted')
248 - final_output = os.path.join(location, 'dbready')
249 - write_message_to_log(logger, args, location=location, input=input, output=output, final_output=final_output)
 254+ write_message_to_log(logger, args, location=location, input=input, output=output)
250255 sort.mergesort_launcher(input, output)
251 - #loader.mergesort_external_launcher(output, final_output)
252256 timer.elapsed()
253257
254258
@@ -264,7 +268,6 @@
265269 write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection)
266270 store.launcher(input, project, collection)
267271 cnt_editors = db.count_records(project, collection)
268 - #assert num_editors == cnt_editors
269272 timer.elapsed()
270273
271274
@@ -282,6 +285,7 @@
283286 def debug_launcher(args, logger, **kwargs):
284287 pass
285288
 289+
286290 def exporter_launcher(args, logger, **kwargs):
287291 print 'Start exporting dataset'
288292 timer = Timer()
@@ -291,6 +295,7 @@
292296 targets = targets.split(',')
293297 for target in targets:
294298 write_message_to_log(logger, args, verb='Exporting', target=target, dbname=dbname, collection=collection)
 299+ target = datasets[target]
295300 exporter.dataset_launcher(dbname, collection, target)
296301 timer.elapsed()
297302
@@ -309,8 +314,9 @@
310315 write_message_to_log(logger, args, verb='Deleting', file=file)
311316 utils.delete_file(settings.binary_location, file)
312317
 318+
313319 def all_launcher(args, logger, **kwargs):
314 - print 'all_launcher'
 320+ print 'The entire data processing chain has been called, this will take a couple of hours (at least) to complete.'
315321 timer = Timer()
316322 full_project = kwargs.get('full_project', None)
317323 message = 'Start of building %s dataset.' % full_project
@@ -323,9 +329,6 @@
324330 if clean:
325331 cleanup(logger, args, **kwargs)
326332
327 - #if format != 'xml':
328 - # ignore = ignore + ',extract'
329 -
330333 functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'),
331334 #(chunker_launcher, 'split'),
332335 (extract_launcher, 'extract'),
@@ -337,7 +340,6 @@
338341 for function, callname in functions.iteritems():
339342 if callname not in ignore:
340343 function(args, logger, **kwargs)
341 -
342344 timer.elapsed()
343345
344346
@@ -374,9 +376,9 @@
375377
376378
377379 def about():
378 - print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
 380+ print '\nEditor Trends Software is (c) 2010 by the Wikimedia Foundation.'
379381 print 'Written by Diederik van Liere (dvanliere@gmail.com).'
380 - print 'This software comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to distribute it under certain conditions.'
 382+ print 'This software comes with ABSOLUTELY NO WARRANTY.\nThis is free software, and you are welcome to distribute it\nunder certain conditions.'
381383 print 'See the README.1ST file for more information.'
382384 print '\n'
383385
@@ -384,15 +386,11 @@
385387 def main():
386388 default_language = determine_default_language()
387389
388 - datasets = {'forward': 'generate_cohort_dataset_forward',
389 - 'backward': 'generate_cohort_dataset_backward',
390 - 'wide': 'generate_wide_editor_dataset',
391 - }
392 -
393390 file_choices = ('stub-meta-history.xml.gz',
394391 'stub-meta-current.xml.gz',
395392 'pages-meta-history.xml.7z',
396 - 'pages-meta-current.xml.bz2')
 393+ 'pages-meta-current.xml.bz2',
 394+ )
397395
398396
399397 parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
@@ -418,7 +416,7 @@
419417 parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
420418 parser_create.set_defaults(func=extract_launcher)
421419
422 - parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.')
 420+ parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reductions are achieved.')
423421 parser_sort.set_defaults(func=sort_launcher)
424422
425423 parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
@@ -435,59 +433,75 @@
436434
437435 parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
438436 parser_all.set_defaults(func=all_launcher)
439 - parser_all.add_argument('-e', '--except', action='store',
 437+ parser_all.add_argument('-e', '--except',
 438+ action='store',
440439 help='Should be a list of functions that are to be ignored when executing \'all\'.',
441 - default=[])
 440+ default=[]
 441+ )
442442
443 - parser_all.add_argument('-n', '--new', action='store_true',
 443+ parser_all.add_argument('-n', '--new',
 444+ action='store_true',
444445 help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.',
445 - default=False)
 446+ default=False
 447+ )
446448
447 - parser.add_argument('-l', '--language', action='store',
 449+ parser.add_argument('-l', '--language',
 450+ action='store',
448451 help='Example of valid languages.',
449452 choices=supported_languages(),
450 - default=default_language)
 453+ default=default_language
 454+ )
451455
452 - parser.add_argument('-p', '--project', action='store',
 456+ parser.add_argument('-p', '--project',
 457+ action='store',
453458 help='Specify the Wikimedia project that you would like to download',
454459 choices=settings.projects.keys(),
455 - default='wiki')
 460+ default='wiki'
 461+ )
456462
457463 parser.add_argument('-c', '--collection', action='store',
458464 help='Name of MongoDB collection',
459465 default='editors')
460466
461467
462 - parser.add_argument('-o', '--location', action='store',
 468+ parser.add_argument('-o', '--location',
 469+ action='store',
463470 help='Indicate where you want to store the downloaded file.',
464471 default=settings.input_location
465472 )
466473
467 - parser.add_argument('-ns', '--namespace', action='store',
 474+ parser.add_argument('-ns', '--namespace',
 475+ action='store',
468476 help='A list of namespaces to include for analysis.',
469 - default='0')
 477+ default='0'
 478+ )
470479
471 - #parser.add_argument('-fo', '--format', action='store',
472 - # help='Indicate which format the chunks should be stored. Valid options are xml and txt.',
473 - # default='txt')
474 -
475 - parser.add_argument('-f', '--file', action='store',
 480+ parser.add_argument('-f', '--file',
 481+ action='store',
476482 choices=file_choices,
477483 help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
478 - default='stub-meta-history.xml.gz')
 484+ default='stub-meta-history.xml.gz'
 485+ )
479486
480 - parser.add_argument('-dv', '--dumpversion', action='store',
 487+ parser.add_argument('-dv', '--dumpversion',
 488+ action='store',
481489 choices=settings.dumpversions.keys(),
482490 help='Indicate the Wikidump version that you are parsing.',
483 - default=settings.dumpversions['0'])
 491+ default=settings.dumpversions['0']
 492+ )
484493
485 - parser.add_argument('-d', '--datasets', action='store',
 494+ parser.add_argument('-d', '--datasets',
 495+ action='store',
486496 choices=datasets.keys(),
487497 help='Indicate what type of data should be exported.',
488 - default=datasets['backward'])
 498+ default='backward'
 499+ )
489500
490 - parser.add_argument('-prog', '--progress', action='store_true', default=True,
491 - help='Indicate whether you want to have a progressbar.')
 501+ parser.add_argument('-prog', '--progress',
 502+ action='store_true',
 503+ default=True, \
 504+ help='Indicate whether you want to have a progressbar.'
 505+ )
492506
493507 args = parser.parse_args()
494508 #initialize logger
Index: trunk/tools/editor_trends/analyses/aggregates.py
@@ -31,11 +31,6 @@
3232 from utils import messages
3333
3434
35 -class Dataset:
36 - def __init__(self):
37 - pass
38 -
39 -
4035 def new_editor_count(editors, dbname, collection, month=12):
4136 '''
4237 @month should be an integer in the range of 1-12.
@@ -105,6 +100,7 @@
106101 utils.write_dict_to_csv(data, fh, keys, write_key=False, newline=True)
107102 fh.close()
108103
 104+
109105 def active_editor_count_launcher(dbname, collection):
110106 editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
111107 tasks = multiprocessing.JoinableQueue()
Index: trunk/tools/editor_trends/analyses/cohort_charts.py
@@ -25,8 +25,8 @@
2626 settings = configuration.Settings()
2727 from utils import utils
2828
29 -def prepare_cohort_dataset(dbname):
30 - dataset = utils.load_object(settings.binary_location, dbname + '_cohort_data.bin')
 29+def prepare_cohort_dataset(dbname, filename):
 30+ dataset = utils.load_object(settings.binary_location, '%s_%s' % (dbname, filename))
3131 fh = utils.create_txt_filehandle(settings.dataset_location, dbname + '_cohort_data.txt', 'w', settings.encoding)
3232
3333 years = dataset.keys()
Index: trunk/tools/editor_trends/etl/exporter.py
@@ -20,8 +20,8 @@
2121 import os
2222 import sys
2323 import datetime
 24+import calendar
2425 from dateutil.relativedelta import *
25 -import calendar
2626 import multiprocessing
2727 from Queue import Empty
2828
@@ -51,7 +51,13 @@
5252 '''
5353 def __init__(self, var):
5454 self.name = var
 55+ self.obs = []
5556 self.stats = ['n', 'avg', 'sd', 'min', 'max']
 57+
 58+ def __repr__(self):
 59+ return self.name
 60+
 61+ def descriptives(self):
5662 self.time = shaper.create_datacontainer()
5763 self.time = shaper.add_months_to_datacontainer(getattr(self, 'time'), datatype='dict')
5864
@@ -59,10 +65,6 @@
6066 setattr(self, var, shaper.create_datacontainer())
6167 setattr(self, var, shaper.add_months_to_datacontainer(getattr(self, var), datatype='list'))
6268
63 - def __repr__(self):
64 - return self.name
65 -
66 - def descriptives(self):
6769 for year in self.time:
6870 for month in self.time[year]:
6971 data = [self.time[year][month][k] for k in self.time[year][month].keys()]
@@ -78,8 +80,8 @@
7981 This class acts as a container for the Variable class and has some methods
8082 to output the dataset to a csv file.
8183 '''
82 - def __init__(self, vars):
83 - self.name = 'long_dataset.tsv'
 84+ def __init__(self, vars, name):
 85+ self.name = name
8486 self.vars = []
8587 for var in vars:
8688 setattr(self, var, Variable(var))
@@ -92,8 +94,9 @@
9395 fh.write('_time\t')
9496 for var in self.vars:
9597 var = getattr(self, var)
96 - for stat in var.stats:
97 - fh.write('%s_%s\t' % (var.name, stat))
 98+ fh.write('%s\t' % var.name)
 99+ #for stat in var.stats:
 100+ # fh.write('%s_%s\t' % (var.name, stat))
98101 fh.write('\n')
99102
100103 def convert_to_longitudinal_data(self, id, obs, vars):
@@ -108,32 +111,43 @@
109112 if id not in ds.time[year][m] and obs[var][year][m] > 0:
110113 ds.time[year][m][id] = obs[var][year][m]
111114
112 - def write_longitudinal_data(self):
 115+ def write_longitudinal_data(self, write_time=True):
113116 fh = utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding)
114117 self.write_headers(fh)
115 - dc = shaper.create_datacontainer()
116 - dc = shaper.add_months_to_datacontainer(dc)
117 -
118118 for var in self.vars:
119119 var = getattr(self, var)
120 - var.descriptives()
121 - years = dc.keys()
122 - years.sort()
123 - for year in years:
124 - months = dc[year].keys()
125 - months.sort()
126 - for month in months:
127 - d = calendar.monthrange(int(year), int(month))[1] #determines the number of days in a given month/year
128 - date = datetime.date(int(year), int(month), d)
129 - fh.write('%s\t' % date)
130 - for var in self.vars:
131 - var = getattr(self, var)
132 - #data = ['%s_%s\t' % (var.name, getattr(var, stat)[year][month]) for stat in var.stats]
133 - fh.write(''.join(['%s\t' % (getattr(var, stat)[year][month],) for stat in var.stats]))
134 - fh.write('\n')
 120+ for o in var.obs:
 121+ if write_time:
 122+ fh.write('%s\t%s\n' % (o[0], o[1]))
 123+ else:
 124+ fh.write('%s\n' % (o[1]))
135125 fh.close()
136126
 127+# windows = create_windows()
 128+# dc = shaper.create_datacontainer()
 129+# dc = shaper.add_months_to_datacontainer(dc, windows)
 130+#
 131+## for var in self.vars:
 132+## var = getattr(self, var)
 133+## var.descriptives()
 134+# years = dc.keys()
 135+# years.sort()
 136+# for year in years:
 137+# months = dc[year].keys()
 138+# months.sort()
 139+# for month in months:
 140+# d = calendar.monthrange(int(year), int(month))[1] #determines the number of days in a given month/year
 141+# date = datetime.date(int(year), int(month), d)
 142+# fh.write('%s\t' % date)
 143+# for var in self.vars:
 144+# var = getattr(self, var)
 145+# #data = ['%s_%s\t' % (var.name, getattr(var, stat)[year][month]) for stat in var.stats]
 146+# fh.write(''.join([ % s\t]))
 147+# #fh.write(''.join(['%s\t' % (getattr(var, stat)[year][month],) for stat in var.stats]))
 148+# fh.write('\n')
137149
 150+
 151+
138152 def expand_edits(edits):
139153 data = []
140154 for edit in edits:
@@ -201,110 +215,113 @@
202216 return windows
203217
204218
205 -#def generate_cohort_dataset_old(tasks, dbname, collection, **kwargs):
206 -# mongo = db.init_mongo_db(dbname)
207 -# editors = mongo[collection + '_dataset']
208 -# windows = create_windows()
209 -# data = shaper.create_datacontainer('dict')
210 -# data = shaper.add_windows_to_datacontainer(data, windows)
211 -#
212 -# while True:
213 -# id = tasks.get(block=False)
214 -# tasks.task_done()
215 -# if id == None:
216 -# break
217 -# obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
218 -#
219 -# first_edit = obs['first_edit']
220 -# last_edit = obs['final_edit']
221 -# editor_dt = relativedelta(last_edit, first_edit)
222 -# editor_dt = (editor_dt.years * 12) + editor_dt.months
223 -# edits = []
224 -# for year in xrange(2001, datetime.datetime.now().year + 1):
225 -# if first_edit.year > year or last_edit.year < year:
226 -# continue
227 -# window_end = datetime.datetime(year, 12, 31)
228 -# for window in windows:
229 -# window_start = window_end - relativedelta(months=window)
230 -# if window_start < datetime.datetime(2001, 1, 1):
231 -# window_start = datetime.datetime(2001, 1, 1)
232 -#
233 -# if editor_dt > 11:
234 -# if date_falls_in_window(window_start, window_end, first_edit):
235 -# edits.append(window)
236 -# elif window > editor_dt:
237 -# data[year][window] += 1
238 -# break
239 -#
240 -# if edits != []:
241 -# w = min(edits)
242 -# data[year][w] += 1
243 -# edits = []
244 -#
245 -#
246 -# print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin')
247 -# utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin')
248 -# cohort_charts.prepare_cohort_dataset(dbname)
249219
 220+def diff_month(d1, d2):
 221+ return (d1.year - d2.year) * 12 + d1.month - d2.month
250222
251223
 224+def generate_cohort_dataset_raw(tasks, dbname, collection):
 225+ mongo = db.init_mongo_db(dbname)
 226+ editors = mongo['%s%s' % (collection, '_dataset')]
 227+ windows = create_windows()
 228+ data = shaper.create_datacontainer('dict')
 229+ final_year = datetime.datetime.now().year + 1
 230+ ld = LongDataset(['experience'], '%s_forward_cohort.csv' % dbname)
 231+ while True:
 232+ id = tasks.get(block=False)
 233+ tasks.task_done()
 234+ if id == None:
 235+ break
 236+ obs = editors.find_one({'editor': id},
 237+ {'new_wikipedian': 1,
 238+ 'monthly_edits': 1,
 239+ 'final_edit':1
 240+ })
252241
253 -def generate_cohort_dataset_forward(tasks, dbname, collection, **kwargs):
 242+ new_wikipedian = obs['new_wikipedian']
 243+ last_edit = obs['final_edit']
 244+ dt = diff_month(last_edit, new_wikipedian)
 245+ day = calendar.monthrange(new_wikipedian.year, new_wikipedian.month)[1]
 246+ tenth_edit = datetime.date(new_wikipedian.year, new_wikipedian.month, day)
 247+ ld.experience.obs.append([tenth_edit, dt])
 248+
 249+ ld.write_longitudinal_data()
 250+
 251+def generate_cohort_dataset_forward(tasks, dbname, collection):
254252 mongo = db.init_mongo_db(dbname)
255253 editors = mongo[collection + '_dataset']
 254+ final_year = datetime.datetime.now().year + 1
256255 windows = create_windows()
257256 data = shaper.create_datacontainer('dict')
258 - final_year = datetime.datetime.now().year + 1
259 - m1 = [1, 2, 3, 4, 5, 6]
260 - m2 = [7, 8, 9, 10, 11, 12]
261 - frames = [m1, m2]
262257 while True:
263258 id = tasks.get(block=False)
264259 if id == None:
265260 break
 261+
266262 obs = editors.find_one({'editor': id}, {'new_wikipedian': 1, 'monthly_edits': 1, 'final_edit':1})
267263 new_wikipedian = obs['new_wikipedian']
 264+ year = new_wikipedian.year
 265+
268266 last_edit = obs['final_edit']
269 - start_year = new_wikipedian.year
270 - last_year = last_edit.year + 1
271 - if new_wikipedian.month != 1:
272 - continue
273 - for year in xrange(start_year, last_year):
274 - if year not in data[start_year]:
275 - data[start_year][year] = {}
276 - for x, frame in enumerate(frames):
277 - if x not in data[start_year][year]:
278 - data[start_year][year][x] = 0
279 - if 'n' not in data[start_year][year]:
280 - data[start_year][year]['n'] = 0
 267+ edits = obs['monthly_edits']
281268
282 - active = sum([obs['monthly_edits'][str(year)][str(m)] for m in frame])
283 - data[start_year][year]['n'] += 1
284 - if active > 0:
285 - data[start_year][year][x] += 1
286 - filename = '%s_cohort_forward.csv' % dbname
287 - fh = utils.create_txt_filehandle(settings.dataset_location, filename, 'w', settings.encoding)
288 - frames.append('n')
289 - headers = ["%s_%s" % (year, frame[0]) for year in xrange(2001, final_year) for frame in enumerate(frames)]
290 - headers.insert(0, '\t')
291 - utils.write_list_to_csv(headers, fh)
 269+ if new_wikipedian.month not in data[new_wikipedian.year]:
 270+ data[new_wikipedian.year][new_wikipedian.month] = {}
 271+ for i, year in enumerate(xrange(new_wikipedian.year, final_year)):
 272+ months = edits.get(str(year), [])
 273+ if i == 0:
 274+ months = months.keys()
 275+ months = [int(m) for m in months]
 276+ months.sort()
 277+ months = months[new_wikipedian.month - 1:]
 278+ months = [str(m) for m in months]
 279+ for month in months:
 280+ experience = str(i * 12 + int(month))
 281+ if experience not in data[new_wikipedian.year][new_wikipedian.month]:
 282+ data[new_wikipedian.year][new_wikipedian.month][experience] = 0
 283+ data[new_wikipedian.year][new_wikipedian.month][experience] += 1 if edits[str(year)][month] > 0 else 0
292284
293 - for obs_year in data:
294 - obs = '%s\t' % obs_year
295 - for year in xrange(2001, final_year):
296 - values = data[obs_year].get(year, None)
297 - if values != None:
298 - for value in values:
299 - obs = '%s\t%s\t' % (obs, values[value])
300 - else:
301 - obs = '%s\t.\t.\t.\t' % obs
302 -
303 - obs = '%s\n' % obs
304 - fh.write(obs)
 285+ fh = utils.create_txt_filehandle(settings.dataset_location, '%s_cohort_data_forward.csv' % (dbname), 'w', settings.encoding)
 286+ for year in data:
 287+ for month in data[year]:
 288+ obs = data[year][month].keys()
 289+ obs.sort()
 290+ for o in obs:
 291+ utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, newline=True)
305292 fh.close()
306293
307294
 295+def generate_cohort_dataset_backward_custom(tasks, dbname, collection):
 296+ mongo = db.init_mongo_db(dbname)
 297+ editors = mongo[collection + '_dataset']
 298+ windows = create_windows()
 299+ data = shaper.create_datacontainer('dict')
 300+ data = shaper.add_windows_to_datacontainer(data, windows)
308301
 302+ while True:
 303+ id = tasks.get(block=False)
 304+ tasks.task_done()
 305+ if id == None:
 306+ break
 307+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'monthly_edits':1, 'edits_by_year': 1, 'last_edit_by_year': 1})
 308+ first_edit = obs['first_edit']
 309+
 310+ if obs['monthly_edits']['2010']['8'] > 0:
 311+ for year in xrange(2001, datetime.datetime.now().year + 1):
 312+ if obs['edits_by_year'].get(year, 0) > 0:
 313+ last_edit = obs['last_edit_by_year'][year]
 314+ editor_dt = relativedelta(last_edit, first_edit)
 315+ editor_dt = (editor_dt.years * 12) + editor_dt.months
 316+ for w in windows:
 317+ if w >= editor_dt:
 318+ data[int(year)][w] += 1
 319+ break
 320+ filename = '_august_2010_cohort_data_.bin'
 321+ utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename))
 322+ cohort_charts.prepare_cohort_dataset(dbname, filename)
 323+
 324+
 325+
309326 def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs):
310327 mongo = db.init_mongo_db(dbname)
311328 editors = mongo[collection + '_dataset']
@@ -334,15 +351,6 @@
335352 cohort_charts.prepare_cohort_dataset(dbname)
336353
337354
338 -
339 -
340 -def date_falls_in_window(window_start, window_end, first_edit):
341 - if first_edit >= window_start and first_edit <= window_end:
342 - return True
343 - else:
344 - return False
345 -
346 -
347355 def generate_wide_editor_dataset(tasks, dbname, collection, **kwargs):
348356 mongo = db.init_mongo_db(dbname)
349357 editors = mongo[collection + '_dataset']
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -101,13 +101,16 @@
102102 return revisions
103103
104104
105 -def is_article_main_namespace(elem, namespace):
 105+def verify_article_belongs_namespace(elem, namespaces):
106106 '''
107 - checks whether the article belongs to the main namespace
 107+ @namespaces is a list of namespaces that should be ignored, hence if the
 108+ title of article starts with the namespace then return False else return True
108109 '''
109110 title = elem.text
110 - for ns in namespace:
111 - if title.startswith(ns):
 111+ if title == None:
 112+ return False
 113+ for namespace in namespaces:
 114+ if title.startswith(namespace):
112115 return False
113116 return True
114117
@@ -249,7 +252,7 @@
250253 for page in wikitree.parser.read_input(fh):
251254 title = page.find('title')
252255 total_pages += 1
253 - if is_article_main_namespace(title, ns):
 256+ if verify_article_belongs_namespace(title, ns):
254257 #cElementTree.dump(page)
255258 article_id = page.find('id').text
256259 revisions = page.findall('revision')
Index: trunk/tools/editor_trends/etl/shaper.py
@@ -1,3 +1,16 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
215
316
417 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
@@ -27,7 +40,7 @@
2841 d[i] = 0.0
2942 return d
3043
31 -def create_datacontainer(datatype):
 44+def create_datacontainer(datatype='dict'):
3245 '''
3346 This function initializes an empty dictionary with as key the year (starting
3447 2001 and running through) and as value @datatype, in most cases this will
Index: trunk/tools/editor_trends/etl/sort.py
@@ -162,10 +162,9 @@
163163
164164 if __name__ == '__main__':
165165 input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
166 - intermediate_output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
167 - output = os.path.join(settings.input_location, 'en', 'wiki', 'dbready')
 166+ output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
168167 dbname = 'enwiki'
169168 collection = 'editors'
170 - mergesort_launcher(input, intermediate_output)
 169+ mergesort_launcher(input, output)
171170 #mergesort_external_launcher(intermediate_output, output)
172171 #num_editors = store_editors(output, dbname, collection)
Index: trunk/tools/editor_trends/config.py
@@ -24,15 +24,14 @@
2525 from utils import utils
2626 import languages
2727
 28+
2829 def show_choices(settings, attr):
2930 choices = getattr(settings, attr).items()
3031 choices.sort()
3132 choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices]
32 - #print '\n'.join(choices)
3333 return choices
34 - #for choice in choices:
35 - # print '%s\t%s' % (choice[0], choice[1])
3634
 35+
3736 def create_configuration(settings, args):
3837 force = getattr(args, 'force', False)
3938 if not os.path.exists('wiki.cfg') or force:
@@ -62,9 +61,6 @@
6362 if len(dumpversion) == 0:
6463 dumpversion = settings.dumpversions['0']
6564
66 -
67 - #dumpversion = dumpversion if dumpversion in settings.dumpversions.keys() else args.dumpversion
68 -
6965 dumpversion = settings.dumpversions[dumpversion]
7066 input_location = input_location if len(input_location) > 0 else settings.input_location
7167 working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
Index: trunk/tools/editor_trends/languages.py
@@ -18,595 +18,596 @@
1919 __version__ = '0.1'
2020
2121 '''
22 -This file provides mapper between language name and locale language name and
23 -Wikipedia acronym.
24 -Gothic and Birmese are not yet supported, see rows 450 and 554.
 22+This file provides mapper between language name and locale language name and
 23+Wikipedia acronym.
 24+Gothic and Birmese are not yet supported, see rows 450 and 554.
2525 '''
2626
2727 from utils import ordered_dict as odict
2828 from utils import utils
2929
3030 MAPPING = odict.OrderedDict([
31 -(u'English','en'),
32 -(u'German','de'),
33 -(u'French','fr'),
34 -(u'Italian','it'),
35 -(u'Polish','pl'),
36 -(u'Japanese','ja'),
37 -(u'Spanish','es'),
38 -(u'Dutch','nl'),
39 -(u'Portuguese','pt'),
40 -(u'Russian','ru'),
41 -(u'Swedish','sv'),
42 -(u'Chinese','zh'),
43 -(u'Catalan','ca'),
44 -(u'Norwegian','no'),
45 -(u'Bokmål','no'),
46 -(u'Finnish','fi'),
47 -(u'Ukrainian','uk'),
48 -(u'Hungarian','hu'),
49 -(u'Czech','cs'),
50 -(u'Romanian','ro'),
51 -(u'Turkish','tr'),
52 -(u'Korean','ko'),
53 -(u'Vietnamese','vi'),
54 -(u'Danish','da'),
55 -(u'Arabic','ar'),
56 -(u'Esperanto','eo'),
57 -(u'Serbian','sr'),
58 -(u'Indonesian','id'),
59 -(u'Lithuanian','lt'),
60 -(u'Volapük','vo'),
61 -(u'Slovak','sk'),
62 -(u'Hebrew','he'),
63 -(u'Bulgarian','bg'),
64 -(u'Persian','fa'),
65 -(u'Slovenian','sl'),
66 -(u'Waray-Waray','war'),
67 -(u'Croatian','hr'),
68 -(u'Estonian','et'),
69 -(u'Malay','ms'),
70 -(u'Newar','new'),
71 -(u'Nepal Bhasa','new'),
72 -(u'Simple English','simple'),
73 -(u'Galician','gl'),
74 -(u'Thai','th'),
75 -(u'Aromanian','roa-rup'),
76 -(u'Nynorsk','nn'),
77 -(u'Basque','eu'),
78 -(u'Hindi','hi'),
79 -(u'Greek','el'),
80 -(u'Haitian','ht'),
81 -(u'Latin','la'),
82 -(u'Telugu','te'),
83 -(u'Georgian','ka'),
84 -(u'Cebuano','ceb'),
85 -(u'Macedonian','mk'),
86 -(u'Azeri','az'),
87 -(u'Tagalog','tl'),
88 -(u'Breton','br'),
89 -(u'Serbo-Croatian','sh'),
90 -(u'Marathi','mr'),
91 -(u'Luxembourgish','lb'),
92 -(u'Javanese','jv'),
93 -(u'Latvian','lv'),
94 -(u'Bosnian','bs'),
95 -(u'Icelandic','is'),
96 -(u'Welsh','cy'),
97 -(u'Belarusian','be-x-old'),
98 -(u'Taraškievica','be-x-old'),
99 -(u'Piedmontese','pms'),
100 -(u'Albanian','sq'),
101 -(u'Tamil','ta'),
102 -(u'Bishnupriya Manipuri','bpy'),
103 -(u'Belarusian','be'),
104 -(u'Aragonese','an'),
105 -(u'Occitan','oc'),
106 -(u'Bengali','bn'),
107 -(u'Swahili','sw'),
108 -(u'Ido','io'),
109 -(u'Ripuarian','ksh'),
110 -(u'Lombard','lmo'),
111 -(u'West Frisian','fy'),
112 -(u'Gujarati','gu'),
113 -(u'Low Saxon','nds'),
114 -(u'Afrikaans','af'),
115 -(u'Sicilian','scn'),
116 -(u'Quechua','qu'),
117 -(u'Kurdish','ku'),
118 -(u'Urdu','ur'),
119 -(u'Sundanese','su'),
120 -(u'Malayalam','ml'),
121 -(u'Cantonese','zh-yue'),
122 -(u'Asturian','ast'),
123 -(u'Neapolitan','nap'),
124 -(u'Samogitian','bat-smg'),
125 -(u'Walloon','wa'),
126 -(u'Chuvash','cv'),
127 -(u'Irish','ga'),
128 -(u'Armenian','hy'),
129 -(u'Yoruba','yo'),
130 -(u'Kannada','kn'),
131 -(u'Tajik','tg'),
132 -(u'Tarantino','roa-tara'),
133 -(u'Venetian','vec'),
134 -(u'Western Panjabi','pnb'),
135 -(u'Nepali','ne'),
136 -(u'Scottish Gaelic','gd'),
137 -(u'Yiddish','yi'),
138 -(u'Min Nan','zh-min-nan'),
139 -(u'Uzbek','uz'),
140 -(u'Tatar','tt'),
141 -(u'Kapampangan','pam'),
142 -(u'Ossetian','os'),
143 -(u'Sakha','sah'),
144 -(u'Alemannic','als'),
145 -(u'Maori','mi'),
146 -(u'Egyptian Arabic','arz'),
147 -(u'Kazakh','kk'),
148 -(u'Nahuatl','nah'),
149 -(u'Limburgian','li'),
150 -(u'Upper Sorbian','hsb'),
151 -(u'Gilaki','glk'),
152 -(u'Corsican','co'),
153 -(u'Gan','gan'),
154 -(u'Amharic','am'),
155 -(u'Mongolian','mn'),
156 -(u'Interlingua','ia'),
157 -(u'Central Bicolano','bcl'),
158 -(u'Võro','fiu-vro'),
159 -(u'Dutch Low Saxon','nds-nl'),
160 -(u'Faroese','fo'),
161 -(u'Turkmen','tk'),
162 -(u'Scots','sco'),
163 -(u'West Flemish','vls'),
164 -(u'Sinhalese','si'),
165 -(u'Sanskrit','sa'),
166 -(u'Bavarian','bar'),
167 -(u'Burmese','my'),
168 -(u'Manx','gv'),
169 -(u'Divehi','dv'),
170 -(u'Norman','nrm'),
171 -(u'Pangasinan','pag'),
172 -(u'Romansh','rm'),
173 -(u'Banyumasan','map-bms'),
174 -(u'Zazaki','diq'),
175 -(u'Sorani','ckb'),
176 -(u'Northern Sami','se'),
177 -(u'Mazandarani','mzn'),
178 -(u'Wu','wuu'),
179 -(u'Uyghur','ug'),
180 -(u'Friulian','fur'),
181 -(u'Ligurian','lij'),
182 -(u'Maltese','mt'),
183 -(u'Bihari','bh'),
184 -(u'Novial','nov'),
185 -(u'Malagasy','mg'),
186 -(u'Kashubian','csb'),
187 -(u'Ilokano','ilo'),
188 -(u'Sardinian','sc'),
189 -(u'Classical Chinese','zh-classical'),
190 -(u'Khmer','km'),
191 -(u'Ladino','lad'),
192 -(u'Pali','pi'),
193 -(u'Anglo-Saxon','ang'),
194 -(u'Zamboanga Chavacano','cbk-zam'),
195 -(u'Tibetan','bo'),
196 -(u'Fiji Hindi','hif'),
197 -(u'Franco-Provençal','frp'),
198 -(u'Arpitan','frp'),
199 -(u'Hakka','hak'),
200 -(u'Cornish','kw'),
201 -(u'Punjabi','pa'),
202 -(u'Pashto','ps'),
203 -(u'Kalmyk','xal'),
204 -(u'Silesian','szl'),
205 -(u'Pennsylvania German','pdc'),
206 -(u'Hawaiian','haw'),
207 -(u'Saterland Frisian','stq'),
208 -(u'Interlingue','ie'),
209 -(u'Navajo','nv'),
210 -(u'Fijian','fj'),
211 -(u'Crimean Tatar','crh'),
212 -(u'Komi','kv'),
213 -(u'Tongan','to'),
214 -(u'Acehnese','ace'),
215 -(u'Somali','so'),
216 -(u'Erzya','myv'),
217 -(u'Guarani','gn'),
218 -(u'Karachay-Balkar','krc'),
219 -(u'Extremaduran','ext'),
220 -(u'Lingala','ln'),
221 -(u'Kirghiz','ky'),
222 -(u'Meadow Mari','mhr'),
223 -(u'Assyrian Neo-Aramaic','arc'),
224 -(u'Emilian-Romagnol','eml'),
225 -(u'Lojban','jbo'),
226 -(u'Picard','pcd'),
227 -(u'Aymara','ay'),
228 -(u'Wolof','wo'),
229 -(u'Tumbuka','tum'),
230 -(u'Kabyle','kab'),
231 -(u'Bashkir','ba'),
232 -(u'North Frisian','frr'),
233 -(u'Tahitian','ty'),
234 -(u'Tok Pisin','tpi'),
235 -(u'Papiamentu','pap'),
236 -(u'Zealandic','zea'),
237 -(u'Sranan','srn'),
238 -(u'Greenlandic','kl'),
239 -(u'Udmurt','udm'),
240 -(u'Chechen','ce'),
241 -(u'Igbo','ig'),
242 -(u'Komi-Permyak','koi'),
243 -(u'Oriya','or'),
244 -(u'Lower Sorbian','dsb'),
245 -(u'Kongo','kg'),
246 -(u'Lao','lo'),
247 -(u'Abkhazian','ab'),
248 -(u'Moksha','mdf'),
249 -(u'Romani','rmy'),
250 -(u'Hill Mari','mrj'),
251 -(u'Banjar','bjn'),
252 -(u'Old Church Slavonic','cu'),
253 -(u'Mirandese','mwl'),
254 -(u'Karakalpak','kaa'),
255 -(u'Samoan','sm'),
256 -(u'Moldovan','mo'),
257 -(u'Tetum','tet'),
258 -(u'Avar','av'),
259 -(u'Kashmiri','ks'),
260 -(u'Gothic','got'),
261 -(u'Sindhi','sd'),
262 -(u'Bambara','bm'),
263 -(u'Nauruan','na'),
264 -(u'Norfolk','pih'),
265 -(u'Pontic','pnt'),
266 -(u'Inuktitut','iu'),
267 -(u'Inupiak','ik'),
268 -(u'Bislama','bi'),
269 -(u'Cherokee','chr'),
270 -(u'Assamese','as'),
271 -(u'Min Dong','cdo'),
272 -(u'Ewe','ee'),
273 -(u'Swati','ss'),
274 -(u'Oromo','om'),
275 -(u'Zhuang','za'),
276 -(u'Zulu','zu'),
277 -(u'Tigrinya','ti'),
278 -(u'Venda','ve'),
279 -(u'Tsonga','ts'),
280 -(u'Hausa','ha'),
281 -(u'Dzongkha','dz'),
282 -(u'Sango','sg'),
283 -(u'Chamorro','ch'),
284 -(u'Cree','cr'),
285 -(u'Xhosa','xh'),
286 -(u'Akan','ak'),
287 -(u'Sesotho','st'),
288 -(u'Kinyarwanda','rw'),
289 -(u'Tswana','tn'),
290 -(u'Kikuyu','ki'),
291 -(u'Buryat','bxr'),
292 -(u'Buginese','bug'),
293 -(u'Chichewa','ny'),
294 -(u'Lak','lbe'),
295 -(u'Twi','tw'),
296 -(u'Shona','sn'),
297 -(u'Kirundi','rn'),
298 -(u'Fula','ff'),
299 -(u'Cheyenne','chy'),
300 -(u'Luganda','lg'),
301 -(u'Ndonga','ng'),
302 -(u'Sichuan Yi','ii'),
303 -(u'Choctaw','cho'),
304 -(u'Marshallese','mh'),
305 -(u'Afar','aa'),
306 -(u'Kuanyama','kj'),
307 -(u'Hiri Motu','ho'),
308 -(u'Muscogee','mus'),
309 -(u'Kanuri','kr'),
310 -(u'Herero','hz'),
311 -(u'English','en'),
312 -(u'Deutsch','de'),
313 -(u'Français','fr'),
314 -(u'Italiano','it'),
315 -(u'Polski','pl'),
316 -(u'日本語','ja'),
317 -(u'Español','es'),
318 -(u'Nederlands','nl'),
319 -(u'Português','pt'),
320 -(u'Русский','ru'),
321 -(u'Svenska','sv'),
322 -(u'中文','zh'),
323 -(u'Català','ca'),
324 -(u'Norsk','no'),
325 -(u'Bokmål','no'),
326 -(u'Suomi','fi'),
327 -(u'Українська','uk'),
328 -(u'Magyar','hu'),
329 -(u'Čeština','cs'),
330 -(u'Română','ro'),
331 -(u'Türkçe','tr'),
332 -(u'한국어','ko'),
333 -(u'Tiếng Việt','vi'),
334 -(u'Dansk','da'),
335 -(u'العربية','ar'),
336 -(u'Esperanto','eo'),
337 -(u'Српски','sr'),
338 -(u'Srpski','sr'),
339 -(u'Bahasa Indonesia','id'),
340 -(u'Lietuvių','lt'),
341 -(u'Volapük','vo'),
342 -(u'Slovenčina','sk'),
343 -(u'עברית','he'),
344 -(u'Български','bg'),
345 -(u'فارسی','fa'),
346 -(u'Slovenščina','sl'),
347 -(u'Winaray','war'),
348 -(u'Hrvatski','hr'),
349 -(u'Eesti','et'),
350 -(u'Bahasa Melayu','ms'),
351 -(u'नेपाल भाषा','new'),
352 -(u'Simple English','simple'),
353 -(u'Galego','gl'),
354 -(u'ไทย','th'),
355 -(u'Armãneashce','roa-rup'),
356 -(u'Nynorsk','nn'),
357 -(u'Euskara','eu'),
358 -(u'हिन्दी','hi'),
359 -(u'Ελληνικά','el'),
360 -(u'Krèyol ayisyen','ht'),
361 -(u'Latina','la'),
362 -(u'తెలుగు','te'),
363 -(u'ქართული','ka'),
364 -(u'Sinugboanong Binisaya','ceb'),
365 -(u'Македонски','mk'),
366 -(u'Azərbaycan','az'),
367 -(u'Tagalog','tl'),
368 -(u'Brezhoneg','br'),
369 -(u'Srpskohrvatski','sh'),
370 -(u'Српскохрватски','sh'),
371 -(u'मराठी','mr'),
372 -(u'Lëtzebuergesch','lb'),
373 -(u'Basa Jawa','jv'),
374 -(u'Latviešu','lv'),
375 -(u'Bosanski','bs'),
376 -(u'Íslenska','is'),
377 -(u'Cymraeg','cy'),
378 -(u'Беларуская','be-x-old'),
379 -(u'тарашкевіца','be-x-old'),
380 -(u'Piemontèis','pms'),
381 -(u'Shqip','sq'),
382 -(u'தமிழ்','ta'),
383 -(u'ইমার ঠার','bpy'),
384 -(u'বিষ্ণুপ্রিয়া মণিপুরী','bpy'),
385 -(u'Беларуская','be'),
386 -(u'Aragonés','an'),
387 -(u'Occitan','oc'),
388 -(u'বাংলা','bn'),
389 -(u'Kiswahili','sw'),
390 -(u'Ido','io'),
391 -(u'Ripoarisch','ksh'),
392 -(u'Lumbaart','lmo'),
393 -(u'Frysk','fy'),
394 -(u'ગુજરાતી','gu'),
395 -(u'Plattdüütsch','nds'),
396 -(u'Afrikaans','af'),
397 -(u'Sicilianu','scn'),
398 -(u'Runa Simi','qu'),
399 -(u'Kurdî','ku'),
400 -(u'كوردی','ku'),
401 -(u'اردو','ur'),
402 -(u'Basa Sunda','su'),
403 -(u'മലയാളം','ml'),
404 -(u'粵語','zh-yue'),
405 -(u'Asturianu','ast'),
406 -(u'Nnapulitano','nap'),
407 -(u'Žemaitėška','bat-smg'),
408 -(u'Walon','wa'),
409 -(u'Чăваш','cv'),
410 -(u'Gaeilge','ga'),
411 -(u'Հայերեն','hy'),
412 -(u'Yorùbá','yo'),
413 -(u'ಕನ್ನಡ','kn'),
414 -(u'Тоҷикӣ','tg'),
415 -(u'Tarandíne','roa-tara'),
416 -(u'Vèneto','vec'),
417 -(u'شاہ مکھی پنجابی','pnb'),
418 -(u'Shāhmukhī Pañjābī','pnb'),
419 -(u'नेपाली','ne'),
420 -(u'Gàidhlig','gd'),
421 -(u'ייִדיש','yi'),
422 -(u'Bân-lâm-gú','zh-min-nan'),
423 -(u'O‘zbek','uz'),
424 -(u'Tatarça','tt'),
425 -(u'Татарча','tt'),
426 -(u'Kapampangan','pam'),
427 -(u'Иронау','os'),
428 -(u'Саха тыла','sah'),
429 -(u'Saxa Tyla','sah'),
430 -(u'Alemannisch','als'),
431 -(u'Māori','mi'),
432 -(u'مصرى','arz'),
433 -(u'Maṣrī','arz'),
434 -(u'Қазақша','kk'),
435 -(u'Nāhuatl','nah'),
436 -(u'Limburgs','li'),
437 -(u'Hornjoserbsce','hsb'),
438 -(u'گیلکی','glk'),
439 -(u'Corsu','co'),
440 -(u'贛語','gan'),
441 -(u'አማርኛ','am'),
442 -(u'Монгол','mn'),
443 -(u'Interlingua','ia'),
444 -(u'Bikol','bcl'),
445 -(u'Võro','fiu-vro'),
446 -(u'Nedersaksisch','nds-nl'),
447 -(u'Føroyskt','fo'),
448 -(u'تركمن ','tk'),
449 -(u'Туркмен','tk'),
450 -(u'Scots','sco'),
451 -(u'West-Vlams','vls'),
452 -(u'සිංහල','si'),
453 -(u'संस्कृतम्','sa'),
454 -(u'Boarisch','bar'),
455 -(u'မ္ရန္‌မာစာ','my'), #Needs fix
456 -(u'Gaelg','gv'),
457 -(u'ދިވެހިބަސް','dv'),
458 -(u'Nouormand','nrm'),
459 -(u'Normaund','nrm'),
460 -(u'Pangasinan','pag'),
461 -(u'Rumantsch','rm'),
462 -(u'Basa Banyumasan','map-bms'),
463 -(u'Zazaki','diq'),
464 -(u'Soranî','ckb'),
465 -(u'کوردی','ckb'),
466 -(u'Sámegiella','se'),
467 -(u'مَزِروني','mzn'),
468 -(u'吴语','wuu'),
469 -(u'Oyghurque','ug'),
470 -(u'Furlan','fur'),
471 -(u'Líguru','lij'),
472 -(u'Malti','mt'),
473 -(u'भोजपुरी','bh'),
474 -(u'Novial','nov'),
475 -(u'Malagasy','mg'),
476 -(u'Kaszëbsczi','csb'),
477 -(u'Ilokano','ilo'),
478 -(u'Sardu','sc'),
479 -(u'古文','zh-classical'),
480 -(u'文言文','zh-classical'),
481 -(u'ភាសាខ្មែរ','km'),
482 -(u'Dzhudezmo','lad'),
483 -(u'पाऴि','pi'),
484 -(u'Englisc','ang'),
485 -(u'Chavacano de Zamboanga','cbk-zam'),
486 -(u'བོད་སྐད','bo'),
487 -(u'Fiji Hindi','hif'),
488 -(u'Arpitan','frp'),
489 -(u'Hak-kâ-fa','hak'),
490 -(u'客家話','hak'),
491 -(u'Kernewek','kw'),
492 -(u'Karnuack','kw'),
493 -(u'ਪੰਜਾਬੀ','pa'),
494 -(u'پښتو','ps'),
495 -(u'Хальмг','xal'),
496 -(u'Ślůnski','szl'),
497 -(u'Deitsch','pdc'),
498 -(u'Hawai`i','haw'),
499 -(u'Seeltersk','stq'),
500 -(u'Interlingue','ie'),
501 -(u'Diné bizaad','nv'),
502 -(u'Na Vosa Vakaviti','fj'),
503 -(u'Qırımtatarca','crh'),
504 -(u'Коми','kv'),
505 -(u'faka Tonga','to'),
506 -(u'Bahsa Acèh','ace'),
507 -(u'Soomaaliga','so'),
508 -(u'Эрзянь','myv'),
509 -(u'Erzjanj Kelj','myv'),
510 -(u"Avañe'ẽ",'gn'),
511 -(u'Къарачай-Малкъар','krc'),
512 -(u'Qarachay-Malqar','krc'),
513 -(u'Estremeñu','ext'),
514 -(u'Lingala','ln'),
515 -(u'Кыргызча','ky'),
516 -(u'Олык Марий','mhr'),
517 -(u'Olyk Marij','mhr'),
518 -(u'ܐܪܡܝܐ','arc'),
519 -(u'Emiliàn e rumagnòl','eml'),
520 -(u'Lojban','jbo'),
521 -(u'Picard','pcd'),
522 -(u'Aymar','ay'),
523 -(u'Wolof','wo'),
524 -(u'chiTumbuka','tum'),
525 -(u'Taqbaylit','kab'),
526 -(u'Башҡорт','ba'),
527 -(u'Frasch','frr'),
528 -(u'Reo Mā`ohi','ty'),
529 -(u'Tok Pisin','tpi'),
530 -(u'Papiamentu','pap'),
531 -(u'Zeêuws','zea'),
532 -(u'Sranantongo','srn'),
533 -(u'Kalaallisut','kl'),
534 -(u'Удмурт кыл','udm'),
535 -(u'Нохчийн','ce'),
536 -(u'Igbo','ig'),
537 -(u'Перем Коми','koi'),
538 -(u'Perem Komi','koi'),
539 -(u'ଓଡ଼ିଆ','or'),
540 -(u'Dolnoserbski','dsb'),
541 -(u'KiKongo','kg'),
542 -(u'ລາວ','lo'),
543 -(u'Аҧсуа','ab'),
544 -(u'Мокшень','mdf'),
545 -(u'Mokshanj Kälj','mdf'),
546 -(u'romani - रोमानी','rmy'),
547 -(u'Кырык Мары','mrj'),
548 -(u'Kyryk Mary','mrj'),
549 -(u'Bahasa Banjar','bjn'),
550 -(u'Словѣньскъ','cu'),
551 -(u'Páigina Percipal','mwl'),
552 -(u'Qaraqalpaqsha','kaa'),
553 -(u'Gagana Samoa','sm'),
554 -(u'Молдовеняскэ','mo'),
555 -(u'Tetun','tet'),
556 -(u'Авар','av'),
557 -(u'कश्मीरी','ks'),
558 -(u'كشميري','ks'),
559 -(u'𐌲𐌿𐍄𐌹𐍃𐌺','got'), #Needs fix
560 -(u'سنڌي، سندھی ، सिन्ध','sd'),
561 -(u'Bamanankan','bm'),
562 -(u'dorerin Naoero','na'),
563 -(u'Norfuk','pih'),
564 -(u'Ποντιακά','pnt'),
565 -(u'ᐃᓄᒃᑎᑐᑦ','iu'),
566 -(u'Iñupiak','ik'),
567 -(u'Bislama','bi'),
568 -(u'ᏣᎳᎩ','chr'),
569 -(u'অসমীয়া','as'),
570 -(u'Mìng-dĕ̤ng-ngṳ̄','cdo'),
571 -(u'Eʋegbe','ee'),
572 -(u'SiSwati','ss'),
573 -(u'Oromoo','om'),
574 -(u'Cuengh','za'),
575 -(u'isiZulu','zu'),
576 -(u'ትግርኛ','ti'),
577 -(u'Tshivenda','ve'),
578 -(u'Xitsonga','ts'),
579 -(u'هَوُسَ','ha'),
580 -(u'ཇོང་ཁ','dz'),
581 -(u'Sängö','sg'),
582 -(u'Chamoru','ch'),
583 -(u'Nehiyaw','cr'),
584 -(u'isiXhosa','xh'),
585 -(u'Akana','ak'),
586 -(u'Sesotho','st'),
587 -(u'Ikinyarwanda','rw'),
588 -(u'Setswana','tn'),
589 -(u'Gĩkũyũ','ki'),
590 -(u'Буряад','bxr'),
591 -(u'Basa Ugi','bug'),
592 -(u'Chi-Chewa','ny'),
593 -(u'Лакку','lbe'),
594 -(u'Twi','tw'),
595 -(u'chiShona','sn'),
596 -(u'Kirundi','rn'),
597 -(u'Fulfulde','ff'),
598 -(u'Tsetsêhestâhese','chy'),
599 -(u'Luganda','lg'),
600 -(u'Oshiwambo','ng'),
601 -(u'ꆇꉙ','ii'),
602 -(u'Choctaw','cho'),
603 -(u'Ebon','mh'),
604 -(u'Afar','aa'),
605 -(u'Kuanyama','kj'),
606 -(u'Hiri Motu','ho'),
607 -(u'Muskogee','mus'),
608 -(u'Kanuri','kr'),
609 -(u'Otsiherero','hz'),
 31+(u'English', 'en'),
 32+(u'German', 'de'),
 33+(u'French', 'fr'),
 34+(u'Italian', 'it'),
 35+(u'Polish', 'pl'),
 36+(u'Japanese', 'ja'),
 37+(u'Spanish', 'es'),
 38+(u'Dutch', 'nl'),
 39+(u'Portuguese', 'pt'),
 40+(u'Russian', 'ru'),
 41+(u'Swedish', 'sv'),
 42+(u'Chinese', 'zh'),
 43+(u'Catalan', 'ca'),
 44+(u'Norwegian', 'no'),
 45+(u'Bokmål', 'no'),
 46+(u'Finnish', 'fi'),
 47+(u'Ukrainian', 'uk'),
 48+(u'Hungarian', 'hu'),
 49+(u'Czech', 'cs'),
 50+(u'Romanian', 'ro'),
 51+(u'Turkish', 'tr'),
 52+(u'Korean', 'ko'),
 53+(u'Vietnamese', 'vi'),
 54+(u'Danish', 'da'),
 55+(u'Arabic', 'ar'),
 56+(u'Esperanto', 'eo'),
 57+(u'Serbian', 'sr'),
 58+(u'Indonesian', 'id'),
 59+(u'Lithuanian', 'lt'),
 60+(u'Volapük', 'vo'),
 61+(u'Slovak', 'sk'),
 62+(u'Hebrew', 'he'),
 63+(u'Bulgarian', 'bg'),
 64+(u'Persian', 'fa'),
 65+(u'Slovenian', 'sl'),
 66+(u'Waray-Waray', 'war'),
 67+(u'Croatian', 'hr'),
 68+(u'Estonian', 'et'),
 69+(u'Malay', 'ms'),
 70+(u'Newar', 'new'),
 71+(u'Nepal Bhasa', 'new'),
 72+(u'Simple English', 'simple'),
 73+(u'Galician', 'gl'),
 74+(u'Thai', 'th'),
 75+(u'Aromanian', 'roa-rup'),
 76+(u'Nynorsk', 'nn'),
 77+(u'Basque', 'eu'),
 78+(u'Hindi', 'hi'),
 79+(u'Greek', 'el'),
 80+(u'Haitian', 'ht'),
 81+(u'Latin', 'la'),
 82+(u'Telugu', 'te'),
 83+(u'Georgian', 'ka'),
 84+(u'Cebuano', 'ceb'),
 85+(u'Macedonian', 'mk'),
 86+(u'Azeri', 'az'),
 87+(u'Tagalog', 'tl'),
 88+(u'Breton', 'br'),
 89+(u'Serbo-Croatian', 'sh'),
 90+(u'Marathi', 'mr'),
 91+(u'Luxembourgish', 'lb'),
 92+(u'Javanese', 'jv'),
 93+(u'Latvian', 'lv'),
 94+(u'Bosnian', 'bs'),
 95+(u'Icelandic', 'is'),
 96+(u'Welsh', 'cy'),
 97+(u'Belarusian', 'be-x-old'),
 98+(u'Taraškievica', 'be-x-old'),
 99+(u'Piedmontese', 'pms'),
 100+(u'Albanian', 'sq'),
 101+(u'Tamil', 'ta'),
 102+(u'Bishnupriya Manipuri', 'bpy'),
 103+(u'Belarusian', 'be'),
 104+(u'Aragonese', 'an'),
 105+(u'Occitan', 'oc'),
 106+(u'Bengali', 'bn'),
 107+(u'Swahili', 'sw'),
 108+(u'Ido', 'io'),
 109+(u'Ripuarian', 'ksh'),
 110+(u'Lombard', 'lmo'),
 111+(u'West Frisian', 'fy'),
 112+(u'Gujarati', 'gu'),
 113+(u'Low Saxon', 'nds'),
 114+(u'Afrikaans', 'af'),
 115+(u'Sicilian', 'scn'),
 116+(u'Quechua', 'qu'),
 117+(u'Kurdish', 'ku'),
 118+(u'Urdu', 'ur'),
 119+(u'Sundanese', 'su'),
 120+(u'Malayalam', 'ml'),
 121+(u'Cantonese', 'zh-yue'),
 122+(u'Asturian', 'ast'),
 123+(u'Neapolitan', 'nap'),
 124+(u'Samogitian', 'bat-smg'),
 125+(u'Walloon', 'wa'),
 126+(u'Chuvash', 'cv'),
 127+(u'Irish', 'ga'),
 128+(u'Armenian', 'hy'),
 129+(u'Yoruba', 'yo'),
 130+(u'Kannada', 'kn'),
 131+(u'Tajik', 'tg'),
 132+(u'Tarantino', 'roa-tara'),
 133+(u'Venetian', 'vec'),
 134+(u'Western Panjabi', 'pnb'),
 135+(u'Nepali', 'ne'),
 136+(u'Scottish Gaelic', 'gd'),
 137+(u'Yiddish', 'yi'),
 138+(u'Min Nan', 'zh-min-nan'),
 139+(u'Uzbek', 'uz'),
 140+(u'Tatar', 'tt'),
 141+(u'Kapampangan', 'pam'),
 142+(u'Ossetian', 'os'),
 143+(u'Sakha', 'sah'),
 144+(u'Alemannic', 'als'),
 145+(u'Maori', 'mi'),
 146+(u'Egyptian Arabic', 'arz'),
 147+(u'Kazakh', 'kk'),
 148+(u'Nahuatl', 'nah'),
 149+(u'Limburgian', 'li'),
 150+(u'Upper Sorbian', 'hsb'),
 151+(u'Gilaki', 'glk'),
 152+(u'Corsican', 'co'),
 153+(u'Gan', 'gan'),
 154+(u'Amharic', 'am'),
 155+(u'Mongolian', 'mn'),
 156+(u'Interlingua', 'ia'),
 157+(u'Central Bicolano', 'bcl'),
 158+(u'Võro', 'fiu-vro'),
 159+(u'Dutch Low Saxon', 'nds-nl'),
 160+(u'Faroese', 'fo'),
 161+(u'Turkmen', 'tk'),
 162+(u'Scots', 'sco'),
 163+(u'West Flemish', 'vls'),
 164+(u'Sinhalese', 'si'),
 165+(u'Sanskrit', 'sa'),
 166+(u'Bavarian', 'bar'),
 167+(u'Burmese', 'my'),
 168+(u'Manx', 'gv'),
 169+(u'Divehi', 'dv'),
 170+(u'Norman', 'nrm'),
 171+(u'Pangasinan', 'pag'),
 172+(u'Romansh', 'rm'),
 173+(u'Banyumasan', 'map-bms'),
 174+(u'Zazaki', 'diq'),
 175+(u'Sorani', 'ckb'),
 176+(u'Northern Sami', 'se'),
 177+(u'Mazandarani', 'mzn'),
 178+(u'Wu', 'wuu'),
 179+(u'Uyghur', 'ug'),
 180+(u'Friulian', 'fur'),
 181+(u'Ligurian', 'lij'),
 182+(u'Maltese', 'mt'),
 183+(u'Bihari', 'bh'),
 184+(u'Novial', 'nov'),
 185+(u'Malagasy', 'mg'),
 186+(u'Kashubian', 'csb'),
 187+(u'Ilokano', 'ilo'),
 188+(u'Sardinian', 'sc'),
 189+(u'Classical Chinese', 'zh-classical'),
 190+(u'Khmer', 'km'),
 191+(u'Ladino', 'lad'),
 192+(u'Pali', 'pi'),
 193+(u'Anglo-Saxon', 'ang'),
 194+(u'Zamboanga Chavacano', 'cbk-zam'),
 195+(u'Tibetan', 'bo'),
 196+(u'Fiji Hindi', 'hif'),
 197+(u'Franco-Provençal', 'frp'),
 198+(u'Arpitan', 'frp'),
 199+(u'Hakka', 'hak'),
 200+(u'Cornish', 'kw'),
 201+(u'Punjabi', 'pa'),
 202+(u'Pashto', 'ps'),
 203+(u'Kalmyk', 'xal'),
 204+(u'Silesian', 'szl'),
 205+(u'Pennsylvania German', 'pdc'),
 206+(u'Hawaiian', 'haw'),
 207+(u'Saterland Frisian', 'stq'),
 208+(u'Interlingue', 'ie'),
 209+(u'Navajo', 'nv'),
 210+(u'Fijian', 'fj'),
 211+(u'Crimean Tatar', 'crh'),
 212+(u'Komi', 'kv'),
 213+(u'Tongan', 'to'),
 214+(u'Acehnese', 'ace'),
 215+(u'Somali', 'so'),
 216+(u'Erzya', 'myv'),
 217+(u'Guarani', 'gn'),
 218+(u'Karachay-Balkar', 'krc'),
 219+(u'Extremaduran', 'ext'),
 220+(u'Lingala', 'ln'),
 221+(u'Kirghiz', 'ky'),
 222+(u'Meadow Mari', 'mhr'),
 223+(u'Assyrian Neo-Aramaic', 'arc'),
 224+(u'Emilian-Romagnol', 'eml'),
 225+(u'Lojban', 'jbo'),
 226+(u'Picard', 'pcd'),
 227+(u'Aymara', 'ay'),
 228+(u'Wolof', 'wo'),
 229+(u'Tumbuka', 'tum'),
 230+(u'Kabyle', 'kab'),
 231+(u'Bashkir', 'ba'),
 232+(u'North Frisian', 'frr'),
 233+(u'Tahitian', 'ty'),
 234+(u'Tok Pisin', 'tpi'),
 235+(u'Papiamentu', 'pap'),
 236+(u'Zealandic', 'zea'),
 237+(u'Sranan', 'srn'),
 238+(u'Greenlandic', 'kl'),
 239+(u'Udmurt', 'udm'),
 240+(u'Chechen', 'ce'),
 241+(u'Igbo', 'ig'),
 242+(u'Komi-Permyak', 'koi'),
 243+(u'Oriya', 'or'),
 244+(u'Lower Sorbian', 'dsb'),
 245+(u'Kongo', 'kg'),
 246+(u'Lao', 'lo'),
 247+(u'Abkhazian', 'ab'),
 248+(u'Moksha', 'mdf'),
 249+(u'Romani', 'rmy'),
 250+(u'Hill Mari', 'mrj'),
 251+(u'Banjar', 'bjn'),
 252+(u'Old Church Slavonic', 'cu'),
 253+(u'Mirandese', 'mwl'),
 254+(u'Karakalpak', 'kaa'),
 255+(u'Samoan', 'sm'),
 256+(u'Moldovan', 'mo'),
 257+(u'Tetum', 'tet'),
 258+(u'Avar', 'av'),
 259+(u'Kashmiri', 'ks'),
 260+(u'Gothic', 'got'),
 261+(u'Sindhi', 'sd'),
 262+(u'Bambara', 'bm'),
 263+(u'Nauruan', 'na'),
 264+(u'Norfolk', 'pih'),
 265+(u'Pontic', 'pnt'),
 266+(u'Inuktitut', 'iu'),
 267+(u'Inupiak', 'ik'),
 268+(u'Bislama', 'bi'),
 269+(u'Cherokee', 'chr'),
 270+(u'Assamese', 'as'),
 271+(u'Min Dong', 'cdo'),
 272+(u'Ewe', 'ee'),
 273+(u'Swati', 'ss'),
 274+(u'Oromo', 'om'),
 275+(u'Zhuang', 'za'),
 276+(u'Zulu', 'zu'),
 277+(u'Tigrinya', 'ti'),
 278+(u'Venda', 've'),
 279+(u'Tsonga', 'ts'),
 280+(u'Hausa', 'ha'),
 281+(u'Dzongkha', 'dz'),
 282+(u'Sango', 'sg'),
 283+(u'Chamorro', 'ch'),
 284+(u'Cree', 'cr'),
 285+(u'Xhosa', 'xh'),
 286+(u'Akan', 'ak'),
 287+(u'Sesotho', 'st'),
 288+(u'Kinyarwanda', 'rw'),
 289+(u'Tswana', 'tn'),
 290+(u'Kikuyu', 'ki'),
 291+(u'Buryat', 'bxr'),
 292+(u'Buginese', 'bug'),
 293+(u'Chichewa', 'ny'),
 294+(u'Lak', 'lbe'),
 295+(u'Twi', 'tw'),
 296+(u'Shona', 'sn'),
 297+(u'Kirundi', 'rn'),
 298+(u'Fula', 'ff'),
 299+(u'Cheyenne', 'chy'),
 300+(u'Luganda', 'lg'),
 301+(u'Ndonga', 'ng'),
 302+(u'Sichuan Yi', 'ii'),
 303+(u'Choctaw', 'cho'),
 304+(u'Marshallese', 'mh'),
 305+(u'Afar', 'aa'),
 306+(u'Kuanyama', 'kj'),
 307+(u'Hiri Motu', 'ho'),
 308+(u'Muscogee', 'mus'),
 309+(u'Kanuri', 'kr'),
 310+(u'Herero', 'hz'),
 311+(u'English', 'en'),
 312+(u'Deutsch', 'de'),
 313+(u'Français', 'fr'),
 314+(u'Italiano', 'it'),
 315+(u'Polski', 'pl'),
 316+(u'日本語', 'ja'),
 317+(u'Español', 'es'),
 318+(u'Nederlands', 'nl'),
 319+(u'Português', 'pt'),
 320+(u'Русский', 'ru'),
 321+(u'Svenska', 'sv'),
 322+(u'中文', 'zh'),
 323+(u'Català', 'ca'),
 324+(u'Norsk', 'no'),
 325+(u'Bokmål', 'no'),
 326+(u'Suomi', 'fi'),
 327+(u'Українська', 'uk'),
 328+(u'Magyar', 'hu'),
 329+(u'Čeština', 'cs'),
 330+(u'Română', 'ro'),
 331+(u'Türkçe', 'tr'),
 332+(u'한국어', 'ko'),
 333+(u'Tiếng Việt', 'vi'),
 334+(u'Dansk', 'da'),
 335+(u'العربية', 'ar'),
 336+(u'Esperanto', 'eo'),
 337+(u'Српски', 'sr'),
 338+(u'Srpski', 'sr'),
 339+(u'Bahasa Indonesia', 'id'),
 340+(u'Lietuvių', 'lt'),
 341+(u'Volapük', 'vo'),
 342+(u'Slovenčina', 'sk'),
 343+(u'עברית', 'he'),
 344+(u'Български', 'bg'),
 345+(u'فارسی', 'fa'),
 346+(u'Slovenščina', 'sl'),
 347+(u'Winaray', 'war'),
 348+(u'Hrvatski', 'hr'),
 349+(u'Eesti', 'et'),
 350+(u'Bahasa Melayu', 'ms'),
 351+(u'नेपाल भाषा', 'new'),
 352+(u'Simple English', 'simple'),
 353+(u'Galego', 'gl'),
 354+(u'ไทย', 'th'),
 355+(u'Armãneashce', 'roa-rup'),
 356+(u'Nynorsk', 'nn'),
 357+(u'Euskara', 'eu'),
 358+(u'हिन्दी', 'hi'),
 359+(u'Ελληνικά', 'el'),
 360+(u'Krèyol ayisyen', 'ht'),
 361+(u'Latina', 'la'),
 362+(u'తెలుగు', 'te'),
 363+(u'ქართული', 'ka'),
 364+(u'Sinugboanong Binisaya', 'ceb'),
 365+(u'Македонски', 'mk'),
 366+(u'Azərbaycan', 'az'),
 367+(u'Tagalog', 'tl'),
 368+(u'Brezhoneg', 'br'),
 369+(u'Srpskohrvatski', 'sh'),
 370+(u'Српскохрватски', 'sh'),
 371+(u'मराठी', 'mr'),
 372+(u'Lëtzebuergesch', 'lb'),
 373+(u'Basa Jawa', 'jv'),
 374+(u'Latviešu', 'lv'),
 375+(u'Bosanski', 'bs'),
 376+(u'Íslenska', 'is'),
 377+(u'Cymraeg', 'cy'),
 378+(u'Беларуская', 'be-x-old'),
 379+(u'тарашкевіца', 'be-x-old'),
 380+(u'Piemontèis', 'pms'),
 381+(u'Shqip', 'sq'),
 382+(u'தமிழ்', 'ta'),
 383+(u'ইমার ঠার', 'bpy'),
 384+(u'বিষ্ণুপ্রিয়া মণিপুরী', 'bpy'),
 385+(u'Беларуская', 'be'),
 386+(u'Aragonés', 'an'),
 387+(u'Occitan', 'oc'),
 388+(u'বাংলা', 'bn'),
 389+(u'Kiswahili', 'sw'),
 390+(u'Ido', 'io'),
 391+(u'Ripoarisch', 'ksh'),
 392+(u'Lumbaart', 'lmo'),
 393+(u'Frysk', 'fy'),
 394+(u'ગુજરાતી', 'gu'),
 395+(u'Plattdüütsch', 'nds'),
 396+(u'Afrikaans', 'af'),
 397+(u'Sicilianu', 'scn'),
 398+(u'Runa Simi', 'qu'),
 399+(u'Kurdî', 'ku'),
 400+(u'كوردی', 'ku'),
 401+(u'اردو', 'ur'),
 402+(u'Basa Sunda', 'su'),
 403+(u'മലയാളം', 'ml'),
 404+(u'粵語', 'zh-yue'),
 405+(u'Asturianu', 'ast'),
 406+(u'Nnapulitano', 'nap'),
 407+(u'Žemaitėška', 'bat-smg'),
 408+(u'Walon', 'wa'),
 409+(u'Чăваш', 'cv'),
 410+(u'Gaeilge', 'ga'),
 411+(u'Հայերեն', 'hy'),
 412+(u'Yorùbá', 'yo'),
 413+(u'ಕನ್ನಡ', 'kn'),
 414+(u'Тоҷикӣ', 'tg'),
 415+(u'Tarandíne', 'roa-tara'),
 416+(u'Vèneto', 'vec'),
 417+(u'شاہ مکھی پنجابی', 'pnb'),
 418+(u'Shāhmukhī Pañjābī', 'pnb'),
 419+(u'नेपाली', 'ne'),
 420+(u'Gàidhlig', 'gd'),
 421+(u'ייִדיש', 'yi'),
 422+(u'Bân-lâm-gú', 'zh-min-nan'),
 423+(u'O‘zbek', 'uz'),
 424+(u'Tatarça', 'tt'),
 425+(u'Татарча', 'tt'),
 426+(u'Kapampangan', 'pam'),
 427+(u'Иронау', 'os'),
 428+(u'Саха тыла', 'sah'),
 429+(u'Saxa Tyla', 'sah'),
 430+(u'Alemannisch', 'als'),
 431+(u'Māori', 'mi'),
 432+(u'مصرى', 'arz'),
 433+(u'Maṣrī', 'arz'),
 434+(u'Қазақша', 'kk'),
 435+(u'Nāhuatl', 'nah'),
 436+(u'Limburgs', 'li'),
 437+(u'Hornjoserbsce', 'hsb'),
 438+(u'گیلکی', 'glk'),
 439+(u'Corsu', 'co'),
 440+(u'贛語', 'gan'),
 441+(u'አማርኛ', 'am'),
 442+(u'Монгол', 'mn'),
 443+(u'Interlingua', 'ia'),
 444+(u'Bikol', 'bcl'),
 445+(u'Võro', 'fiu-vro'),
 446+(u'Nedersaksisch', 'nds-nl'),
 447+(u'Føroyskt', 'fo'),
 448+(u'تركمن ', 'tk'),
 449+(u'Туркмен', 'tk'),
 450+(u'Scots', 'sco'),
 451+(u'West-Vlams', 'vls'),
 452+(u'සිංහල', 'si'),
 453+(u'संस्कृतम्', 'sa'),
 454+(u'Boarisch', 'bar'),
 455+(u'မ္ရန္‌မာစာ', 'my'), #Needs fix
 456+(u'Gaelg', 'gv'),
 457+(u'ދިވެހިބަސް', 'dv'),
 458+(u'Nouormand', 'nrm'),
 459+(u'Normaund', 'nrm'),
 460+(u'Pangasinan', 'pag'),
 461+(u'Rumantsch', 'rm'),
 462+(u'Basa Banyumasan', 'map-bms'),
 463+(u'Zazaki', 'diq'),
 464+(u'Soranî', 'ckb'),
 465+(u'کوردی', 'ckb'),
 466+(u'Sámegiella', 'se'),
 467+(u'مَزِروني', 'mzn'),
 468+(u'吴语', 'wuu'),
 469+(u'Oyghurque', 'ug'),
 470+(u'Furlan', 'fur'),
 471+(u'Líguru', 'lij'),
 472+(u'Malti', 'mt'),
 473+(u'भोजपुरी', 'bh'),
 474+(u'Novial', 'nov'),
 475+(u'Malagasy', 'mg'),
 476+(u'Kaszëbsczi', 'csb'),
 477+(u'Ilokano', 'ilo'),
 478+(u'Sardu', 'sc'),
 479+(u'古文', 'zh-classical'),
 480+(u'文言文', 'zh-classical'),
 481+(u'ភាសាខ្មែរ', 'km'),
 482+(u'Dzhudezmo', 'lad'),
 483+(u'पाऴि', 'pi'),
 484+(u'Englisc', 'ang'),
 485+(u'Chavacano de Zamboanga', 'cbk-zam'),
 486+(u'བོད་སྐད', 'bo'),
 487+(u'Fiji Hindi', 'hif'),
 488+(u'Arpitan', 'frp'),
 489+(u'Hak-kâ-fa', 'hak'),
 490+(u'客家話', 'hak'),
 491+(u'Kernewek', 'kw'),
 492+(u'Karnuack', 'kw'),
 493+(u'ਪੰਜਾਬੀ', 'pa'),
 494+(u'پښتو', 'ps'),
 495+(u'Хальмг', 'xal'),
 496+(u'Ślůnski', 'szl'),
 497+(u'Deitsch', 'pdc'),
 498+(u'Hawai`i', 'haw'),
 499+(u'Seeltersk', 'stq'),
 500+(u'Interlingue', 'ie'),
 501+(u'Diné bizaad', 'nv'),
 502+(u'Na Vosa Vakaviti', 'fj'),
 503+(u'Qırımtatarca', 'crh'),
 504+(u'Коми', 'kv'),
 505+(u'faka Tonga', 'to'),
 506+(u'Bahsa Acèh', 'ace'),
 507+(u'Soomaaliga', 'so'),
 508+(u'Эрзянь', 'myv'),
 509+(u'Erzjanj Kelj', 'myv'),
 510+(u"Avañe'ẽ", 'gn'),
 511+(u'Къарачай-Малкъар', 'krc'),
 512+(u'Qarachay-Malqar', 'krc'),
 513+(u'Estremeñu', 'ext'),
 514+(u'Lingala', 'ln'),
 515+(u'Кыргызча', 'ky'),
 516+(u'Олык Марий', 'mhr'),
 517+(u'Olyk Marij', 'mhr'),
 518+(u'ܐܪܡܝܐ', 'arc'),
 519+(u'Emiliàn e rumagnòl', 'eml'),
 520+(u'Lojban', 'jbo'),
 521+(u'Picard', 'pcd'),
 522+(u'Aymar', 'ay'),
 523+(u'Wolof', 'wo'),
 524+(u'chiTumbuka', 'tum'),
 525+(u'Taqbaylit', 'kab'),
 526+(u'Башҡорт', 'ba'),
 527+(u'Frasch', 'frr'),
 528+(u'Reo Mā`ohi', 'ty'),
 529+(u'Tok Pisin', 'tpi'),
 530+(u'Papiamentu', 'pap'),
 531+(u'Zeêuws', 'zea'),
 532+(u'Sranantongo', 'srn'),
 533+(u'Kalaallisut', 'kl'),
 534+(u'Удмурт кыл', 'udm'),
 535+(u'Нохчийн', 'ce'),
 536+(u'Igbo', 'ig'),
 537+(u'Перем Коми', 'koi'),
 538+(u'Perem Komi', 'koi'),
 539+(u'ଓଡ଼ିଆ', 'or'),
 540+(u'Dolnoserbski', 'dsb'),
 541+(u'KiKongo', 'kg'),
 542+(u'ລາວ', 'lo'),
 543+(u'Аҧсуа', 'ab'),
 544+(u'Мокшень', 'mdf'),
 545+(u'Mokshanj Kälj', 'mdf'),
 546+(u'romani - रोमानी', 'rmy'),
 547+(u'Кырык Мары', 'mrj'),
 548+(u'Kyryk Mary', 'mrj'),
 549+(u'Bahasa Banjar', 'bjn'),
 550+(u'Словѣньскъ', 'cu'),
 551+(u'Páigina Percipal', 'mwl'),
 552+(u'Qaraqalpaqsha', 'kaa'),
 553+(u'Gagana Samoa', 'sm'),
 554+(u'Молдовеняскэ', 'mo'),
 555+(u'Tetun', 'tet'),
 556+(u'Авар', 'av'),
 557+(u'कश्मीरी', 'ks'),
 558+(u'كشميري', 'ks'),
 559+(u'𐌲𐌿𐍄𐌹𐍃𐌺', 'got'), #Needs fix
 560+(u'سنڌي، سندھی ، सिन्ध', 'sd'),
 561+(u'Bamanankan', 'bm'),
 562+(u'dorerin Naoero', 'na'),
 563+(u'Norfuk', 'pih'),
 564+(u'Ποντιακά', 'pnt'),
 565+(u'ᐃᓄᒃᑎᑐᑦ', 'iu'),
 566+(u'Iñupiak', 'ik'),
 567+(u'Bislama', 'bi'),
 568+(u'ᏣᎳᎩ', 'chr'),
 569+(u'অসমীয়া', 'as'),
 570+(u'Mìng-dĕ̤ng-ngṳ̄', 'cdo'),
 571+(u'Eʋegbe', 'ee'),
 572+(u'SiSwati', 'ss'),
 573+(u'Oromoo', 'om'),
 574+(u'Cuengh', 'za'),
 575+(u'isiZulu', 'zu'),
 576+(u'ትግርኛ', 'ti'),
 577+(u'Tshivenda', 've'),
 578+(u'Xitsonga', 'ts'),
 579+(u'هَوُسَ', 'ha'),
 580+(u'ཇོང་ཁ', 'dz'),
 581+(u'Sängö', 'sg'),
 582+(u'Chamoru', 'ch'),
 583+(u'Nehiyaw', 'cr'),
 584+(u'isiXhosa', 'xh'),
 585+(u'Akana', 'ak'),
 586+(u'Sesotho', 'st'),
 587+(u'Ikinyarwanda', 'rw'),
 588+(u'Setswana', 'tn'),
 589+(u'Gĩkũyũ', 'ki'),
 590+(u'Буряад', 'bxr'),
 591+(u'Basa Ugi', 'bug'),
 592+(u'Chi-Chewa', 'ny'),
 593+(u'Лакку', 'lbe'),
 594+(u'Twi', 'tw'),
 595+(u'chiShona', 'sn'),
 596+(u'Kirundi', 'rn'),
 597+(u'Fulfulde', 'ff'),
 598+(u'Tsetsêhestâhese', 'chy'),
 599+(u'Luganda', 'lg'),
 600+(u'Oshiwambo', 'ng'),
 601+(u'ꆇꉙ', 'ii'),
 602+(u'Choctaw', 'cho'),
 603+(u'Ebon', 'mh'),
 604+(u'Afar', 'aa'),
 605+(u'Kuanyama', 'kj'),
 606+(u'Hiri Motu', 'ho'),
 607+(u'Muskogee', 'mus'),
 608+(u'Kanuri', 'kr'),
 609+(u'Otsiherero', 'hz'),
610610 ])
611611
 612+
612613 def language_map():
613 - return utils.invert_dict(MAPPING)
\ No newline at end of file
 614+ return utils.invert_dict(MAPPING)
Index: trunk/tools/editor_trends/configuration.py
@@ -49,12 +49,19 @@
5050 self.debug = debug
5151 self.progressbar = True
5252 self.encoding = 'utf-8'
53 - self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte
54 - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestamp format as generated by the MediaWiki dumps
5553
56 - self.max_xmlfile_size = 4096 * 1024 #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
 54+ #Date format as used by Erik Zachte
 55+ self.date_format = '%Y-%m-%d'
 56+
 57+ # Timestamp format as generated by the MediaWiki dumps
 58+ self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
 59+
 60+ #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
 61+ self.max_xmlfile_size = 4096 * 1024
 62+
 63+ #Change this to match your computers configuration (RAM / CPU)
5764 self.number_of_processes = cpu_count() * process_multiplier
58 - #Change this to match your computers configuration (RAM / CPU)
 65+
5966 self.minimum_python_version = (2, 6)
6067 self.wp_dump_location = 'http://download.wikimedia.org'
6168 self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
@@ -97,7 +104,6 @@
98105 }
99106
100107
101 -
102108 def set_custom_settings(self, **kwargs):
103109 for kw in kwargs:
104110 setattr(self, kw, kwargs[kw])
@@ -119,11 +125,11 @@
120126 return cwd
121127
122128 def determine_platform(self):
123 - os = platform.system()
124 - if os == 'Darwin':
125 - return 'OSX'
126 - else:
127 - return os
 129+ os = platform.system()
 130+ if os == 'Darwin':
 131+ return 'OSX'
 132+ else:
 133+ return os
128134
129135 #def determine_path_ziptool(self):
130136 # return self.detect_installed_program(self.determine_ziptool())
@@ -165,7 +171,7 @@
166172 if self.platform == 'Windows' and self.architecture == 'i386':
167173 return win32file._getmaxstdio()
168174 elif self.platform != 'Windows':
169 - return resource.getrlimit(resource.RLIMIT_NOFILE)[0]
 175+ return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
170176 else:
171177 return 500
172178
@@ -175,11 +181,10 @@
176182 os.path.isdir(os.path.join(self.working_directory, name))]
177183 for subdirname in dirs:
178184 if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
179 - sys.path.append(os.path.join(self.working_directory,
 185+ sys.path.append(os.path.join(self.working_directory,
180186 subdirname))
181187
182188
183 -
184189 def set_file_locations(self):
185190 self.input_location = os.path.join(self.root, 'wikimedia')
186191 self.input_filename = os.path.join(self.input_location, 'en',
Index: trunk/tools/editor_trends/utils/models.py
@@ -1,63 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-11-09'
19 -__version__ = '0.1'
20 -
21 -import multiprocessing
22 -
23 -
24 -class BaseConsumer(multiprocessing.Process):
25 -
26 - def __init__(self, task_queue, result_queue):
27 - multiprocessing.Process.__init__(self)
28 - self.task_queue = task_queue
29 - self.result_queue = result_queue
30 -
31 -
32 -
33 -
34 -# for kw in kwargs:
35 -# setattr(self, kw, kwargs[kw])
36 -#
37 -# def run(self):
38 -# proc_name = self.name
39 -# kwargs = {}
40 -# IGNORE = ['input_queue', 'result_queue', 'target']
41 -# for kw in self.__dict__:
42 -# if kw not in IGNORE and not kw.startswith('_'):
43 -# kwargs[kw] = getattr(self, kw)
44 -# self.target(self.input_queue, self.result_queue, **kwargs)
45 -
46 -
47 -class ProcessResultQueue(multiprocessing.Process):
48 -
49 - def __init__(self, target, result_queue, **kwargs):
50 - multiprocessing.Process.__init__(self)
51 - self.result_queue = result_queue
52 - self.target = target
53 - for kw in kwargs:
54 - setattr(self, kw, kwargs[kw])
55 -
56 -
57 - def run(self):
58 - proc_name = self.name
59 - kwargs = {}
60 - IGNORE = ['result_queue', 'target']
61 - for kw in self.__dict__:
62 - if kw not in IGNORE and not kw.startswith('_'):
63 - kwargs[kw] = getattr(self, kw)
64 - self.target(self.result_queue, **kwargs)
Index: trunk/tools/editor_trends/utils/consumers.py
@@ -0,0 +1,61 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-09'
 19+__version__ = '0.1'
 20+
 21+import multiprocessing
 22+
 23+
 24+class BaseConsumer(multiprocessing.Process):
 25+
 26+ def __init__(self, task_queue, result_queue):
 27+ multiprocessing.Process.__init__(self)
 28+ self.task_queue = task_queue
 29+ self.result_queue = result_queue
 30+
 31+
 32+# for kw in kwargs:
 33+# setattr(self, kw, kwargs[kw])
 34+#
 35+# def run(self):
 36+# proc_name = self.name
 37+# kwargs = {}
 38+# IGNORE = ['input_queue', 'result_queue', 'target']
 39+# for kw in self.__dict__:
 40+# if kw not in IGNORE and not kw.startswith('_'):
 41+# kwargs[kw] = getattr(self, kw)
 42+# self.target(self.input_queue, self.result_queue, **kwargs)
 43+
 44+
 45+class ProcessResultQueue(multiprocessing.Process):
 46+
 47+ def __init__(self, target, result_queue, **kwargs):
 48+ multiprocessing.Process.__init__(self)
 49+ self.result_queue = result_queue
 50+ self.target = target
 51+ for kw in kwargs:
 52+ setattr(self, kw, kwargs[kw])
 53+
 54+
 55+ def run(self):
 56+ proc_name = self.name
 57+ kwargs = {}
 58+ IGNORE = ['result_queue', 'target']
 59+ for kw in self.__dict__:
 60+ if kw not in IGNORE and not kw.startswith('_'):
 61+ kwargs[kw] = getattr(self, kw)
 62+ self.target(self.result_queue, **kwargs)
Property changes on: trunk/tools/editor_trends/utils/consumers.py
___________________________________________________________________
Added: svn:eol-style
163 + native
Added: svn:mime-type
264 + text/plain
Index: trunk/tools/editor_trends/utils/messages.py
@@ -26,7 +26,12 @@
2727
2828
2929 def show(func):
 30+ '''
 31+ @func should be an qsize() belonging to a task queue. qsize() is not supported
 32+ on OSX hence this simple workaround to make sure that we can continue supporting
 33+ OSX.
 34+ '''
3035 try:
31 - func()
 36+ return func()
3237 except:
33 - print 'Calling function %s caused an error, probably your platform is not supporting this function' % func
 38+ return 'unknown'
Index: trunk/tools/editor_trends/database/db_settings.py
@@ -1,38 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -
18 -'''
19 -This is a settings file that contains the layout of different tables. The main
20 -key will be used as the tablename while it,s values contain tuples containing
21 -fieldname and datatype This is only be used for sqlite.
22 -'''
23 -CONTRIBUTOR_TABLE = {'contributors': []}
24 -CONTRIBUTOR_TABLE['contributors'].append(('contributor', 'VARCHAR(64)'))
25 -CONTRIBUTOR_TABLE['contributors'].append(('article', 'INTEGER'))
26 -CONTRIBUTOR_TABLE['contributors'].append(('timestamp', 'TEXT'))
27 -CONTRIBUTOR_TABLE['contributors'].append(('bot', 'INTEGER'))
28 -
29 -BOT_TABLE = {'bots': []}
30 -BOT_TABLE['bots'].append(('language', 'VARCHAR(12)'))
31 -BOT_TABLE['bots'].append(('name', 'VARCHAR(64)'))
32 -BOT_TABLE['bots'].append(('edits_namespace_a', 'INTEGER'))
33 -BOT_TABLE['bots'].append(('edits_namespace_x', 'INTEGER'))
34 -BOT_TABLE['bots'].append(('rank_now', 'INTEGER'))
35 -BOT_TABLE['bots'].append(('rank_prev', 'INTEGER'))
36 -BOT_TABLE['bots'].append(('first_date', 'TEXT'))
37 -BOT_TABLE['bots'].append(('days_first', 'INTEGER'))
38 -BOT_TABLE['bots'].append(('last_date', 'TEXT'))
39 -BOT_TABLE['bots'].append(('days_last', 'INTEGER'))
Index: trunk/tools/editor_trends/database/launcher.py
@@ -27,14 +27,14 @@
2828 from utils import utils
2929
3030
31 -def start_mongodb_server(platform, x, path):
 31+def start_mongodb_server(x, path):
3232 default_port = 27017
3333 port = default_port + x
34 - if platform == 'Windows':
 34+ if settings.platform == 'Windows':
3535 p = subprocess.Popen([path, '--port', str(port), '--dbpath', 'c:\data\db', '--logpath', 'c:\mongodb\logs'])
36 - elif platform == 'Linux':
 36+ elif settings.platform == 'Linux':
3737 subprocess.Popen([path, '--port %s' % port])
38 - elif platform == 'OSX':
 38+ elif settings.platform == 'OSX':
3939 raise NotImplementedError
4040 else:
4141 raise exceptions.PlatformNotSupportedError(platform)

Status & tagging log