r75879 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75878‎ | r75879 | r75880 >
Date:21:10, 2 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Improved command-line utility
* supports all Wikipedia projects
* in all languages
Modified paths:
  • /trunk/tools/editor_trends/config.py (added) (history)
  • /trunk/tools/editor_trends/languages.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -17,9 +17,12 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
 21+import os
 22+import sys
2123 from argparse import ArgumentParser
2224 from argparse import RawTextHelpFormatter
2325
 26+
2427 import progressbar
2528
2629 import settings
@@ -27,44 +30,68 @@
2831 from utils import utils
2932 from utils import dump_downloader
3033 import split_xml_file
 34+import config
3135
 36+
3237 def get_value(args, key):
3338 return getattr(args, key, None)
3439
 40+
3541 def config_launcher(args):
36 - pass
 42+ config.load_configuration(args)
3743
 44+
 45+def retrieve_language(args):
 46+ language = get_value(args, 'language')
 47+ language = language.title()
 48+ return languages.MAPPING.get(language, None)
 49+
 50+def retrieve_project(args):
 51+ project = get_value(args, 'project')
 52+ if project != 'wiki':
 53+ project = settings.WIKIMEDIA_PROJECTS.get(project, None)
 54+ return project
 55+
 56+
3857 def dump_downloader_launcher(args):
3958 print 'dump downloader'
40 - language = get_value(args, 'language')
41 - location = get_value(args, 'store')
42 - filename = '%s-%s-%s' % (create_dbname(args), 'latest', get_value(args, 'file'))
 59+ config.load_configuration(args)
 60+ language_code = retrieve_language(args)
 61+ if language_code == None:
 62+ print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')
 63+ sys.exit(-1)
 64+ project = retrieve_project(args)
 65+ if project == None:
 66+ print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
 67+ sys.exit(-1)
 68+ location = os.path.join(get_value(args, 'location'), language_code)
 69+ project = language_code + project
 70+ filename = '%s-%s-%s' % (project, 'latest', get_value(args, 'file'))
4371 pbar = get_value(args, 'progress')
4472
4573 domain = settings.WP_DUMP_LOCATION
46 - path = '/%s/latest/' % create_dbname(args)
 74+ path = '/%s/latest/' % language_code
4775
4876 extension = utils.determine_file_extension(filename)
4977 filemode = utils.determine_file_mode(extension)
5078
5179 dump_downloader.download_wp_dump(domain, path, filename, location, filemode, pbar)
5280
53 -def create_dbname(args):
54 - language = get_value(args, 'language')
55 - return languages.MAPPING[language] + 'wiki'
5681
5782 def split_xml_file_launcher(args):
5883 print 'split_xml_file_launcher'
5984 dbname = create_dbname(args)
6085 split_xml_file.split_xml(dbname)
6186
 87+
6288 def mongodb_script_launcher(args):
6389 print 'mongodb_script_launcher'
 90+ config.load_configuration(args)
6491 dbname = create_dbname(args)
65 -
6692 #map_wiki_editors.run_stand_alone(dbname)
6793 #print args
6894
 95+
6996 def all_launcher(args):
7097 print 'all_launcher'
7198 config_launcher(args)
@@ -72,19 +99,30 @@
73100 split_xml_file_launcher(args)
74101 mongodb_script_launcher(args)
75102
76 -def supported_languages(first_letter=False):
77 - if first_letter == False:
78 - choices = languages.MAPPING.keys()[:10]
79 - else:
80 - choices = languages.MAPPING.keys()
81 - choices = [c for c in choices if c.startswith(first_letter)]
 103+
 104+def supported_languages():
 105+ choices = languages.MAPPING.keys()
82106 choices = [c.encode(settings.ENCODING) for c in choices]
83 -
84107 return tuple(choices)
85108
86109
 110+def show_languages(args):
 111+ first = get_value(args, 'first')
 112+ if first != None:
 113+ first = first.title()
 114+ choices = supported_languages()
 115+ languages = []
 116+ for choice in choices:
 117+ languages.append(choice)
 118+ languages.sort()
 119+ for language in languages:
 120+ if first == None:
 121+ print '%s' % language
 122+ elif first != None and language.startswith(first):
 123+ print '%s' % language
 124+
 125+
87126 def main():
88 -
89127 file_choices = ('stub-meta-history.xml.gz',
90128 'stub-meta-current.xml.gz',
91129 'pages-meta-history.xml.7z',
@@ -92,64 +130,46 @@
93131
94132 parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
95133 subparsers = parser.add_subparsers(help='sub-command help')
96 -
97 - parser.add_argument('language', action='store',
98 - help='Example of valid languages. To see more languages, add the first character of the language you are interested in.',
99 - choices=supported_languages(),
100 - default='Russian')
101 -
102134 parser.add_argument('-p', '--progress', action='store_true', default=True,
103135 help='Indicate whether you want to have a progressbar.')
104136
 137+ parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.')
 138+ parser_languages.set_defaults(func=show_languages)
 139+ parser_languages.add_argument('-f', '--first', action='store', help='Enter the first letter of a language to see which languages are available.')
 140+
105141 parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.')
106142 parser_config.set_defaults(func=config_launcher)
107143
108 -
109 -
110144 parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.')
111 -
 145+ parser_download.add_argument('language', action='store',
 146+ help='Example of valid languages.',
 147+ choices=supported_languages(),
 148+ default='Russian')
 149+ parser_download.add_argument('-p', '--project', action='store', help='Specify the Wikimedia project that you would like to download',
 150+ choices=settings.WIKIMEDIA_PROJECTS.keys(),
 151+ default='wiki')
112152 parser_download.add_argument('-l', '--location', action='store',
113153 help='Indicate where you want to store the downloaded file.',
114154 default=settings.XML_FILE_LOCATION)
115 -
116 -
117 - parser_download.add_argument('file', action='store',
 155+ parser_download.add_argument('-f', '--file', action='store',
118156 choices=file_choices,
119 - help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
120 - default='user_groups.sql.gz')
121 -
 157+ help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
 158+ default='stub-meta-current.xml.gz')
122159 parser_download.set_defaults(func=dump_downloader_launcher)
123160
124 -
125161 parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
126162 parser_split.set_defaults(func=split_xml_file_launcher)
127163
128164 parser_create = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
129165 parser_create.set_defaults(func=mongodb_script_launcher)
130166
131 -
132 -
133167 parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
134168 parser_all.set_defaults(func=all_launcher)
135169
136 -
137 -
138 - #parser_create.add_argument()
139 -
140 -
141 - #('-c', '--create',
142 - # help='This will start the scripts to create a dataset\\\
143 - # from the MongoDB', type=mongodb_script_launcher)
144 - #.add_argument('-d', '--download',
145 - # help='This will start downloading the dump file.',
146 - # )
147 -
148 -
149170 args = parser.parse_args()
150171 args.func(args)
151172
152173
153 -
154174 if __name__ == '__main__':
155175 #args = ['download', '-l', 'Russian']
156176 main()
Index: trunk/tools/editor_trends/config.py
@@ -0,0 +1,61 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+
 22+import os
 23+import ConfigParser
 24+
 25+import settings
 26+from utils import utils
 27+
 28+
 29+def load_configuration(args):
 30+ config = ConfigParser.RawConfigParser()
 31+ if not utils.check_file_exists(settings.WORKING_DIRECTORY, 'wiki.cfg'):
 32+ working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd())
 33+ if working_directory == '':
 34+ working_directory = os.getcwd()
 35+
 36+ xml_file_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.XML_FILE_LOCATION)
 37+ if xml_file_location == '':
 38+ xml_file_location = settings.XML_FILE_LOCATION
 39+
 40+ create_configuration(WORKING_DIRECTORY=working_directory, XML_FILE_LOCATION=xml_file_location)
 41+
 42+ config.read('wiki.cfg')
 43+ settings.WORKING_DIRECTORY = config.get('file_locations', 'WORKING_DIRECTORY')
 44+ settings.XML_FILE_LOCATION = config.get('file_locations', 'XML_FILE_LOCATION')
 45+
 46+
 47+def create_configuration(**kwargs):
 48+ working_directory = kwargs.get('WORKING_DIRECTORY', settings.WORKING_DIRECTORY)
 49+ config = ConfigParser.RawConfigParser()
 50+ config.add_section('file_locations')
 51+ config.set('file_locations', 'WORKING_DIRECTORY', working_directory)
 52+ config.set('file_locations', 'XML_FILE_LOCATION', kwargs.get('XML_FILE_LOCATION', settings.XML_FILE_LOCATION))
 53+
 54+ fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
 55+ config.write(fh)
 56+ fh.close()
 57+
 58+
 59+if __name__ == '__main__':
 60+ load_configuration([])
 61+
 62+
Property changes on: trunk/tools/editor_trends/config.py
___________________________________________________________________
Added: svn:eol-style
163 + native
Index: trunk/tools/editor_trends/languages.py
@@ -1,5 +1,5 @@
22 #!/usr/bin/python
3 -# -*- coding: utf-8 -*-
 3+# coding=utf-8
44 '''
55 Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
66 This program is free software; you can redistribute it and/or
@@ -9,7 +9,7 @@
1010 but WITHOUT ANY WARRANTY; without even the implied warranty of
1111 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
1212 See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
 13+http,//www.fsf.org/licenses/gpl.html
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
@@ -23,584 +23,585 @@
2424 Gothic and Birmese are not yet supported, see rows 450 and 554.
2525 '''
2626
27 -LANGUAGES = {
28 -'English':'en',
29 -'German':'de',
30 -'French':'fr',
31 -'Italian':'it',
32 -'Polish':'pl',
33 -'Japanese':'ja',
34 -'Spanish':'es',
35 -'Dutch':'nl',
36 -'Portuguese':'pt',
37 -'Russian':'ru',
38 -'Swedish':'sv',
39 -'Chinese':'zh',
40 -'Catalan':'ca',
41 -'Norwegian':'no',
42 -'Bokmål':'no',
43 -'Finnish':'fi',
44 -'Ukrainian':'uk',
45 -'Hungarian':'hu',
46 -'Czech':'cs',
47 -'Romanian':'ro',
48 -'Turkish':'tr',
49 -'Korean':'ko',
50 -'Vietnamese':'vi',
51 -'Danish':'da',
52 -'Arabic':'ar',
53 -'Esperanto':'eo',
54 -'Serbian':'sr',
55 -'Indonesian':'id',
56 -'Lithuanian':'lt',
57 -'Volapük':'vo',
58 -'Slovak':'sk',
59 -'Hebrew':'he',
60 -'Bulgarian':'bg',
61 -'Persian':'fa',
62 -'Slovenian':'sl',
63 -'Waray-Waray':'war',
64 -'Croatian':'hr',
65 -'Estonian':'et',
66 -'Malay':'ms',
67 -'Newar':'new',
68 -'Nepal Bhasa':'new',
69 -'Simple English':'simple',
70 -'Galician':'gl',
71 -'Thai':'th',
72 -'Aromanian':'roa-rup',
73 -'Nynorsk':'nn',
74 -'Basque':'eu',
75 -'Hindi':'hi',
76 -'Greek':'el',
77 -'Haitian':'ht',
78 -'Latin':'la',
79 -'Telugu':'te',
80 -'Georgian':'ka',
81 -'Cebuano':'ceb',
82 -'Macedonian':'mk',
83 -'Azeri':'az',
84 -'Tagalog':'tl',
85 -'Breton':'br',
86 -'Serbo-Croatian':'sh',
87 -'Marathi':'mr',
88 -'Luxembourgish':'lb',
89 -'Javanese':'jv',
90 -'Latvian':'lv',
91 -'Bosnian':'bs',
92 -'Icelandic':'is',
93 -'Welsh':'cy',
94 -'Belarusian':'be-x-old',
95 -'Taraškievica':'be-x-old',
96 -'Piedmontese':'pms',
97 -'Albanian':'sq',
98 -'Tamil':'ta',
99 -'Bishnupriya Manipuri':'bpy',
100 -'Belarusian':'be',
101 -'Aragonese':'an',
102 -'Occitan':'oc',
103 -'Bengali':'bn',
104 -'Swahili':'sw',
105 -'Ido':'io',
106 -'Ripuarian':'ksh',
107 -'Lombard':'lmo',
108 -'West Frisian':'fy',
109 -'Gujarati':'gu',
110 -'Low Saxon':'nds',
111 -'Afrikaans':'af',
112 -'Sicilian':'scn',
113 -'Quechua':'qu',
114 -'Kurdish':'ku',
115 -'Urdu':'ur',
116 -'Sundanese':'su',
117 -'Malayalam':'ml',
118 -'Cantonese':'zh-yue',
119 -'Asturian':'ast',
120 -'Neapolitan':'nap',
121 -'Samogitian':'bat-smg',
122 -'Walloon':'wa',
123 -'Chuvash':'cv',
124 -'Irish':'ga',
125 -'Armenian':'hy',
126 -'Yoruba':'yo',
127 -'Kannada':'kn',
128 -'Tajik':'tg',
129 -'Tarantino':'roa-tara',
130 -'Venetian':'vec',
131 -'Western Panjabi':'pnb',
132 -'Nepali':'ne',
133 -'Scottish Gaelic':'gd',
134 -'Yiddish':'yi',
135 -'Min Nan':'zh-min-nan',
136 -'Uzbek':'uz',
137 -'Tatar':'tt',
138 -'Kapampangan':'pam',
139 -'Ossetian':'os',
140 -'Sakha':'sah',
141 -'Alemannic':'als',
142 -'Maori':'mi',
143 -'Egyptian Arabic':'arz',
144 -'Kazakh':'kk',
145 -'Nahuatl':'nah',
146 -'Limburgian':'li',
147 -'Upper Sorbian':'hsb',
148 -'Gilaki':'glk',
149 -'Corsican':'co',
150 -'Gan':'gan',
151 -'Amharic':'am',
152 -'Mongolian':'mn',
153 -'Interlingua':'ia',
154 -'Central Bicolano':'bcl',
155 -'Võro':'fiu-vro',
156 -'Dutch Low Saxon':'nds-nl',
157 -'Faroese':'fo',
158 -'Turkmen':'tk',
159 -'Scots':'sco',
160 -'West Flemish':'vls',
161 -'Sinhalese':'si',
162 -'Sanskrit':'sa',
163 -'Bavarian':'bar',
164 -'Burmese':'my',
165 -'Manx':'gv',
166 -'Divehi':'dv',
167 -'Norman':'nrm',
168 -'Pangasinan':'pag',
169 -'Romansh':'rm',
170 -'Banyumasan':'map-bms',
171 -'Zazaki':'diq',
172 -'Sorani':'ckb',
173 -'Northern Sami':'se',
174 -'Mazandarani':'mzn',
175 -'Wu':'wuu',
176 -'Uyghur':'ug',
177 -'Friulian':'fur',
178 -'Ligurian':'lij',
179 -'Maltese':'mt',
180 -'Bihari':'bh',
181 -'Novial':'nov',
182 -'Malagasy':'mg',
183 -'Kashubian':'csb',
184 -'Ilokano':'ilo',
185 -'Sardinian':'sc',
186 -'Classical Chinese':'zh-classical',
187 -'Khmer':'km',
188 -'Ladino':'lad',
189 -'Pali':'pi',
190 -'Anglo-Saxon':'ang',
191 -'Zamboanga Chavacano':'cbk-zam',
192 -'Tibetan':'bo',
193 -'Fiji Hindi':'hif',
194 -'Franco-Provençal':'frp',
195 -'Arpitan':'frp',
196 -'Hakka':'hak',
197 -'Cornish':'kw',
198 -'Punjabi':'pa',
199 -'Pashto':'ps',
200 -'Kalmyk':'xal',
201 -'Silesian':'szl',
202 -'Pennsylvania German':'pdc',
203 -'Hawaiian':'haw',
204 -'Saterland Frisian':'stq',
205 -'Interlingue':'ie',
206 -'Navajo':'nv',
207 -'Fijian':'fj',
208 -'Crimean Tatar':'crh',
209 -'Komi':'kv',
210 -'Tongan':'to',
211 -'Acehnese':'ace',
212 -'Somali':'so',
213 -'Erzya':'myv',
214 -'Guarani':'gn',
215 -'Karachay-Balkar':'krc',
216 -'Extremaduran':'ext',
217 -'Lingala':'ln',
218 -'Kirghiz':'ky',
219 -'Meadow Mari':'mhr',
220 -'Assyrian Neo-Aramaic':'arc',
221 -'Emilian-Romagnol':'eml',
222 -'Lojban':'jbo',
223 -'Picard':'pcd',
224 -'Aymara':'ay',
225 -'Wolof':'wo',
226 -'Tumbuka':'tum',
227 -'Kabyle':'kab',
228 -'Bashkir':'ba',
229 -'North Frisian':'frr',
230 -'Tahitian':'ty',
231 -'Tok Pisin':'tpi',
232 -'Papiamentu':'pap',
233 -'Zealandic':'zea',
234 -'Sranan':'srn',
235 -'Greenlandic':'kl',
236 -'Udmurt':'udm',
237 -'Chechen':'ce',
238 -'Igbo':'ig',
239 -'Komi-Permyak':'koi',
240 -'Oriya':'or',
241 -'Lower Sorbian':'dsb',
242 -'Kongo':'kg',
243 -'Lao':'lo',
244 -'Abkhazian':'ab',
245 -'Moksha':'mdf',
246 -'Romani':'rmy',
247 -'Hill Mari':'mrj',
248 -'Banjar':'bjn',
249 -'Old Church Slavonic':'cu',
250 -'Mirandese':'mwl',
251 -'Karakalpak':'kaa',
252 -'Samoan':'sm',
253 -'Moldovan':'mo',
254 -'Tetum':'tet',
255 -'Avar':'av',
256 -'Kashmiri':'ks',
257 -'Gothic':'got',
258 -'Sindhi':'sd',
259 -'Bambara':'bm',
260 -'Nauruan':'na',
261 -'Norfolk':'pih',
262 -'Pontic':'pnt',
263 -'Inuktitut':'iu',
264 -'Inupiak':'ik',
265 -'Bislama':'bi',
266 -'Cherokee':'chr',
267 -'Assamese':'as',
268 -'Min Dong':'cdo',
269 -'Ewe':'ee',
270 -'Swati':'ss',
271 -'Oromo':'om',
272 -'Zhuang':'za',
273 -'Zulu':'zu',
274 -'Tigrinya':'ti',
275 -'Venda':'ve',
276 -'Tsonga':'ts',
277 -'Hausa':'ha',
278 -'Dzongkha':'dz',
279 -'Sango':'sg',
280 -'Chamorro':'ch',
281 -'Cree':'cr',
282 -'Xhosa':'xh',
283 -'Akan':'ak',
284 -'Sesotho':'st',
285 -'Kinyarwanda':'rw',
286 -'Tswana':'tn',
287 -'Kikuyu':'ki',
288 -'Buryat':'bxr',
289 -'Buginese':'bug',
290 -'Chichewa':'ny',
291 -'Lak':'lbe',
292 -'Twi':'tw',
293 -'Shona':'sn',
294 -'Kirundi':'rn',
295 -'Fula':'ff',
296 -'Cheyenne':'chy',
297 -'Luganda':'lg',
298 -'Ndonga':'ng',
299 -'Sichuan Yi':'ii',
300 -'Choctaw':'cho',
301 -'Marshallese':'mh',
302 -'Afar':'aa',
303 -'Kuanyama':'kj',
304 -'Hiri Motu':'ho',
305 -'Muscogee':'mus',
306 -'Kanuri':'kr',
307 -'Herero':'hz',
308 -'English':'en',
309 -'Deutsch':'de',
310 -'Français':'fr',
311 -'Italiano':'it',
312 -'Polski':'pl',
313 -'日本語':'ja',
314 -'Español':'es',
315 -'Nederlands':'nl',
316 -'Português':'pt',
317 -'Русский':'ru',
318 -'Svenska':'sv',
319 -'中文':'zh',
320 -'Català':'ca',
321 -'Norsk':'no',
322 -'Bokmål':'no',
323 -'Suomi':'fi',
324 -'Українська':'uk',
325 -'Magyar':'hu',
326 -'Čeština':'cs',
327 -'Română':'ro',
328 -'Türkçe':'tr',
329 -'한국어':'ko',
330 -'Tiếng Việt':'vi',
331 -'Dansk':'da',
332 -'العربية':'ar',
333 -'Esperanto':'eo',
334 -'Српски':'sr',
335 -'Srpski':'sr',
336 -'Bahasa Indonesia':'id',
337 -'Lietuvių':'lt',
338 -'Volapük':'vo',
339 -'Slovenčina':'sk',
340 -'עברית':'he',
341 -'Български':'bg',
342 -'فارسی':'fa',
343 -'Slovenščina':'sl',
344 -'Winaray':'war',
345 -'Hrvatski':'hr',
346 -'Eesti':'et',
347 -'Bahasa Melayu':'ms',
348 -'नेपाल भाषा':'new',
349 -'Simple English':'simple',
350 -'Galego':'gl',
351 -'ไทย':'th',
352 -'Armãneashce':'roa-rup',
353 -'Nynorsk':'nn',
354 -'Euskara':'eu',
355 -'हिन्दी':'hi',
356 -'Ελληνικά':'el',
357 -'Krèyol ayisyen':'ht',
358 -'Latina':'la',
359 -'తెలుగు':'te',
360 -'ქართული':'ka',
361 -'Sinugboanong Binisaya':'ceb',
362 -'Македонски':'mk',
363 -'Azərbaycan':'az',
364 -'Tagalog':'tl',
365 -'Brezhoneg':'br',
366 -'Srpskohrvatski':'sh',
367 -'Српскохрватски':'sh',
368 -'मराठी':'mr',
369 -'Lëtzebuergesch':'lb',
370 -'Basa Jawa':'jv',
371 -'Latviešu':'lv',
372 -'Bosanski':'bs',
373 -'Íslenska':'is',
374 -'Cymraeg':'cy',
375 -'Беларуская':'be-x-old',
376 -'тарашкевіца':'be-x-old',
377 -'Piemontèis':'pms',
378 -'Shqip':'sq',
379 -'தமிழ்':'ta',
380 -'ইমার ঠার':'bpy',
381 -'বিষ্ণুপ্রিয়া মণিপুরী':'bpy',
382 -'Беларуская':'be',
383 -'Aragonés':'an',
384 -'Occitan':'oc',
385 -'বাংলা':'bn',
386 -'Kiswahili':'sw',
387 -'Ido':'io',
388 -'Ripoarisch':'ksh',
389 -'Lumbaart':'lmo',
390 -'Frysk':'fy',
391 -'ગુજરાતી':'gu',
392 -'Plattdüütsch':'nds',
393 -'Afrikaans':'af',
394 -'Sicilianu':'scn',
395 -'Runa Simi':'qu',
396 -'Kurdî':'ku',
397 -'كوردی':'ku',
398 -'اردو':'ur',
399 -'Basa Sunda':'su',
400 -'മലയാളം':'ml',
401 -'粵語':'zh-yue',
402 -'Asturianu':'ast',
403 -'Nnapulitano':'nap',
404 -'Žemaitėška':'bat-smg',
405 -'Walon':'wa',
406 -'Чăваш':'cv',
407 -'Gaeilge':'ga',
408 -'Հայերեն':'hy',
409 -'Yorùbá':'yo',
410 -'ಕನ್ನಡ':'kn',
411 -'Тоҷикӣ':'tg',
412 -'Tarandíne':'roa-tara',
413 -'Vèneto':'vec',
414 -'شاہ مکھی پنجابی':'pnb',
415 -'Shāhmukhī Pañjābī':'pnb',
416 -'नेपाली':'ne',
417 -'Gàidhlig':'gd',
418 -'ייִדיש':'yi',
419 -'Bân-lâm-gú':'zh-min-nan',
420 -'O‘zbek':'uz',
421 -'Tatarça':'tt',
422 -'Татарча':'tt',
423 -'Kapampangan':'pam',
424 -'Иронау':'os',
425 -'Саха тыла':'sah',
426 -'Saxa Tyla':'sah',
427 -'Alemannisch':'als',
428 -'Māori':'mi',
429 -'مصرى':'arz',
430 -'Maṣrī':'arz',
431 -'Қазақша':'kk',
432 -'Nāhuatl':'nah',
433 -'Limburgs':'li',
434 -'Hornjoserbsce':'hsb',
435 -'گیلکی':'glk',
436 -'Corsu':'co',
437 -'贛語':'gan',
438 -'አማርኛ':'am',
439 -'Монгол':'mn',
440 -'Interlingua':'ia',
441 -'Bikol':'bcl',
442 -'Võro':'fiu-vro',
443 -'Nedersaksisch':'nds-nl',
444 -'Føroyskt':'fo',
445 -'تركمن ':'tk',
446 -'Туркмен':'tk',
447 -'Scots':'sco',
448 -'West-Vlams':'vls',
449 -'සිංහල':'si',
450 -'संस्कृतम्':'sa',
451 -'Boarisch':'bar',
452 -'မ္ရန္‌မာစာ':'my', #Needs fix
453 -'Gaelg':'gv',
454 -'ދިވެހިބަސް':'dv',
455 -'Nouormand':'nrm',
456 -'Normaund':'nrm',
457 -'Pangasinan':'pag',
458 -'Rumantsch':'rm',
459 -'Basa Banyumasan':'map-bms',
460 -'Zazaki':'diq',
461 -'Soranî':'ckb',
462 -'کوردی':'ckb',
463 -'Sámegiella':'se',
464 -'مَزِروني':'mzn',
465 -'吴语':'wuu',
466 -'Oyghurque':'ug',
467 -'Furlan':'fur',
468 -'Líguru':'lij',
469 -'Malti':'mt',
470 -'भोजपुरी':'bh',
471 -'Novial':'nov',
472 -'Malagasy':'mg',
473 -'Kaszëbsczi':'csb',
474 -'Ilokano':'ilo',
475 -'Sardu':'sc',
476 -'古文':'zh-classical',
477 -'文言文':'zh-classical',
478 -'ភាសាខ្មែរ':'km',
479 -'Dzhudezmo':'lad',
480 -'पाऴि':'pi',
481 -'Englisc':'ang',
482 -'Chavacano de Zamboanga':'cbk-zam',
483 -'བོད་སྐད':'bo',
484 -'Fiji Hindi':'hif',
485 -'Arpitan':'frp',
486 -'Hak-kâ-fa':'hak',
487 -'客家話':'hak',
488 -'Kernewek':'kw',
489 -'Karnuack':'kw',
490 -'ਪੰਜਾਬੀ':'pa',
491 -'پښتو':'ps',
492 -'Хальмг':'xal',
493 -'Ślůnski':'szl',
494 -'Deitsch':'pdc',
495 -'Hawai`i':'haw',
496 -'Seeltersk':'stq',
497 -'Interlingue':'ie',
498 -'Diné bizaad':'nv',
499 -'Na Vosa Vakaviti':'fj',
500 -'Qırımtatarca':'crh',
501 -'Коми':'kv',
502 -'faka Tonga':'to',
503 -'Bahsa Acèh':'ace',
504 -'Soomaaliga':'so',
505 -'Эрзянь':'myv',
506 -'Erzjanj Kelj':'myv',
507 -"Avañe'ẽ":'gn',
508 -'Къарачай-Малкъар':'krc',
509 -'Qarachay-Malqar':'krc',
510 -'Estremeñu':'ext',
511 -'Lingala':'ln',
512 -'Кыргызча':'ky',
513 -'Олык Марий':'mhr',
514 -'Olyk Marij':'mhr',
515 -'ܐܪܡܝܐ':'arc',
516 -'Emiliàn e rumagnòl':'eml',
517 -'Lojban':'jbo',
518 -'Picard':'pcd',
519 -'Aymar':'ay',
520 -'Wolof':'wo',
521 -'chiTumbuka':'tum',
522 -'Taqbaylit':'kab',
523 -'Башҡорт':'ba',
524 -'Frasch':'frr',
525 -'Reo Mā`ohi':'ty',
526 -'Tok Pisin':'tpi',
527 -'Papiamentu':'pap',
528 -'Zeêuws':'zea',
529 -'Sranantongo':'srn',
530 -'Kalaallisut':'kl',
531 -'Удмурт кыл':'udm',
532 -'Нохчийн':'ce',
533 -'Igbo':'ig',
534 -'Перем Коми':'koi',
535 -'Perem Komi':'koi',
536 -'ଓଡ଼ିଆ':'or',
537 -'Dolnoserbski':'dsb',
538 -'KiKongo':'kg',
539 -'ລາວ':'lo',
540 -'Аҧсуа':'ab',
541 -'Мокшень':'mdf',
542 -'Mokshanj Kälj':'mdf',
543 -'romani - रोमानी':'rmy',
544 -'Кырык Мары':'mrj',
545 -'Kyryk Mary':'mrj',
546 -'Bahasa Banjar':'bjn',
547 -'Словѣньскъ':'cu',
548 -'Páigina Percipal':'mwl',
549 -'Qaraqalpaqsha':'kaa',
550 -'Gagana Samoa':'sm',
551 -'Молдовеняскэ':'mo',
552 -'Tetun':'tet',
553 -'Авар':'av',
554 -'कश्मीरी':'ks',
555 -'كشميري':'ks',
556 -'𐌲𐌿𐍄𐌹𐍃𐌺':'got', #Needs fix
557 -'سنڌي، سندھی ، सिन्ध':'sd',
558 -'Bamanankan':'bm',
559 -'dorerin Naoero':'na',
560 -'Norfuk':'pih',
561 -'Ποντιακά':'pnt',
562 -'ᐃᓄᒃᑎᑐᑦ':'iu',
563 -'Iñupiak':'ik',
564 -'Bislama':'bi',
565 -'ᏣᎳᎩ':'chr',
566 -'অসমীয়া':'as',
567 -'Mìng-dĕ̤ng-ngṳ̄':'cdo',
568 -'Eʋegbe':'ee',
569 -'SiSwati':'ss',
570 -'Oromoo':'om',
571 -'Cuengh':'za',
572 -'isiZulu':'zu',
573 -'ትግርኛ':'ti',
574 -'Tshivenda':'ve',
575 -'Xitsonga':'ts',
576 -'هَوُسَ':'ha',
577 -'ཇོང་ཁ':'dz',
578 -'Sängö':'sg',
579 -'Chamoru':'ch',
580 -'Nehiyaw':'cr',
581 -'isiXhosa':'xh',
582 -'Akana':'ak',
583 -'Sesotho':'st',
584 -'Ikinyarwanda':'rw',
585 -'Setswana':'tn',
586 -'Gĩkũyũ':'ki',
587 -'Буряад':'bxr',
588 -'Basa Ugi':'bug',
589 -'Chi-Chewa':'ny',
590 -'Лакку':'lbe',
591 -'Twi':'tw',
592 -'chiShona':'sn',
593 -'Kirundi':'rn',
594 -'Fulfulde':'ff',
595 -'Tsetsêhestâhese':'chy',
596 -'Luganda':'lg',
597 -'Oshiwambo':'ng',
598 -'ꆇꉙ':'ii',
599 -'Choctaw':'cho',
600 -'Ebon':'mh',
601 -'Afar':'aa',
602 -'Kuanyama':'kj',
603 -'Hiri Motu':'ho',
604 -'Muskogee':'mus',
605 -'Kanuri':'kr',
606 -'Otsiherero':'hz',
607 -}
\ No newline at end of file
 27+from utils import ordered_dict as odict
 28+MAPPING = odict.OrderedDict([
 29+(u'English','en'),
 30+(u'German','de'),
 31+(u'French','fr'),
 32+(u'Italian','it'),
 33+(u'Polish','pl'),
 34+(u'Japanese','ja'),
 35+(u'Spanish','es'),
 36+(u'Dutch','nl'),
 37+(u'Portuguese','pt'),
 38+(u'Russian','ru'),
 39+(u'Swedish','sv'),
 40+(u'Chinese','zh'),
 41+(u'Catalan','ca'),
 42+(u'Norwegian','no'),
 43+(u'Bokmål','no'),
 44+(u'Finnish','fi'),
 45+(u'Ukrainian','uk'),
 46+(u'Hungarian','hu'),
 47+(u'Czech','cs'),
 48+(u'Romanian','ro'),
 49+(u'Turkish','tr'),
 50+(u'Korean','ko'),
 51+(u'Vietnamese','vi'),
 52+(u'Danish','da'),
 53+(u'Arabic','ar'),
 54+(u'Esperanto','eo'),
 55+(u'Serbian','sr'),
 56+(u'Indonesian','id'),
 57+(u'Lithuanian','lt'),
 58+(u'Volapük','vo'),
 59+(u'Slovak','sk'),
 60+(u'Hebrew','he'),
 61+(u'Bulgarian','bg'),
 62+(u'Persian','fa'),
 63+(u'Slovenian','sl'),
 64+(u'Waray-Waray','war'),
 65+(u'Croatian','hr'),
 66+(u'Estonian','et'),
 67+(u'Malay','ms'),
 68+(u'Newar','new'),
 69+(u'Nepal Bhasa','new'),
 70+(u'Simple English','simple'),
 71+(u'Galician','gl'),
 72+(u'Thai','th'),
 73+(u'Aromanian','roa-rup'),
 74+(u'Nynorsk','nn'),
 75+(u'Basque','eu'),
 76+(u'Hindi','hi'),
 77+(u'Greek','el'),
 78+(u'Haitian','ht'),
 79+(u'Latin','la'),
 80+(u'Telugu','te'),
 81+(u'Georgian','ka'),
 82+(u'Cebuano','ceb'),
 83+(u'Macedonian','mk'),
 84+(u'Azeri','az'),
 85+(u'Tagalog','tl'),
 86+(u'Breton','br'),
 87+(u'Serbo-Croatian','sh'),
 88+(u'Marathi','mr'),
 89+(u'Luxembourgish','lb'),
 90+(u'Javanese','jv'),
 91+(u'Latvian','lv'),
 92+(u'Bosnian','bs'),
 93+(u'Icelandic','is'),
 94+(u'Welsh','cy'),
 95+(u'Belarusian','be-x-old'),
 96+(u'Taraškievica','be-x-old'),
 97+(u'Piedmontese','pms'),
 98+(u'Albanian','sq'),
 99+(u'Tamil','ta'),
 100+(u'Bishnupriya Manipuri','bpy'),
 101+(u'Belarusian','be'),
 102+(u'Aragonese','an'),
 103+(u'Occitan','oc'),
 104+(u'Bengali','bn'),
 105+(u'Swahili','sw'),
 106+(u'Ido','io'),
 107+(u'Ripuarian','ksh'),
 108+(u'Lombard','lmo'),
 109+(u'West Frisian','fy'),
 110+(u'Gujarati','gu'),
 111+(u'Low Saxon','nds'),
 112+(u'Afrikaans','af'),
 113+(u'Sicilian','scn'),
 114+(u'Quechua','qu'),
 115+(u'Kurdish','ku'),
 116+(u'Urdu','ur'),
 117+(u'Sundanese','su'),
 118+(u'Malayalam','ml'),
 119+(u'Cantonese','zh-yue'),
 120+(u'Asturian','ast'),
 121+(u'Neapolitan','nap'),
 122+(u'Samogitian','bat-smg'),
 123+(u'Walloon','wa'),
 124+(u'Chuvash','cv'),
 125+(u'Irish','ga'),
 126+(u'Armenian','hy'),
 127+(u'Yoruba','yo'),
 128+(u'Kannada','kn'),
 129+(u'Tajik','tg'),
 130+(u'Tarantino','roa-tara'),
 131+(u'Venetian','vec'),
 132+(u'Western Panjabi','pnb'),
 133+(u'Nepali','ne'),
 134+(u'Scottish Gaelic','gd'),
 135+(u'Yiddish','yi'),
 136+(u'Min Nan','zh-min-nan'),
 137+(u'Uzbek','uz'),
 138+(u'Tatar','tt'),
 139+(u'Kapampangan','pam'),
 140+(u'Ossetian','os'),
 141+(u'Sakha','sah'),
 142+(u'Alemannic','als'),
 143+(u'Maori','mi'),
 144+(u'Egyptian Arabic','arz'),
 145+(u'Kazakh','kk'),
 146+(u'Nahuatl','nah'),
 147+(u'Limburgian','li'),
 148+(u'Upper Sorbian','hsb'),
 149+(u'Gilaki','glk'),
 150+(u'Corsican','co'),
 151+(u'Gan','gan'),
 152+(u'Amharic','am'),
 153+(u'Mongolian','mn'),
 154+(u'Interlingua','ia'),
 155+(u'Central Bicolano','bcl'),
 156+(u'Võro','fiu-vro'),
 157+(u'Dutch Low Saxon','nds-nl'),
 158+(u'Faroese','fo'),
 159+(u'Turkmen','tk'),
 160+(u'Scots','sco'),
 161+(u'West Flemish','vls'),
 162+(u'Sinhalese','si'),
 163+(u'Sanskrit','sa'),
 164+(u'Bavarian','bar'),
 165+(u'Burmese','my'),
 166+(u'Manx','gv'),
 167+(u'Divehi','dv'),
 168+(u'Norman','nrm'),
 169+(u'Pangasinan','pag'),
 170+(u'Romansh','rm'),
 171+(u'Banyumasan','map-bms'),
 172+(u'Zazaki','diq'),
 173+(u'Sorani','ckb'),
 174+(u'Northern Sami','se'),
 175+(u'Mazandarani','mzn'),
 176+(u'Wu','wuu'),
 177+(u'Uyghur','ug'),
 178+(u'Friulian','fur'),
 179+(u'Ligurian','lij'),
 180+(u'Maltese','mt'),
 181+(u'Bihari','bh'),
 182+(u'Novial','nov'),
 183+(u'Malagasy','mg'),
 184+(u'Kashubian','csb'),
 185+(u'Ilokano','ilo'),
 186+(u'Sardinian','sc'),
 187+(u'Classical Chinese','zh-classical'),
 188+(u'Khmer','km'),
 189+(u'Ladino','lad'),
 190+(u'Pali','pi'),
 191+(u'Anglo-Saxon','ang'),
 192+(u'Zamboanga Chavacano','cbk-zam'),
 193+(u'Tibetan','bo'),
 194+(u'Fiji Hindi','hif'),
 195+(u'Franco-Provençal','frp'),
 196+(u'Arpitan','frp'),
 197+(u'Hakka','hak'),
 198+(u'Cornish','kw'),
 199+(u'Punjabi','pa'),
 200+(u'Pashto','ps'),
 201+(u'Kalmyk','xal'),
 202+(u'Silesian','szl'),
 203+(u'Pennsylvania German','pdc'),
 204+(u'Hawaiian','haw'),
 205+(u'Saterland Frisian','stq'),
 206+(u'Interlingue','ie'),
 207+(u'Navajo','nv'),
 208+(u'Fijian','fj'),
 209+(u'Crimean Tatar','crh'),
 210+(u'Komi','kv'),
 211+(u'Tongan','to'),
 212+(u'Acehnese','ace'),
 213+(u'Somali','so'),
 214+(u'Erzya','myv'),
 215+(u'Guarani','gn'),
 216+(u'Karachay-Balkar','krc'),
 217+(u'Extremaduran','ext'),
 218+(u'Lingala','ln'),
 219+(u'Kirghiz','ky'),
 220+(u'Meadow Mari','mhr'),
 221+(u'Assyrian Neo-Aramaic','arc'),
 222+(u'Emilian-Romagnol','eml'),
 223+(u'Lojban','jbo'),
 224+(u'Picard','pcd'),
 225+(u'Aymara','ay'),
 226+(u'Wolof','wo'),
 227+(u'Tumbuka','tum'),
 228+(u'Kabyle','kab'),
 229+(u'Bashkir','ba'),
 230+(u'North Frisian','frr'),
 231+(u'Tahitian','ty'),
 232+(u'Tok Pisin','tpi'),
 233+(u'Papiamentu','pap'),
 234+(u'Zealandic','zea'),
 235+(u'Sranan','srn'),
 236+(u'Greenlandic','kl'),
 237+(u'Udmurt','udm'),
 238+(u'Chechen','ce'),
 239+(u'Igbo','ig'),
 240+(u'Komi-Permyak','koi'),
 241+(u'Oriya','or'),
 242+(u'Lower Sorbian','dsb'),
 243+(u'Kongo','kg'),
 244+(u'Lao','lo'),
 245+(u'Abkhazian','ab'),
 246+(u'Moksha','mdf'),
 247+(u'Romani','rmy'),
 248+(u'Hill Mari','mrj'),
 249+(u'Banjar','bjn'),
 250+(u'Old Church Slavonic','cu'),
 251+(u'Mirandese','mwl'),
 252+(u'Karakalpak','kaa'),
 253+(u'Samoan','sm'),
 254+(u'Moldovan','mo'),
 255+(u'Tetum','tet'),
 256+(u'Avar','av'),
 257+(u'Kashmiri','ks'),
 258+(u'Gothic','got'),
 259+(u'Sindhi','sd'),
 260+(u'Bambara','bm'),
 261+(u'Nauruan','na'),
 262+(u'Norfolk','pih'),
 263+(u'Pontic','pnt'),
 264+(u'Inuktitut','iu'),
 265+(u'Inupiak','ik'),
 266+(u'Bislama','bi'),
 267+(u'Cherokee','chr'),
 268+(u'Assamese','as'),
 269+(u'Min Dong','cdo'),
 270+(u'Ewe','ee'),
 271+(u'Swati','ss'),
 272+(u'Oromo','om'),
 273+(u'Zhuang','za'),
 274+(u'Zulu','zu'),
 275+(u'Tigrinya','ti'),
 276+(u'Venda','ve'),
 277+(u'Tsonga','ts'),
 278+(u'Hausa','ha'),
 279+(u'Dzongkha','dz'),
 280+(u'Sango','sg'),
 281+(u'Chamorro','ch'),
 282+(u'Cree','cr'),
 283+(u'Xhosa','xh'),
 284+(u'Akan','ak'),
 285+(u'Sesotho','st'),
 286+(u'Kinyarwanda','rw'),
 287+(u'Tswana','tn'),
 288+(u'Kikuyu','ki'),
 289+(u'Buryat','bxr'),
 290+(u'Buginese','bug'),
 291+(u'Chichewa','ny'),
 292+(u'Lak','lbe'),
 293+(u'Twi','tw'),
 294+(u'Shona','sn'),
 295+(u'Kirundi','rn'),
 296+(u'Fula','ff'),
 297+(u'Cheyenne','chy'),
 298+(u'Luganda','lg'),
 299+(u'Ndonga','ng'),
 300+(u'Sichuan Yi','ii'),
 301+(u'Choctaw','cho'),
 302+(u'Marshallese','mh'),
 303+(u'Afar','aa'),
 304+(u'Kuanyama','kj'),
 305+(u'Hiri Motu','ho'),
 306+(u'Muscogee','mus'),
 307+(u'Kanuri','kr'),
 308+(u'Herero','hz'),
 309+(u'English','en'),
 310+(u'Deutsch','de'),
 311+(u'Français','fr'),
 312+(u'Italiano','it'),
 313+(u'Polski','pl'),
 314+(u'日本語','ja'),
 315+(u'Español','es'),
 316+(u'Nederlands','nl'),
 317+(u'Português','pt'),
 318+(u'Русский','ru'),
 319+(u'Svenska','sv'),
 320+(u'中文','zh'),
 321+(u'Català','ca'),
 322+(u'Norsk','no'),
 323+(u'Bokmål','no'),
 324+(u'Suomi','fi'),
 325+(u'Українська','uk'),
 326+(u'Magyar','hu'),
 327+(u'Čeština','cs'),
 328+(u'Română','ro'),
 329+(u'Türkçe','tr'),
 330+(u'한국어','ko'),
 331+(u'Tiếng Việt','vi'),
 332+(u'Dansk','da'),
 333+(u'العربية','ar'),
 334+(u'Esperanto','eo'),
 335+(u'Српски','sr'),
 336+(u'Srpski','sr'),
 337+(u'Bahasa Indonesia','id'),
 338+(u'Lietuvių','lt'),
 339+(u'Volapük','vo'),
 340+(u'Slovenčina','sk'),
 341+(u'עברית','he'),
 342+(u'Български','bg'),
 343+(u'فارسی','fa'),
 344+(u'Slovenščina','sl'),
 345+(u'Winaray','war'),
 346+(u'Hrvatski','hr'),
 347+(u'Eesti','et'),
 348+(u'Bahasa Melayu','ms'),
 349+(u'नेपाल भाषा','new'),
 350+(u'Simple English','simple'),
 351+(u'Galego','gl'),
 352+(u'ไทย','th'),
 353+(u'Armãneashce','roa-rup'),
 354+(u'Nynorsk','nn'),
 355+(u'Euskara','eu'),
 356+(u'हिन्दी','hi'),
 357+(u'Ελληνικά','el'),
 358+(u'Krèyol ayisyen','ht'),
 359+(u'Latina','la'),
 360+(u'తెలుగు','te'),
 361+(u'ქართული','ka'),
 362+(u'Sinugboanong Binisaya','ceb'),
 363+(u'Македонски','mk'),
 364+(u'Azərbaycan','az'),
 365+(u'Tagalog','tl'),
 366+(u'Brezhoneg','br'),
 367+(u'Srpskohrvatski','sh'),
 368+(u'Српскохрватски','sh'),
 369+(u'मराठी','mr'),
 370+(u'Lëtzebuergesch','lb'),
 371+(u'Basa Jawa','jv'),
 372+(u'Latviešu','lv'),
 373+(u'Bosanski','bs'),
 374+(u'Íslenska','is'),
 375+(u'Cymraeg','cy'),
 376+(u'Беларуская','be-x-old'),
 377+(u'тарашкевіца','be-x-old'),
 378+(u'Piemontèis','pms'),
 379+(u'Shqip','sq'),
 380+(u'தமிழ்','ta'),
 381+(u'ইমার ঠার','bpy'),
 382+(u'বিষ্ণুপ্রিয়া মণিপুরী','bpy'),
 383+(u'Беларуская','be'),
 384+(u'Aragonés','an'),
 385+(u'Occitan','oc'),
 386+(u'বাংলা','bn'),
 387+(u'Kiswahili','sw'),
 388+(u'Ido','io'),
 389+(u'Ripoarisch','ksh'),
 390+(u'Lumbaart','lmo'),
 391+(u'Frysk','fy'),
 392+(u'ગુજરાતી','gu'),
 393+(u'Plattdüütsch','nds'),
 394+(u'Afrikaans','af'),
 395+(u'Sicilianu','scn'),
 396+(u'Runa Simi','qu'),
 397+(u'Kurdî','ku'),
 398+(u'كوردی','ku'),
 399+(u'اردو','ur'),
 400+(u'Basa Sunda','su'),
 401+(u'മലയാളം','ml'),
 402+(u'粵語','zh-yue'),
 403+(u'Asturianu','ast'),
 404+(u'Nnapulitano','nap'),
 405+(u'Žemaitėška','bat-smg'),
 406+(u'Walon','wa'),
 407+(u'Чăваш','cv'),
 408+(u'Gaeilge','ga'),
 409+(u'Հայերեն','hy'),
 410+(u'Yorùbá','yo'),
 411+(u'ಕನ್ನಡ','kn'),
 412+(u'Тоҷикӣ','tg'),
 413+(u'Tarandíne','roa-tara'),
 414+(u'Vèneto','vec'),
 415+(u'شاہ مکھی پنجابی','pnb'),
 416+(u'Shāhmukhī Pañjābī','pnb'),
 417+(u'नेपाली','ne'),
 418+(u'Gàidhlig','gd'),
 419+(u'ייִדיש','yi'),
 420+(u'Bân-lâm-gú','zh-min-nan'),
 421+(u'O‘zbek','uz'),
 422+(u'Tatarça','tt'),
 423+(u'Татарча','tt'),
 424+(u'Kapampangan','pam'),
 425+(u'Иронау','os'),
 426+(u'Саха тыла','sah'),
 427+(u'Saxa Tyla','sah'),
 428+(u'Alemannisch','als'),
 429+(u'Māori','mi'),
 430+(u'مصرى','arz'),
 431+(u'Maṣrī','arz'),
 432+(u'Қазақша','kk'),
 433+(u'Nāhuatl','nah'),
 434+(u'Limburgs','li'),
 435+(u'Hornjoserbsce','hsb'),
 436+(u'گیلکی','glk'),
 437+(u'Corsu','co'),
 438+(u'贛語','gan'),
 439+(u'አማርኛ','am'),
 440+(u'Монгол','mn'),
 441+(u'Interlingua','ia'),
 442+(u'Bikol','bcl'),
 443+(u'Võro','fiu-vro'),
 444+(u'Nedersaksisch','nds-nl'),
 445+(u'Føroyskt','fo'),
 446+(u'تركمن ','tk'),
 447+(u'Туркмен','tk'),
 448+(u'Scots','sco'),
 449+(u'West-Vlams','vls'),
 450+(u'සිංහල','si'),
 451+(u'संस्कृतम्','sa'),
 452+(u'Boarisch','bar'),
 453+(u'မ္ရန္‌မာစာ','my'), #Needs fix
 454+(u'Gaelg','gv'),
 455+(u'ދިވެހިބަސް','dv'),
 456+(u'Nouormand','nrm'),
 457+(u'Normaund','nrm'),
 458+(u'Pangasinan','pag'),
 459+(u'Rumantsch','rm'),
 460+(u'Basa Banyumasan','map-bms'),
 461+(u'Zazaki','diq'),
 462+(u'Soranî','ckb'),
 463+(u'کوردی','ckb'),
 464+(u'Sámegiella','se'),
 465+(u'مَزِروني','mzn'),
 466+(u'吴语','wuu'),
 467+(u'Oyghurque','ug'),
 468+(u'Furlan','fur'),
 469+(u'Líguru','lij'),
 470+(u'Malti','mt'),
 471+(u'भोजपुरी','bh'),
 472+(u'Novial','nov'),
 473+(u'Malagasy','mg'),
 474+(u'Kaszëbsczi','csb'),
 475+(u'Ilokano','ilo'),
 476+(u'Sardu','sc'),
 477+(u'古文','zh-classical'),
 478+(u'文言文','zh-classical'),
 479+(u'ភាសាខ្មែរ','km'),
 480+(u'Dzhudezmo','lad'),
 481+(u'पाऴि','pi'),
 482+(u'Englisc','ang'),
 483+(u'Chavacano de Zamboanga','cbk-zam'),
 484+(u'བོད་སྐད','bo'),
 485+(u'Fiji Hindi','hif'),
 486+(u'Arpitan','frp'),
 487+(u'Hak-kâ-fa','hak'),
 488+(u'客家話','hak'),
 489+(u'Kernewek','kw'),
 490+(u'Karnuack','kw'),
 491+(u'ਪੰਜਾਬੀ','pa'),
 492+(u'پښتو','ps'),
 493+(u'Хальмг','xal'),
 494+(u'Ślůnski','szl'),
 495+(u'Deitsch','pdc'),
 496+(u'Hawai`i','haw'),
 497+(u'Seeltersk','stq'),
 498+(u'Interlingue','ie'),
 499+(u'Diné bizaad','nv'),
 500+(u'Na Vosa Vakaviti','fj'),
 501+(u'Qırımtatarca','crh'),
 502+(u'Коми','kv'),
 503+(u'faka Tonga','to'),
 504+(u'Bahsa Acèh','ace'),
 505+(u'Soomaaliga','so'),
 506+(u'Эрзянь','myv'),
 507+(u'Erzjanj Kelj','myv'),
 508+(u"Avañe'ẽ",'gn'),
 509+(u'Къарачай-Малкъар','krc'),
 510+(u'Qarachay-Malqar','krc'),
 511+(u'Estremeñu','ext'),
 512+(u'Lingala','ln'),
 513+(u'Кыргызча','ky'),
 514+(u'Олык Марий','mhr'),
 515+(u'Olyk Marij','mhr'),
 516+(u'ܐܪܡܝܐ','arc'),
 517+(u'Emiliàn e rumagnòl','eml'),
 518+(u'Lojban','jbo'),
 519+(u'Picard','pcd'),
 520+(u'Aymar','ay'),
 521+(u'Wolof','wo'),
 522+(u'chiTumbuka','tum'),
 523+(u'Taqbaylit','kab'),
 524+(u'Башҡорт','ba'),
 525+(u'Frasch','frr'),
 526+(u'Reo Mā`ohi','ty'),
 527+(u'Tok Pisin','tpi'),
 528+(u'Papiamentu','pap'),
 529+(u'Zeêuws','zea'),
 530+(u'Sranantongo','srn'),
 531+(u'Kalaallisut','kl'),
 532+(u'Удмурт кыл','udm'),
 533+(u'Нохчийн','ce'),
 534+(u'Igbo','ig'),
 535+(u'Перем Коми','koi'),
 536+(u'Perem Komi','koi'),
 537+(u'ଓଡ଼ିଆ','or'),
 538+(u'Dolnoserbski','dsb'),
 539+(u'KiKongo','kg'),
 540+(u'ລາວ','lo'),
 541+(u'Аҧсуа','ab'),
 542+(u'Мокшень','mdf'),
 543+(u'Mokshanj Kälj','mdf'),
 544+(u'romani - रोमानी','rmy'),
 545+(u'Кырык Мары','mrj'),
 546+(u'Kyryk Mary','mrj'),
 547+(u'Bahasa Banjar','bjn'),
 548+(u'Словѣньскъ','cu'),
 549+(u'Páigina Percipal','mwl'),
 550+(u'Qaraqalpaqsha','kaa'),
 551+(u'Gagana Samoa','sm'),
 552+(u'Молдовеняскэ','mo'),
 553+(u'Tetun','tet'),
 554+(u'Авар','av'),
 555+(u'कश्मीरी','ks'),
 556+(u'كشميري','ks'),
 557+(u'𐌲𐌿𐍄𐌹𐍃𐌺','got'), #Needs fix
 558+(u'سنڌي، سندھی ، सिन्ध','sd'),
 559+(u'Bamanankan','bm'),
 560+(u'dorerin Naoero','na'),
 561+(u'Norfuk','pih'),
 562+(u'Ποντιακά','pnt'),
 563+(u'ᐃᓄᒃᑎᑐᑦ','iu'),
 564+(u'Iñupiak','ik'),
 565+(u'Bislama','bi'),
 566+(u'ᏣᎳᎩ','chr'),
 567+(u'অসমীয়া','as'),
 568+(u'Mìng-dĕ̤ng-ngṳ̄','cdo'),
 569+(u'Eʋegbe','ee'),
 570+(u'SiSwati','ss'),
 571+(u'Oromoo','om'),
 572+(u'Cuengh','za'),
 573+(u'isiZulu','zu'),
 574+(u'ትግርኛ','ti'),
 575+(u'Tshivenda','ve'),
 576+(u'Xitsonga','ts'),
 577+(u'هَوُسَ','ha'),
 578+(u'ཇོང་ཁ','dz'),
 579+(u'Sängö','sg'),
 580+(u'Chamoru','ch'),
 581+(u'Nehiyaw','cr'),
 582+(u'isiXhosa','xh'),
 583+(u'Akana','ak'),
 584+(u'Sesotho','st'),
 585+(u'Ikinyarwanda','rw'),
 586+(u'Setswana','tn'),
 587+(u'Gĩkũyũ','ki'),
 588+(u'Буряад','bxr'),
 589+(u'Basa Ugi','bug'),
 590+(u'Chi-Chewa','ny'),
 591+(u'Лакку','lbe'),
 592+(u'Twi','tw'),
 593+(u'chiShona','sn'),
 594+(u'Kirundi','rn'),
 595+(u'Fulfulde','ff'),
 596+(u'Tsetsêhestâhese','chy'),
 597+(u'Luganda','lg'),
 598+(u'Oshiwambo','ng'),
 599+(u'ꆇꉙ','ii'),
 600+(u'Choctaw','cho'),
 601+(u'Ebon','mh'),
 602+(u'Afar','aa'),
 603+(u'Kuanyama','kj'),
 604+(u'Hiri Motu','ho'),
 605+(u'Muskogee','mus'),
 606+(u'Kanuri','kr'),
 607+(u'Otsiherero','hz'),
 608+])
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/dump_downloader.py
@@ -46,7 +46,7 @@
4747 return - 1
4848
4949
50 -def download_wp_dump(domain, path, filename, location, filemode, pbar):
 50+def download_wiki_file(domain, path, filename, location, filemode, pbar):
5151 '''
5252 This is a very simple replacement for wget and curl because Windows does
5353 support these tools.
@@ -57,10 +57,13 @@
5858 @pbar is an instance of progressbar.ProgressBar()
5959 '''
6060 chunk = 4096
 61+ result = utils.check_file_exists(location, '')
 62+ if result == False:
 63+ utils.create_directory(os.path.join(location))
6164 if filemode == 'w':
62 - fh = utils.open_txt_file(location, filename, filemode, settings.ENCODING)
 65+ fh = utils.create_txt_filehandle(location, filename, filemode, settings.ENCODING)
6366 else:
64 - fh = utils.open_binary_file(location, filename, filemode)
 67+ fh = utils.create_binary_filehandle(location, filename, 'wb')
6568
6669 filesize = determine_remote_filesize(domain, path + filename)
6770
@@ -73,9 +76,12 @@
7477 pbar = progressbar.ProgressBar(widgets=widgets,maxval=filesize).start()
7578 else:
7679 pbar = False
77 -
78 - req = urllib2.Request(domain + path + filename)
 80+
7981 try:
 82+ if filename.endswith('json'):
 83+ req = urllib2.Request(domain + path)
 84+ else:
 85+ req = urllib2.Request(domain + path + filename)
8086 response = urllib2.urlopen(req)
8187 while True:
8288 data = response.read(chunk)

Status & tagging log