Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -23,14 +23,16 @@ |
24 | 24 | from argparse import ArgumentParser |
25 | 25 | from argparse import RawTextHelpFormatter |
26 | 26 | import locale |
27 | | - |
28 | 27 | import progressbar |
29 | 28 | |
30 | | -import settings |
| 29 | +sys.path.append('..') |
| 30 | +import configuration |
| 31 | +settings = configuration.Settings() |
| 32 | + |
31 | 33 | import languages |
32 | 34 | from utils import utils |
33 | 35 | from utils import dump_downloader |
34 | | -import split_xml_file |
| 36 | +from etl import chunker |
35 | 37 | import map_wiki_editors |
36 | 38 | import optimize_editors |
37 | 39 | import construct_datasets |
— | — | @@ -85,10 +87,11 @@ |
86 | 88 | |
87 | 89 | def determine_file_locations(args): |
88 | 90 | locations = {} |
89 | | - location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION |
| 91 | + location = get_value(args, 'location') if get_value(args, 'location') != None else settings.input_location |
90 | 92 | project = retrieve_project(args) |
91 | 93 | language_code = retrieve_language(args) |
92 | 94 | locations['language_code'] = language_code |
| 95 | + locations['language'] = get_value(args, 'language') |
93 | 96 | locations['location'] = os.path.join(location, language_code, project) |
94 | 97 | locations['project'] = project |
95 | 98 | locations['full_project'] = retrieve_projectname(args) |
— | — | @@ -96,65 +99,61 @@ |
97 | 100 | return locations |
98 | 101 | |
99 | 102 | |
100 | | -def prepare_file_locations(location): |
101 | | - result = utils.check_file_exists(location, '') |
102 | | - if result == False: |
103 | | - utils.create_directory(os.path.join(location)) |
104 | 103 | |
105 | 104 | |
106 | | -def show_settings(args, location, filename, project, full_project, language_code): |
107 | | - project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki') |
| 105 | +def show_settings(args, location, filename, project, full_project, language_code, language): |
| 106 | + project = settings.projects.get(project, 'wiki') |
108 | 107 | project = project.title() |
109 | 108 | language_map = utils.invert_dict(languages.MAPPING) |
110 | 109 | print 'Project: %s' % (project) |
111 | | - print 'Language: %s' % language_map[language_code].decode('utf-8') |
| 110 | + print 'Language: %s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding)) |
112 | 111 | print 'Input directory: %s' % location |
113 | 112 | print 'Output directory: %s and subdirectories' % location |
114 | 113 | |
115 | 114 | |
116 | | -def dump_downloader_launcher(args, location, filename, project, full_project, language_code): |
| 115 | +def dump_downloader_launcher(args, location, filename, project, full_project, language_code, language): |
117 | 116 | print 'dump downloader' |
118 | 117 | pbar = get_value(args, 'progress') |
119 | | - domain = settings.WP_DUMP_LOCATION |
| 118 | + domain = settings.wp_dump_location |
120 | 119 | path = '/%s/latest/' % project |
121 | 120 | extension = utils.determine_file_extension(filename) |
122 | 121 | filemode = utils.determine_file_mode(extension) |
123 | 122 | dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar) |
124 | 123 | |
125 | 124 | |
126 | | -def split_xml_file_launcher(args, location, filename, project, full_project, language_code): |
127 | | - print 'split_xml_file_launcher' |
| 125 | +def cruncher_launcher(args, location, filename, project, full_project, language_code, language): |
| 126 | + print 'split_settings.input_filename_launcher' |
128 | 127 | ext = utils.determine_file_extension(filename) |
129 | | - if ext in settings.COMPRESSION_EXTENSIONS: |
| 128 | + if ext in settings.compression_extensions: |
130 | 129 | ext = '.%s' % ext |
131 | 130 | file = filename.replace(ext, '') |
132 | 131 | result = utils.check_file_exists(location, file) |
133 | 132 | if not result: |
134 | | - retcode = extract_xml_file(args, location, filename) |
| 133 | + retcode = launch_zip_extractor(args, location, filename) |
135 | 134 | else: |
136 | 135 | retcode = 0 |
137 | 136 | if retcode != 0: |
138 | 137 | sys.exit(retcode) |
139 | | - split_xml_file.split_xml(location, file, project, language_code) |
| 138 | + chunker.split_file(location, file, project, language_code, language) |
140 | 139 | |
141 | 140 | |
142 | | -def extract_xml_file(args, location, file): |
143 | | - path = config.detect_installed_program('7zip') |
| 141 | +def launch_zip_extractor(args, location, file): |
| 142 | + path = settings.detect_installed_program('7zip') |
144 | 143 | source = os.path.join(location, file) |
145 | 144 | p = None |
146 | 145 | |
147 | | - if settings.OS == 'Windows': |
| 146 | + if settings.platform == 'Windows': |
148 | 147 | p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait() |
149 | | - elif settings.OS == 'Linux': |
| 148 | + elif settings.platform == 'Linux': |
150 | 149 | raise NotImplementedError |
151 | | - elif settings.OS == 'OSX': |
| 150 | + elif settings.platform == 'OSX': |
152 | 151 | raise NotImplementedError |
153 | 152 | else: |
154 | 153 | raise exceptions.PlatformNotSupportedError |
155 | 154 | return p |
156 | 155 | |
157 | 156 | |
158 | | -def mongodb_script_launcher(args, location, filename, project, full_project, language_code): |
| 157 | +def mongodb_script_launcher(args, location, filename, project, full_project, language_code, language): |
159 | 158 | print 'mongodb_script_launcher' |
160 | 159 | map_wiki_editors.run_parse_editors(project, language_code, location) |
161 | 160 | |
— | — | @@ -169,21 +168,21 @@ |
170 | 169 | construct_datasets.generate_editor_dataset_launcher(project) |
171 | 170 | |
172 | 171 | |
173 | | -def all_launcher(args, location, filename, project, full_project, language_code): |
| 172 | +def all_launcher(args, location, filename, project, full_project, language_code, language): |
174 | 173 | print 'all_launcher' |
175 | 174 | dump_downloader_launcher(args, location, filename, project, language_code) |
176 | | - split_xml_file_launcher(args, location, filename, project, language_code) |
| 175 | + split_settings.input_filename_launcher(args, location, filename, project, language_code) |
177 | 176 | mongodb_script_launcher(args, location, filename, project, language_code) |
178 | 177 | dataset_launcher(args, location, filename, project, language_code) |
179 | 178 | |
180 | 179 | |
181 | 180 | def supported_languages(): |
182 | 181 | choices = languages.MAPPING.keys() |
183 | | - choices = [c.encode(settings.ENCODING) for c in choices] |
| 182 | + choices = [c.encode(settings.encoding) for c in choices] |
184 | 183 | return tuple(choices) |
185 | 184 | |
186 | 185 | |
187 | | -def show_languages(args, location, filename, project, full_project, language_code): |
| 186 | +def show_languages(args, location, filename, project, full_project, language_code, language): |
188 | 187 | first = get_value(args, 'startswith') |
189 | 188 | if first != None: |
190 | 189 | first = first.title() |
— | — | @@ -195,16 +194,16 @@ |
196 | 195 | for language in languages: |
197 | 196 | try: |
198 | 197 | if first != None and language.startswith(first): |
199 | | - print '%s' % language.decode('utf-8') |
| 198 | + print '%s' % language.decode(settings.encoding) |
200 | 199 | elif first == None: |
201 | | - print '%s' % language.decode('utf-8') |
| 200 | + print '%s' % language.decode(settings.encoding) |
202 | 201 | except UnicodeEncodeError: |
203 | 202 | print '%s' % language |
204 | 203 | |
205 | 204 | |
206 | 205 | def detect_python_version(): |
207 | 206 | version = sys.version_info[0:2] |
208 | | - if version < settings.MINIMUM_PYTHON_VERSION: |
| 207 | + if version < settings.minimum_python_version: |
209 | 208 | raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).' |
210 | 209 | |
211 | 210 | def about(): |
— | — | @@ -238,7 +237,7 @@ |
239 | 238 | parser_download.set_defaults(func=dump_downloader_launcher) |
240 | 239 | |
241 | 240 | parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
242 | | - parser_split.set_defaults(func=split_xml_file_launcher) |
| 241 | + parser_split.set_defaults(func=cruncher_launcher) |
243 | 242 | |
244 | 243 | parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.') |
245 | 244 | parser_sort.set_defaults(func=sort_launcher) |
— | — | @@ -259,12 +258,12 @@ |
260 | 259 | |
261 | 260 | parser.add_argument('-p', '--project', action='store', |
262 | 261 | help='Specify the Wikimedia project that you would like to download', |
263 | | - choices=settings.WIKIMEDIA_PROJECTS.keys(), |
| 262 | + choices=settings.projects.keys(), |
264 | 263 | default='wiki') |
265 | 264 | |
266 | 265 | parser.add_argument('-o', '--location', action='store', |
267 | 266 | help='Indicate where you want to store the downloaded file.', |
268 | | - default=settings.XML_FILE_LOCATION) |
| 267 | + default=settings.input_location) |
269 | 268 | |
270 | 269 | parser.add_argument('-f', '--file', action='store', |
271 | 270 | choices=file_choices, |
— | — | @@ -275,11 +274,12 @@ |
276 | 275 | help='Indicate whether you want to have a progressbar.') |
277 | 276 | |
278 | 277 | detect_python_version() |
| 278 | + about() |
279 | 279 | args = parser.parse_args() |
280 | 280 | config.load_configuration(args) |
281 | 281 | locations = determine_file_locations(args) |
282 | | - prepare_file_locations(locations['location']) |
283 | | - about() |
| 282 | + #prepare_file_locations(locations['location']) |
| 283 | + settings.verify_environment([locations['location']]) |
284 | 284 | show_settings(args, **locations) |
285 | 285 | args.func(args, **locations) |
286 | 286 | |