Index: trunk/tools/editor_trends/config.py |
— | — | @@ -13,7 +13,7 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
— | — | @@ -22,7 +22,7 @@ |
23 | 23 | import ConfigParser |
24 | 24 | |
25 | 25 | from utils import utils |
26 | | -import languages |
| 26 | +from classes import wikiprojects |
27 | 27 | |
28 | 28 | |
29 | 29 | def show_choices(settings, attr): |
— | — | @@ -34,12 +34,13 @@ |
35 | 35 | |
36 | 36 | def create_configuration(settings, args): |
37 | 37 | force = getattr(args, 'force', False) |
| 38 | + |
38 | 39 | if not os.path.exists('wiki.cfg') or force: |
39 | 40 | config = ConfigParser.RawConfigParser() |
40 | 41 | project = None |
41 | 42 | language = None |
42 | 43 | dumpversion = None |
43 | | - language_map = languages.language_map() |
| 44 | + #language_map = languages.language_map() |
44 | 45 | working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd()) |
45 | 46 | input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location) |
46 | 47 | |
— | — | @@ -49,19 +50,20 @@ |
50 | 51 | if project not in settings.projects.keys(): |
51 | 52 | print 'Valid choices for a project are: %s' % ','.join(settings.projects.keys()) |
52 | 53 | |
53 | | - while language not in languages.MAPPING: |
| 54 | + wiki = wikiprojects.Wiki(settings.encoding, project=project) |
| 55 | + while language not in wiki.valid_languages: |
54 | 56 | language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (settings.projects[project].capitalize(), language_map[args.language])) |
55 | 57 | if len(language) == 0: |
56 | 58 | language = language_map[args.language] |
57 | | - language = language if language in languages.MAPPING else args.language |
| 59 | + language = language if language in wiki.valid_languages else args.language |
58 | 60 | |
59 | | - while dumpversion not in settings.dumpversions.keys(): |
60 | | - choices = '\n'.join(show_choices(settings, 'dumpversions')) |
61 | | - dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0'])) |
62 | | - if len(dumpversion) == 0: |
63 | | - dumpversion = settings.dumpversions['0'] |
| 61 | +# while dumpversion not in settings.dumpversions.keys(): |
| 62 | +# choices = '\n'.join(show_choices(settings, 'dumpversions')) |
| 63 | +# dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0'])) |
| 64 | +# if len(dumpversion) == 0: |
| 65 | +# dumpversion = settings.dumpversions['0'] |
64 | 66 | |
65 | | - dumpversion = settings.dumpversions[dumpversion] |
| 67 | + #dumpversion = settings.dumpversions[dumpversion] |
66 | 68 | input_location = input_location if len(input_location) > 0 else settings.input_location |
67 | 69 | working_directory = working_directory if len(working_directory) > 0 else os.getcwd() |
68 | 70 | |
— | — | @@ -72,15 +74,15 @@ |
73 | 75 | config.add_section('wiki') |
74 | 76 | config.set('wiki', 'project', project) |
75 | 77 | config.set('wiki', 'language', language) |
76 | | - config.set('wiki', 'dumpversion', dumpversion) |
| 78 | + #config.set('wiki', 'dumpversion', dumpversion) |
77 | 79 | |
78 | | - fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
| 80 | + fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
79 | 81 | config.write(fh) |
80 | 82 | fh.close() |
81 | 83 | |
82 | 84 | settings.working_directory = config.get('file_locations', 'working_directory') |
83 | 85 | settings.input_location = config.get('file_locations', 'input_location') |
84 | | - settings.xml_namespace = config.get('wiki', 'dumpversion') |
| 86 | + #settings.xml_namespace = config.get('wiki', 'dumpversion') |
85 | 87 | return settings |
86 | 88 | |
87 | 89 | |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -13,7 +13,7 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
— | — | @@ -29,6 +29,9 @@ |
30 | 30 | import platform |
31 | 31 | import subprocess |
32 | 32 | |
| 33 | + |
| 34 | +from classes import exceptions |
| 35 | + |
33 | 36 | try: |
34 | 37 | from _winreg import * |
35 | 38 | from pywin import win32file |
— | — | @@ -58,156 +61,146 @@ |
59 | 62 | else: |
60 | 63 | return cls.instance |
61 | 64 | |
62 | | -class Settings(object): |
63 | | - __metaclass__ = Singleton |
| 65 | +class Settings: |
| 66 | + __metaclass__ = Singleton |
64 | 67 | |
65 | | - def __init__(self, debug=True, process_multiplier=1, **kwargs): |
66 | | - self.debug = debug |
67 | | - self.progressbar = True |
68 | | - self.encoding = 'utf-8' |
| 68 | + def __init__(self, debug=True, process_multiplier=1, **kwargs): |
| 69 | + self.minimum_python_version = (2, 6) |
| 70 | + self.detect_python_version() |
| 71 | + self.debug = debug |
| 72 | + self.progressbar = True |
| 73 | + self.encoding = 'utf-8' |
69 | 74 | |
70 | | - #Date format as used by Erik Zachte |
71 | | - self.date_format = '%Y-%m-%d' |
| 75 | + #Date format as used by Erik Zachte |
| 76 | + self.date_format = '%Y-%m-%d' |
72 | 77 | |
73 | | - # Timestamp format as generated by the MediaWiki dumps |
74 | | - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
| 78 | + # Timestamp format as generated by the MediaWiki dumps |
| 79 | + self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
75 | 80 | |
76 | | - #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
77 | | - self.max_xmlfile_size = 4096 * 1024 |
| 81 | + #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
| 82 | + self.max_xmlfile_size = 4096 * 1024 |
78 | 83 | |
79 | | - #Change this to match your computers configuration (RAM / CPU) |
80 | | - self.number_of_processes = cpu_count() * process_multiplier |
| 84 | + #Change this to match your computers configuration (RAM / CPU) |
| 85 | + self.number_of_processes = cpu_count() * process_multiplier |
81 | 86 | |
82 | | - self.minimum_python_version = (2, 6) |
83 | | - self.wp_dump_location = 'http://download.wikimedia.org' |
84 | | - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
85 | | - self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
86 | | - self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
87 | | - #Extensions of ascii files, this is used to determine the filemode to use |
88 | | - self.platform = self.determine_platform() |
| 87 | + self.wp_dump_location = 'http://download.wikimedia.org' |
| 88 | + self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
| 89 | + self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
| 90 | + self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
| 91 | + #Extensions of ascii files, this is used to determine the filemode to use |
| 92 | + self.platform = self.determine_platform() |
89 | 93 | |
90 | | - self.architecture = platform.machine() |
91 | | - self.working_directory = self.determine_working_directory() |
92 | | - self.update_python_path() |
| 94 | + self.architecture = platform.machine() |
| 95 | + self.working_directory = self.determine_working_directory() |
| 96 | + self.update_python_path() |
93 | 97 | |
94 | | - self.root = '/' if self.platform != 'Windows' else 'c:\\' |
95 | | - self.file_locations = self.set_file_locations() |
96 | | - self.max_filehandles = self.determine_max_filehandles_open() |
97 | | - self.tab_width = 4 if self.platform == 'Windows' else 8 |
| 98 | + self.root = '/' if self.platform != 'Windows' else 'c:\\' |
| 99 | + self.max_filehandles = self.determine_max_filehandles_open() |
| 100 | + self.tab_width = 4 if self.platform == 'Windows' else 8 |
98 | 101 | |
99 | | - self.load_configuration() |
100 | | - self.set_custom_settings(**kwargs) |
101 | | - self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/', |
102 | | - '1': 'http://www.mediawiki.org/xml/export-0.3/', |
103 | | - } |
104 | | - self.projects = {'wiki': 'wikipedia', |
105 | | - 'commons': 'commonswiki', |
106 | | - 'books': 'wikibooks', |
107 | | - 'news': 'wikinews', |
108 | | - 'quote': 'wikiquote', |
109 | | - 'source': 'wikisource', |
110 | | - 'versity': 'wikiversity', |
111 | | - 'tionary': 'wiktionary', |
112 | | - 'meta': 'metawiki', |
113 | | - 'species': 'specieswiki', |
114 | | - 'incubator': 'incubatorwiki', |
115 | | - 'foundation': 'foundationwiki', |
116 | | - 'mediawiki': 'mediawikiwiki', |
117 | | - 'outreach': 'outreachwiki', |
118 | | - 'strategic_planning': 'strategywiki', |
119 | | - 'usability_initiative': 'usabilitywiki', |
120 | | - 'multilingual_wikisource': None |
121 | | - } |
| 102 | + self.load_configuration() |
| 103 | + self.set_custom_settings(**kwargs) |
122 | 104 | |
123 | 105 | |
124 | | - def set_custom_settings(self, **kwargs): |
125 | | - for kw in kwargs: |
126 | | - setattr(self, kw, kwargs[kw]) |
| 106 | + self.input_location = os.path.join(self.root, 'wikimedia') |
| 107 | + # Default Input file |
| 108 | + self.input_filename = os.path.join(self.input_location, 'en', |
| 109 | + 'wiki', |
| 110 | + 'enwiki-20100916-stub-meta-history.xml') |
| 111 | + # This is the place where error messages are stored for debugging purposes |
| 112 | + self.log_location = os.path.join(self.working_directory, |
| 113 | + 'logs') |
| 114 | + self.csv_location = os.path.join(self.working_directory, |
| 115 | + 'data', 'csv') |
| 116 | + self.dataset_location = os.path.join(self.working_directory, 'datasets') |
| 117 | + self.binary_location = os.path.join(self.working_directory, |
| 118 | + 'data', 'objects') |
| 119 | + self.namespace_location = os.path.join(self.working_directory, |
| 120 | + 'namespaces') |
| 121 | + self.chart_location = os.path.join(self.working_directory, 'statistics', |
| 122 | + 'charts') |
| 123 | + self.file_choices = ('stub-meta-history.xml.gz', |
| 124 | + 'stub-meta-current.xml.gz', |
| 125 | + 'pages-meta-history.xml.7z', |
| 126 | + 'pages-meta-current.xml.bz2',) |
127 | 127 | |
128 | | - def load_configuration(self): |
129 | | - if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')): |
130 | | - config = ConfigParser.RawConfigParser() |
131 | | - config.read(os.path.join(self.working_directory, 'wiki.cfg')) |
132 | | - self.working_directory = config.get('file_locations', 'working_directory') |
133 | | - self.input_location = config.get('file_locations', 'input_location') |
134 | | - self.default_project = config.get('wiki', 'project') |
135 | | - self.default_language = config.get('wiki', 'language') |
| 128 | + def set_custom_settings(self, **kwargs): |
| 129 | + for kw in kwargs: |
| 130 | + setattr(self, kw, kwargs[kw]) |
136 | 131 | |
137 | | - def determine_working_directory(self): |
138 | | - cwd = os.getcwd() |
139 | | - if not cwd.endswith('editor_trends%s' % os.sep): |
140 | | - pos = cwd.find('editor_trends') + 14 |
141 | | - cwd = cwd[:pos] |
142 | | - return cwd |
| 132 | + def load_configuration(self): |
| 133 | + if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')): |
| 134 | + config = ConfigParser.RawConfigParser() |
| 135 | + config.read(os.path.join(self.working_directory, 'wiki.cfg')) |
| 136 | + self.working_directory = config.get('file_locations', 'working_directory') |
| 137 | + self.input_location = config.get('file_locations', 'input_location') |
| 138 | + self.default_project = config.get('wiki', 'project') |
| 139 | + self.default_language = config.get('wiki', 'language') |
143 | 140 | |
144 | | - def determine_platform(self): |
145 | | - os = platform.system() |
146 | | - if os == 'Darwin': |
147 | | - return 'OSX' |
148 | | - else: |
149 | | - return os |
| 141 | + def determine_working_directory(self): |
| 142 | + cwd = os.getcwd() |
| 143 | + if not cwd.endswith('editor_trends%s' % os.sep): |
| 144 | + pos = cwd.find('editor_trends') + 14 |
| 145 | + cwd = cwd[:pos] |
| 146 | + return cwd |
150 | 147 | |
151 | | - def verify_environment(self, directories): |
152 | | - for dir in directories: |
153 | | - if not os.path.exists(dir): |
154 | | - try: |
155 | | - os.makedirs(dir) |
156 | | - except IOError: |
157 | | - raise 'Configuration Error, could not create directory.' |
| 148 | + def detect_python_version(self): |
| 149 | + version = sys.version_info[0:2] |
| 150 | + #logger.debug('Python version: %s' % '.'.join(str(version))) |
| 151 | + if version < self.minimum_python_version: |
| 152 | + raise exceptions.OutDatedPythonVersionError |
158 | 153 | |
159 | | - def detect_windows_program(self, program): |
160 | | - entry = self.windows_register.get(program, None) |
161 | | - try: |
162 | | - key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
163 | | - return QueryValueEx(key, 'Path')[0] |
164 | | - except WindowsError: |
165 | | - return None |
| 154 | + def determine_platform(self): |
| 155 | + if platform.system() == 'Darwin': |
| 156 | + return 'OSX' |
| 157 | + else: |
| 158 | + return platform.system() |
166 | 159 | |
167 | | - def detect_linux_program(self, program): |
168 | | - path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0] |
169 | | - return path.replace('\n', '') |
| 160 | + def verify_environment(self, directories): |
| 161 | + for dir in directories: |
| 162 | + if not os.path.exists(dir): |
| 163 | + try: |
| 164 | + os.makedirs(dir) |
| 165 | + except IOError: |
| 166 | + raise 'Configuration Error, could not create directory.' |
170 | 167 | |
171 | | - def detect_installed_program(self, program): |
172 | | - if self.platform == 'Windows': |
173 | | - if not program.endswith('.exe'): |
174 | | - program = program + '.exe' |
175 | | - path = self.detect_windows_program(program) |
176 | | - if path != None: |
177 | | - path = path + program |
178 | | - elif self.platform == 'Linux': |
179 | | - path = self.detect_linux_program(program) |
| 168 | + def detect_windows_program(self, program): |
| 169 | + entry = self.windows_register.get(program, None) |
| 170 | + try: |
| 171 | + key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
| 172 | + return QueryValueEx(key, 'Path')[0] |
| 173 | + except WindowsError: |
| 174 | + return None |
180 | 175 | |
181 | | - return path |
| 176 | + def detect_linux_program(self, program): |
| 177 | + path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0] |
| 178 | + return path.strip() |
182 | 179 | |
183 | | - def determine_max_filehandles_open(self): |
184 | | - if self.platform == 'Windows' and self.architecture == 'i386': |
185 | | - return win32file._getmaxstdio() |
186 | | - elif self.platform != 'Windows': |
187 | | - return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100 |
188 | | - else: |
189 | | - return 500 |
| 180 | + def detect_installed_program(self, program): |
| 181 | + if self.platform == 'Windows': |
| 182 | + if not program.endswith('.exe'): |
| 183 | + program = program + '.exe' |
| 184 | + path = self.detect_windows_program(program) |
| 185 | + if path != None: |
| 186 | + path = path + program |
| 187 | + elif self.platform == 'Linux': |
| 188 | + path = self.detect_linux_program(program) |
190 | 189 | |
191 | | - def update_python_path(self): |
192 | | - IGNORE_DIRS = ['wikistats', 'zips'] |
193 | | - dirs = [name for name in os.listdir(self.working_directory) if |
194 | | - os.path.isdir(os.path.join(self.working_directory, name))] |
195 | | - for subdirname in dirs: |
196 | | - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
197 | | - sys.path.append(os.path.join(self.working_directory, |
198 | | - subdirname)) |
| 190 | + return path |
199 | 191 | |
| 192 | + def determine_max_filehandles_open(self): |
| 193 | + if self.platform == 'Windows' and self.architecture == 'i386': |
| 194 | + return win32file._getmaxstdio() |
| 195 | + elif self.platform != 'Windows': |
| 196 | + return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100 |
| 197 | + else: |
| 198 | + return 500 |
200 | 199 | |
201 | | - def set_file_locations(self): |
202 | | - self.input_location = os.path.join(self.root, 'wikimedia') |
203 | | - self.input_filename = os.path.join(self.input_location, 'en', |
204 | | -'wiki', 'enwiki-20100916-stub-meta-history.xml') # Default Input file |
205 | | - self.log_location = os.path.join(self.working_directory, |
206 | | -'logs') # This is the place where error messages are stored for debugging purposes |
207 | | - self.csv_location = os.path.join(self.working_directory, |
208 | | -'data', 'csv') |
209 | | - self.dataset_location = os.path.join(self.working_directory, 'datasets') |
210 | | - self.binary_location = os.path.join(self.working_directory, |
211 | | -'data', 'objects') |
212 | | - self.namespace_location = os.path.join(self.working_directory, |
213 | | -'namespaces') |
214 | | - self.chart_location = os.path.join(self.working_directory, 'statistics', 'charts') |
| 200 | + def update_python_path(self): |
| 201 | + IGNORE_DIRS = ['wikistats', 'zips'] |
| 202 | + dirs = [name for name in os.listdir(self.working_directory) if |
| 203 | + os.path.isdir(os.path.join(self.working_directory, name))] |
| 204 | + for subdirname in dirs: |
| 205 | + if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
| 206 | + sys.path.append(os.path.join(self.working_directory, |
| 207 | + subdirname)) |
Index: trunk/tools/editor_trends/cronjobs.py |
— | — | @@ -21,7 +21,7 @@ |
22 | 22 | |
23 | 23 | |
24 | 24 | from database import db |
25 | | -from utils import wikiprojects |
| 25 | +from classes import wikiprojects |
26 | 26 | import manage as manager |
27 | 27 | |
28 | 28 | def launch_editor_trends_toolkit(task): |