r80883 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80882‎ | r80883 | r80884 >
Date:17:25, 24 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Too many small changes to recall.
Modified paths:
  • /trunk/tools/editor_trends/config.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/cronjobs.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/config.py
@@ -13,7 +13,7 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
 17+__email__ = 'dvanliere at gmail dot com'
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
@@ -22,7 +22,7 @@
2323 import ConfigParser
2424
2525 from utils import utils
26 -import languages
 26+from classes import wikiprojects
2727
2828
2929 def show_choices(settings, attr):
@@ -34,12 +34,13 @@
3535
3636 def create_configuration(settings, args):
3737 force = getattr(args, 'force', False)
 38+
3839 if not os.path.exists('wiki.cfg') or force:
3940 config = ConfigParser.RawConfigParser()
4041 project = None
4142 language = None
4243 dumpversion = None
43 - language_map = languages.language_map()
 44+ #language_map = languages.language_map()
4445 working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd())
4546 input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location)
4647
@@ -49,19 +50,20 @@
5051 if project not in settings.projects.keys():
5152 print 'Valid choices for a project are: %s' % ','.join(settings.projects.keys())
5253
53 - while language not in languages.MAPPING:
 54+ wiki = wikiprojects.Wiki(settings.encoding, project=project)
 55+ while language not in wiki.valid_languages:
5456 language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (settings.projects[project].capitalize(), language_map[args.language]))
5557 if len(language) == 0:
5658 language = language_map[args.language]
57 - language = language if language in languages.MAPPING else args.language
 59+ language = language if language in wiki.valid_languages else args.language
5860
59 - while dumpversion not in settings.dumpversions.keys():
60 - choices = '\n'.join(show_choices(settings, 'dumpversions'))
61 - dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0']))
62 - if len(dumpversion) == 0:
63 - dumpversion = settings.dumpversions['0']
 61+# while dumpversion not in settings.dumpversions.keys():
 62+# choices = '\n'.join(show_choices(settings, 'dumpversions'))
 63+# dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0']))
 64+# if len(dumpversion) == 0:
 65+# dumpversion = settings.dumpversions['0']
6466
65 - dumpversion = settings.dumpversions[dumpversion]
 67+ #dumpversion = settings.dumpversions[dumpversion]
6668 input_location = input_location if len(input_location) > 0 else settings.input_location
6769 working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
6870
@@ -72,15 +74,15 @@
7375 config.add_section('wiki')
7476 config.set('wiki', 'project', project)
7577 config.set('wiki', 'language', language)
76 - config.set('wiki', 'dumpversion', dumpversion)
 78+ #config.set('wiki', 'dumpversion', dumpversion)
7779
78 - fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
 80+ fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
7981 config.write(fh)
8082 fh.close()
8183
8284 settings.working_directory = config.get('file_locations', 'working_directory')
8385 settings.input_location = config.get('file_locations', 'input_location')
84 - settings.xml_namespace = config.get('wiki', 'dumpversion')
 86+ #settings.xml_namespace = config.get('wiki', 'dumpversion')
8587 return settings
8688
8789
Index: trunk/tools/editor_trends/configuration.py
@@ -13,7 +13,7 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
 17+__email__ = 'dvanliere at gmail dot com'
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
@@ -29,6 +29,9 @@
3030 import platform
3131 import subprocess
3232
 33+
 34+from classes import exceptions
 35+
3336 try:
3437 from _winreg import *
3538 from pywin import win32file
@@ -58,156 +61,146 @@
5962 else:
6063 return cls.instance
6164
62 -class Settings(object):
63 - __metaclass__ = Singleton
 65+class Settings:
 66+ __metaclass__ = Singleton
6467
65 - def __init__(self, debug=True, process_multiplier=1, **kwargs):
66 - self.debug = debug
67 - self.progressbar = True
68 - self.encoding = 'utf-8'
 68+ def __init__(self, debug=True, process_multiplier=1, **kwargs):
 69+ self.minimum_python_version = (2, 6)
 70+ self.detect_python_version()
 71+ self.debug = debug
 72+ self.progressbar = True
 73+ self.encoding = 'utf-8'
6974
70 - #Date format as used by Erik Zachte
71 - self.date_format = '%Y-%m-%d'
 75+ #Date format as used by Erik Zachte
 76+ self.date_format = '%Y-%m-%d'
7277
73 - # Timestamp format as generated by the MediaWiki dumps
74 - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
 78+ # Timestamp format as generated by the MediaWiki dumps
 79+ self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
7580
76 - #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
77 - self.max_xmlfile_size = 4096 * 1024
 81+ #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
 82+ self.max_xmlfile_size = 4096 * 1024
7883
79 - #Change this to match your computers configuration (RAM / CPU)
80 - self.number_of_processes = cpu_count() * process_multiplier
 84+ #Change this to match your computers configuration (RAM / CPU)
 85+ self.number_of_processes = cpu_count() * process_multiplier
8186
82 - self.minimum_python_version = (2, 6)
83 - self.wp_dump_location = 'http://download.wikimedia.org'
84 - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
85 - self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
86 - self.windows_register = {'7z.exe': 'Software\\7-Zip', }
87 - #Extensions of ascii files, this is used to determine the filemode to use
88 - self.platform = self.determine_platform()
 87+ self.wp_dump_location = 'http://download.wikimedia.org'
 88+ self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
 89+ self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
 90+ self.windows_register = {'7z.exe': 'Software\\7-Zip', }
 91+ #Extensions of ascii files, this is used to determine the filemode to use
 92+ self.platform = self.determine_platform()
8993
90 - self.architecture = platform.machine()
91 - self.working_directory = self.determine_working_directory()
92 - self.update_python_path()
 94+ self.architecture = platform.machine()
 95+ self.working_directory = self.determine_working_directory()
 96+ self.update_python_path()
9397
94 - self.root = '/' if self.platform != 'Windows' else 'c:\\'
95 - self.file_locations = self.set_file_locations()
96 - self.max_filehandles = self.determine_max_filehandles_open()
97 - self.tab_width = 4 if self.platform == 'Windows' else 8
 98+ self.root = '/' if self.platform != 'Windows' else 'c:\\'
 99+ self.max_filehandles = self.determine_max_filehandles_open()
 100+ self.tab_width = 4 if self.platform == 'Windows' else 8
98101
99 - self.load_configuration()
100 - self.set_custom_settings(**kwargs)
101 - self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/',
102 - '1': 'http://www.mediawiki.org/xml/export-0.3/',
103 - }
104 - self.projects = {'wiki': 'wikipedia',
105 - 'commons': 'commonswiki',
106 - 'books': 'wikibooks',
107 - 'news': 'wikinews',
108 - 'quote': 'wikiquote',
109 - 'source': 'wikisource',
110 - 'versity': 'wikiversity',
111 - 'tionary': 'wiktionary',
112 - 'meta': 'metawiki',
113 - 'species': 'specieswiki',
114 - 'incubator': 'incubatorwiki',
115 - 'foundation': 'foundationwiki',
116 - 'mediawiki': 'mediawikiwiki',
117 - 'outreach': 'outreachwiki',
118 - 'strategic_planning': 'strategywiki',
119 - 'usability_initiative': 'usabilitywiki',
120 - 'multilingual_wikisource': None
121 - }
 102+ self.load_configuration()
 103+ self.set_custom_settings(**kwargs)
122104
123105
124 - def set_custom_settings(self, **kwargs):
125 - for kw in kwargs:
126 - setattr(self, kw, kwargs[kw])
 106+ self.input_location = os.path.join(self.root, 'wikimedia')
 107+ # Default Input file
 108+ self.input_filename = os.path.join(self.input_location, 'en',
 109+ 'wiki',
 110+ 'enwiki-20100916-stub-meta-history.xml')
 111+ # This is the place where error messages are stored for debugging purposes
 112+ self.log_location = os.path.join(self.working_directory,
 113+ 'logs')
 114+ self.csv_location = os.path.join(self.working_directory,
 115+ 'data', 'csv')
 116+ self.dataset_location = os.path.join(self.working_directory, 'datasets')
 117+ self.binary_location = os.path.join(self.working_directory,
 118+ 'data', 'objects')
 119+ self.namespace_location = os.path.join(self.working_directory,
 120+ 'namespaces')
 121+ self.chart_location = os.path.join(self.working_directory, 'statistics',
 122+ 'charts')
 123+ self.file_choices = ('stub-meta-history.xml.gz',
 124+ 'stub-meta-current.xml.gz',
 125+ 'pages-meta-history.xml.7z',
 126+ 'pages-meta-current.xml.bz2',)
127127
128 - def load_configuration(self):
129 - if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')):
130 - config = ConfigParser.RawConfigParser()
131 - config.read(os.path.join(self.working_directory, 'wiki.cfg'))
132 - self.working_directory = config.get('file_locations', 'working_directory')
133 - self.input_location = config.get('file_locations', 'input_location')
134 - self.default_project = config.get('wiki', 'project')
135 - self.default_language = config.get('wiki', 'language')
 128+ def set_custom_settings(self, **kwargs):
 129+ for kw in kwargs:
 130+ setattr(self, kw, kwargs[kw])
136131
137 - def determine_working_directory(self):
138 - cwd = os.getcwd()
139 - if not cwd.endswith('editor_trends%s' % os.sep):
140 - pos = cwd.find('editor_trends') + 14
141 - cwd = cwd[:pos]
142 - return cwd
 132+ def load_configuration(self):
 133+ if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')):
 134+ config = ConfigParser.RawConfigParser()
 135+ config.read(os.path.join(self.working_directory, 'wiki.cfg'))
 136+ self.working_directory = config.get('file_locations', 'working_directory')
 137+ self.input_location = config.get('file_locations', 'input_location')
 138+ self.default_project = config.get('wiki', 'project')
 139+ self.default_language = config.get('wiki', 'language')
143140
144 - def determine_platform(self):
145 - os = platform.system()
146 - if os == 'Darwin':
147 - return 'OSX'
148 - else:
149 - return os
 141+ def determine_working_directory(self):
 142+ cwd = os.getcwd()
 143+ if not cwd.endswith('editor_trends%s' % os.sep):
 144+ pos = cwd.find('editor_trends') + 14
 145+ cwd = cwd[:pos]
 146+ return cwd
150147
151 - def verify_environment(self, directories):
152 - for dir in directories:
153 - if not os.path.exists(dir):
154 - try:
155 - os.makedirs(dir)
156 - except IOError:
157 - raise 'Configuration Error, could not create directory.'
 148+ def detect_python_version(self):
 149+ version = sys.version_info[0:2]
 150+ #logger.debug('Python version: %s' % '.'.join(str(version)))
 151+ if version < self.minimum_python_version:
 152+ raise exceptions.OutDatedPythonVersionError
158153
159 - def detect_windows_program(self, program):
160 - entry = self.windows_register.get(program, None)
161 - try:
162 - key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
163 - return QueryValueEx(key, 'Path')[0]
164 - except WindowsError:
165 - return None
 154+ def determine_platform(self):
 155+ if platform.system() == 'Darwin':
 156+ return 'OSX'
 157+ else:
 158+ return platform.system()
166159
167 - def detect_linux_program(self, program):
168 - path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]
169 - return path.replace('\n', '')
 160+ def verify_environment(self, directories):
 161+ for dir in directories:
 162+ if not os.path.exists(dir):
 163+ try:
 164+ os.makedirs(dir)
 165+ except IOError:
 166+ raise 'Configuration Error, could not create directory.'
170167
171 - def detect_installed_program(self, program):
172 - if self.platform == 'Windows':
173 - if not program.endswith('.exe'):
174 - program = program + '.exe'
175 - path = self.detect_windows_program(program)
176 - if path != None:
177 - path = path + program
178 - elif self.platform == 'Linux':
179 - path = self.detect_linux_program(program)
 168+ def detect_windows_program(self, program):
 169+ entry = self.windows_register.get(program, None)
 170+ try:
 171+ key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
 172+ return QueryValueEx(key, 'Path')[0]
 173+ except WindowsError:
 174+ return None
180175
181 - return path
 176+ def detect_linux_program(self, program):
 177+ path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]
 178+ return path.strip()
182179
183 - def determine_max_filehandles_open(self):
184 - if self.platform == 'Windows' and self.architecture == 'i386':
185 - return win32file._getmaxstdio()
186 - elif self.platform != 'Windows':
187 - return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
188 - else:
189 - return 500
 180+ def detect_installed_program(self, program):
 181+ if self.platform == 'Windows':
 182+ if not program.endswith('.exe'):
 183+ program = program + '.exe'
 184+ path = self.detect_windows_program(program)
 185+ if path != None:
 186+ path = path + program
 187+ elif self.platform == 'Linux':
 188+ path = self.detect_linux_program(program)
190189
191 - def update_python_path(self):
192 - IGNORE_DIRS = ['wikistats', 'zips']
193 - dirs = [name for name in os.listdir(self.working_directory) if
194 - os.path.isdir(os.path.join(self.working_directory, name))]
195 - for subdirname in dirs:
196 - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
197 - sys.path.append(os.path.join(self.working_directory,
198 - subdirname))
 190+ return path
199191
 192+ def determine_max_filehandles_open(self):
 193+ if self.platform == 'Windows' and self.architecture == 'i386':
 194+ return win32file._getmaxstdio()
 195+ elif self.platform != 'Windows':
 196+ return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
 197+ else:
 198+ return 500
200199
201 - def set_file_locations(self):
202 - self.input_location = os.path.join(self.root, 'wikimedia')
203 - self.input_filename = os.path.join(self.input_location, 'en',
204 -'wiki', 'enwiki-20100916-stub-meta-history.xml') # Default Input file
205 - self.log_location = os.path.join(self.working_directory,
206 -'logs') # This is the place where error messages are stored for debugging purposes
207 - self.csv_location = os.path.join(self.working_directory,
208 -'data', 'csv')
209 - self.dataset_location = os.path.join(self.working_directory, 'datasets')
210 - self.binary_location = os.path.join(self.working_directory,
211 -'data', 'objects')
212 - self.namespace_location = os.path.join(self.working_directory,
213 -'namespaces')
214 - self.chart_location = os.path.join(self.working_directory, 'statistics', 'charts')
 200+ def update_python_path(self):
 201+ IGNORE_DIRS = ['wikistats', 'zips']
 202+ dirs = [name for name in os.listdir(self.working_directory) if
 203+ os.path.isdir(os.path.join(self.working_directory, name))]
 204+ for subdirname in dirs:
 205+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
 206+ sys.path.append(os.path.join(self.working_directory,
 207+ subdirname))
Index: trunk/tools/editor_trends/cronjobs.py
@@ -21,7 +21,7 @@
2222
2323
2424 from database import db
25 -from utils import wikiprojects
 25+from classes import wikiprojects
2626 import manage as manager
2727
2828 def launch_editor_trends_toolkit(task):

Status & tagging log