r85632 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85631‎ | r85632 | r85633 >
Date:19:38, 7 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added storage parameter to manage.py config
Modified paths:
  • /trunk/tools/editor_trends/classes/projects.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -42,17 +42,6 @@
4343 from analyses import inventory
4444
4545
46 -def show_choices(settings, attr):
47 - '''
48 - Show possible choices in console, for example output valid languages or
49 - valid projects.
50 - '''
51 - choices = getattr(settings, attr).items()
52 - choices.sort()
53 - choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices]
54 - return choices
55 -
56 -
5746 def init_args_parser():
5847 '''
5948 Entry point for parsing command line and launching the needed function(s).
@@ -210,10 +199,18 @@
211200 config = ConfigParser.RawConfigParser()
212201 project = None
213202 language = None
 203+ valid_storage = ['mongo', 'cassandra']
214204 working_directory = raw_input('Please indicate where you installed Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd())
215205 input_location = raw_input('Please indicate where the Wikipedia dump files are or will be located.\nDefault is: %s\nPress Enter to accept default.\n' % rts.input_location)
216206 output_location = raw_input('Please indicate where to store all Wikilytics project files.\nDefault is: %s\nPress Enter to accept default.\n' % rts.output_location)
217207
 208+
 209+ while db not in valid_storage:
 210+ db = raw_input('Please indicate what database you are using for storage. \nDefault is: Mongo\n')
 211+ db = 'mongo' if len(db) == 0 else db.lower()
 212+ if db not in valid_storage:
 213+ print 'Valid choices are: %s' % ','.join(valid_storage)
 214+
218215 while project not in pc.projects.keys():
219216 project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % rts.project.full_name)
220217 project = project if len(project) > 0 else rts.project.name
@@ -238,15 +235,13 @@
239236 config.add_section('wiki')
240237 config.set('wiki', 'project', project)
241238 config.set('wiki', 'language', language)
 239+ config.add_section('storage')
 240+ config.set('db', 'type', db)
242241
243242 fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
244243 config.write(fh)
245244 fh.close()
246245
247 - rts.working_directory = config.get('file_locations', 'working_directory')
248 - rts.input_location = config.get('file_locations', 'input_location')
249 - rts.output_location = config.get('file_locations', 'output_location')
250 -
251246 log.to_csv(logger, rts, 'New configuration', 'Creating',
252247 config_launcher,
253248 working_directory=working_directory,
Index: trunk/tools/editor_trends/etl/enricher.py
@@ -28,7 +28,6 @@
2929 from xml.etree.cElementTree import iterparse, dump
3030 from collections import deque
3131
32 -
3332 if '..' not in sys.path:
3433 sys.path.append('..')
3534
@@ -94,7 +93,7 @@
9594 class DummyRTS:
9695 def __init__(self, location, path):
9796 self.input_location = location
98 - self.location = path
 97+ self.output_location = path
9998 self.language = Dummy()
10099 self.project = Dummy()
101100 self.language.code = 'en'
@@ -520,10 +519,9 @@
521520 This function determines the local names of the different namespaces.
522521 '''
523522 namespaces = {}
 523+ print 'Detected xml namespace: %s' % xml_namespace
524524 print 'Constructing namespace dictionary'
525 - print xml_namespace
526525 elements = siteinfo.find('%s%s' % (xml_namespace, 'namespaces'))
527 - print elements
528526 for elem in elements.getchildren():
529527 key = int(elem.get('key'))
530528 namespaces[key] = elem.text #extract_text(ns)
@@ -677,7 +675,7 @@
678676
679677 def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts):
680678 bots = bot_detector.retrieve_bots(rts.language.code)
681 - path = os.path.join(rts.location, 'txt')
 679+ path = os.path.join(rts.output_location, 'txt')
682680
683681 filehandles = [file_utils.create_txt_filehandle(path, '%s.csv' % fh, 'a',
684682 'utf-8') for fh in xrange(rts.max_filehandles)]
@@ -746,18 +744,13 @@
747745 preparations are made including setting up namespaces and cleaning up old
748746 files.
749747 '''
750 - keyspace_name = 'enwiki'
751748 if storage == 'cassandra':
 749+ keyspace_name = 'enwiki'
752750 cassandra.install_schema(keyspace_name, drop_first=True)
753751 elif storage == 'csv':
754 - output_articles = os.path.join(rts.input_location, rts.language.code,
755 - rts.project.name)
756 - output_txt = os.path.join(rts.input_location, rts.language.code,
757 - rts.project.name, 'txt')
758 - res = file_utils.delete_file(output_articles, 'articles.csv')
759 - res = file_utils.delete_file(output_txt, None, directory=True)
 752+ res = file_utils.delete_file(rts.txt, None, directory=True)
760753 if res:
761 - res = file_utils.create_directory(output_txt)
 754+ res = file_utils.create_directory(rts.txt)
762755
763756
764757 def multiprocessor_launcher(function, dataset, storage, locks, rts):
Index: trunk/tools/editor_trends/classes/projects.py
@@ -17,7 +17,12 @@
1818 __date__ = '2011-01-26'
1919 __version__ = '0.1'
2020
 21+import sys
 22+if '..' not in sys.path:
 23+ sys.path.append('..')
 24+
2125 import languages
 26+from utils import text_utils
2227
2328 class Project:
2429 def __init__(self, name, urlname, full_name):
@@ -81,8 +86,7 @@
8287 return self.projects.get(name, None)
8388
8489 def supported_projects(self):
85 - choices = ([(key, d['full_name']) for key, d in self.wikis.iteritems()])
86 - return tuple(choices)
 90+ return ([(d['full_name'], key) for key, d in self.wikis.iteritems()])
8791
8892 def project_supports_language(self, urlname):
8993 valid_languages_wiki = ['ace', 'af', 'als', 'an', 'roa-rup', 'ast', 'gn', 'av', 'ay', 'az', 'id', 'ms', 'bm', 'zh-min-nan', 'jv', 'map-bms', 'su', 'bug', 'bi', 'bar', 'bs', 'br', 'ca', 'cbk-zam', 'ch', 'cs', 'ny', 'sn', 'tum', 've', 'co', 'za', 'cy', 'da', 'pdc', 'de', 'nv', 'na', 'lad', 'et', 'ang', 'en', 'es', 'eo', 'ext', 'eu', 'to', 'fo', 'fr', 'frp', 'fy', 'ff', 'fur', 'ga', 'gv', 'sm', 'gd', 'gl', 'got', 'hak', 'haw', 'hsb', 'hr', 'io', 'ilo', 'ig', 'ia', 'ie', 'ik', 'xh', 'zu', 'is', 'it', 'mh', 'kl', 'pam', 'csb', 'kw', 'kg', 'ki', 'rw', 'ky', 'rn', 'sw', 'ht', 'ku', 'la', 'lv', 'lb', 'lt', 'lij', 'li', 'ln', 'jbo', 'lg', 'lmo', 'hu', 'mg', 'mt', 'mi', 'cdo', 'my', 'nah', 'fj', 'nl', 'cr', 'ne', 'nap', 'frr', 'pih', 'no', 'nn', 'nrm', 'oc', 'om', 'pag', 'pi', 'pap', 'pms', 'nds', 'pl', 'pt', 'ty', 'ksh', 'ro', 'rmy', 'rm', 'qu', 'se', 'sg', 'sc', 'sco', 'st', 'tn', 'sq', 'scn', 'simple', 'ceb', 'ss', 'sk', 'sl', 'so', 'sh', 'fi', 'sv', 'tl', 'tt', 'tet', 'vi', 'tpi', 'chy', 'tr', 'tk', 'tw', 'vec', 'vo', 'fiu-vro', 'wa', 'vls', 'war', 'wo', 'ts', 'yo', 'bat-smg', 'el', 'ab', 'ba', 'be', 'bg', 'bxr', 'cu', 'os', 'kk', 'kv', 'mk', 'mn', 'ce', 'ru', 'sr', 'tg', 'udm', 'uk', 'uz', 'xal', 'cv', 'hy', 'ka', 'he', 'yi', 'ar', 'fa', 'ha', 'ps', 'sd', 'ur', 'ug', 'arc', 'dv', 'as', 'bn', 'bpy', 'gu', 'bh', 'hi', 'ks', 'mr', 'kn', 'ne', 'new', 'sa', 'ml', 'or', 'pa', 'ta', 'te', 'bo', 'dz', 'si', 'km', 'lo', 'th', 'am', 'ti', 'iu', 'chr', 'ko', 'ja', 'zh', 'wuu', 'lzh', 'yue']
@@ -100,9 +104,13 @@
101105 except KeyError:
102106 return []
103107
 108+def debug():
 109+ pc = ProjectContainer()
 110+ pc.supported_projects()
 111+
104112 def init():
105113 pc = ProjectContainer()
106114 return pc.get_project('wiki')
107115
108116 if __name__ == '__main__':
109 - init()
 117+ debug()
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -56,40 +56,45 @@
5757 self.hash = self.secs_since_epoch()
5858 #print self.settings.input_location
5959 #print self.get_value('location')
60 - self.input_location = self.input_location if \
61 - self.input_location != None else self.get_value('location')
6260 self.project = self.update_project_settings()
6361 self.language = self.update_language_settings()
64 - self.charts = self.determine_chart(self.get_value('charts'))
65 - self.keywords = self.split_keywords(self.get_value('keywords'))
 62+
 63+ self.input_location = self.set_input_location()
 64+ self.output_location = self.set_output_location()
 65+
 66+
 67+ self.charts = self.determine_chart()
 68+ self.keywords = self.split_keywords()
 69+ self.namespaces = self.get_namespaces()
 70+
6671 self.kaggle = self.get_value('kaggle')
6772 self.function = self.get_value('func')
68 -
6973 self.ignore = self.get_value('except')
7074 self.force = self.get_value('force')
71 - self.location = self.get_project_location()
72 - self.filename = self.generate_wikidump_filename()
73 - self.namespaces = self.get_namespaces()
 75+ self.analyzer_collection = self.get_value('collection')
7476
75 - self.dataset = os.path.join(self.dataset_location,
76 - self.project.name)
 77+ self.dataset = os.path.join(self.dataset_location, self.project.name)
 78+ self.txt = os.path.join(self.output_location, 'txt')
 79+ self.sorted = os.path.join(self.output_location, 'sorted')
7780
78 - self.txt = os.path.join(self.location, 'txt')
79 - self.sorted = os.path.join(self.location, 'sorted')
80 -
81 - self.directories = [self.location,
 81+ self.directories = [self.output_location,
8282 self.txt,
8383 self.sorted,
8484 self.dataset]
 85+ self.verify_environment(self.directories)
 86+
 87+ #Wikidump file related variables
8588 self.dump_filename = self.generate_wikidump_filename()
8689 self.dump_relative_path = self.set_dump_path()
8790 self.dump_absolute_path = self.set_dump_path(absolute=True)
 91+
 92+ #Collection names
8893 self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
8994 self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
9095 self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
91 - self.analyzer_collection = self.get_value('collection')
92 - self.verify_environment(self.directories)
9396
 97+
 98+
9499 def __str__(self):
95100 return 'Runtime Settings for project %s %s' % (self.language.name,
96101 self.project.full_name)
@@ -107,7 +112,8 @@
108113 props[prop] = getattr(self, prop)
109114 return props
110115
111 - def split_keywords(self, keywords):
 116+ def split_keywords(self):
 117+ keywords = self.get_value('keywords')
112118 d = {}
113119 if keywords != None:
114120 keywords = keywords.split(',')
@@ -124,10 +130,11 @@
125131 d[key] = value
126132 return d
127133
128 - def determine_chart(self, chart):
 134+ def determine_chart(self):
 135+ charts = self.get_value('charts')
129136 requested_charts = []
130 - if chart != None and getattr(chart, 'func_name', None) == None:
131 - charts = chart.split(',')
 137+ if charts != None and getattr(charts, 'func_name', None) == None:
 138+ charts = charts.split(',')
132139 available_charts = inventory.available_analyses()
133140 for chart in charts:
134141 if chart not in available_charts:
@@ -135,11 +142,26 @@
136143 sys.exit(-1)
137144 else:
138145 requested_charts.append(chart)
139 - elif getattr(chart, 'func_name', None) != None:
 146+ elif getattr(charts, 'func_name', None) != None:
140147 requested_charts.append(chart.func_name)
141148 return requested_charts
142149
143 - def get_project_location(self):
 150+
 151+ def set_input_location(self):
 152+ files = os.listdir(self.input_location)
 153+ extensions = ['gz', '7z', 'bz2']
 154+ valid = False
 155+ for ext in extensions:
 156+ if ext in files:
 157+ valid = True
 158+ if valid:
 159+ #ABS path case: check if files are stored here
 160+ return input_location
 161+ else:
 162+ return os.path.join(self.input_location, self.language.code,
 163+ self.project.name)
 164+
 165+ def set_output_location(self):
144166 '''
145167 Construct the full project location
146168 '''
@@ -156,7 +178,7 @@
157179 self.language.locale,
158180 self.language.code)
159181 about['Input directory'] = '%s' % self.input_location
160 - about['Output directory'] = '%s and subdirectories' % self.location
 182+ about['Output directory'] = '%s and subdirectories' % self.output_location
161183
162184 max_length_key = max([len(key) for key in about.keys()])
163185