Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -42,17 +42,6 @@ |
43 | 43 | from analyses import inventory |
44 | 44 | |
45 | 45 | |
46 | | -def show_choices(settings, attr): |
47 | | - ''' |
48 | | - Show possible choices in console, for example output valid languages or |
49 | | - valid projects. |
50 | | - ''' |
51 | | - choices = getattr(settings, attr).items() |
52 | | - choices.sort() |
53 | | - choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices] |
54 | | - return choices |
55 | | - |
56 | | - |
57 | 46 | def init_args_parser(): |
58 | 47 | ''' |
59 | 48 | Entry point for parsing command line and launching the needed function(s). |
— | — | @@ -210,10 +199,18 @@ |
211 | 200 | config = ConfigParser.RawConfigParser() |
212 | 201 | project = None |
213 | 202 | language = None |
| 203 | + valid_storage = ['mongo', 'cassandra'] |
214 | 204 | working_directory = raw_input('Please indicate where you installed Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd()) |
215 | 205 | input_location = raw_input('Please indicate where the Wikipedia dump files are or will be located.\nDefault is: %s\nPress Enter to accept default.\n' % rts.input_location) |
216 | 206 | output_location = raw_input('Please indicate where to store all Wikilytics project files.\nDefault is: %s\nPress Enter to accept default.\n' % rts.output_location) |
217 | 207 | |
| 208 | + |
| 209 | + while db not in valid_storage: |
| 210 | + db = raw_input('Please indicate what database you are using for storage. \nDefault is: Mongo\n') |
| 211 | + db = 'mongo' if len(db) == 0 else db.lower() |
| 212 | + if db not in valid_storage: |
| 213 | + print 'Valid choices are: %s' % ','.join(valid_storage) |
| 214 | + |
218 | 215 | while project not in pc.projects.keys(): |
219 | 216 | project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % rts.project.full_name) |
220 | 217 | project = project if len(project) > 0 else rts.project.name |
— | — | @@ -238,15 +235,13 @@ |
239 | 236 | config.add_section('wiki') |
240 | 237 | config.set('wiki', 'project', project) |
241 | 238 | config.set('wiki', 'language', language) |
| 239 | + config.add_section('storage') |
| 240 | + config.set('db', 'type', db) |
242 | 241 | |
243 | 242 | fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
244 | 243 | config.write(fh) |
245 | 244 | fh.close() |
246 | 245 | |
247 | | - rts.working_directory = config.get('file_locations', 'working_directory') |
248 | | - rts.input_location = config.get('file_locations', 'input_location') |
249 | | - rts.output_location = config.get('file_locations', 'output_location') |
250 | | - |
251 | 246 | log.to_csv(logger, rts, 'New configuration', 'Creating', |
252 | 247 | config_launcher, |
253 | 248 | working_directory=working_directory, |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -28,7 +28,6 @@ |
29 | 29 | from xml.etree.cElementTree import iterparse, dump |
30 | 30 | from collections import deque |
31 | 31 | |
32 | | - |
33 | 32 | if '..' not in sys.path: |
34 | 33 | sys.path.append('..') |
35 | 34 | |
— | — | @@ -94,7 +93,7 @@ |
95 | 94 | class DummyRTS: |
96 | 95 | def __init__(self, location, path): |
97 | 96 | self.input_location = location |
98 | | - self.location = path |
| 97 | + self.output_location = path |
99 | 98 | self.language = Dummy() |
100 | 99 | self.project = Dummy() |
101 | 100 | self.language.code = 'en' |
— | — | @@ -520,10 +519,9 @@ |
521 | 520 | This function determines the local names of the different namespaces. |
522 | 521 | ''' |
523 | 522 | namespaces = {} |
| 523 | + print 'Detected xml namespace: %s' % xml_namespace |
524 | 524 | print 'Constructing namespace dictionary' |
525 | | - print xml_namespace |
526 | 525 | elements = siteinfo.find('%s%s' % (xml_namespace, 'namespaces')) |
527 | | - print elements |
528 | 526 | for elem in elements.getchildren(): |
529 | 527 | key = int(elem.get('key')) |
530 | 528 | namespaces[key] = elem.text #extract_text(ns) |
— | — | @@ -677,7 +675,7 @@ |
678 | 676 | |
679 | 677 | def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts): |
680 | 678 | bots = bot_detector.retrieve_bots(rts.language.code) |
681 | | - path = os.path.join(rts.location, 'txt') |
| 679 | + path = os.path.join(rts.output_location, 'txt') |
682 | 680 | |
683 | 681 | filehandles = [file_utils.create_txt_filehandle(path, '%s.csv' % fh, 'a', |
684 | 682 | 'utf-8') for fh in xrange(rts.max_filehandles)] |
— | — | @@ -746,18 +744,13 @@ |
747 | 745 | preparations are made including setting up namespaces and cleaning up old |
748 | 746 | files. |
749 | 747 | ''' |
750 | | - keyspace_name = 'enwiki' |
751 | 748 | if storage == 'cassandra': |
| 749 | + keyspace_name = 'enwiki' |
752 | 750 | cassandra.install_schema(keyspace_name, drop_first=True) |
753 | 751 | elif storage == 'csv': |
754 | | - output_articles = os.path.join(rts.input_location, rts.language.code, |
755 | | - rts.project.name) |
756 | | - output_txt = os.path.join(rts.input_location, rts.language.code, |
757 | | - rts.project.name, 'txt') |
758 | | - res = file_utils.delete_file(output_articles, 'articles.csv') |
759 | | - res = file_utils.delete_file(output_txt, None, directory=True) |
| 752 | + res = file_utils.delete_file(rts.txt, None, directory=True) |
760 | 753 | if res: |
761 | | - res = file_utils.create_directory(output_txt) |
| 754 | + res = file_utils.create_directory(rts.txt) |
762 | 755 | |
763 | 756 | |
764 | 757 | def multiprocessor_launcher(function, dataset, storage, locks, rts): |
Index: trunk/tools/editor_trends/classes/projects.py |
— | — | @@ -17,7 +17,12 @@ |
18 | 18 | __date__ = '2011-01-26' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | +import sys |
| 22 | +if '..' not in sys.path: |
| 23 | + sys.path.append('..') |
| 24 | + |
21 | 25 | import languages |
| 26 | +from utils import text_utils |
22 | 27 | |
23 | 28 | class Project: |
24 | 29 | def __init__(self, name, urlname, full_name): |
— | — | @@ -81,8 +86,7 @@ |
82 | 87 | return self.projects.get(name, None) |
83 | 88 | |
84 | 89 | def supported_projects(self): |
85 | | - choices = ([(key, d['full_name']) for key, d in self.wikis.iteritems()]) |
86 | | - return tuple(choices) |
| 90 | + return ([(d['full_name'], key) for key, d in self.wikis.iteritems()]) |
87 | 91 | |
88 | 92 | def project_supports_language(self, urlname): |
89 | 93 | valid_languages_wiki = ['ace', 'af', 'als', 'an', 'roa-rup', 'ast', 'gn', 'av', 'ay', 'az', 'id', 'ms', 'bm', 'zh-min-nan', 'jv', 'map-bms', 'su', 'bug', 'bi', 'bar', 'bs', 'br', 'ca', 'cbk-zam', 'ch', 'cs', 'ny', 'sn', 'tum', 've', 'co', 'za', 'cy', 'da', 'pdc', 'de', 'nv', 'na', 'lad', 'et', 'ang', 'en', 'es', 'eo', 'ext', 'eu', 'to', 'fo', 'fr', 'frp', 'fy', 'ff', 'fur', 'ga', 'gv', 'sm', 'gd', 'gl', 'got', 'hak', 'haw', 'hsb', 'hr', 'io', 'ilo', 'ig', 'ia', 'ie', 'ik', 'xh', 'zu', 'is', 'it', 'mh', 'kl', 'pam', 'csb', 'kw', 'kg', 'ki', 'rw', 'ky', 'rn', 'sw', 'ht', 'ku', 'la', 'lv', 'lb', 'lt', 'lij', 'li', 'ln', 'jbo', 'lg', 'lmo', 'hu', 'mg', 'mt', 'mi', 'cdo', 'my', 'nah', 'fj', 'nl', 'cr', 'ne', 'nap', 'frr', 'pih', 'no', 'nn', 'nrm', 'oc', 'om', 'pag', 'pi', 'pap', 'pms', 'nds', 'pl', 'pt', 'ty', 'ksh', 'ro', 'rmy', 'rm', 'qu', 'se', 'sg', 'sc', 'sco', 'st', 'tn', 'sq', 'scn', 'simple', 'ceb', 'ss', 'sk', 'sl', 'so', 'sh', 'fi', 'sv', 'tl', 'tt', 'tet', 'vi', 'tpi', 'chy', 'tr', 'tk', 'tw', 'vec', 'vo', 'fiu-vro', 'wa', 'vls', 'war', 'wo', 'ts', 'yo', 'bat-smg', 'el', 'ab', 'ba', 'be', 'bg', 'bxr', 'cu', 'os', 'kk', 'kv', 'mk', 'mn', 'ce', 'ru', 'sr', 'tg', 'udm', 'uk', 'uz', 'xal', 'cv', 'hy', 'ka', 'he', 'yi', 'ar', 'fa', 'ha', 'ps', 'sd', 'ur', 'ug', 'arc', 'dv', 'as', 'bn', 'bpy', 'gu', 'bh', 'hi', 'ks', 'mr', 'kn', 'ne', 'new', 'sa', 'ml', 'or', 'pa', 'ta', 'te', 'bo', 'dz', 'si', 'km', 'lo', 'th', 'am', 'ti', 'iu', 'chr', 'ko', 'ja', 'zh', 'wuu', 'lzh', 'yue'] |
— | — | @@ -100,9 +104,13 @@ |
101 | 105 | except KeyError: |
102 | 106 | return [] |
103 | 107 | |
| 108 | +def debug(): |
| 109 | + pc = ProjectContainer() |
| 110 | + pc.supported_projects() |
| 111 | + |
104 | 112 | def init(): |
105 | 113 | pc = ProjectContainer() |
106 | 114 | return pc.get_project('wiki') |
107 | 115 | |
108 | 116 | if __name__ == '__main__': |
109 | | - init() |
| 117 | + debug() |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -56,40 +56,45 @@ |
57 | 57 | self.hash = self.secs_since_epoch() |
58 | 58 | #print self.settings.input_location |
59 | 59 | #print self.get_value('location') |
60 | | - self.input_location = self.input_location if \ |
61 | | - self.input_location != None else self.get_value('location') |
62 | 60 | self.project = self.update_project_settings() |
63 | 61 | self.language = self.update_language_settings() |
64 | | - self.charts = self.determine_chart(self.get_value('charts')) |
65 | | - self.keywords = self.split_keywords(self.get_value('keywords')) |
| 62 | + |
| 63 | + self.input_location = self.set_input_location() |
| 64 | + self.output_location = self.set_output_location() |
| 65 | + |
| 66 | + |
| 67 | + self.charts = self.determine_chart() |
| 68 | + self.keywords = self.split_keywords() |
| 69 | + self.namespaces = self.get_namespaces() |
| 70 | + |
66 | 71 | self.kaggle = self.get_value('kaggle') |
67 | 72 | self.function = self.get_value('func') |
68 | | - |
69 | 73 | self.ignore = self.get_value('except') |
70 | 74 | self.force = self.get_value('force') |
71 | | - self.location = self.get_project_location() |
72 | | - self.filename = self.generate_wikidump_filename() |
73 | | - self.namespaces = self.get_namespaces() |
| 75 | + self.analyzer_collection = self.get_value('collection') |
74 | 76 | |
75 | | - self.dataset = os.path.join(self.dataset_location, |
76 | | - self.project.name) |
| 77 | + self.dataset = os.path.join(self.dataset_location, self.project.name) |
| 78 | + self.txt = os.path.join(self.output_location, 'txt') |
| 79 | + self.sorted = os.path.join(self.output_location, 'sorted') |
77 | 80 | |
78 | | - self.txt = os.path.join(self.location, 'txt') |
79 | | - self.sorted = os.path.join(self.location, 'sorted') |
80 | | - |
81 | | - self.directories = [self.location, |
| 81 | + self.directories = [self.output_location, |
82 | 82 | self.txt, |
83 | 83 | self.sorted, |
84 | 84 | self.dataset] |
| 85 | + self.verify_environment(self.directories) |
| 86 | + |
| 87 | + #Wikidump file related variables |
85 | 88 | self.dump_filename = self.generate_wikidump_filename() |
86 | 89 | self.dump_relative_path = self.set_dump_path() |
87 | 90 | self.dump_absolute_path = self.set_dump_path(absolute=True) |
| 91 | + |
| 92 | + #Collection names |
88 | 93 | self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name) |
89 | 94 | self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name) |
90 | 95 | self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name) |
91 | | - self.analyzer_collection = self.get_value('collection') |
92 | | - self.verify_environment(self.directories) |
93 | 96 | |
| 97 | + |
| 98 | + |
94 | 99 | def __str__(self): |
95 | 100 | return 'Runtime Settings for project %s %s' % (self.language.name, |
96 | 101 | self.project.full_name) |
— | — | @@ -107,7 +112,8 @@ |
108 | 113 | props[prop] = getattr(self, prop) |
109 | 114 | return props |
110 | 115 | |
111 | | - def split_keywords(self, keywords): |
| 116 | + def split_keywords(self): |
| 117 | + keywords = self.get_value('keywords') |
112 | 118 | d = {} |
113 | 119 | if keywords != None: |
114 | 120 | keywords = keywords.split(',') |
— | — | @@ -124,10 +130,11 @@ |
125 | 131 | d[key] = value |
126 | 132 | return d |
127 | 133 | |
128 | | - def determine_chart(self, chart): |
| 134 | + def determine_chart(self): |
| 135 | + charts = self.get_value('charts') |
129 | 136 | requested_charts = [] |
130 | | - if chart != None and getattr(chart, 'func_name', None) == None: |
131 | | - charts = chart.split(',') |
| 137 | + if charts != None and getattr(charts, 'func_name', None) == None: |
| 138 | + charts = charts.split(',') |
132 | 139 | available_charts = inventory.available_analyses() |
133 | 140 | for chart in charts: |
134 | 141 | if chart not in available_charts: |
— | — | @@ -135,11 +142,26 @@ |
136 | 143 | sys.exit(-1) |
137 | 144 | else: |
138 | 145 | requested_charts.append(chart) |
139 | | - elif getattr(chart, 'func_name', None) != None: |
| 146 | + elif getattr(charts, 'func_name', None) != None: |
140 | 147 | requested_charts.append(chart.func_name) |
141 | 148 | return requested_charts |
142 | 149 | |
143 | | - def get_project_location(self): |
| 150 | + |
| 151 | + def set_input_location(self): |
| 152 | + files = os.listdir(self.input_location) |
| 153 | + extensions = ['gz', '7z', 'bz2'] |
| 154 | + valid = False |
| 155 | + for ext in extensions: |
| 156 | + if ext in files: |
| 157 | + valid = True |
| 158 | + if valid: |
| 159 | + #ABS path case: check if files are stored here |
| 160 | + return input_location |
| 161 | + else: |
| 162 | + return os.path.join(self.input_location, self.language.code, |
| 163 | + self.project.name) |
| 164 | + |
| 165 | + def set_output_location(self): |
144 | 166 | ''' |
145 | 167 | Construct the full project location |
146 | 168 | ''' |
— | — | @@ -156,7 +178,7 @@ |
157 | 179 | self.language.locale, |
158 | 180 | self.language.code) |
159 | 181 | about['Input directory'] = '%s' % self.input_location |
160 | | - about['Output directory'] = '%s and subdirectories' % self.location |
| 182 | + about['Output directory'] = '%s and subdirectories' % self.output_location |
161 | 183 | |
162 | 184 | max_length_key = max([len(key) for key in about.keys()]) |
163 | 185 | |