Index: trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py |
— | — | @@ -71,4 +71,4 @@ |
72 | 72 | cursor = db.find('category', 'List') |
73 | 73 | for c in cursor: |
74 | 74 | data[c['id']] = 1 |
75 | | - return data |
| 75 | + return data, rts |
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -24,6 +24,7 @@ |
25 | 25 | import types |
26 | 26 | import sys |
27 | 27 | import cPickle |
| 28 | +import pymongo |
28 | 29 | import gc |
29 | 30 | import os |
30 | 31 | import progressbar |
— | — | @@ -65,7 +66,7 @@ |
66 | 67 | def feedback(plugin, rts): |
67 | 68 | print 'Exporting data for chart: %s' % plugin |
68 | 69 | print 'Project: %s' % rts.dbname |
69 | | - print 'Dataset: %s' % rts.editors_dataset |
| 70 | + print 'Dataset: %s' % rts.collection |
70 | 71 | |
71 | 72 | |
72 | 73 | def write_output(ds, rts, stopwatch): |
— | — | @@ -97,7 +98,8 @@ |
98 | 99 | plugin = retrieve_plugin(func) |
99 | 100 | |
100 | 101 | if not plugin: |
101 | | - raise exceptions.UnknownPluginError(plugin, self.available_plugins) |
| 102 | + available_plugins = inventory.available_analyses() |
| 103 | + raise exceptions.UnknownPluginError(plugin, available_plugins) |
102 | 104 | plugin = getattr(plugin, func) |
103 | 105 | |
104 | 106 | feedback(func, rts) |
— | — | @@ -110,15 +112,16 @@ |
111 | 113 | obs = dict() |
112 | 114 | obs_proxy = mgr.dict(obs) |
113 | 115 | |
114 | | - db = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
| 116 | + db = storage.init_database(rts.storage, rts.dbname, rts.collection) |
115 | 117 | editors = db.retrieve_distinct_keys('editor') |
116 | 118 | #editors = editors[:500] |
117 | | - min_year, max_year = determine_project_year_range(db, 'new_wikipedian') |
| 119 | + if rts.collection.find('editors_dataset') > -1: |
| 120 | + min_year, max_year = determine_project_year_range(db, 'new_wikipedian') |
| 121 | + kwargs['min_year'] = min_year |
| 122 | + kwargs['max_year'] = max_year |
118 | 123 | |
119 | 124 | fmt = kwargs.pop('format', 'long') |
120 | 125 | time_unit = kwargs.pop('time_unit', 'year') |
121 | | - kwargs['min_year'] = min_year |
122 | | - kwargs['max_year'] = max_year |
123 | 126 | |
124 | 127 | |
125 | 128 | var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs) |
— | — | @@ -153,24 +156,21 @@ |
154 | 157 | |
155 | 158 | |
156 | 159 | ppills = cpu_count() |
157 | | - while True: |
158 | | - while ppills > 0: |
159 | | - try: |
160 | | - res = result.get() |
161 | | - if res == True: |
162 | | - pbar.update(pbar.currval + 1) |
163 | | - else: |
164 | | - ppills -= 1 |
165 | | - var = res |
166 | | - print ppills |
167 | | - except Empty: |
168 | | - pass |
169 | | - break |
170 | | - print 'Waiting for tasks...' |
| 160 | + while ppills > 0: |
| 161 | + try: |
| 162 | + res = result.get() |
| 163 | + if res == True: |
| 164 | + pbar.update(pbar.currval + 1) |
| 165 | + else: |
| 166 | + ppills -= 1 |
| 167 | + var = res |
| 168 | + except Empty: |
| 169 | + pass |
| 170 | + |
171 | 171 | tasks.join() |
172 | 172 | |
173 | 173 | var = reconstruct_observations(var) |
174 | | - ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs) |
| 174 | + ds = dataset.Dataset(func, rts, format=fmt, **kwargs) |
175 | 175 | ds.add_variable(var) |
176 | 176 | |
177 | 177 | stopwatch.elapsed() |
— | — | @@ -178,8 +178,8 @@ |
179 | 179 | |
180 | 180 | ds.summary() |
181 | 181 | |
182 | | - for n, c in get_refcounts()[:100]: |
183 | | - print '%10d %s' % (n, c.__name__) |
| 182 | + #for n, c in get_refcounts()[:100]: |
| 183 | + # print '%10d %s' % (n, c.__name__) |
184 | 184 | |
185 | 185 | |
186 | 186 | def get_refcounts(): |
— | — | @@ -205,9 +205,12 @@ |
206 | 206 | Determine the first and final year for the observed data |
207 | 207 | ''' |
208 | 208 | try: |
209 | | - obs = db.find(var, qualifier='max') |
| 209 | + conditions = {var : {'$ne' : False}} |
| 210 | + |
| 211 | + obs = db.find(conditions).sort(var, pymongo.ASCENDING).limit(1)[0] |
210 | 212 | max_year = obs[var].year + 1 |
211 | | - obs = db.find(var, qualifier='min') |
| 213 | + |
| 214 | + obs = db.find(conditions).sort(var, pymongo.DESCENDING).limit(1)[0] |
212 | 215 | min_year = obs[var].year |
213 | 216 | except KeyError: |
214 | 217 | min_year = 2001 |
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | |
59 | 59 | |
60 | 60 | def retrieve_variables(obs, username, date): |
61 | | - data = db.find_one('username', username) |
| 61 | + data = db.find_one({'username': username}) |
62 | 62 | year = str(date.year) |
63 | 63 | month = str(date.month) |
64 | 64 | if data: |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -150,14 +150,8 @@ |
151 | 151 | language = languages.init(language_code) |
152 | 152 | project = projects.init(project) |
153 | 153 | pjc = projects.ProjectContainer() |
154 | | - #rts = runtime_settings.RunTimeSettings(project, language) |
| 154 | + rts = runtime_settings.RunTimeSettings(project, language) |
155 | 155 | |
156 | | - file_choices = {'meta-full': 'stub-meta-history.xml.gz', |
157 | | - 'meta-current': 'stub-meta-current.xml.gz', |
158 | | - 'history-full': 'pages-meta-history.xml.7z', |
159 | | - 'history-current': 'pages-meta-current.xml.bz2' |
160 | | - } |
161 | | - |
162 | 156 | #Init Argument Parser |
163 | 157 | parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
164 | 158 | subparsers = parser.add_subparsers(help='sub - command help') |
— | — | @@ -218,7 +212,7 @@ |
219 | 213 | parser_dataset.add_argument('-c', '--charts', |
220 | 214 | action='store', |
221 | 215 | help='Should be a valid function name that matches one of the plugin functions', |
222 | | - default=inventory.available_analyses()['new_editor_count']) |
| 216 | + default='new_editor_count') |
223 | 217 | |
224 | 218 | parser_dataset.add_argument('-k', '--keywords', |
225 | 219 | action='store', |
— | — | @@ -256,6 +250,13 @@ |
257 | 251 | help='Indicate whether the output is for Kaggle or not', |
258 | 252 | default=False) |
259 | 253 | |
| 254 | + |
| 255 | + parser.add_argument('-t', '--collection', |
| 256 | + action='store', |
| 257 | + help='Name of default collection', |
| 258 | + default='editors_dataset' |
| 259 | + ) |
| 260 | + |
260 | 261 | parser.add_argument('-l', '--language', |
261 | 262 | action='store', |
262 | 263 | help='Example of valid languages.', |
— | — | @@ -269,28 +270,17 @@ |
270 | 271 | choices=pjc.supported_projects(), |
271 | 272 | default='wiki') |
272 | 273 | |
273 | | - parser.add_argument('-c', '--collection', |
274 | | - action='store', |
275 | | - help='Name of MongoDB collection', |
276 | | - default='editors_raw') |
277 | | - |
278 | | - |
279 | 274 | parser.add_argument('-ns', '--namespace', |
280 | 275 | action='store', |
281 | 276 | help='A list of namespaces to include for analysis.', |
282 | 277 | default='0') |
283 | 278 | |
284 | | - parser.add_argument('-db', '--database', |
285 | | - action='store', |
286 | | - help='Specify the database that you want to use. Valid choices are mongo and cassandra.', |
287 | | - default='mongo') |
288 | | - |
289 | 279 | parser.add_argument('-f', '--file', |
290 | 280 | action='store', |
291 | | - choices=file_choices, |
| 281 | + choices=rts.file_choices, |
292 | 282 | help='Indicate which dump you want to download. Valid choices are:\n \ |
293 | | - %s' % ''.join([f + ',\n' for f in file_choices]), |
294 | | - default=file_choices['meta-full']) |
| 283 | + %s' % ''.join([f + ',\n' for f in rts.file_choices]), |
| 284 | + default='meta-full') |
295 | 285 | |
296 | 286 | return parser |
297 | 287 | |
— | — | @@ -353,6 +343,7 @@ |
354 | 344 | log.to_db(rts, 'dataset', 'store', stopwatch, event='start') |
355 | 345 | log.to_csv(logger, rts, 'Start', 'Store', store_launcher) |
356 | 346 | store.launcher(rts) |
| 347 | + #store.launcher_articles(rts) |
357 | 348 | stopwatch.elapsed() |
358 | 349 | log.to_db(rts, 'dataset', 'store', stopwatch, event='finish') |
359 | 350 | log.to_csv(logger, rts, 'Finish', 'Store', store_launcher) |
Index: trunk/tools/editor_trends/etl/variables.py |
— | — | @@ -68,21 +68,22 @@ |
69 | 69 | return title.text |
70 | 70 | |
71 | 71 | |
72 | | -def parse_title_meta_data(title, namespace): |
| 72 | +def parse_title_meta_data(title, ns, namespaces): |
73 | 73 | ''' |
74 | 74 | This function categorizes an article to assist the Wikimedia Taxonomy |
75 | 75 | project. See |
76 | 76 | http://meta.wikimedia.org/wiki/Contribution_Taxonomy_Project/Research_Questions |
77 | 77 | ''' |
78 | 78 | title_meta = {} |
79 | | - if not namespace: |
| 79 | + if not ns: |
80 | 80 | return title_meta |
81 | | - |
| 81 | + namespace = '%s:' % namespaces[ns] |
| 82 | + title = title.replace(namespace, '') |
82 | 83 | title_meta['title'] = title |
83 | | - title_meta['ns'] = namespace |
| 84 | + title_meta['ns'] = ns |
84 | 85 | if title.startswith('List of'): |
85 | 86 | title_meta['category'] = 'List' |
86 | | - elif namespace == 4 or namespace == 5: |
| 87 | + elif ns == 4 or ns == 5: |
87 | 88 | if title.find('Articles for deletion') > -1: |
88 | 89 | title_meta['category'] = 'Deletion' |
89 | 90 | elif title.find('Mediation Committee') > -1: |
— | — | @@ -105,6 +106,7 @@ |
106 | 107 | title_meta['category'] = 'Featured Topic' |
107 | 108 | elif title.find('Good Article') > -1: |
108 | 109 | title_meta['category'] = 'Good Article' |
| 110 | + #print title_meta |
109 | 111 | return title_meta |
110 | 112 | |
111 | 113 | |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -44,32 +44,31 @@ |
45 | 45 | while True: |
46 | 46 | try: |
47 | 47 | filename = self.tasks.get(block=False) |
48 | | - except Empty: |
49 | | - break |
| 48 | + self.tasks.task_done() |
| 49 | + if filename == None: |
| 50 | + self.result.put(None) |
| 51 | + break |
50 | 52 | |
51 | | - self.tasks.task_done() |
52 | | - if filename == None: |
53 | | - self.result.put(None) |
54 | | - break |
| 53 | + fh = file_utils.create_txt_filehandle(self.rts.sorted, filename, |
| 54 | + 'r', 'utf-8') |
| 55 | + for line in file_utils.read_raw_data(fh): |
| 56 | + if len(line) == 1 or len(line) == 4: |
| 57 | + continue |
| 58 | + editor = line[0] |
| 59 | + #print 'Parsing %s' % editor |
| 60 | + if prev_editor != editor and prev_editor != -1: |
| 61 | + editor_cache.add(prev_editor, 'NEXT') |
55 | 62 | |
56 | | - fh = file_utils.create_txt_filehandle(self.rts.sorted, filename, |
57 | | - 'r', 'utf-8') |
58 | | - for line in file_utils.read_raw_data(fh): |
59 | | - if len(line) == 1 or len(line) == 4: |
60 | | - continue |
61 | | - editor = line[0] |
62 | | - #print 'Parsing %s' % editor |
63 | | - if prev_editor != editor and prev_editor != -1: |
64 | | - editor_cache.add(prev_editor, 'NEXT') |
| 63 | + data = prepare_data(line) |
| 64 | + #print editor, data['username'] |
| 65 | + editor_cache.add(editor, data) |
| 66 | + prev_editor = editor |
| 67 | + fh.close() |
| 68 | + self.result.put(True) |
| 69 | + except Empty: |
| 70 | + pass |
65 | 71 | |
66 | | - data = prepare_data(line) |
67 | | - #print editor, data['username'] |
68 | | - editor_cache.add(editor, data) |
69 | | - prev_editor = editor |
70 | | - fh.close() |
71 | | - self.result.put(True) |
72 | 72 | |
73 | | - |
74 | 73 | def prepare_data(line): |
75 | 74 | ''' |
76 | 75 | Prepare a single line to store in the database, this entails converting |
— | — | @@ -103,34 +102,34 @@ |
104 | 103 | while True: |
105 | 104 | try: |
106 | 105 | filename = tasks.get(block=False) |
| 106 | + if filename == None: |
| 107 | + self.result.put(None) |
| 108 | + break |
| 109 | + print 'Processing %s...' % filename |
| 110 | + fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8') |
| 111 | + for line in fh: |
| 112 | + line = line.strip() |
| 113 | + line = line.split('\t') |
| 114 | + data = {} |
| 115 | + x, y = 0, 1 |
| 116 | + while y < len(line): |
| 117 | + key, value = line[x], line[y] |
| 118 | + if key == 'ns' or key == 'id': |
| 119 | + data[key] = int(value) |
| 120 | + else: |
| 121 | + data[key] = value |
| 122 | + x += 2 |
| 123 | + y += 2 |
| 124 | + db.insert(data) |
| 125 | + fh.close() |
107 | 126 | except Empty: |
108 | | - continue |
109 | | - |
110 | | - if filename == None: |
111 | | - break |
112 | | - print 'Processing %s...' % filename |
113 | | - fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8') |
114 | | - for line in fh: |
115 | | - line = line.strip() |
116 | | - line = line.split('\t') |
117 | | - data = {} |
118 | | - x, y = 0, 1 |
119 | | - while y < len(line): |
120 | | - key, value = line[x], line[y] |
121 | | - if key == 'ns' or key == 'id': |
122 | | - data[key] = int(value) |
123 | | - else: |
124 | | - data[key] = value |
125 | | - x += 2 |
126 | | - y += 2 |
127 | | - db.insert(data) |
128 | | - fh.close() |
| 127 | + pass |
129 | 128 | print 'Done storing articles...' |
130 | 129 | |
131 | 130 | |
132 | 131 | def launcher_articles(rts): |
133 | 132 | ''' |
134 | | - This function reads titles.csv and stores it in a separate collection. |
| 133 | + This function reads articles.csv and stores it in a separate collection. |
135 | 134 | Besides containing the title of an article, it also includes: |
136 | 135 | * namespace |
137 | 136 | * category (if any) |
— | — | @@ -172,7 +171,6 @@ |
173 | 172 | This is the main entry point and creates a number of workers and launches |
174 | 173 | them. |
175 | 174 | ''' |
176 | | - #launcher_articles(rts) |
177 | 175 | print 'Input directory is: %s ' % rts.sorted |
178 | 176 | db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) |
179 | 177 | db.drop_collection() |
Index: trunk/tools/editor_trends/etl/downloader.py |
— | — | @@ -29,7 +29,7 @@ |
30 | 30 | from utils import log |
31 | 31 | |
32 | 32 | |
33 | | -def download_wiki_file(task_queue, properties): |
| 33 | +def download_wiki_file(task_queue, rts): |
34 | 34 | ''' |
35 | 35 | This is a very simple replacement for wget and curl because Windows does |
36 | 36 | not have these tools installed by default |
— | — | @@ -46,34 +46,34 @@ |
47 | 47 | widgets = log.init_progressbar_widgets(filename) |
48 | 48 | extension = os.path.splitext(filename)[1] |
49 | 49 | filemode = file_utils.determine_file_mode(extension) |
50 | | - filesize = http_utils.determine_remote_filesize(properties.wp_dump_location, |
51 | | - properties.dump_relative_path, |
| 50 | + filesize = http_utils.determine_remote_filesize(rts.wp_dump_location, |
| 51 | + rts.dump_relative_path, |
52 | 52 | filename) |
53 | 53 | |
54 | | - mod_date = http_utils.determine_modified_date(properties.wp_dump_location, |
55 | | - properties.dump_relative_path, |
| 54 | + mod_date = http_utils.determine_modified_date(rts.wp_dump_location, |
| 55 | + rts.dump_relative_path, |
56 | 56 | filename) |
57 | | - mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.timestamp_server) |
58 | | - if file_utils.check_file_exists(properties.input_location, filename): |
59 | | - mod_loc = file_utils.get_modified_date(properties.input_location, filename) |
60 | | - if mod_loc == mod_date and (properties.force == False or properties.force == None): |
61 | | - print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name) |
| 57 | + mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, rts.timestamp_server) |
| 58 | + if file_utils.check_file_exists(rts.input_location, filename): |
| 59 | + mod_loc = file_utils.get_modified_date(rts.input_location, filename) |
| 60 | + if mod_loc == mod_date and (rts.force == False or rts.force == None): |
| 61 | + print 'You already have downloaded the most recent %s%s dumpfile.' % (rts.language.code, rts.project.name) |
62 | 62 | continue |
63 | 63 | |
64 | 64 | if filemode == 'w': |
65 | | - fh = file_utils.create_txt_filehandle(properties.input_location, |
| 65 | + fh = file_utils.create_txt_filehandle(rts.input_location, |
66 | 66 | filename, |
67 | 67 | filemode, |
68 | | - properties.encoding) |
| 68 | + rts.encoding) |
69 | 69 | else: |
70 | | - fh = file_utils.create_binary_filehandle(properties.input_location, filename, 'wb') |
| 70 | + fh = file_utils.create_binary_filehandle(rts.input_location, filename, 'wb') |
71 | 71 | |
72 | 72 | if filesize != -1: |
73 | 73 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() |
74 | 74 | else: |
75 | 75 | pbar = progressbar.ProgressBar(widgets=widgets).start() |
76 | 76 | try: |
77 | | - path = '%s%s' % (properties.dump_absolute_path, filename) |
| 77 | + path = '%s%s' % (rts.dump_absolute_path, filename) |
78 | 78 | req = urllib2.Request(path) |
79 | 79 | response = urllib2.urlopen(req) |
80 | 80 | while True: |
— | — | @@ -94,24 +94,24 @@ |
95 | 95 | print 'Error: %s' % error |
96 | 96 | finally: |
97 | 97 | fh.close() |
98 | | - file_utils.set_modified_data(mod_date, properties.input_location, filename) |
| 98 | + file_utils.set_modified_data(mod_date, rts.input_location, filename) |
99 | 99 | |
100 | 100 | |
101 | 101 | |
102 | | -def launcher(properties, logger): |
| 102 | +def launcher(rts, logger): |
103 | 103 | print 'Creating list of files to be downloaded...' |
104 | | - tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location, |
105 | | - properties.dump_relative_path, |
106 | | - properties.dump_filename) |
| 104 | + tasks = http_utils.create_list_dumpfiles(rts.wp_dump_location, |
| 105 | + rts.dump_relative_path, |
| 106 | + rts.dump_filename) |
107 | 107 | #print tasks.qsize() |
108 | | - #if tasks.qsize() < properties.settings.number_of_processes: |
109 | | - # properties..number_of_processes = tasks.qsize() |
| 108 | + #if tasks.qsize() < rts.settings.number_of_processes: |
| 109 | + # rts..number_of_processes = tasks.qsize() |
110 | 110 | if tasks.qsize() > 2: |
111 | 111 | consumers = [multiprocessing.Process(target=download_wiki_file, |
112 | | - args=(tasks, properties)) |
113 | | - for i in xrange(properties.number_of_processes)] |
| 112 | + args=(tasks, rts)) |
| 113 | + for i in xrange(rts.number_of_processes)] |
114 | 114 | else: consumers = [multiprocessing.Process(target=download_wiki_file, |
115 | | - args=(tasks, properties)) |
| 115 | + args=(tasks, rts)) |
116 | 116 | for i in xrange(1)] |
117 | 117 | print 'Starting consumers to download files...' |
118 | 118 | for w in consumers: |
Index: trunk/tools/editor_trends/etl/differ.py |
— | — | @@ -213,7 +213,7 @@ |
214 | 214 | |
215 | 215 | def store_json_diffs(rts): |
216 | 216 | files = os.listdir(rts.diffs) |
217 | | - print files, rts.diffs |
| 217 | + #print files, rts.diffs |
218 | 218 | db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) |
219 | 219 | buffer = cStringIO.StringIO() |
220 | 220 | |
— | — | @@ -226,12 +226,10 @@ |
227 | 227 | obj = json.loads(obj) |
228 | 228 | obj[0]['article_id'] = int(obj[0]['article_id']) |
229 | 229 | for key, value in obj[0].iteritems(): |
230 | | - if type(value) == type(dict()): |
231 | | - value['timestamp'] = datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S') |
| 230 | + if key == 'timestamp': |
| 231 | + value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S') |
232 | 232 | obj[0][key] = value |
233 | 233 | obj = obj[0] |
234 | | - #print obj |
235 | | - #print len(obj) |
236 | 234 | try: |
237 | 235 | db.save(obj) |
238 | 236 | except bson.errors.InvalidDocument, error: |
— | — | @@ -279,6 +277,7 @@ |
280 | 278 | |
281 | 279 | def store_diffs_debug(rts): |
282 | 280 | db = storage.init_database(rts) |
| 281 | + db.drop_collection() |
283 | 282 | files = os.listdir(rts.diffs) |
284 | 283 | for filename in files: |
285 | 284 | fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') |
— | — | @@ -335,20 +334,22 @@ |
336 | 335 | print 'Inserting poison pill %s...' % x |
337 | 336 | input_queue.put(None) |
338 | 337 | |
339 | | - extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id, |
340 | | - rts, format]) |
341 | | - for process_id in xrange(processors)] |
342 | | - for extracter in extracters: |
343 | | - extracter.start() |
| 338 | +# extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id, |
| 339 | +# rts, format]) |
| 340 | +# for process_id in xrange(processors)] |
| 341 | +# for extracter in extracters: |
| 342 | +# extracter.start() |
| 343 | +# |
| 344 | +# input_queue.join() |
344 | 345 | |
345 | | - input_queue.join() |
346 | | - |
347 | 346 | store_json_diffs(rts) |
348 | 347 | db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) |
| 348 | + |
349 | 349 | db.add_index('title') |
350 | 350 | db.add_index('timestamp') |
351 | 351 | db.add_index('username') |
352 | 352 | db.add_index('ns') |
| 353 | + db.add_index('editor') |
353 | 354 | |
354 | 355 | |
355 | 356 | def launcher_simple(): |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -72,8 +72,8 @@ |
73 | 73 | text = variables.extract_revision_text(revision, xml_namespace)
|
74 | 74 | article.update(contributor)
|
75 | 75 |
|
76 | | - comment = variables.extract_comment_text(revision_id, revision)
|
77 | | - cache.comments.update(comment)
|
| 76 | + #comment = variables.extract_comment_text(revision_id, revision)
|
| 77 | + #cache.comments.update(comment)
|
78 | 78 |
|
79 | 79 | timestamp = revision.find('%s%s' % (xml_namespace, 'timestamp')).text
|
80 | 80 | article['timestamp'] = timestamp
|
— | — | @@ -139,7 +139,7 @@ |
140 | 140 | title = variables.parse_title(elem)
|
141 | 141 | article['title'] = title
|
142 | 142 | current_namespace = variables.determine_namespace(title, namespaces, include_ns)
|
143 | | - title_meta = variables.parse_title_meta_data(title, current_namespace)
|
| 143 | + title_meta = variables.parse_title_meta_data(title, current_namespace, namespaces)
|
144 | 144 | if current_namespace < 6:
|
145 | 145 | parse = True
|
146 | 146 | article['namespace'] = current_namespace
|
— | — | @@ -172,7 +172,7 @@ |
173 | 173 | Determine id of article
|
174 | 174 | '''
|
175 | 175 | article['article_id'] = elem.text
|
176 | | - if isinstance(current_namespace, int):
|
| 176 | + if isinstance(current_namespace, int) and title_meta != {}:
|
177 | 177 | cache.articles[article['article_id']] = title_meta
|
178 | 178 | id = True
|
179 | 179 | elem.clear()
|
Index: trunk/tools/editor_trends/kaggle/training.py |
— | — | @@ -26,7 +26,7 @@ |
27 | 27 | |
28 | 28 | from classes import storage |
29 | 29 | |
30 | | -location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution' |
| 30 | +location = '/home/diederik/wikimedia/en/wiki/kaggle' |
31 | 31 | files = os.listdir(location) |
32 | 32 | files.reverse() |
33 | 33 | |
— | — | @@ -67,7 +67,7 @@ |
68 | 68 | continue |
69 | 69 | id = line[2] |
70 | 70 | if id not in ids and id not in ignore_ids: |
71 | | - res = db.find_one('editor', id) |
| 71 | + res = db.find_one({'editor': id}) |
72 | 72 | if res == None: |
73 | 73 | ignore_ids.add(id) |
74 | 74 | continue |
— | — | @@ -100,7 +100,7 @@ |
101 | 101 | fh = codecs.open('solutions.tsv', 'w', 'utf-8') |
102 | 102 | for id in ids: |
103 | 103 | if id not in ignore_ids: |
104 | | - obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns') |
| 104 | + obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns') |
105 | 105 | if obs != None: |
106 | 106 | x += 1 |
107 | 107 | n = obs['cum_edit_count_main_ns'] |
Index: trunk/tools/editor_trends/classes/buffer.py |
— | — | @@ -124,7 +124,9 @@ |
125 | 125 | def simplify(self, revision): |
126 | 126 | row = [] |
127 | 127 | for key in self.keys: |
128 | | - row.append(revision[key].decode('utf-8')) |
| 128 | + value = revision.get(key, None) |
| 129 | + if value != None: |
| 130 | + row.append(value.decode('utf-8')) |
129 | 131 | return row |
130 | 132 | |
131 | 133 | def stringify(self, revision): |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -50,53 +50,57 @@ |
51 | 51 | self.project = project |
52 | 52 | self.language = language |
53 | 53 | self.dbname = 'wikilytics' |
| 54 | + self.file_choices = {'meta-full': 'stub-meta-history.xml.gz', |
| 55 | + 'meta-current': 'stub-meta-current.xml.gz', |
| 56 | + 'history-full': 'pages-meta-history.xml.7z', |
| 57 | + 'history-current': 'pages-meta-current.xml.bz2' |
| 58 | + } |
| 59 | + if args: |
| 60 | + self.args = args |
| 61 | + self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month') |
| 62 | + #print self.settings.input_location |
| 63 | + #print self.get_value('location') |
| 64 | + self.project = self.update_project_settings() |
| 65 | + self.language = self.update_language_settings() |
54 | 66 | |
55 | | - #if args: |
56 | | - self.args = args |
57 | | - self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month') |
58 | | - #print self.settings.input_location |
59 | | - #print self.get_value('location') |
60 | | - self.project = self.update_project_settings() |
61 | | - self.language = self.update_language_settings() |
| 67 | + self.input_location = self.set_input_location() |
| 68 | + self.output_location = self.set_output_location() |
62 | 69 | |
63 | | - self.input_location = self.set_input_location() |
64 | | - self.output_location = self.set_output_location() |
| 70 | + self.plugins = self.set_plugin() |
| 71 | + self.keywords = self.split_keywords() |
| 72 | + self.namespaces = self.get_namespaces() |
65 | 73 | |
66 | | - self.plugins = self.set_plugin() |
67 | | - self.keywords = self.split_keywords() |
68 | | - self.namespaces = self.get_namespaces() |
| 74 | + #self.kaggle = self.get_value('kaggle') |
| 75 | + self.function = self.get_value('func') |
| 76 | + self.ignore = self.get_value('except') |
| 77 | + self.force = self.get_value('force') |
| 78 | + self.analyzer_collection = self.get_value('collection') |
69 | 79 | |
70 | | - #self.kaggle = self.get_value('kaggle') |
71 | | - self.function = self.get_value('func') |
72 | | - self.ignore = self.get_value('except') |
73 | | - self.force = self.get_value('force') |
74 | | - self.analyzer_collection = self.get_value('collection') |
| 80 | + self.dataset = os.path.join(self.dataset_location, self.project.name) |
| 81 | + self.txt = os.path.join(self.output_location, 'txt') |
| 82 | + self.sorted = os.path.join(self.output_location, 'sorted') |
| 83 | + self.diffs = os.path.join(self.output_location, 'diffs') |
75 | 84 | |
76 | | - self.dataset = os.path.join(self.dataset_location, self.project.name) |
77 | | - self.txt = os.path.join(self.output_location, 'txt') |
78 | | - self.sorted = os.path.join(self.output_location, 'sorted') |
79 | | - self.diffs = os.path.join(self.output_location, 'diffs') |
| 85 | + self.directories = [self.output_location, |
| 86 | + self.txt, |
| 87 | + self.sorted, |
| 88 | + self.dataset, |
| 89 | + self.diffs] |
| 90 | + self.verify_environment(self.directories) |
80 | 91 | |
81 | | - self.directories = [self.output_location, |
82 | | - self.txt, |
83 | | - self.sorted, |
84 | | - self.dataset, |
85 | | - self.diffs] |
86 | | - self.verify_environment(self.directories) |
| 92 | + #Wikidump file related variables |
| 93 | + self.dump_filename = self.generate_wikidump_filename() |
| 94 | + self.dump_relative_path = self.set_dump_path() |
| 95 | + self.dump_absolute_path = self.set_dump_path(absolute=True) |
87 | 96 | |
88 | | - #Wikidump file related variables |
89 | | - self.dump_filename = self.generate_wikidump_filename() |
90 | | - self.dump_relative_path = self.set_dump_path() |
91 | | - self.dump_absolute_path = self.set_dump_path(absolute=True) |
| 97 | + #Collection names |
| 98 | + self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name) |
| 99 | + self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name) |
| 100 | + self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name) |
| 101 | + self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name) |
| 102 | + self.collection = self.set_collection() |
92 | 103 | |
93 | | - #Collection names |
94 | | - self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name) |
95 | | - self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name) |
96 | | - self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name) |
97 | | - self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name) |
98 | 104 | |
99 | | - |
100 | | - |
101 | 105 | def __str__(self): |
102 | 106 | return 'Runtime Settings for project %s %s' % (self.language.name, |
103 | 107 | self.project.full_name) |
— | — | @@ -105,14 +109,8 @@ |
106 | 110 | for item in self.__dict__: |
107 | 111 | yield item |
108 | 112 | |
109 | | - def dict(self): |
110 | | - ''' |
111 | | - Return a dictionary with all properties and their values |
112 | | - ''' |
113 | | - props = {} |
114 | | - for prop in self: |
115 | | - props[prop] = getattr(self, prop) |
116 | | - return props |
| 113 | + def set_collection(self): |
| 114 | + return getattr(self, self.get_value('collection')) |
117 | 115 | |
118 | 116 | def split_keywords(self): |
119 | 117 | ''' |
— | — | @@ -141,7 +139,7 @@ |
142 | 140 | ''' |
143 | 141 | plugin = self.get_value('charts') |
144 | 142 | requested_plugins = [] |
145 | | - if plugin != None and isinstance(plugin, type('module')) == False: |
| 143 | + if plugin != None: |
146 | 144 | plugins = plugin.split(',') |
147 | 145 | available_plugins = inventory.available_analyses() |
148 | 146 | for plugin in plugins: |
— | — | @@ -220,8 +218,9 @@ |
221 | 219 | ''' |
222 | 220 | Generate the main name of the wikidump file to be downloaded. |
223 | 221 | ''' |
| 222 | + choice = self.get_value('file') |
224 | 223 | return '%s%s-latest-%s' % (self.language.code, self.project.name, |
225 | | - self.get_value('file')) |
| 224 | + self.file_choices[choice]) |
226 | 225 | |
227 | 226 | def update_language_settings(self): |
228 | 227 | ''' |
Index: trunk/tools/editor_trends/classes/storage.py |
— | — | @@ -160,29 +160,22 @@ |
161 | 161 | assert isinstance(data, dict), 'You need to feed me dictionaries.' |
162 | 162 | self.db[self.collection].update({key: value}, {'$set': data}) |
163 | 163 | |
164 | | - def find(self, key=None, qualifier=None): |
165 | | - if qualifier == 'min': |
166 | | - return self.db[self.collection].find({ |
167 | | - key : {'$ne' : False}}).sort(key, pymongo.ASCENDING).limit(1)[0] |
168 | | - elif qualifier == 'max': |
169 | | - return self.db[self.collection].find({ |
170 | | - key : {'$ne' : False}}).sort(key, pymongo.DESCENDING).limit(1)[0] |
171 | | - elif qualifier: |
172 | | - return self.db[self.collection].find({key : qualifier}) |
173 | | - elif key != None: |
174 | | - return self.db[self.collection].find({}, fields=[key]) |
| 164 | + def find(self, conditions, vars=None): |
| 165 | + if conditions: |
| 166 | + return self.db[self.collection].find(conditions, fields=vars) |
175 | 167 | else: |
176 | 168 | return self.db[self.collection].find() |
177 | 169 | |
178 | | - def find_one(self, key, value, vars=None): |
| 170 | + def find_one(self, conditions, vars=None): |
179 | 171 | if vars: |
180 | 172 | #if you only want to retrieve a specific variable(s) then you need to |
181 | 173 | #specify vars, if vars is None then you will get the entire BSON object |
182 | 174 | vars = vars.split(',') |
183 | 175 | vars = dict([(var, 1) for var in vars]) |
184 | | - return self.db[self.collection].find_one({key: value}, vars) |
| 176 | + return self.db[self.collection].find_one(conditions, vars) |
185 | 177 | else: |
186 | | - return self.db[self.collection].find_one({key: value}) |
| 178 | + #conditions should be a dictionary |
| 179 | + return self.db[self.collection].find_one(conditions) |
187 | 180 | |
188 | 181 | |
189 | 182 | def drop_collection(self): |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -176,6 +176,7 @@ |
177 | 177 | #self.date = date |
178 | 178 | self.data = 0 |
179 | 179 | self.time_unit = time_unit |
| 180 | + self.date = date |
180 | 181 | self.t1, self.t0 = self.set_date_range(date) |
181 | 182 | self.id = id |
182 | 183 | self.props = [] |
— | — | @@ -515,7 +516,7 @@ |
516 | 517 | variable.max = get_max(data) |
517 | 518 | variable.num_obs = variable.number_of_obs() |
518 | 519 | variable.num_dates = len(variable) |
519 | | - #variable.first_obs, variable.last_obs = variable.get_date_range() |
| 520 | + variable.first_obs, variable.last_obs = variable.get_date_range() |
520 | 521 | |
521 | 522 | def summary(self): |
522 | 523 | ''' |
Index: trunk/tools/editor_trends/classes/analytics.py |
— | — | @@ -64,10 +64,8 @@ |
65 | 65 | |
66 | 66 | def __call__(self): |
67 | 67 | project = 'wiki' |
68 | | - #rts = runtime_settings.init_environment('wiki', 'en', args) |
69 | 68 | for lang in self.languages: |
70 | 69 | self.rts = runtime_settings.init_environment(project, lang, self.args) |
71 | | - #self.rts.editors_dataset = 'editors_dataset' |
72 | 70 | |
73 | 71 | self.rts.dbname = '%s%s' % (lang, project) |
74 | 72 | for cum_cutoff in self.cum_cutoff: |
— | — | @@ -91,15 +89,16 @@ |
92 | 90 | Generic loop function that loops over all the editors of a Wikipedia |
93 | 91 | project and then calls the plugin that does the actual mapping. |
94 | 92 | ''' |
95 | | - db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset) |
| 93 | + db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.collection) |
96 | 94 | while True: |
97 | 95 | try: |
98 | 96 | editor_id = self.tasks.get(block=False) |
| 97 | + self.tasks.task_done() |
99 | 98 | if editor_id == None: |
100 | 99 | self.result.put(self.var) |
101 | 100 | break |
102 | | - editor = db.find_one('editor', editor_id) |
103 | | - self.plugin(self.var, editor, dbname=self.rts.dbname, data=self.data) |
| 101 | + editor = db.find_one({'editor': editor_id}) |
| 102 | + self.plugin(self.var, editor, rts=self.rts, data=self.data) |
104 | 103 | self.result.put(True) |
105 | 104 | except Empty: |
106 | 105 | pass |
Index: trunk/tools/editor_trends/utils/log.py |
— | — | @@ -31,8 +31,8 @@ |
32 | 32 | def to_db(rts, jobtype, task, timer, event='start'): |
33 | 33 | db = storage.init_database(rts.storage, rts.dbname, 'jobs') |
34 | 34 | created = datetime.datetime.now() |
35 | | - job = db.find_one('hash', rts.id) |
36 | | - |
| 35 | + job = db.find_one({'hash': rts.id}) |
| 36 | + #print job |
37 | 37 | data = {'hash': rts.id, |
38 | 38 | 'created': created, |
39 | 39 | 'jobtype': jobtype, |
— | — | @@ -50,7 +50,7 @@ |
51 | 51 | data['finished'] = True |
52 | 52 | _id = db.save(data) |
53 | 53 | |
54 | | - job = db.find_one('_id', _id) |
| 54 | + job = db.find_one({'_id': _id}) |
55 | 55 | |
56 | 56 | tasks = job['tasks'] |
57 | 57 | t = tasks.get(task, {}) |