r89242 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r89241‎ | r89242 | r89243 >
Date:00:02, 1 June 2011
Author:diederik
Status:deferred
Tags:
Comment:
Preparing for Summer of Research, part 2
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py (modified) (history)
  • /trunk/tools/editor_trends/classes/analytics.py (modified) (history)
  • /trunk/tools/editor_trends/classes/buffer.py (modified) (history)
  • /trunk/tools/editor_trends/classes/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/storage.py (modified) (history)
  • /trunk/tools/editor_trends/etl/differ.py (modified) (history)
  • /trunk/tools/editor_trends/etl/downloader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/etl/variables.py (modified) (history)
  • /trunk/tools/editor_trends/kaggle/training.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/log.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
@@ -71,4 +71,4 @@
7272 cursor = db.find('category', 'List')
7373 for c in cursor:
7474 data[c['id']] = 1
75 - return data
 75+ return data, rts
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -24,6 +24,7 @@
2525 import types
2626 import sys
2727 import cPickle
 28+import pymongo
2829 import gc
2930 import os
3031 import progressbar
@@ -65,7 +66,7 @@
6667 def feedback(plugin, rts):
6768 print 'Exporting data for chart: %s' % plugin
6869 print 'Project: %s' % rts.dbname
69 - print 'Dataset: %s' % rts.editors_dataset
 70+ print 'Dataset: %s' % rts.collection
7071
7172
7273 def write_output(ds, rts, stopwatch):
@@ -97,7 +98,8 @@
9899 plugin = retrieve_plugin(func)
99100
100101 if not plugin:
101 - raise exceptions.UnknownPluginError(plugin, self.available_plugins)
 102+ available_plugins = inventory.available_analyses()
 103+ raise exceptions.UnknownPluginError(plugin, available_plugins)
102104 plugin = getattr(plugin, func)
103105
104106 feedback(func, rts)
@@ -110,15 +112,16 @@
111113 obs = dict()
112114 obs_proxy = mgr.dict(obs)
113115
114 - db = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
 116+ db = storage.init_database(rts.storage, rts.dbname, rts.collection)
115117 editors = db.retrieve_distinct_keys('editor')
116118 #editors = editors[:500]
117 - min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
 119+ if rts.collection.find('editors_dataset') > -1:
 120+ min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
 121+ kwargs['min_year'] = min_year
 122+ kwargs['max_year'] = max_year
118123
119124 fmt = kwargs.pop('format', 'long')
120125 time_unit = kwargs.pop('time_unit', 'year')
121 - kwargs['min_year'] = min_year
122 - kwargs['max_year'] = max_year
123126
124127
125128 var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)
@@ -153,24 +156,21 @@
154157
155158
156159 ppills = cpu_count()
157 - while True:
158 - while ppills > 0:
159 - try:
160 - res = result.get()
161 - if res == True:
162 - pbar.update(pbar.currval + 1)
163 - else:
164 - ppills -= 1
165 - var = res
166 - print ppills
167 - except Empty:
168 - pass
169 - break
170 - print 'Waiting for tasks...'
 160+ while ppills > 0:
 161+ try:
 162+ res = result.get()
 163+ if res == True:
 164+ pbar.update(pbar.currval + 1)
 165+ else:
 166+ ppills -= 1
 167+ var = res
 168+ except Empty:
 169+ pass
 170+
171171 tasks.join()
172172
173173 var = reconstruct_observations(var)
174 - ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs)
 174+ ds = dataset.Dataset(func, rts, format=fmt, **kwargs)
175175 ds.add_variable(var)
176176
177177 stopwatch.elapsed()
@@ -178,8 +178,8 @@
179179
180180 ds.summary()
181181
182 - for n, c in get_refcounts()[:100]:
183 - print '%10d %s' % (n, c.__name__)
 182+ #for n, c in get_refcounts()[:100]:
 183+ # print '%10d %s' % (n, c.__name__)
184184
185185
186186 def get_refcounts():
@@ -205,9 +205,12 @@
206206 Determine the first and final year for the observed data
207207 '''
208208 try:
209 - obs = db.find(var, qualifier='max')
 209+ conditions = {var : {'$ne' : False}}
 210+
 211+ obs = db.find(conditions).sort(var, pymongo.ASCENDING).limit(1)[0]
210212 max_year = obs[var].year + 1
211 - obs = db.find(var, qualifier='min')
 213+
 214+ obs = db.find(conditions).sort(var, pymongo.DESCENDING).limit(1)[0]
212215 min_year = obs[var].year
213216 except KeyError:
214217 min_year = 2001
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
@@ -57,7 +57,7 @@
5858
5959
6060 def retrieve_variables(obs, username, date):
61 - data = db.find_one('username', username)
 61+ data = db.find_one({'username': username})
6262 year = str(date.year)
6363 month = str(date.month)
6464 if data:
Index: trunk/tools/editor_trends/manage.py
@@ -150,14 +150,8 @@
151151 language = languages.init(language_code)
152152 project = projects.init(project)
153153 pjc = projects.ProjectContainer()
154 - #rts = runtime_settings.RunTimeSettings(project, language)
 154+ rts = runtime_settings.RunTimeSettings(project, language)
155155
156 - file_choices = {'meta-full': 'stub-meta-history.xml.gz',
157 - 'meta-current': 'stub-meta-current.xml.gz',
158 - 'history-full': 'pages-meta-history.xml.7z',
159 - 'history-current': 'pages-meta-current.xml.bz2'
160 - }
161 -
162156 #Init Argument Parser
163157 parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
164158 subparsers = parser.add_subparsers(help='sub - command help')
@@ -218,7 +212,7 @@
219213 parser_dataset.add_argument('-c', '--charts',
220214 action='store',
221215 help='Should be a valid function name that matches one of the plugin functions',
222 - default=inventory.available_analyses()['new_editor_count'])
 216+ default='new_editor_count')
223217
224218 parser_dataset.add_argument('-k', '--keywords',
225219 action='store',
@@ -256,6 +250,13 @@
257251 help='Indicate whether the output is for Kaggle or not',
258252 default=False)
259253
 254+
 255+ parser.add_argument('-t', '--collection',
 256+ action='store',
 257+ help='Name of default collection',
 258+ default='editors_dataset'
 259+ )
 260+
260261 parser.add_argument('-l', '--language',
261262 action='store',
262263 help='Example of valid languages.',
@@ -269,28 +270,17 @@
270271 choices=pjc.supported_projects(),
271272 default='wiki')
272273
273 - parser.add_argument('-c', '--collection',
274 - action='store',
275 - help='Name of MongoDB collection',
276 - default='editors_raw')
277 -
278 -
279274 parser.add_argument('-ns', '--namespace',
280275 action='store',
281276 help='A list of namespaces to include for analysis.',
282277 default='0')
283278
284 - parser.add_argument('-db', '--database',
285 - action='store',
286 - help='Specify the database that you want to use. Valid choices are mongo and cassandra.',
287 - default='mongo')
288 -
289279 parser.add_argument('-f', '--file',
290280 action='store',
291 - choices=file_choices,
 281+ choices=rts.file_choices,
292282 help='Indicate which dump you want to download. Valid choices are:\n \
293 - %s' % ''.join([f + ',\n' for f in file_choices]),
294 - default=file_choices['meta-full'])
 283+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
 284+ default='meta-full')
295285
296286 return parser
297287
@@ -353,6 +343,7 @@
354344 log.to_db(rts, 'dataset', 'store', stopwatch, event='start')
355345 log.to_csv(logger, rts, 'Start', 'Store', store_launcher)
356346 store.launcher(rts)
 347+ #store.launcher_articles(rts)
357348 stopwatch.elapsed()
358349 log.to_db(rts, 'dataset', 'store', stopwatch, event='finish')
359350 log.to_csv(logger, rts, 'Finish', 'Store', store_launcher)
Index: trunk/tools/editor_trends/etl/variables.py
@@ -68,21 +68,22 @@
6969 return title.text
7070
7171
72 -def parse_title_meta_data(title, namespace):
 72+def parse_title_meta_data(title, ns, namespaces):
7373 '''
7474 This function categorizes an article to assist the Wikimedia Taxonomy
7575 project. See
7676 http://meta.wikimedia.org/wiki/Contribution_Taxonomy_Project/Research_Questions
7777 '''
7878 title_meta = {}
79 - if not namespace:
 79+ if not ns:
8080 return title_meta
81 -
 81+ namespace = '%s:' % namespaces[ns]
 82+ title = title.replace(namespace, '')
8283 title_meta['title'] = title
83 - title_meta['ns'] = namespace
 84+ title_meta['ns'] = ns
8485 if title.startswith('List of'):
8586 title_meta['category'] = 'List'
86 - elif namespace == 4 or namespace == 5:
 87+ elif ns == 4 or ns == 5:
8788 if title.find('Articles for deletion') > -1:
8889 title_meta['category'] = 'Deletion'
8990 elif title.find('Mediation Committee') > -1:
@@ -105,6 +106,7 @@
106107 title_meta['category'] = 'Featured Topic'
107108 elif title.find('Good Article') > -1:
108109 title_meta['category'] = 'Good Article'
 110+ #print title_meta
109111 return title_meta
110112
111113
Index: trunk/tools/editor_trends/etl/store.py
@@ -44,32 +44,31 @@
4545 while True:
4646 try:
4747 filename = self.tasks.get(block=False)
48 - except Empty:
49 - break
 48+ self.tasks.task_done()
 49+ if filename == None:
 50+ self.result.put(None)
 51+ break
5052
51 - self.tasks.task_done()
52 - if filename == None:
53 - self.result.put(None)
54 - break
 53+ fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
 54+ 'r', 'utf-8')
 55+ for line in file_utils.read_raw_data(fh):
 56+ if len(line) == 1 or len(line) == 4:
 57+ continue
 58+ editor = line[0]
 59+ #print 'Parsing %s' % editor
 60+ if prev_editor != editor and prev_editor != -1:
 61+ editor_cache.add(prev_editor, 'NEXT')
5562
56 - fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
57 - 'r', 'utf-8')
58 - for line in file_utils.read_raw_data(fh):
59 - if len(line) == 1 or len(line) == 4:
60 - continue
61 - editor = line[0]
62 - #print 'Parsing %s' % editor
63 - if prev_editor != editor and prev_editor != -1:
64 - editor_cache.add(prev_editor, 'NEXT')
 63+ data = prepare_data(line)
 64+ #print editor, data['username']
 65+ editor_cache.add(editor, data)
 66+ prev_editor = editor
 67+ fh.close()
 68+ self.result.put(True)
 69+ except Empty:
 70+ pass
6571
66 - data = prepare_data(line)
67 - #print editor, data['username']
68 - editor_cache.add(editor, data)
69 - prev_editor = editor
70 - fh.close()
71 - self.result.put(True)
7272
73 -
7473 def prepare_data(line):
7574 '''
7675 Prepare a single line to store in the database, this entails converting
@@ -103,34 +102,34 @@
104103 while True:
105104 try:
106105 filename = tasks.get(block=False)
 106+ if filename == None:
 107+ self.result.put(None)
 108+ break
 109+ print 'Processing %s...' % filename
 110+ fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8')
 111+ for line in fh:
 112+ line = line.strip()
 113+ line = line.split('\t')
 114+ data = {}
 115+ x, y = 0, 1
 116+ while y < len(line):
 117+ key, value = line[x], line[y]
 118+ if key == 'ns' or key == 'id':
 119+ data[key] = int(value)
 120+ else:
 121+ data[key] = value
 122+ x += 2
 123+ y += 2
 124+ db.insert(data)
 125+ fh.close()
107126 except Empty:
108 - continue
109 -
110 - if filename == None:
111 - break
112 - print 'Processing %s...' % filename
113 - fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8')
114 - for line in fh:
115 - line = line.strip()
116 - line = line.split('\t')
117 - data = {}
118 - x, y = 0, 1
119 - while y < len(line):
120 - key, value = line[x], line[y]
121 - if key == 'ns' or key == 'id':
122 - data[key] = int(value)
123 - else:
124 - data[key] = value
125 - x += 2
126 - y += 2
127 - db.insert(data)
128 - fh.close()
 127+ pass
129128 print 'Done storing articles...'
130129
131130
132131 def launcher_articles(rts):
133132 '''
134 - This function reads titles.csv and stores it in a separate collection.
 133+ This function reads articles.csv and stores it in a separate collection.
135134 Besides containing the title of an article, it also includes:
136135 * namespace
137136 * category (if any)
@@ -172,7 +171,6 @@
173172 This is the main entry point and creates a number of workers and launches
174173 them.
175174 '''
176 - #launcher_articles(rts)
177175 print 'Input directory is: %s ' % rts.sorted
178176 db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
179177 db.drop_collection()
Index: trunk/tools/editor_trends/etl/downloader.py
@@ -29,7 +29,7 @@
3030 from utils import log
3131
3232
33 -def download_wiki_file(task_queue, properties):
 33+def download_wiki_file(task_queue, rts):
3434 '''
3535 This is a very simple replacement for wget and curl because Windows does
3636 not have these tools installed by default
@@ -46,34 +46,34 @@
4747 widgets = log.init_progressbar_widgets(filename)
4848 extension = os.path.splitext(filename)[1]
4949 filemode = file_utils.determine_file_mode(extension)
50 - filesize = http_utils.determine_remote_filesize(properties.wp_dump_location,
51 - properties.dump_relative_path,
 50+ filesize = http_utils.determine_remote_filesize(rts.wp_dump_location,
 51+ rts.dump_relative_path,
5252 filename)
5353
54 - mod_date = http_utils.determine_modified_date(properties.wp_dump_location,
55 - properties.dump_relative_path,
 54+ mod_date = http_utils.determine_modified_date(rts.wp_dump_location,
 55+ rts.dump_relative_path,
5656 filename)
57 - mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.timestamp_server)
58 - if file_utils.check_file_exists(properties.input_location, filename):
59 - mod_loc = file_utils.get_modified_date(properties.input_location, filename)
60 - if mod_loc == mod_date and (properties.force == False or properties.force == None):
61 - print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name)
 57+ mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, rts.timestamp_server)
 58+ if file_utils.check_file_exists(rts.input_location, filename):
 59+ mod_loc = file_utils.get_modified_date(rts.input_location, filename)
 60+ if mod_loc == mod_date and (rts.force == False or rts.force == None):
 61+ print 'You already have downloaded the most recent %s%s dumpfile.' % (rts.language.code, rts.project.name)
6262 continue
6363
6464 if filemode == 'w':
65 - fh = file_utils.create_txt_filehandle(properties.input_location,
 65+ fh = file_utils.create_txt_filehandle(rts.input_location,
6666 filename,
6767 filemode,
68 - properties.encoding)
 68+ rts.encoding)
6969 else:
70 - fh = file_utils.create_binary_filehandle(properties.input_location, filename, 'wb')
 70+ fh = file_utils.create_binary_filehandle(rts.input_location, filename, 'wb')
7171
7272 if filesize != -1:
7373 pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
7474 else:
7575 pbar = progressbar.ProgressBar(widgets=widgets).start()
7676 try:
77 - path = '%s%s' % (properties.dump_absolute_path, filename)
 77+ path = '%s%s' % (rts.dump_absolute_path, filename)
7878 req = urllib2.Request(path)
7979 response = urllib2.urlopen(req)
8080 while True:
@@ -94,24 +94,24 @@
9595 print 'Error: %s' % error
9696 finally:
9797 fh.close()
98 - file_utils.set_modified_data(mod_date, properties.input_location, filename)
 98+ file_utils.set_modified_data(mod_date, rts.input_location, filename)
9999
100100
101101
102 -def launcher(properties, logger):
 102+def launcher(rts, logger):
103103 print 'Creating list of files to be downloaded...'
104 - tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location,
105 - properties.dump_relative_path,
106 - properties.dump_filename)
 104+ tasks = http_utils.create_list_dumpfiles(rts.wp_dump_location,
 105+ rts.dump_relative_path,
 106+ rts.dump_filename)
107107 #print tasks.qsize()
108 - #if tasks.qsize() < properties.settings.number_of_processes:
109 - # properties..number_of_processes = tasks.qsize()
 108+ #if tasks.qsize() < rts.settings.number_of_processes:
 109+ # rts..number_of_processes = tasks.qsize()
110110 if tasks.qsize() > 2:
111111 consumers = [multiprocessing.Process(target=download_wiki_file,
112 - args=(tasks, properties))
113 - for i in xrange(properties.number_of_processes)]
 112+ args=(tasks, rts))
 113+ for i in xrange(rts.number_of_processes)]
114114 else: consumers = [multiprocessing.Process(target=download_wiki_file,
115 - args=(tasks, properties))
 115+ args=(tasks, rts))
116116 for i in xrange(1)]
117117 print 'Starting consumers to download files...'
118118 for w in consumers:
Index: trunk/tools/editor_trends/etl/differ.py
@@ -213,7 +213,7 @@
214214
215215 def store_json_diffs(rts):
216216 files = os.listdir(rts.diffs)
217 - print files, rts.diffs
 217+ #print files, rts.diffs
218218 db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
219219 buffer = cStringIO.StringIO()
220220
@@ -226,12 +226,10 @@
227227 obj = json.loads(obj)
228228 obj[0]['article_id'] = int(obj[0]['article_id'])
229229 for key, value in obj[0].iteritems():
230 - if type(value) == type(dict()):
231 - value['timestamp'] = datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
 230+ if key == 'timestamp':
 231+ value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S')
232232 obj[0][key] = value
233233 obj = obj[0]
234 - #print obj
235 - #print len(obj)
236234 try:
237235 db.save(obj)
238236 except bson.errors.InvalidDocument, error:
@@ -279,6 +277,7 @@
280278
281279 def store_diffs_debug(rts):
282280 db = storage.init_database(rts)
 281+ db.drop_collection()
283282 files = os.listdir(rts.diffs)
284283 for filename in files:
285284 fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8')
@@ -335,20 +334,22 @@
336335 print 'Inserting poison pill %s...' % x
337336 input_queue.put(None)
338337
339 - extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
340 - rts, format])
341 - for process_id in xrange(processors)]
342 - for extracter in extracters:
343 - extracter.start()
 338+# extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
 339+# rts, format])
 340+# for process_id in xrange(processors)]
 341+# for extracter in extracters:
 342+# extracter.start()
 343+#
 344+# input_queue.join()
344345
345 - input_queue.join()
346 -
347346 store_json_diffs(rts)
348347 db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
 348+
349349 db.add_index('title')
350350 db.add_index('timestamp')
351351 db.add_index('username')
352352 db.add_index('ns')
 353+ db.add_index('editor')
353354
354355
355356 def launcher_simple():
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -72,8 +72,8 @@
7373 text = variables.extract_revision_text(revision, xml_namespace)
7474 article.update(contributor)
7575
76 - comment = variables.extract_comment_text(revision_id, revision)
77 - cache.comments.update(comment)
 76+ #comment = variables.extract_comment_text(revision_id, revision)
 77+ #cache.comments.update(comment)
7878
7979 timestamp = revision.find('%s%s' % (xml_namespace, 'timestamp')).text
8080 article['timestamp'] = timestamp
@@ -139,7 +139,7 @@
140140 title = variables.parse_title(elem)
141141 article['title'] = title
142142 current_namespace = variables.determine_namespace(title, namespaces, include_ns)
143 - title_meta = variables.parse_title_meta_data(title, current_namespace)
 143+ title_meta = variables.parse_title_meta_data(title, current_namespace, namespaces)
144144 if current_namespace < 6:
145145 parse = True
146146 article['namespace'] = current_namespace
@@ -172,7 +172,7 @@
173173 Determine id of article
174174 '''
175175 article['article_id'] = elem.text
176 - if isinstance(current_namespace, int):
 176+ if isinstance(current_namespace, int) and title_meta != {}:
177177 cache.articles[article['article_id']] = title_meta
178178 id = True
179179 elem.clear()
Index: trunk/tools/editor_trends/kaggle/training.py
@@ -26,7 +26,7 @@
2727
2828 from classes import storage
2929
30 -location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
 30+location = '/home/diederik/wikimedia/en/wiki/kaggle'
3131 files = os.listdir(location)
3232 files.reverse()
3333
@@ -67,7 +67,7 @@
6868 continue
6969 id = line[2]
7070 if id not in ids and id not in ignore_ids:
71 - res = db.find_one('editor', id)
 71+ res = db.find_one({'editor': id})
7272 if res == None:
7373 ignore_ids.add(id)
7474 continue
@@ -100,7 +100,7 @@
101101 fh = codecs.open('solutions.tsv', 'w', 'utf-8')
102102 for id in ids:
103103 if id not in ignore_ids:
104 - obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
 104+ obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
105105 if obs != None:
106106 x += 1
107107 n = obs['cum_edit_count_main_ns']
Index: trunk/tools/editor_trends/classes/buffer.py
@@ -124,7 +124,9 @@
125125 def simplify(self, revision):
126126 row = []
127127 for key in self.keys:
128 - row.append(revision[key].decode('utf-8'))
 128+ value = revision.get(key, None)
 129+ if value != None:
 130+ row.append(value.decode('utf-8'))
129131 return row
130132
131133 def stringify(self, revision):
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -50,53 +50,57 @@
5151 self.project = project
5252 self.language = language
5353 self.dbname = 'wikilytics'
 54+ self.file_choices = {'meta-full': 'stub-meta-history.xml.gz',
 55+ 'meta-current': 'stub-meta-current.xml.gz',
 56+ 'history-full': 'pages-meta-history.xml.7z',
 57+ 'history-current': 'pages-meta-current.xml.bz2'
 58+ }
 59+ if args:
 60+ self.args = args
 61+ self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month')
 62+ #print self.settings.input_location
 63+ #print self.get_value('location')
 64+ self.project = self.update_project_settings()
 65+ self.language = self.update_language_settings()
5466
55 - #if args:
56 - self.args = args
57 - self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month')
58 - #print self.settings.input_location
59 - #print self.get_value('location')
60 - self.project = self.update_project_settings()
61 - self.language = self.update_language_settings()
 67+ self.input_location = self.set_input_location()
 68+ self.output_location = self.set_output_location()
6269
63 - self.input_location = self.set_input_location()
64 - self.output_location = self.set_output_location()
 70+ self.plugins = self.set_plugin()
 71+ self.keywords = self.split_keywords()
 72+ self.namespaces = self.get_namespaces()
6573
66 - self.plugins = self.set_plugin()
67 - self.keywords = self.split_keywords()
68 - self.namespaces = self.get_namespaces()
 74+ #self.kaggle = self.get_value('kaggle')
 75+ self.function = self.get_value('func')
 76+ self.ignore = self.get_value('except')
 77+ self.force = self.get_value('force')
 78+ self.analyzer_collection = self.get_value('collection')
6979
70 - #self.kaggle = self.get_value('kaggle')
71 - self.function = self.get_value('func')
72 - self.ignore = self.get_value('except')
73 - self.force = self.get_value('force')
74 - self.analyzer_collection = self.get_value('collection')
 80+ self.dataset = os.path.join(self.dataset_location, self.project.name)
 81+ self.txt = os.path.join(self.output_location, 'txt')
 82+ self.sorted = os.path.join(self.output_location, 'sorted')
 83+ self.diffs = os.path.join(self.output_location, 'diffs')
7584
76 - self.dataset = os.path.join(self.dataset_location, self.project.name)
77 - self.txt = os.path.join(self.output_location, 'txt')
78 - self.sorted = os.path.join(self.output_location, 'sorted')
79 - self.diffs = os.path.join(self.output_location, 'diffs')
 85+ self.directories = [self.output_location,
 86+ self.txt,
 87+ self.sorted,
 88+ self.dataset,
 89+ self.diffs]
 90+ self.verify_environment(self.directories)
8091
81 - self.directories = [self.output_location,
82 - self.txt,
83 - self.sorted,
84 - self.dataset,
85 - self.diffs]
86 - self.verify_environment(self.directories)
 92+ #Wikidump file related variables
 93+ self.dump_filename = self.generate_wikidump_filename()
 94+ self.dump_relative_path = self.set_dump_path()
 95+ self.dump_absolute_path = self.set_dump_path(absolute=True)
8796
88 - #Wikidump file related variables
89 - self.dump_filename = self.generate_wikidump_filename()
90 - self.dump_relative_path = self.set_dump_path()
91 - self.dump_absolute_path = self.set_dump_path(absolute=True)
 97+ #Collection names
 98+ self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
 99+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
 100+ self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
 101+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
 102+ self.collection = self.set_collection()
92103
93 - #Collection names
94 - self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
95 - self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
96 - self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
97 - self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
98104
99 -
100 -
101105 def __str__(self):
102106 return 'Runtime Settings for project %s %s' % (self.language.name,
103107 self.project.full_name)
@@ -105,14 +109,8 @@
106110 for item in self.__dict__:
107111 yield item
108112
109 - def dict(self):
110 - '''
111 - Return a dictionary with all properties and their values
112 - '''
113 - props = {}
114 - for prop in self:
115 - props[prop] = getattr(self, prop)
116 - return props
 113+ def set_collection(self):
 114+ return getattr(self, self.get_value('collection'))
117115
118116 def split_keywords(self):
119117 '''
@@ -141,7 +139,7 @@
142140 '''
143141 plugin = self.get_value('charts')
144142 requested_plugins = []
145 - if plugin != None and isinstance(plugin, type('module')) == False:
 143+ if plugin != None:
146144 plugins = plugin.split(',')
147145 available_plugins = inventory.available_analyses()
148146 for plugin in plugins:
@@ -220,8 +218,9 @@
221219 '''
222220 Generate the main name of the wikidump file to be downloaded.
223221 '''
 222+ choice = self.get_value('file')
224223 return '%s%s-latest-%s' % (self.language.code, self.project.name,
225 - self.get_value('file'))
 224+ self.file_choices[choice])
226225
227226 def update_language_settings(self):
228227 '''
Index: trunk/tools/editor_trends/classes/storage.py
@@ -160,29 +160,22 @@
161161 assert isinstance(data, dict), 'You need to feed me dictionaries.'
162162 self.db[self.collection].update({key: value}, {'$set': data})
163163
164 - def find(self, key=None, qualifier=None):
165 - if qualifier == 'min':
166 - return self.db[self.collection].find({
167 - key : {'$ne' : False}}).sort(key, pymongo.ASCENDING).limit(1)[0]
168 - elif qualifier == 'max':
169 - return self.db[self.collection].find({
170 - key : {'$ne' : False}}).sort(key, pymongo.DESCENDING).limit(1)[0]
171 - elif qualifier:
172 - return self.db[self.collection].find({key : qualifier})
173 - elif key != None:
174 - return self.db[self.collection].find({}, fields=[key])
 164+ def find(self, conditions, vars=None):
 165+ if conditions:
 166+ return self.db[self.collection].find(conditions, fields=vars)
175167 else:
176168 return self.db[self.collection].find()
177169
178 - def find_one(self, key, value, vars=None):
 170+ def find_one(self, conditions, vars=None):
179171 if vars:
180172 #if you only want to retrieve a specific variable(s) then you need to
181173 #specify vars, if vars is None then you will get the entire BSON object
182174 vars = vars.split(',')
183175 vars = dict([(var, 1) for var in vars])
184 - return self.db[self.collection].find_one({key: value}, vars)
 176+ return self.db[self.collection].find_one(conditions, vars)
185177 else:
186 - return self.db[self.collection].find_one({key: value})
 178+ #conditions should be a dictionary
 179+ return self.db[self.collection].find_one(conditions)
187180
188181
189182 def drop_collection(self):
Index: trunk/tools/editor_trends/classes/dataset.py
@@ -176,6 +176,7 @@
177177 #self.date = date
178178 self.data = 0
179179 self.time_unit = time_unit
 180+ self.date = date
180181 self.t1, self.t0 = self.set_date_range(date)
181182 self.id = id
182183 self.props = []
@@ -515,7 +516,7 @@
516517 variable.max = get_max(data)
517518 variable.num_obs = variable.number_of_obs()
518519 variable.num_dates = len(variable)
519 - #variable.first_obs, variable.last_obs = variable.get_date_range()
 520+ variable.first_obs, variable.last_obs = variable.get_date_range()
520521
521522 def summary(self):
522523 '''
Index: trunk/tools/editor_trends/classes/analytics.py
@@ -64,10 +64,8 @@
6565
6666 def __call__(self):
6767 project = 'wiki'
68 - #rts = runtime_settings.init_environment('wiki', 'en', args)
6968 for lang in self.languages:
7069 self.rts = runtime_settings.init_environment(project, lang, self.args)
71 - #self.rts.editors_dataset = 'editors_dataset'
7270
7371 self.rts.dbname = '%s%s' % (lang, project)
7472 for cum_cutoff in self.cum_cutoff:
@@ -91,15 +89,16 @@
9290 Generic loop function that loops over all the editors of a Wikipedia
9391 project and then calls the plugin that does the actual mapping.
9492 '''
95 - db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset)
 93+ db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.collection)
9694 while True:
9795 try:
9896 editor_id = self.tasks.get(block=False)
 97+ self.tasks.task_done()
9998 if editor_id == None:
10099 self.result.put(self.var)
101100 break
102 - editor = db.find_one('editor', editor_id)
103 - self.plugin(self.var, editor, dbname=self.rts.dbname, data=self.data)
 101+ editor = db.find_one({'editor': editor_id})
 102+ self.plugin(self.var, editor, rts=self.rts, data=self.data)
104103 self.result.put(True)
105104 except Empty:
106105 pass
Index: trunk/tools/editor_trends/utils/log.py
@@ -31,8 +31,8 @@
3232 def to_db(rts, jobtype, task, timer, event='start'):
3333 db = storage.init_database(rts.storage, rts.dbname, 'jobs')
3434 created = datetime.datetime.now()
35 - job = db.find_one('hash', rts.id)
36 -
 35+ job = db.find_one({'hash': rts.id})
 36+ #print job
3737 data = {'hash': rts.id,
3838 'created': created,
3939 'jobtype': jobtype,
@@ -50,7 +50,7 @@
5151 data['finished'] = True
5252 _id = db.save(data)
5353
54 - job = db.find_one('_id', _id)
 54+ job = db.find_one({'_id': _id})
5555
5656 tasks = job['tasks']
5757 t = tasks.get(task, {})