r82206 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82205‎ | r82206 | r82207 >
Date:00:24, 16 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
more progress...
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/community_graph.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/json_encoders.py (modified) (history)
  • /trunk/tools/editor_trends/bots/detector.py (modified) (history)
  • /trunk/tools/editor_trends/classes/bots.py (modified) (history)
  • /trunk/tools/editor_trends/classes/consumers.py (modified) (history)
  • /trunk/tools/editor_trends/classes/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/classes/exceptions.py (modified) (history)
  • /trunk/tools/editor_trends/classes/languages.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/settings.py (modified) (history)
  • /trunk/tools/editor_trends/cronjobs.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/database/launcher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/downloader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/mapreduce/xml2pig.py (modified) (history)
  • /trunk/tools/editor_trends/utils/compression.py (modified) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/http_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/inventory.py (modified) (history)
  • /trunk/tools/editor_trends/utils/log.py (modified) (history)
  • /trunk/tools/editor_trends/utils/messages.py (modified) (history)
  • /trunk/tools/editor_trends/utils/text_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/timer.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/parser.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/json_encoders.py
@@ -19,6 +19,7 @@
2020
2121 import sys
2222 import types
 23+import re
2324
2425 if '..' not in sys.path:
2526 sys.path.append('..')
@@ -27,35 +28,46 @@
2829 from classes import exceptions
2930 from utils import data_converter
3031
 32+HISTOGRAM = re.compile('histogram')
 33+#BAR = re.compile('bar')
 34+STACKED_BAR = re.compile('stacked_bar')
 35+RES = [HISTOGRAM, STACKED_BAR]
3136
32 -def available_json_encoders():
 37+def determine_chart_type(chart):
 38+ for res in RES:
 39+ match = re.findall(res, chart)
 40+ if len(match) > 0:
 41+ return match[0]
 42+ #Bar charts is the default chart,
 43+ return 'bar'
 44+
 45+def get_json_encoder(chart):
 46+ chart = determine_chart_type(chart)
3347 functions = globals()
34 - d = {}
35 - ignore = ['transform_to_json']
 48+ ignore = ['transform_to_json', 'available_charts', 'init_options']
3649 for func in functions:
37 - if func.endswith('json') \
38 - and func not in ignore:
39 - d[func] = func
 50+ if func.endswith('json'):
 51+ if func not in ignore:
 52+ encoder = func.replace('to_', '')
 53+ encoder = encoder.replace('_json', '')
 54+ if encoder == chart:
 55+ return func, chart, RES
 56+ return None, chart, RES
4057
41 - return d
4258
43 -
4459 def transform_to_json(ds):
4560 analyses = analyzer.available_analyses()
46 - json_encoders = available_json_encoders()
47 - analysis = '%s_%s_%s' % ('transform_to', ds.encoder, 'json')
48 - print analysis
 61+ encoder = get_json_encoder(ds.chart)
 62+ #analysis = '%s_%s_%s' % ('transform_to', ds.encoder, 'json')
 63+ #print analysis
4964 encoder = getattr(locals(), analysis, None)
5065 if encoder == None:
5166 encoder = to_bar_json
5267
5368 data = encoder(ds)
54 -# except Exception, e:
55 -# print e
56 -# raise exceptions.UnknownJSONEncoderError(analysis)
57 -
5869 return data
5970
 71+
6072 def init_options():
6173 options = {}
6274 options['xaxis'] = {}
@@ -101,6 +113,10 @@
102114 return json
103115
104116
 117+def to_histogram_json(ds):
 118+ pass
 119+
 120+
105121 def to_stacked_bar_json(ds):
106122 '''
107123 This function outputs data in a format that is understood by jquery
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
@@ -18,11 +18,11 @@
1919 __version__ = '0.1'
2020
2121 import sys
22 -sys.path.append('..')
 22+if '..' not in sys.path:
 23+ sys.path.append('..')
2324
24 -import configuration
25 -settings = configuration.Settings()
26 -
 25+from classes import settings
 26+settings = settings.Settings()
2727 from database import db
2828 from utils import file_utils
2929
Index: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py
@@ -17,17 +17,20 @@
1818 __date__ = '2010-11-15'
1919 __version__ = '0.1'
2020
21 -import sys
22 -sys.path.append('..')
2321
2422 import os
2523 import xml.etree.cElementTree as cElementTree
2624
27 -import configuration
 25+if '..' not in sys.path:
 26+ sys.path.append('..')
 27+
 28+from classes import settings
 29+settings = settings.Settings()
 30+
2831 from utils import file_utils
29 -settings = configuration.Settings()
3032
3133
 34+
3235 class DumpStatistics(object):
3336 ''' Simple class to keep track of XML tags, how often they occur,
3437 and the length of strings they contain. This is used to calculate the
Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
@@ -19,11 +19,11 @@
2020
2121 import sys
2222 import os
23 -sys.path.append('..')
 23+if '..' not in sys.path:
 24+ sys.path.append('..')
2425
25 -import configuration
26 -settings = configuration.Settings()
27 -
 26+from classes import settings
 27+settings = settings.Settings()
2828 from etl import extracter
2929 from utils import file_utils
3030 import wikitree
Index: trunk/tools/editor_trends/manage.py
@@ -26,7 +26,6 @@
2727 from argparse import RawTextHelpFormatter
2828 import ConfigParser
2929
30 -#import configuration
3130 from utils import file_utils
3231 from utils import ordered_dict
3332 from utils import log
@@ -268,9 +267,9 @@
269268
270269 def about_statement():
271270 print ''
272 - print 'Editor Trends Software is (c) 2010-2011 by the Wikimedia Foundation.'
 271+ print 'Wikilytics is (c) 2010-2011 by the Wikimedia Foundation.'
273272 print 'Written by Diederik van Liere (dvanliere@gmail.com).'
274 - print '''This software comes with ABSOLUTELY NO WARRANTY.\nThis is
 273+ print '''This software comes with ABSOLUTELY NO WARRANTY. This is
275274 free software, and you are welcome to distribute it under certain
276275 conditions.'''
277276 print 'See the README.1ST file for more information.'
@@ -281,7 +280,6 @@
282281 '''
283282 Entry point for parsing command line and launching the needed function(s).
284283 '''
285 - #settings = configuration.Settings()
286284 language = languages.init()
287285 project = projects.init()
288286 pjc = projects.ProjectContainer()
Index: trunk/tools/editor_trends/wikitree/parser.py
@@ -23,10 +23,11 @@
2424 import xml.etree.cElementTree as cElementTree
2525 import sys
2626
27 -sys.path.append('..')
 27+if '..' not in sys.path:
 28+ sys.path.append('..')
2829
29 -import configuration
30 -settings = configuration.Settings()
 30+from classes import settings
 31+settings = settings.Settings()
3132 from utils import file_utils
3233
3334 def convert_html_entities(text):
Index: trunk/tools/editor_trends/etl/store.py
@@ -21,100 +21,100 @@
2222 import multiprocessing
2323 import sys
2424 import os
 25+import progressbar
2526
2627 from utils import file_utils
2728 from utils import text_utils
2829 from database import cache
 30+from database import db
 31+from classes import consumers
2932 from utils import messages
30 -from database import db
3133
3234
33 -def store_articles(rts):
34 - location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
35 - fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding)
36 - headers = ['id', 'title']
37 - data = file_utils.read_unicode_text(fh)
38 - fh.close()
3935
40 - mongo = db.init_mongo_db(rts.dbname)
41 - collection = mongo[rts.articles_raw]
 36+class Storer(consumers.BaseConsumer):
 37+ def run(self):
 38+ '''
 39+ This function is called by multiple consumers who each take a sorted
 40+ file and create a cache object. If the number of edits made by an
 41+ editor is above the treshold then the cache object stores the data in
 42+ Mongo, else the data is discarded.
 43+ The treshold is currently more than 9 edits and is not yet configurable.
 44+ '''
 45+ mongo = db.init_mongo_db(self.rts.dbname)
 46+ collection = mongo[self.rts.editors_raw]
4247
43 - articles = {}
44 - for x, d in enumerate(data):
45 - d = d.split('\t')
46 - x = str(x)
47 - articles[x] = {}
48 - for k, v in zip(headers, d):
49 - articles[x][k] = v
 48+ editor_cache = cache.EditorCache(collection)
 49+ prev_contributor = -1
 50+ while True:
 51+ try:
 52+ filename = self.tasks.get(block=False)
 53+ except Empty:
 54+ break
5055
51 - collection.insert(articles)
 56+ self.tasks.task_done()
 57+ if filename == None:
 58+ self.result.put(None)
 59+ break
5260
 61+ fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
 62+ 'r', self.rts.encoding)
 63+ for line in file_utils.read_raw_data(fh):
 64+ if len(line) > 1:
 65+ contributor = line[0]
 66+ #print 'Parsing %s' % contributor
 67+ if prev_contributor != contributor and prev_contributor != -1:
 68+ editor_cache.add(prev_contributor, 'NEXT')
 69+ date = text_utils.convert_timestamp_to_datetime_utc(line[1])
 70+ article_id = int(line[2])
 71+ username = line[3].encode(self.rts.encoding)
 72+ ns = int(line[4])
 73+ value = {'date': date,
 74+ 'article': article_id,
 75+ 'username': username,
 76+ 'ns': ns}
 77+ editor_cache.add(contributor, value)
 78+ prev_contributor = contributor
 79+ fh.close()
 80+ self.result.put(True)
5381
54 -def store_editors(tasks, rts):
55 - '''
56 - This function is called by multiple consumers who each take a sorted file
57 - and create a cache object. If the number of edits made by an editor is above
58 - the treshold then the cache object stores the data in Mongo, else the data
59 - is discarded.
60 - The treshold is currently more than 9 edits and is not yet configurable.
61 - '''
 82+
 83+def store_articles(rts):
6284 mongo = db.init_mongo_db(rts.dbname)
63 - collection = mongo[rts.editors_raw]
 85+ collection = mongo[rts.articles_raw]
6486
65 - editor_cache = cache.EditorCache(collection)
66 - prev_contributor = -1
67 - while True:
68 - try:
69 - filename = tasks.get(block=False)
70 - except Empty:
71 - break
 87+ location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
 88+ fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding)
 89+ print 'Storing article titles...'
 90+ for line in fh:
 91+ line = line.strip()
 92+ id, title = line.split('\t')
 93+ collection.insert({'id':id, 'title':title})
 94+ fh.close()
 95+ print 'Done...'
7296
73 - tasks.task_done()
74 - if filename == None:
75 - print 'Swallowing a poison pill.'
76 - break
77 - print '%s files left in the queue.' % messages.show(tasks.qsize)
7897
79 - fh = file_utils.create_txt_filehandle(rts.sorted, filename, 'r', rts.encoding)
80 - for line in file_utils.read_raw_data(fh):
81 - if len(line) > 1:
82 - contributor = line[0]
83 - #print 'Parsing %s' % contributor
84 - if prev_contributor != contributor and prev_contributor != -1:
85 - editor_cache.add(prev_contributor, 'NEXT')
86 - date = text_utils.convert_timestamp_to_datetime_utc(line[1])
87 - article_id = int(line[2])
88 - username = line[3].encode(rts.encoding)
89 - ns = int(line[4])
90 - value = {'date': date,
91 - 'article': article_id,
92 - 'username': username,
93 - 'ns': ns}
94 - editor_cache.add(contributor, value)
95 - prev_contributor = contributor
96 - fh.close()
97 - #print editor_cache.n
98 -
99 -
10098 def launcher(rts):
10199 '''
102100 This is the main entry point and creates a number of workers and launches
103101 them.
104102 '''
105 - #rts.sorted, rts.dbname, rts.collection
 103+ store_articles(rts)
 104+ print 'Input directory is: %s ' % rts.sorted
106105 mongo = db.init_mongo_db(rts.dbname)
107106 coll = mongo[rts.editors_raw]
108107 coll.ensure_index('editor')
109108 coll.create_index('editor')
110109
111110 files = file_utils.retrieve_file_list(rts.sorted, 'csv')
 111+ pbar = progressbar.ProgressBar(maxval=len(files)).start()
112112
113 - print 'Input directory is: %s ' % rts.sorted
114113 tasks = multiprocessing.JoinableQueue()
115 - consumers = [multiprocessing.Process(target=store_editors,
116 - args=(tasks, rts))
117 - for i in xrange(rts.number_of_processes)]
 114+ result = multiprocessing.JoinableQueue()
118115
 116+ consumers = [Storer(rts, tasks, result) for
 117+ x in xrange(rts.number_of_processes)]
 118+
119119 for filename in files:
120120 tasks.put(filename)
121121
@@ -124,8 +124,20 @@
125125 for w in consumers:
126126 w.start()
127127
 128+ ppills = rts.number_of_processes
 129+ while True:
 130+ while ppills > 0:
 131+ try:
 132+ res = result.get(block=True)
 133+ if res == True:
 134+ pbar.update(pbar.currval + 1)
 135+ else:
 136+ ppills -= 1
 137+ except Empty:
 138+ pass
 139+ break
 140+
128141 tasks.join()
129 - store_articles(rts)
130142
131143
132144 def debug():
Index: trunk/tools/editor_trends/etl/downloader.py
@@ -22,15 +22,12 @@
2323 import multiprocessing
2424 import sys
2525
26 -#sys.path.append('..')
27 -#import configuration
28 -#settings = configuration.Settings()
29 -
3026 from utils import file_utils
3127 from utils import http_utils
3228 from utils import text_utils
3329 from utils import log
3430
 31+
3532 def download_wiki_file(task_queue, properties):
3633 '''
3734 This is a very simple replacement for wget and curl because Windows does
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -380,7 +380,6 @@
381381 no data pollution and finally it will start the parser to actually extract
382382 the variables from the different dump files.
383383 '''
384 - result = True
385384 tasks = unzip(rts)
386385 if not tasks:
387386 return False
@@ -413,10 +412,14 @@
414413 tasks.join()
415414 filehandles = [fh.close() for fh in filehandles]
416415
417 - result = all([consumer.exitcode for consumer in consumers])
418 - return result
 416+ result = sum([consumer.exitcode for consumer in consumers])
419417
 418+ if restult == 0:
 419+ return True
 420+ else:
 421+ return False
420422
 423+
421424 def debug():
422425 project = 'wiki'
423426 language_code = 'sv'
Index: trunk/tools/editor_trends/etl/sort.py
@@ -22,12 +22,45 @@
2323 import sys
2424 import os
2525 import multiprocessing
 26+import progressbar
2627 from Queue import Empty
2728
2829 from utils import file_utils
2930 from utils import messages
 31+from classes import consumers
3032
3133
 34+class Sorter(consumers.BaseConsumer):
 35+ def run(self):
 36+ '''
 37+ The feeder function is called by the launcher and gives it a task to
 38+ complete.
 39+ '''
 40+ while True:
 41+ try:
 42+ filename = self.tasks.get(block=False)
 43+ self.tasks.task_done()
 44+ if filename == None:
 45+ self.result.put(None)
 46+ break
 47+
 48+ fh = file_utils.create_txt_filehandle(self.rts.txt,
 49+ filename,
 50+ 'r',
 51+ self.rts.encoding)
 52+ data = file_utils.read_unicode_text(fh)
 53+ fh.close()
 54+ data = [d.strip() for d in data]
 55+ data = [d.split('\t') for d in data]
 56+ sorted_data = mergesort(data)
 57+ write_sorted_file(sorted_data, filename, self.rts)
 58+ self.result.put(True)
 59+ except UnicodeDecodeError, e:
 60+ print e
 61+ except Empty:
 62+ pass
 63+
 64+
3265 def quick_sort(obs):
3366 '''
3467 Quicksort is a sorting algorithm developed by C. A. R. Hoare that, on \
@@ -74,7 +107,6 @@
75108 return result
76109
77110
78 -
79111 def merge_sorted_files(target, files, iteration, rts):
80112 '''
81113 Merges smaller sorted files in one big file, Only used for creating
@@ -105,48 +137,20 @@
106138 fh.close()
107139
108140
109 -def mergesort_feeder(tasks, rts):
110 - '''
111 - The feeder function is called by the launcher and gives it a task to
112 - complete.
113 - '''
114 - while True:
115 - try:
116 - filename = tasks.get(block=False)
117 - tasks.task_done()
118 - if filename == None:
119 - print 'Swallowed a poison pill'
120 - print tasks.qsize()
121 - break
122 -
123 - fh = file_utils.create_txt_filehandle(rts.txt,
124 - filename,
125 - 'r',
126 - rts.encoding)
127 - #print fh
128 - #data = fh.readlines()
129 - data = file_utils.read_unicode_text(fh)
130 - fh.close()
131 - data = [d.strip() for d in data]
132 - data = [d.split('\t') for d in data]
133 - sorted_data = mergesort(data)
134 - write_sorted_file(sorted_data, filename, rts)
135 - print filename, messages.show(tasks.qsize)
136 - except UnicodeDecodeError, e:
137 - print e
138 - except Empty:
139 - pass
140 -
141 -
142141 def launcher(rts):
143142 '''
144143 rts is an instance of RunTimeSettings
145144 '''
146145 files = file_utils.retrieve_file_list(rts.txt, 'csv')
 146+ #files = files[0:6]
 147+
 148+ pbar = progressbar.ProgressBar(maxval=len(files)).start()
147149 tasks = multiprocessing.JoinableQueue()
148 - consumers = [multiprocessing.Process(target=mergesort_feeder,
149 - args=(tasks, rts))
150 - for x in xrange(rts.number_of_processes)]
 150+ result = multiprocessing.JoinableQueue()
 151+
 152+ consumers = [Sorter(rts, tasks, result) for
 153+ x in xrange(rts.number_of_processes)]
 154+
151155 for filename in files:
152156 tasks.put(filename)
153157
@@ -156,17 +160,17 @@
157161 for w in consumers:
158162 w.start()
159163
 164+ ppills = rts.number_of_processes
 165+ while True:
 166+ while ppills > 0:
 167+ try:
 168+ res = result.get(block=True)
 169+ if res == True:
 170+ pbar.update(pbar.currval + 1)
 171+ else:
 172+ ppills -= 1
 173+ except Empty:
 174+ pass
 175+ break
 176+
160177 tasks.join()
161 -
162 -
163 -def debug():
164 - '''
165 - Simple test function
166 - '''
167 - source = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
168 - target = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
169 - mergesort_launcher(source, target)
170 -
171 -
172 -if __name__ == '__main__':
173 - debug()
Index: trunk/tools/editor_trends/classes/consumers.py
@@ -19,153 +19,12 @@
2020
2121 import multiprocessing
2222
23 -'''
24 -This needs a cleanup
25 -'''
2623 class BaseConsumer(multiprocessing.Process):
2724
28 - def __init__(self, task_queue, result_queue):
 25+ def __init__(self, rts, tasks, result=None):
2926 multiprocessing.Process.__init__(self)
30 - self.task_queue = task_queue
31 - self.result_queue = result_queue
 27+ self.rts = rts
 28+ self.tasks = tasks
 29+ self.result = result
3230
3331
34 -# for kw in kwargs:
35 -# setattr(self, kw, kwargs[kw])
36 -#
37 -# def run(self):
38 -# proc_name = self.name
39 -# kwargs = {}
40 -# IGNORE = ['input_queue', 'result_queue', 'target']
41 -# for kw in self.__dict__:
42 -# if kw not in IGNORE and not kw.startswith('_'):
43 -# kwargs[kw] = getattr(self, kw)
44 -# self.target(self.input_queue, self.result_queue, **kwargs)
45 -
46 -
47 -class ProcessResultQueue(multiprocessing.Process):
48 -
49 - def __init__(self, target, result_queue, **kwargs):
50 - multiprocessing.Process.__init__(self)
51 - self.result_queue = result_queue
52 - self.target = target
53 - for kw in kwargs:
54 - setattr(self, kw, kwargs[kw])
55 -
56 -
57 - def run(self):
58 - proc_name = self.name
59 - kwargs = {}
60 - IGNORE = ['result_queue', 'target']
61 - for kw in self.__dict__:
62 - if kw not in IGNORE and not kw.startswith('_'):
63 - kwargs[kw] = getattr(self, kw)
64 - self.target(self.result_queue, **kwargs)
65 -
66 -class TXTFile(object):
67 -
68 - def __init__(self, file, location, output, output_file, target, **kwargs):
69 - self.file = file
70 - self.location = location
71 - self.target = target
72 - self.output = output
73 - self.output_file = output_file
74 - for kw in kwargs:
75 - setattr(self, kw, kwargs[kw])
76 -
77 - def __str__(self):
78 - return '%s' % (self.file)
79 -
80 - def __call__(self, bots):
81 - self.bots = bots
82 - self.fr = file_utils.create_txt_filehandle(self.location, self.file, 'r', settings.encoding)
83 - self.fw = file_utils.create_txt_filehandle(self.output, self.output_file, 'w', settings.encoding)
84 - for line in self.fr:
85 - line = line.strip()
86 - if line == '':
87 - continue
88 - line = line.split('\t')
89 - self.bots = self.target(line, self.fw, self.bots, self.keys)
90 - if self.bots == {}:
91 - break
92 - self.fr.close()
93 - self.fw.close()
94 - return self.bots
95 -
96 -
97 -class XMLFileConsumer(BaseConsumer):
98 -
99 - def run(self):
100 - while True:
101 - new_xmlfile = self.task_queue.get()
102 - self.task_queue.task_done()
103 - if new_xmlfile == None:
104 - print 'Swallowed a poison pill'
105 - break
106 - print 'Queue is %s files long...' % (messages.show(self.task_queue.qsize) - settings.number_of_processes)
107 - new_xmlfile()
108 -
109 -
110 -class XMLFile(object):
111 - def __init__(self, file, location, output, output_file, target, ** kwargs):
112 - self.file = file
113 - self.location = location
114 - self.output = output
115 - self.target = target
116 - self.output_file = output_file
117 - for kw in kwargs:
118 - setattr(self, kw, kwargs[kw])
119 -
120 - def create_file_handle(self):
121 - self.mode = 'a'
122 - if self.output_file == None:
123 - self.mode = 'w'
124 - self.output_file = self.file[:-4] + '.txt'
125 -
126 - self.fh = file_utils.create_txt_filehandle(self.output, self.output_file, self.mode, settings.encoding)
127 -
128 - def __str__(self):
129 - return '%s' % (self.file)
130 -
131 - def __call__(self, bots=None):
132 - if bots != {} and bots != None:
133 - self.bots = bots
134 - if settings.debug:
135 - messages = {}
136 - vars = {}
137 -
138 - data = xml.read_input(utils.create_txt_filehandle(self.location,
139 - self.file, 'r',
140 - encoding=settings.encoding))
141 - self.create_file_handle()
142 - for raw_data in data:
143 - xml_buffer = cStringIO.StringIO()
144 - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
145 - try:
146 - raw_data = ''.join(raw_data)
147 - xml_buffer.write(raw_data)
148 - elem = cElementTree.XML(xml_buffer.getvalue())
149 - bots = self.target(elem, fh=self.fh, bots=self.bots)
150 - except SyntaxError, error:
151 - print error
152 - '''
153 - There are few cases with invalid tokens, they are ignored
154 - '''
155 - if settings.debug:
156 - file_utils.track_errors(xml_buffer, error, self.file, messages)
157 - except UnicodeEncodeError, error:
158 - print error
159 - if settings.debug:
160 - file_utils.track_errors(xml_buffer, error, self.file, messages)
161 - except MemoryError, error:
162 - print self.file, error
163 - print raw_data[:12]
164 - print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
165 - else:
166 - self.fh.close()
167 -
168 - if settings.debug:
169 - file_utils.report_error_messages(messages, self.target)
170 -
171 - return bots
172 -
Index: trunk/tools/editor_trends/classes/settings.py
@@ -143,7 +143,8 @@
144144 try:
145145 os.makedirs(directory)
146146 except IOError:
147 - print 'Configuration Error, could not create directory %s.' % directory
 147+ print 'Configuration Error, could not create directory %s.'\
 148+ % directory
148149
149150 def detect_windows_program(self, program):
150151 entry = self.windows_register.get(program, None)
@@ -154,7 +155,8 @@
155156 return None
156157
157158 def detect_linux_program(self, program):
158 - path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]
 159+ path = subprocess.Popen(['which', '%s' % program],
 160+ stdout=subprocess.PIPE).communicate()[0]
159161 return path.strip()
160162
161163 def detect_installed_program(self, program):
Index: trunk/tools/editor_trends/classes/exceptions.py
@@ -62,6 +62,15 @@
6363 return 'There is no JSON encoder called %s, please make sure that you \
6464 entered the right name' % self.func
6565
 66+class UnknownChartError(Error):
 67+ def __init__(self, chart, charts):
 68+ self.chart = chart
 69+ self.charts = charts
 70+
 71+ def __str__(self):
 72+ return 'Currently, chart type %s is not supported. Please choose one of \
 73+ the following charts: %s' % (self.chart, self.charts)
 74+
6675 class NotYetImplementedError(Error):
6776 def __init__(self, func):
6877 self.func = func
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -20,7 +20,6 @@
2121 '''
2222 This file provides mapper between language name and locale language name and
2323 Wikipedia acronym.
24 -Gothic and Birmese are not yet supported, see rows 450 and 554.
2524 '''
2625
2726 import os
@@ -33,7 +32,8 @@
3433 from settings import Settings
3534 from utils import text_utils
3635 from utils import ordered_dict as odict
37 -from classes import languages
 36+import languages
 37+import projects
3838
3939
4040 class RunTimeSettings(Settings):
@@ -129,7 +129,8 @@
130130 '''
131131 Construct the full project location
132132 '''
133 - return os.path.join(self.input_location, self.language.code, self.project.name)
 133+ return os.path.join(self.input_location, self.language.code,
 134+ self.project.name)
134135
135136 def show_settings(self):
136137 '''
@@ -137,7 +138,9 @@
138139 '''
139140 about = {}
140141 about['Project'] = '%s' % self.project.full_name.title()
141 - about['Language'] = '%s / %s / %s' % (self.language.name, self.language.locale, self.language.code)
 142+ about['Language'] = '%s / %s / %s' % (self.language.name,
 143+ self.language.locale,
 144+ self.language.code)
142145 about['Input directory'] = '%s' % self.location
143146 about['Output directory'] = '%s and subdirectories' % self.location
144147
@@ -155,7 +158,9 @@
156159
157160 def set_dump_path(self, absolute=False):
158161 if absolute:
159 - return '%s/%s%s/latest/' % (self.wp_dump_location, self.language.code, self.project.name)
 162+ return '%s/%s%s/latest/' % (self.wp_dump_location,
 163+ self.language.code,
 164+ self.project.name)
160165 else:
161166 return '/%s%s/latest/' % (self.language.code, self.project.name)
162167
@@ -163,7 +168,8 @@
164169 '''
165170 Generate the main name of the wikidump file to be downloaded.
166171 '''
167 - return '%s%s-latest-%s' % (self.language.code, self.project.name, self.get_value('file'))
 172+ return '%s%s-latest-%s' % (self.language.code, self.project.name,
 173+ self.get_value('file'))
168174
169175 def update_language_settings(self):
170176 '''
@@ -193,7 +199,8 @@
194200
195201 def get_projectname(self):
196202 '''
197 - Determine the full project name based on the project acronym and language.
 203+ Determine the full project name based on the project acronym
 204+ and language.
198205 '''
199206 #language_code = self.get_language()
200207 print self.language.code, self.project.name
@@ -223,3 +230,15 @@
224231 return namespaces.split(',')
225232 else:
226233 return ['0'] #Assume that the mainspace is of interest
 234+
 235+
 236+def init_environment(project, language_code, args):
 237+ pjc = projects.ProjectContainer()
 238+ project = pjc.get_project(project)
 239+ lnc = languages.LanguageContainer()
 240+ language = lnc.get_language(language_code)
 241+
 242+ args.language = language.name
 243+ args.project = project.name
 244+ rts = RunTimeSettings(project, language, args)
 245+ return rts
Index: trunk/tools/editor_trends/classes/dataset.py
@@ -23,18 +23,23 @@
2424 import math
2525 import operator
2626 import sys
 27+import hashlib
2728 from pymongo.son_manipulator import SONManipulator
2829 from multiprocessing import Lock
2930 from texttable import Texttable
3031
31 -sys.path.append('..')
32 -import configuration
33 -settings = configuration.Settings()
3432
 33+if '..' not in sys.path:
 34+ sys.path.append('..')
 35+
 36+from classes import settings
 37+settings = settings.Settings()
 38+
3539 from utils import file_utils
3640 from utils import data_converter
3741 from database import db
3842 from analyses import json_encoders
 43+from classes import exceptions
3944
4045 class Transform(SONManipulator):
4146 '''
@@ -82,10 +87,14 @@
8388 def __hash__(self, vars):
8489 '''
8590 This is a generic hash function that expects a list of variables, used
86 - to lookup an observation or Variable.
 91+ to lookup an Observation or Variable.
8792 '''
88 - id = ''.join([str(var) for var in vars])
89 - return hash(id)
 93+ id = '_'.join([str(var) for var in vars])
 94+ m = hashlib.md5()
 95+ m.update(id)
 96+ #print id, m.hexdigest()
 97+ return m.hexdigest()
 98+ #return ''.join([str(var) for var in vars])
9099
91100 def encode_to_bson(self, data=None):
92101 '''
@@ -176,9 +185,6 @@
177186
178187 def add(self, value):
179188 '''
180 - If update == True then data[i] will be incremented else data[i] will be
181 - created, in that case make sure that i is unique. Update is useful for
182 - tallying a variable.
183189 '''
184190 self.lock.acquire()
185191 try:
@@ -192,6 +198,7 @@
193199 self.count += 1
194200 self.lock.release()
195201
 202+
196203 def get_date_range(self):
197204 return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \
198205 self.t1.month, self.t1.day, self.t1.year)
@@ -200,9 +207,10 @@
201208 '''
202209 This class constructs a time-based variable.
203210 '''
204 - lock = Lock()
 211+
205212 def __init__(self, name, time_unit, **kwargs):
206213 self.name = name
 214+ self.lock = Lock()
207215 self.obs = {}
208216 self.time_unit = time_unit
209217 self.groupbys = []
@@ -278,11 +286,16 @@
279287 values.insert(0, end)
280288 values.insert(0, start)
281289 id = self.__hash__(values)
 290+# print values
 291+ self.lock.acquire()
 292+ try:
 293+ obs = self.get_observation(id, date, meta)
 294+ obs.add(value)
 295+ self.obs[id] = obs
 296+ finally:
 297+ self.lock.release()
 298+ print len(self.obs)
282299
283 - obs = self.get_observation(id, date, meta)
284 - obs.add(value)
285 - self.obs[id] = obs
286 -
287300 def number_of_obs(self):
288301 n = 0
289302 for obs in self.obs:
@@ -327,19 +340,19 @@
328341 to output the dataset to a csv file, mongodb and display statistics.
329342 '''
330343
331 - def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs):
332 - encoders = json_encoders.available_json_encoders()
333 - if encoder not in encoders:
334 - raise exception.UnknownJSONEncoderError(encoder)
335 - else:
336 - self.encoder = encoder
337 - self.name = name
338 - self.project = project
339 - self.collection = collection
340 - self.language_code = language_code
 344+ def __init__(self, chart, rts, vars=None, **kwargs):
 345+ #project, collection, language_code
 346+ self.encoder, chart, charts = json_encoders.get_json_encoder(chart)
 347+ if self.encoder == None:
 348+ raise exceptions.UnknownChartError(chart, charts)
 349+ self.chart = chart
 350+ self.name = 'Dataset to construct %s' % self.chart
 351+ self.project = rts.project.name
 352+ self.collection = rts.editors_dataset
 353+ self.language_code = rts.language.code
341354 self.hash = self.name
342355 self._type = 'dataset'
343 - self.created = datetime.datetime.now()
 356+ self.created = datetime.datetime.today()
344357 self.format = 'long'
345358 for kw in kwargs:
346359 setattr(self, kw, kwargs[kw])
Index: trunk/tools/editor_trends/classes/languages.py
@@ -16,6 +16,8 @@
1717 __date__ = '2011-01-26'
1818 __version__ = '0.1'
1919
 20+'Gothic and Birmese are not yet supported, see rows 450 and 554.'
 21+
2022 import locale
2123 import sys
2224 sys.path.append('..')
Index: trunk/tools/editor_trends/classes/bots.py
@@ -20,11 +20,13 @@
2121
2222 import datetime
2323 import sys
24 -sys.path.append('..')
2524
26 -import configuration
27 -settings = configuration.Settings()
 25+if '..' not in sys.path:
 26+ sys.path.append('..')
2827
 28+from classes import settings
 29+settings = settings.Settings()
 30+
2931 from etl import shaper
3032 from utils import file_utils
3133
Index: trunk/tools/editor_trends/utils/inventory.py
@@ -21,10 +21,11 @@
2222 import sys
2323 from threading import Thread
2424 from HTMLParser import HTMLParser
25 -sys.path.append('..')
 25+if '..' not in sys.path:
 26+ sys.path.append('..')
2627
27 -import configuration
28 -settings = configuration.Settings()
 28+from classes import settings
 29+settings = settings.Settings()
2930
3031 from database import db
3132 from utils import http_utils
Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -35,12 +35,12 @@
3636 import shutil
3737 import multiprocessing
3838
39 -sys.path.append('..')
 39+if '..' not in sys.path:
 40+ sys.path.append('..')
4041
 42+from classes import settings
 43+settings = settings.Settings()
4144
42 -import configuration
43 -settings = configuration.Settings()
44 -
4545 from classes import exceptions
4646 import messages
4747 import text_utils
Index: trunk/tools/editor_trends/utils/http_utils.py
@@ -23,10 +23,11 @@
2424 import httplib
2525 import multiprocessing
2626
 27+if '..' not in sys.path:
 28+ sys.path.append('..')
2729
28 -sys.path.append('..')
29 -import configuration
30 -settings = configuration.Settings()
 30+from classes import settings
 31+settings = settings.Settings()
3132 import file_utils
3233 import text_utils
3334 import log
Index: trunk/tools/editor_trends/utils/timer.py
@@ -33,6 +33,7 @@
3434 self.stop()
3535 print 'Processing time: %s' % (self.t1 - self.t0)
3636
 37+
3738 def humanize_time_difference(seconds_elapsed):
3839 """
3940 Returns a humanized string representing time difference.
Index: trunk/tools/editor_trends/utils/compression.py
@@ -20,10 +20,9 @@
2121 import sys
2222 import subprocess
2323 import os
24 -sys.path.append('..')
 24+if '..' not in sys.path:
 25+ sys.path.append('..')
2526
26 -#import configuration
27 -#settings = configuration.Settings()
2827 from classes import settings
2928 settings = settings.Settings()
3029 from classes import exceptions
Index: trunk/tools/editor_trends/utils/log.py
@@ -20,10 +20,11 @@
2121 import datetime
2222 import sys
2323 import progressbar
24 -sys.path.append('..')
 24+if '..' not in sys.path:
 25+ sys.path.append('..')
2526
26 -import configuration
27 -settings = configuration.Settings()
 27+from classes import settings
 28+settings = settings.Settings()
2829
2930 from database import db
3031
@@ -47,8 +48,10 @@
4849 elif jobtype == 'chart':
4950 _id = coll.save({'hash': hash, 'created': created,
5051 'jobtype': jobtype,
51 - 'project': rts.project,
52 - 'language_code': rts.language_code,
 52+ 'finished': True,
 53+ 'in_progress': True,
 54+ 'project': rts.project.name,
 55+ 'language_code': rts.language.code,
5356 'tasks': {}})
5457
5558 job = coll.find_one({'_id': _id})
Index: trunk/tools/editor_trends/utils/text_utils.py
@@ -21,11 +21,13 @@
2222 import time
2323 import sys
2424
25 -sys.path.append('..')
26 -import configuration
27 -settings = configuration.Settings()
 25+if '..' not in sys.path:
 26+ sys.path.append('..')
2827
 28+from classes import settings
 29+settings = settings.Settings()
2930
 31+
3032 def convert_timestamp_to_date(timestamp):
3133 return datetime.datetime.strptime(timestamp[:10], settings.date_format)
3234
Index: trunk/tools/editor_trends/utils/messages.py
@@ -17,14 +17,7 @@
1818 __date__ = '2011-01-05'
1919 __version__ = '0.1'
2020
21 -#import sys
22 -#sys.path.append('..')
23 -#
24 -#import configuration
25 -#settings = configuration.Settings()
2621
27 -
28 -
2922 def show(func):
3023 '''
3124 @func should be an qsize() belonging to a task queue. qsize() is not supported
Index: trunk/tools/editor_trends/database/launcher.py
@@ -21,8 +21,9 @@
2222 import subprocess
2323 import os
2424
25 -import configuration
26 -settings = configuration.Settings()
 25+
 26+from classes import settings
 27+settings = settings.Settings()
2728 from classes import exceptions
2829 from utils import file_utils
2930
Index: trunk/tools/editor_trends/database/cache.py
@@ -20,12 +20,14 @@
2121
2222 import datetime
2323 import sys
24 -sys.path.append('..')
2524 import bson
2625
27 -import configuration
28 -settings = configuration.Settings()
 26+if '..' not in sys.path:
 27+ sys.path.append('..')
2928
 29+from classes import settings
 30+settings = settings.Settings()
 31+
3032 import db
3133 from utils import file_utils
3234 from etl import shaper
Index: trunk/tools/editor_trends/database/db.py
@@ -17,15 +17,18 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
 21+import sys
2122 import pymongo
2223 from bson.code import Code
23 -import sys
24 -sys.path.append('..')
2524
26 -import configuration
27 -settings = configuration.Settings()
 25+if '..' not in sys.path:
 26+ sys.path.append('..')
 27+
 28+from classes import settings
 29+settings = settings.Settings()
2830 import file_utils
2931
 32+
3033 def init_mongo_db(dbname):
3134 connection = pymongo.Connection()
3235 db = connection[dbname]
Index: trunk/tools/editor_trends/mapreduce/xml2pig.py
@@ -17,13 +17,15 @@
1818 __date__ = '2010-11-15'
1919 __version__ = '0.1'
2020
21 -import sys
22 -sys.path.append('..')
2321
2422 import os
2523 import xml.etree.cElementTree as cElementTree
2624
27 -import configuration
28 -settings = configuration.Settings()
 25+if '..' not in sys.path:
 26+ sys.path.append('..')
2927
 28+from classes import settings
 29+settings = settings.Settings()
3030
 31+
 32+
Index: trunk/tools/editor_trends/cronjobs.py
@@ -43,22 +43,22 @@
4444 return rts
4545
4646
47 -def launch_editor_trends_toolkit(task):
 47+def launch_editor_trends_toolkit(task, args):
4848 '''
4949 This function should only be called as a cronjob and not directly.
5050 '''
51 - rts = init_environment(task)
 51+ rts = runtime_settings.init_environment(task['project'], task['language_code'], args)
5252 res = manager.all_launcher(rts, None)
5353 return res
5454
5555
56 -def launch_chart(task):
 56+def launch_chart(task, args):
5757 '''
5858 This function should only be called as a cronjob and not directly.
5959 '''
6060 res = True
6161 try:
62 - rts = init_environment(task)
 62+ rts = runtime_settings.init_environment(task['project'], task['language_code'], args)
6363 func = task['jobtype']
6464 time_unit = 'month' #FIXME hardcoded string
6565 cutoff = 1 #FIXME hardcoded string
@@ -89,6 +89,8 @@
9090 mongo = db.init_mongo_db('wikilytics')
9191 coll = mongo['jobs']
9292 tasks = []
 93+ project, language, parser = manager.init_args_parser()
 94+ args = parser.parse_args(['django'])
9395 jobs = coll.find({'finished': False, 'in_progress': False, 'error': False})
9496 for job in jobs:
9597 tasks.append(job)
@@ -96,11 +98,11 @@
9799 for task in tasks:
98100 if task['jobtype'] == 'dataset':
99101 print 'Launching the Editor Trends Analytics Toolkit.'
100 - res = launch_editor_trends_toolkit(task)
 102+ res = launch_editor_trends_toolkit(task, args)
101103 #res = False
102104 else:
103105 print 'Launching %s.' % task['jobtype']
104 - res = launch_chart(task)
 106+ res = launch_chart(task, args)
105107
106108 if res:
107109 coll.update({'_id': task['_id']}, {'$set': {'finished': True}})
Index: trunk/tools/editor_trends/bots/detector.py
@@ -24,10 +24,14 @@
2525 import xml.etree.cElementTree as cElementTree
2626 import sys
2727 from Queue import Empty
28 -sys.path.append('..')
2928
30 -import configuration
31 -settings = configuration.Settings()
 29+if '..' not in sys.path:
 30+ sys.path.append('..')
 31+
 32+from classes import settings
 33+settings = settings.Settings()
 34+
 35+
3236 import wikitree
3337 from database import db
3438 from utils import file_utils

Status & tagging log