Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -153,7 +153,7 @@ |
154 | 154 | print 'Start storing data in MongoDB' |
155 | 155 | stopwatch = timer.Timer() |
156 | 156 | log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start') |
157 | | - db.cleanup_database(properties.project, logger) |
| 157 | + db.cleanup_database(properties.project.name, logger) |
158 | 158 | # write_message_to_log(logger, settings, |
159 | 159 | # message=None, |
160 | 160 | # verb='Storing', |
— | — | @@ -164,7 +164,7 @@ |
165 | 165 | # collection=properties.collection) |
166 | 166 | # for key in properties: |
167 | 167 | # print key, getattr(properties, key) |
168 | | - store.launcher(properties.sorted, properties.project, properties.collection) |
| 168 | + store.launcher(properties.sorted, properties.project.name, properties.collection) |
169 | 169 | stopwatch.elapsed() |
170 | 170 | log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish') |
171 | 171 | |
— | — | @@ -300,7 +300,7 @@ |
301 | 301 | action='store', |
302 | 302 | help='Enter the first letter of a language to see which languages are \ |
303 | 303 | available.') |
304 | | - parser_languages.set_defaults(func=language.show_languages) |
| 304 | + parser_languages.set_defaults(func=language.show_languages, args=[settings, project]) |
305 | 305 | |
306 | 306 | #CONFIG |
307 | 307 | parser_config = subparsers.add_parser('config', |
Index: trunk/tools/editor_trends/wikilytics/api/views.py |
— | — | @@ -13,6 +13,7 @@ |
14 | 14 | from wikilytics.api.forms import SearchForm, AnalysisForm |
15 | 15 | from wikilytics.api.models import Editor, Dataset, Job, Dump |
16 | 16 | import wikilytics.api.helpers as helpers |
| 17 | +from editor_trends.analyses import json_encoders |
17 | 18 | |
18 | 19 | |
19 | 20 | def search(request): |
— | — | @@ -76,7 +77,7 @@ |
77 | 78 | return HttpResponseRedirect(reverse('chart_generator', args=[project, language, chart])) |
78 | 79 | elif xhr: |
79 | 80 | dthandler = lambda obj:'new Date("%s")' % datetime.date.ctime(obj) if isinstance(obj, datetime.datetime) else obj |
80 | | - data = helpers.transform_to_json(ds) |
| 81 | + data = json_encoders.transform_to_json(ds) |
81 | 82 | return HttpResponse(json.dumps(data, default=dthandler), mimetype='application/json') |
82 | 83 | else: |
83 | 84 | |
Index: trunk/tools/editor_trends/wikilytics/templates/chart.html |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | var options = json['options']; |
21 | 21 | var css_id = "#wikilytics"; |
22 | 22 | console.log(data); |
| 23 | + console.log(options); |
23 | 24 | $("#status > h1").hide(); |
24 | 25 | $.plot($(css_id), data, options); |
25 | 26 | } |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -64,7 +64,7 @@ |
65 | 65 | class Settings: |
66 | 66 | __metaclass__ = Singleton |
67 | 67 | |
68 | | - def __init__(self, process_multiplier=1, **kwargs): |
| 68 | + def __init__(self, process_multiplier=1): |
69 | 69 | self.minimum_python_version = (2, 6) |
70 | 70 | self.detect_python_version() |
71 | 71 | self.encoding = 'utf-8' |
— | — | @@ -74,7 +74,7 @@ |
75 | 75 | |
76 | 76 | # Timestamp format as generated by the MediaWiki dumps |
77 | 77 | self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
78 | | - |
| 78 | + self.timestamp_server = '%D, %d %M %Y %H:M%:%SZ' |
79 | 79 | #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
80 | 80 | self.max_xmlfile_size = 4096 * 1024 |
81 | 81 | |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -37,8 +37,9 @@ |
38 | 38 | import configuration |
39 | 39 | settings = configuration.Settings() |
40 | 40 | |
41 | | -import exceptions |
| 41 | +from classes import exceptions |
42 | 42 | import messages |
| 43 | +import text_utils |
43 | 44 | |
44 | 45 | try: |
45 | 46 | import psyco |
— | — | @@ -74,16 +75,10 @@ |
75 | 76 | ''' |
76 | 77 | for line in fh: |
77 | 78 | line = line.strip() |
78 | | - if line == '': |
79 | | - continue |
80 | | - else: |
81 | | - line = line.split('\t') |
82 | | - yield line |
| 79 | + line = line.split('\t') |
| 80 | + yield line |
83 | 81 | |
84 | 82 | |
85 | | - |
86 | | - |
87 | | - |
88 | 83 | # read / write data related functions |
89 | 84 | def read_data_from_csv(location, filename, encoding): |
90 | 85 | ''' |
— | — | @@ -237,6 +232,22 @@ |
238 | 233 | return os.path.getsize(path) |
239 | 234 | |
240 | 235 | |
| 236 | +def set_modified_data(mod_rem, location, filename): |
| 237 | + ''' |
| 238 | + Mod_rem is the modified date of the remote file (the Wikimedia dump file) |
| 239 | + Mon, 15 Mar 2010 07:07:30 GMT Example server timestamp |
| 240 | + ''' |
| 241 | + path = os.path.join(location, filename) |
| 242 | + print mod_rem |
| 243 | + mod_rem = text_utils.convert_timestamp_to_datetime_naive(mod_rem, settings.timestamp_format) |
| 244 | + os.utime(path, (mod_rem, mod_rem)) |
| 245 | + raise exceptions.NotYetImplementedError(set_modified_data) |
| 246 | + |
| 247 | +def get_modified_date(location, filename): |
| 248 | + path = os.path.join(location, filename) |
| 249 | + return os.stat(path).st_mtime |
| 250 | + |
| 251 | + |
241 | 252 | def check_file_exists(location, filename): |
242 | 253 | if hasattr(filename, '__call__'): |
243 | 254 | filename = construct_filename(filename, '.bin') |
Index: trunk/tools/editor_trends/utils/http_utils.py |
— | — | @@ -31,8 +31,6 @@ |
32 | 32 | import log |
33 | 33 | |
34 | 34 | |
35 | | - |
36 | | - |
37 | 35 | def read_data_from_http_connection(domain, path): |
38 | 36 | if not domain.startswith('http://'): |
39 | 37 | domain = 'http://%s' % domain |
— | — | @@ -50,7 +48,6 @@ |
51 | 49 | return data |
52 | 50 | |
53 | 51 | |
54 | | - |
55 | 52 | def retrieve_md5_hashes(domain, project, date): |
56 | 53 | path = '%s/%s/%s-%s-md5sums.txt' % (project, date, project, date) |
57 | 54 | data = read_data_from_http_connection(domain, path) |
— | — | @@ -68,7 +65,7 @@ |
69 | 66 | canonical_filename = file_utils.determine_canonical_name(filename) |
70 | 67 | for x in xrange(1, 100): |
71 | 68 | f = '%s%s.xml.%s' % (canonical_filename, x, ext) |
72 | | - res = check_remote_path_exists(domain, path, f) |
| 69 | + res = get_headers(domain, path, f) |
73 | 70 | if res == None or res.status != 200: |
74 | 71 | if x == 1: |
75 | 72 | task_queue.put(filename) |
— | — | @@ -83,8 +80,7 @@ |
84 | 81 | return task_queue |
85 | 82 | |
86 | 83 | |
87 | | - |
88 | | -def check_remote_path_exists(domain, path, filename): |
| 84 | +def get_headers(domain, path, filename): |
89 | 85 | ''' |
90 | 86 | @path is the full path of the file to be downloaded |
91 | 87 | @filename is the name of the file to be downloaded |
— | — | @@ -104,11 +100,20 @@ |
105 | 101 | |
106 | 102 | except httplib.socket.error: |
107 | 103 | raise httplib.NotConnected('It seems that %s is temporarily \ |
108 | | - unavailable, please try again later.' % url) |
| 104 | + unavailable, please try again later.' % url) |
109 | 105 | |
110 | 106 | |
| 107 | +def determine_modified_date(domain, path, filename): |
| 108 | + res = get_headers(domain, path, filename) |
| 109 | + print res.__dict__ |
| 110 | + if res != None and res.status == 200: |
| 111 | + return int(res.getheader('last-modified', -1)) |
| 112 | + else: |
| 113 | + return - 1 |
| 114 | + |
| 115 | + |
111 | 116 | def determine_remote_filesize(domain, path, filename): |
112 | | - res = check_remote_path_exists(domain, path, filename) |
| 117 | + res = get_headers(domain, path, filename) |
113 | 118 | if res != None and res.status == 200: |
114 | 119 | return int(res.getheader('content-length', -1)) |
115 | 120 | else: |
— | — | @@ -116,9 +121,10 @@ |
117 | 122 | |
118 | 123 | |
119 | 124 | def debug(): |
120 | | - domain = 'download.wikimedia.org' |
121 | | - path = 'enwikinews' |
122 | | - filename = None |
| 125 | + domain = 'http://download.wikimedia.org' |
| 126 | + path = '/enwikinews/20100315/' |
| 127 | + filename = 'enwikinews-20100315-all-titles-in-ns0.gz' |
| 128 | + determine_modified_date(domain, path, filename) |
123 | 129 | #check_remote_path_exists(domain, path, filename) |
124 | 130 | #read_directory_contents(domain, path) |
125 | 131 | # download_wp_dump('http://download.wikimedia.org/enwiki/latest', |
Index: trunk/tools/editor_trends/utils/compression.py |
— | — | @@ -25,7 +25,7 @@ |
26 | 26 | import configuration |
27 | 27 | settings = configuration.Settings() |
28 | 28 | import file_utils |
29 | | -import exceptions |
| 29 | +from classes import exceptions |
30 | 30 | import timer |
31 | 31 | import log |
32 | 32 | |
Index: trunk/tools/editor_trends/utils/text_utils.py |
— | — | @@ -30,18 +30,19 @@ |
31 | 31 | return datetime.datetime.strptime(timestamp[:10], settings.date_format) |
32 | 32 | |
33 | 33 | |
34 | | -def convert_timestamp_to_datetime_naive(timestamp): |
35 | | - return datetime.datetime.strptime(timestamp, settings.timestamp_format) |
| 34 | +def convert_timestamp_to_datetime_naive(timestamp, timestamp_format): |
| 35 | + return datetime.datetime.strptime(timestamp, timestamp_format) |
36 | 36 | |
37 | 37 | |
38 | 38 | def convert_timestamp_to_datetime_utc(timestamp): |
39 | 39 | tz = datetime.tzinfo('utc') |
40 | | - d = convert_timestamp_to_datetime_naive(timestamp) |
| 40 | + d = convert_timestamp_to_datetime_naive(timestamp, settings.timestamp_format) |
41 | 41 | #return d.replace(tzinfo=tz) #enabling this line crashes pymongo |
42 | 42 | return d |
43 | 43 | |
44 | 44 | |
45 | 45 | |
| 46 | + |
46 | 47 | def invert_dict(dictionary): |
47 | 48 | ''' |
48 | 49 | @dictionary is a simple dictionary containing simple values, ie. no lists, |