Index: trunk/tools/editor_trends/etl/downloader.py |
— | — | @@ -37,39 +37,42 @@ |
38 | 38 | ''' |
39 | 39 | success = True |
40 | 40 | chunk = 1024 * 4 |
| 41 | + |
| 42 | + |
41 | 43 | while True: |
42 | 44 | filename = task_queue.get(block=False) |
43 | 45 | task_queue.task_done() |
44 | 46 | if filename == None: |
45 | 47 | print 'Swallowed a poison pill' |
46 | 48 | break |
| 49 | + widgets = log.init_progressbar_widgets(filename) |
47 | 50 | extension = file_utils.determine_file_extension(filename) |
48 | 51 | filemode = file_utils.determine_file_mode(extension) |
49 | 52 | filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location, |
50 | 53 | properties.dump_relative_path, |
51 | 54 | filename) |
52 | 55 | |
53 | | -# mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location, |
54 | | -# properties.dump_relative_path, |
55 | | -# filename) |
| 56 | + mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location, |
| 57 | + properties.dump_relative_path, |
| 58 | + filename) |
56 | 59 | |
57 | 60 | if file_utils.check_file_exists(properties.location, filename): |
58 | 61 | #This can be activated as soon as bug 21575 is fixed. |
59 | | - #mod_loc = file_utils.get_modified_date(properties.location, filename) |
60 | | - #if mod_loc != mod_rem: |
61 | | - print 'Swallowed a poison pill' |
62 | | - break |
| 62 | + properties.force = True |
| 63 | + mod_loc = file_utils.get_modified_date(properties.location, filename) |
| 64 | + if mod_loc != mod_rem and properties.force == False: |
| 65 | + print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name) |
| 66 | + break |
63 | 67 | |
64 | 68 | if filemode == 'w': |
65 | 69 | fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding) |
66 | | - |
67 | 70 | else: |
68 | 71 | fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb') |
69 | 72 | |
70 | 73 | if filesize != -1: |
71 | | - widgets = log.init_progressbar_widgets(filename) |
72 | 74 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() |
73 | | - |
| 75 | + else: |
| 76 | + pbar = progressbar.ProgressBar(widgets=widgets).start() |
74 | 77 | try: |
75 | 78 | path = '%s%s' % (properties.dump_absolute_path, filename) |
76 | 79 | req = urllib2.Request(path) |
— | — | @@ -94,9 +97,8 @@ |
95 | 98 | success = False |
96 | 99 | finally: |
97 | 100 | fh.close() |
| 101 | + file_utils.set_modified_data(mod_rem, properties.location, filename) |
98 | 102 | |
99 | | - #file_utils.set_modified_data(mod_rem, properties.location, filename) |
100 | | - |
101 | 103 | return success |
102 | 104 | |
103 | 105 | |
— | — | @@ -109,9 +111,13 @@ |
110 | 112 | #print tasks.qsize() |
111 | 113 | #if tasks.qsize() < properties.settings.number_of_processes: |
112 | 114 | # properties.settings.number_of_processes = tasks.qsize() |
113 | | - consumers = [multiprocessing.Process(target=download_wiki_file, |
114 | | - args=(tasks, properties)) |
115 | | - for i in xrange(properties.settings.number_of_processes + 1)] |
| 115 | + if tasks.qsize() > 1: |
| 116 | + consumers = [multiprocessing.Process(target=download_wiki_file, |
| 117 | + args=(tasks, properties)) |
| 118 | + for i in xrange(properties.settings.number_of_processes)] |
| 119 | + else: consumers = [multiprocessing.Process(target=download_wiki_file, |
| 120 | + args=(tasks, properties)) |
| 121 | + for i in xrange(1)] |
116 | 122 | print 'Starting consumers to download files...' |
117 | 123 | for w in consumers: |
118 | 124 | w.start() |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -74,14 +74,14 @@ |
75 | 75 | |
76 | 76 | # Timestamp format as generated by the MediaWiki dumps |
77 | 77 | self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
78 | | - self.timestamp_server = '%D, %d %M %Y %H:M%:%SZ' |
| 78 | + self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z' |
79 | 79 | #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
80 | 80 | self.max_xmlfile_size = 4096 * 1024 |
81 | 81 | |
82 | 82 | #Change this to match your computers configuration (RAM / CPU) |
83 | 83 | self.number_of_processes = cpu_count() * process_multiplier |
84 | 84 | |
85 | | - self.wp_dump_location = 'http://download.wikimedia.org' |
| 85 | + self.wp_dump_location = 'http://dumps.wikimedia.org' |
86 | 86 | self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
87 | 87 | self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
88 | 88 | self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
Index: trunk/tools/editor_trends/utils/http_utils.py |
— | — | @@ -28,6 +28,7 @@ |
29 | 29 | import configuration |
30 | 30 | settings = configuration.Settings() |
31 | 31 | import file_utils |
| 32 | +import text_utils |
32 | 33 | import log |
33 | 34 | |
34 | 35 | |
— | — | @@ -73,10 +74,10 @@ |
74 | 75 | else: |
75 | 76 | print 'Added chunk to download: %s' % f |
76 | 77 | task_queue.put(f) |
77 | | - if x < settings.number_of_processes: |
78 | | - settings.number_of_processes = x |
79 | | - for x in xrange(settings.number_of_processes): |
80 | | - task_queue.put(None) |
| 78 | +# if x < settings.number_of_processes: |
| 79 | +# settings.number_of_processes = x |
| 80 | + for x in xrange(settings.number_of_processes): |
| 81 | + task_queue.put(None) |
81 | 82 | return task_queue |
82 | 83 | |
83 | 84 | |
— | — | @@ -106,25 +107,28 @@ |
107 | 108 | def determine_modified_date(domain, path, filename): |
108 | 109 | res = get_headers(domain, path, filename) |
109 | 110 | print res.__dict__ |
110 | | - if res != None and res.status == 200: |
111 | | - return int(res.getheader('last-modified', -1)) |
| 111 | + if res != None and (res.status == 200 or res.status == 301): |
| 112 | + return res.getheader('last-modified', -1) |
112 | 113 | else: |
113 | 114 | return - 1 |
114 | 115 | |
115 | 116 | |
116 | 117 | def determine_remote_filesize(domain, path, filename): |
117 | 118 | res = get_headers(domain, path, filename) |
118 | | - if res != None and res.status == 200: |
| 119 | + if res != None or res.status == 200: |
119 | 120 | return int(res.getheader('content-length', -1)) |
120 | 121 | else: |
121 | 122 | return - 1 |
122 | 123 | |
123 | 124 | |
124 | 125 | def debug(): |
125 | | - domain = 'http://download.wikimedia.org' |
| 126 | + domain = 'http://dumps.wikimedia.org' |
126 | 127 | path = '/enwikinews/20100315/' |
127 | 128 | filename = 'enwikinews-20100315-all-titles-in-ns0.gz' |
128 | | - determine_modified_date(domain, path, filename) |
| 129 | + mod_date = determine_modified_date(domain, path, filename) |
| 130 | + print mod_date |
| 131 | + mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, '%a, %d %b %Y %H:%M:%S %Z') |
| 132 | + print mod_date |
129 | 133 | #check_remote_path_exists(domain, path, filename) |
130 | 134 | #read_directory_contents(domain, path) |
131 | 135 | # download_wp_dump('http://download.wikimedia.org/enwiki/latest', |