r81303 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81302‎ | r81303 | r81304 >
Date:03:56, 1 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
Downloader now checks if a new dump file is available by comparing the Last-Modified HTTP header of the remote file and the modified date of the local file. The modified date of the local file is set to the Last Modified HTTP date when the file is downloaded for the first time.
Modified paths:
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/etl/downloader.py (modified) (history)
  • /trunk/tools/editor_trends/utils/http_utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/downloader.py
@@ -37,39 +37,42 @@
3838 '''
3939 success = True
4040 chunk = 1024 * 4
 41+
 42+
4143 while True:
4244 filename = task_queue.get(block=False)
4345 task_queue.task_done()
4446 if filename == None:
4547 print 'Swallowed a poison pill'
4648 break
 49+ widgets = log.init_progressbar_widgets(filename)
4750 extension = file_utils.determine_file_extension(filename)
4851 filemode = file_utils.determine_file_mode(extension)
4952 filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location,
5053 properties.dump_relative_path,
5154 filename)
5255
53 -# mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location,
54 -# properties.dump_relative_path,
55 -# filename)
 56+ mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location,
 57+ properties.dump_relative_path,
 58+ filename)
5659
5760 if file_utils.check_file_exists(properties.location, filename):
5861 #This can be activated as soon as bug 21575 is fixed.
59 - #mod_loc = file_utils.get_modified_date(properties.location, filename)
60 - #if mod_loc != mod_rem:
61 - print 'Swallowed a poison pill'
62 - break
 62+ properties.force = True
 63+ mod_loc = file_utils.get_modified_date(properties.location, filename)
 64+ if mod_loc != mod_rem and properties.force == False:
 65+ print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name)
 66+ break
6367
6468 if filemode == 'w':
6569 fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding)
66 -
6770 else:
6871 fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
6972
7073 if filesize != -1:
71 - widgets = log.init_progressbar_widgets(filename)
7274 pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
73 -
 75+ else:
 76+ pbar = progressbar.ProgressBar(widgets=widgets).start()
7477 try:
7578 path = '%s%s' % (properties.dump_absolute_path, filename)
7679 req = urllib2.Request(path)
@@ -94,9 +97,8 @@
9598 success = False
9699 finally:
97100 fh.close()
 101+ file_utils.set_modified_data(mod_rem, properties.location, filename)
98102
99 - #file_utils.set_modified_data(mod_rem, properties.location, filename)
100 -
101103 return success
102104
103105
@@ -109,9 +111,13 @@
110112 #print tasks.qsize()
111113 #if tasks.qsize() < properties.settings.number_of_processes:
112114 # properties.settings.number_of_processes = tasks.qsize()
113 - consumers = [multiprocessing.Process(target=download_wiki_file,
114 - args=(tasks, properties))
115 - for i in xrange(properties.settings.number_of_processes + 1)]
 115+ if tasks.qsize() > 1:
 116+ consumers = [multiprocessing.Process(target=download_wiki_file,
 117+ args=(tasks, properties))
 118+ for i in xrange(properties.settings.number_of_processes)]
 119+ else: consumers = [multiprocessing.Process(target=download_wiki_file,
 120+ args=(tasks, properties))
 121+ for i in xrange(1)]
116122 print 'Starting consumers to download files...'
117123 for w in consumers:
118124 w.start()
Index: trunk/tools/editor_trends/configuration.py
@@ -74,14 +74,14 @@
7575
7676 # Timestamp format as generated by the MediaWiki dumps
7777 self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
78 - self.timestamp_server = '%D, %d %M %Y %H:M%:%SZ'
 78+ self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z'
7979 #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
8080 self.max_xmlfile_size = 4096 * 1024
8181
8282 #Change this to match your computers configuration (RAM / CPU)
8383 self.number_of_processes = cpu_count() * process_multiplier
8484
85 - self.wp_dump_location = 'http://download.wikimedia.org'
 85+ self.wp_dump_location = 'http://dumps.wikimedia.org'
8686 self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
8787 self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
8888 self.windows_register = {'7z.exe': 'Software\\7-Zip', }
Index: trunk/tools/editor_trends/utils/http_utils.py
@@ -28,6 +28,7 @@
2929 import configuration
3030 settings = configuration.Settings()
3131 import file_utils
 32+import text_utils
3233 import log
3334
3435
@@ -73,10 +74,10 @@
7475 else:
7576 print 'Added chunk to download: %s' % f
7677 task_queue.put(f)
77 - if x < settings.number_of_processes:
78 - settings.number_of_processes = x
79 - for x in xrange(settings.number_of_processes):
80 - task_queue.put(None)
 78+# if x < settings.number_of_processes:
 79+# settings.number_of_processes = x
 80+ for x in xrange(settings.number_of_processes):
 81+ task_queue.put(None)
8182 return task_queue
8283
8384
@@ -106,25 +107,28 @@
107108 def determine_modified_date(domain, path, filename):
108109 res = get_headers(domain, path, filename)
109110 print res.__dict__
110 - if res != None and res.status == 200:
111 - return int(res.getheader('last-modified', -1))
 111+ if res != None and (res.status == 200 or res.status == 301):
 112+ return res.getheader('last-modified', -1)
112113 else:
113114 return - 1
114115
115116
116117 def determine_remote_filesize(domain, path, filename):
117118 res = get_headers(domain, path, filename)
118 - if res != None and res.status == 200:
 119+ if res != None or res.status == 200:
119120 return int(res.getheader('content-length', -1))
120121 else:
121122 return - 1
122123
123124
124125 def debug():
125 - domain = 'http://download.wikimedia.org'
 126+ domain = 'http://dumps.wikimedia.org'
126127 path = '/enwikinews/20100315/'
127128 filename = 'enwikinews-20100315-all-titles-in-ns0.gz'
128 - determine_modified_date(domain, path, filename)
 129+ mod_date = determine_modified_date(domain, path, filename)
 130+ print mod_date
 131+ mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, '%a, %d %b %Y %H:%M:%S %Z')
 132+ print mod_date
129133 #check_remote_path_exists(domain, path, filename)
130134 #read_directory_contents(domain, path)
131135 # download_wp_dump('http://download.wikimedia.org/enwiki/latest',

Status & tagging log