Index: trunk/extensions/SwiftMedia/wmf/proxy-server.conf.example |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +## |
| 3 | +## you must add a filter:rewrite section to the swift proxy-server.conf |
| 4 | +## this file documents the options available |
| 5 | +## |
| 6 | + |
| 7 | +[filter:rewrite] |
| 8 | + |
| 9 | +# the auth system turns our login and key into an account / token pair. |
| 10 | +# the account remains valid forever, but the token times out. |
| 11 | +url = http://127.0.0.1:8080/auth/v1.0 |
| 12 | +account = AUTH_205b4c23-6716-4a3b-91b2-5da36ce1d120 |
| 13 | +login = mw:thumb |
| 14 | +key = testing |
| 15 | + |
| 16 | +# the name of the scaler cluster. |
| 17 | +thumbhost = upload.wikimedia.org |
| 18 | + |
| 19 | +# upload doesn"t like our User-agent (Python-urllib/2.6), otherwise we could call it using urllib2.urlopen() |
| 20 | +user_agent = Mozilla/5.0 |
| 21 | + |
| 22 | +# uncomment this if we want to write the 404 handler"s output into Swift. |
| 23 | +writethumb=yes |
| 24 | + |
| 25 | +# Should we shard containers? Note that this config and mediawiki's config must agree or bad things will happen. |
| 26 | +# valid choices: all, some, none |
| 27 | +shard_containers = some |
| 28 | +# if we shard some containers, provide an unquoted comma-separated list of containers to shard |
| 29 | +# note that these are container names, not URL paths. |
| 30 | +shard_container_list = wikimedia-commons-thumb,wikipedia-en-thumb |
| 31 | + |
Index: trunk/extensions/SwiftMedia/wmf/rewrite.py |
— | — | @@ -59,7 +59,6 @@ |
60 | 60 | try: |
61 | 61 | self.copyconn.close() #06 or #04 if it fails. |
62 | 62 | except wmf.client.ClientException, err: |
63 | | - self.app.logger.warn("PUT Status: %d" % err.http_status) |
64 | 63 | if err.http_status == 401: |
65 | 64 | # not worth retrying the write. Thumb will get saved |
66 | 65 | # the next time. |
— | — | @@ -100,6 +99,13 @@ |
101 | 100 | self.writethumb = 'writethumb' in conf |
102 | 101 | self.user_agent = conf['user_agent'].strip() |
103 | 102 | self.bind_port = conf['bind_port'].strip() |
| 103 | + self.shard_containers = conf['shard_containers'].strip() #all, some, none |
| 104 | + if (self.shard_containers == 'some'): |
| 105 | + # if we're supposed to shard some containers, get a cleaned list of the containers to shard |
| 106 | + def striplist(l): |
| 107 | + return([x.strip() for x in l]) |
| 108 | + self.shard_container_list = striplist(conf['shard_container_list'].split(',')) |
| 109 | + |
104 | 110 | #self.logger = get_logger(conf) |
105 | 111 | |
106 | 112 | def handle404(self, reqorig, url, container, obj): |
— | — | @@ -155,44 +161,55 @@ |
156 | 162 | if req.method == 'PUT': |
157 | 163 | return self.app(env, start_response) |
158 | 164 | |
159 | | - # if it already has AUTH, presume that it's good. #07 |
160 | | - if req.path.startswith('/auth') or req.path.find('AUTH') >= 0: |
| 165 | + # double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864 |
| 166 | + while(req.path_info != req.path_info.replace('//', '/')): |
| 167 | + req.path_info = req.path_info.replace('//', '/') |
| 168 | + |
| 169 | + # if it already has AUTH, presume that it's good. #07. fixes bug 33620 |
| 170 | + hasauth = re.search('/AUTH_[0-9a-fA-F]{32}/', req.path) |
| 171 | + if req.path.startswith('/auth') or hasauth: |
161 | 172 | return self.app(env, start_response) |
162 | 173 | |
163 | 174 | # keep a copy of the original request so we can ask the scalers for it |
164 | 175 | reqorig = req.copy() |
| 176 | + |
165 | 177 | # match these two URL forms (source files and thumbnails): |
166 | | - # http://upload.wikimedia.org/<site>/<lang>/.* |
167 | | - # http://upload.wikimedia.org/<site>/<lang>/thumb/.* |
| 178 | + # http://upload.wikimedia.org/<proj>/<lang>/.* |
| 179 | + # http://upload.wikimedia.org/<proj>/<lang>/thumb/.* |
168 | 180 | # example: |
169 | 181 | # http://upload.wikimedia.org/wikipedia/commons/a/aa/000_Finlanda_harta.PNG |
170 | 182 | # http://upload.wikimedia.org/wikipedia/commons/thumb/a/aa/000_Finlanda_harta.PNG/75px-000_Finlanda_harta.PNG |
171 | 183 | # http://upload.wikimedia.org/wikipedia/commons/thumb/archive/b/b6/20101108115418!Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/100px-Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg |
172 | | - match = re.match(r'/(?P<proj>.*?)/(?P<lang>.*?)/(?P<thumb>thumb/)?(?P<archive>(temp|archive)/)?(?P<shard>./../)?(?P<path>.*)', req.path) |
| 184 | + match = re.match(r'/(?P<proj>[^/]*?)/(?P<lang>[^/]*?)/(?P<thumb>thumb/)?(?P<archive>(temp|archive)/)?(?P<shard>[0-9a-f]/[0-9a-f]{2}/)?(?P<path>.*)', req.path) |
173 | 185 | if match: |
174 | 186 | # Our target URL is as follows (example): |
175 | 187 | # https://alsted.wikimedia.org:8080/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-en-25/Machinesmith.png |
176 | 188 | # http://msfe/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-commons-aa/000_Finlanda_harta.PNG |
177 | 189 | # http://mfse/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-commons-thumb-aa/000_Finlanda_harta.PNG/75px-000_Finlanda_harta.PNG |
178 | 190 | |
179 | | - # quote slashes in the container name |
| 191 | + # turn slashes in the container name into hyphens |
180 | 192 | container = "%s-%s" % (match.group('proj'), match.group('lang')) #02 |
181 | 193 | thumb = match.group('thumb') |
| 194 | + arch = match.group('archive') |
182 | 195 | shard = match.group('shard') |
183 | 196 | obj = match.group('path') |
184 | | - arch = match.group('archive') |
185 | 197 | # include the thumb in the container. |
186 | 198 | if thumb: #03 |
187 | 199 | container += "-thumb" |
188 | | - if shard: |
189 | | - #add only the 2-digit shard to the container name |
190 | | - container += "-%s" % shard[2:4] |
| 200 | + |
| 201 | + # only pull out shard if we're supposed to shard this container |
| 202 | + if ( (self.shard_containers == 'all') or \ |
| 203 | + ((self.shard_containers == 'some') and (container in self.shard_container_list))): |
| 204 | + if shard: |
| 205 | + #add only the 2-digit shard to the container name |
| 206 | + container += "-%s" % shard[2:4] |
191 | 207 | if arch: |
192 | 208 | # for urls that go /wiki/thumb/archive/a/ab/path, the container is wiki-thumb-ab and the obj is archive/path |
| 209 | + # aka pull the shard into the container if necessary but the string 'archive' or 'temp' goes into the object. |
193 | 210 | obj = "%s%s" % (arch, obj) |
194 | 211 | |
195 | 212 | if not obj: |
196 | | - # don't let them list the container (it's CRAZY huge) #08 |
| 213 | + # don't let them list the contents of the container (it's CRAZY huge) #08 |
197 | 214 | resp = webob.exc.HTTPForbidden('No container listing') |
198 | 215 | return resp(env, start_response) |
199 | 216 | |
— | — | @@ -203,6 +220,7 @@ |
204 | 221 | url = req.url[:] |
205 | 222 | # Create a path to our object's name. |
206 | 223 | req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) |
| 224 | + #self.logger.warn("new path is %s" % req.path_info) |
207 | 225 | |
208 | 226 | controller = ObjectController() |
209 | 227 | # do_start_response just remembers what it got called with, |