Index: trunk/tools/editor_trends/utils/inventory.py |
— | — | @@ -0,0 +1,127 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__author__email = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-21'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import re
|
| 22 | +import sys
|
| 23 | +from threading import Thread
|
| 24 | +from HTMLParser import HTMLParser
|
| 25 | +sys.path.append('..')
|
| 26 | +
|
| 27 | +import configuration
|
| 28 | +settings = configuration.Settings()
|
| 29 | +
|
| 30 | +from database import db
|
| 31 | +import dump_downloader
|
| 32 | +import wikiprojects
|
| 33 | +
|
| 34 | +class AnchorParser(HTMLParser):
|
| 35 | + '''
|
| 36 | + A simple HTML parser that takes an HTML directory listing and extracts the
|
| 37 | + directories.
|
| 38 | + '''
|
| 39 | + def __init__(self,):
|
| 40 | + HTMLParser.__init__(self)
|
| 41 | + self.directories = []
|
| 42 | +
|
| 43 | + def handle_starttag(self, tag, attrs):
|
| 44 | + if tag == 'a':
|
| 45 | + for key, value in attrs:
|
| 46 | + if key == 'href':
|
| 47 | + self.directories.append(value)
|
| 48 | + #print value
|
| 49 | +
|
| 50 | +class Dumper(Thread):
|
| 51 | + '''
|
| 52 | + A simple threaded http parser that determines for the different wikimedia
|
| 53 | + projects when dumps are available. This data is stored in MongoDB and can
|
| 54 | + be fed into the Django Wikilytics application so that people know for sure
|
| 55 | + that a particular month /year combination is available instead of getting
|
| 56 | + errors that a particular dump does not exist.
|
| 57 | + '''
|
| 58 | + def __init__(self, project, properties):
|
| 59 | + Thread.__init__(self)
|
| 60 | + self.project = project
|
| 61 | + self.props = properties
|
| 62 | + self.data = {}
|
| 63 | +
|
| 64 | +
|
| 65 | + def store_available_dumps(self):
|
| 66 | + mongo = db.init_mongo_db('wikilytics')
|
| 67 | + coll = mongo['available_dumps']
|
| 68 | +
|
| 69 | + coll.save({'project': self.project, 'dumps': self.data})
|
| 70 | +
|
| 71 | + def run(self):
|
| 72 | + project = self.props.projects[self.project]
|
| 73 | + langs = self.props.project_supports_language(project)
|
| 74 | +
|
| 75 | + for lang in langs:
|
| 76 | + path = '%s%s' % (lang, project)
|
| 77 | + res = dump_downloader.check_remote_path_exists(settings.wp_dump_location, path, None)
|
| 78 | + if res != None and (res.status == 200 or res.status == 301):
|
| 79 | + print 'Constructing list of available dumps for %s' % path
|
| 80 | + directories = dump_downloader.read_directory_contents(settings.wp_dump_location, path)
|
| 81 | + dates = determine_available_dumps(directories)
|
| 82 | + self.data.setdefault(lang, dates)
|
| 83 | +
|
| 84 | + self.store_available_dumps()
|
| 85 | +
|
| 86 | +
|
| 87 | +def read_directory_contents(domain, path):
|
| 88 | + parser = AnchorParser()
|
| 89 | + data = read_data_from_http_connection(domain, path)
|
| 90 | + parser.feed(data)
|
| 91 | + return parser.directories
|
| 92 | +
|
| 93 | +
|
| 94 | +def determine_available_dumps(directories):
|
| 95 | + dates = {}
|
| 96 | + for directory in directories:
|
| 97 | + if directory == '../' or directory == 'latest/':
|
| 98 | + continue
|
| 99 | + try:
|
| 100 | + year = int(directory[:4])
|
| 101 | + month = int(directory[4:6])
|
| 102 | + except ValueError:
|
| 103 | + print directory
|
| 104 | + dates.setdefault(year, set())
|
| 105 | + dates[year].add(month)
|
| 106 | + data = {}
|
| 107 | + for year in dates:
|
| 108 | + data[str(year)] = list(dates[year])
|
| 109 | + return data
|
| 110 | +
|
| 111 | +
|
| 112 | +def launcher():
|
| 113 | + properties = wikiprojects.Wiki(settings)
|
| 114 | + dumpers = []
|
| 115 | + for project in properties.projects:
|
| 116 | + if project == 'wiki':
|
| 117 | + continue
|
| 118 | + dumper = Dumper(project, properties)
|
| 119 | + dumpers.append(dumper)
|
| 120 | + dumper.start()
|
| 121 | +
|
| 122 | + for d in dumpers:
|
| 123 | + d.join()
|
| 124 | + print 'Found dumps for %s: %s' % (d.project, d.data)
|
| 125 | +
|
| 126 | +
|
| 127 | +if __name__ == '__main__':
|
| 128 | + launcher()
|