r80866 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80865‎ | r80866 | r80867 >
Date:16:51, 24 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Determines dump availability for each project / language combination.
Modified paths:
  • /trunk/tools/editor_trends/utils/inventory.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/inventory.py
@@ -0,0 +1,127 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-21'
 19+__version__ = '0.1'
 20+
 21+import re
 22+import sys
 23+from threading import Thread
 24+from HTMLParser import HTMLParser
 25+sys.path.append('..')
 26+
 27+import configuration
 28+settings = configuration.Settings()
 29+
 30+from database import db
 31+import dump_downloader
 32+import wikiprojects
 33+
 34+class AnchorParser(HTMLParser):
 35+ '''
 36+ A simple HTML parser that takes an HTML directory listing and extracts the
 37+ directories.
 38+ '''
 39+ def __init__(self,):
 40+ HTMLParser.__init__(self)
 41+ self.directories = []
 42+
 43+ def handle_starttag(self, tag, attrs):
 44+ if tag == 'a':
 45+ for key, value in attrs:
 46+ if key == 'href':
 47+ self.directories.append(value)
 48+ #print value
 49+
 50+class Dumper(Thread):
 51+ '''
 52+ A simple threaded http parser that determines for the different wikimedia
 53+ projects when dumps are available. This data is stored in MongoDB and can
 54+ be fed into the Django Wikilytics application so that people know for sure
 55+ that a particular month /year combination is available instead of getting
 56+ errors that a particular dump does not exist.
 57+ '''
 58+ def __init__(self, project, properties):
 59+ Thread.__init__(self)
 60+ self.project = project
 61+ self.props = properties
 62+ self.data = {}
 63+
 64+
 65+ def store_available_dumps(self):
 66+ mongo = db.init_mongo_db('wikilytics')
 67+ coll = mongo['available_dumps']
 68+
 69+ coll.save({'project': self.project, 'dumps': self.data})
 70+
 71+ def run(self):
 72+ project = self.props.projects[self.project]
 73+ langs = self.props.project_supports_language(project)
 74+
 75+ for lang in langs:
 76+ path = '%s%s' % (lang, project)
 77+ res = dump_downloader.check_remote_path_exists(settings.wp_dump_location, path, None)
 78+ if res != None and (res.status == 200 or res.status == 301):
 79+ print 'Constructing list of available dumps for %s' % path
 80+ directories = dump_downloader.read_directory_contents(settings.wp_dump_location, path)
 81+ dates = determine_available_dumps(directories)
 82+ self.data.setdefault(lang, dates)
 83+
 84+ self.store_available_dumps()
 85+
 86+
 87+def read_directory_contents(domain, path):
 88+ parser = AnchorParser()
 89+ data = read_data_from_http_connection(domain, path)
 90+ parser.feed(data)
 91+ return parser.directories
 92+
 93+
 94+def determine_available_dumps(directories):
 95+ dates = {}
 96+ for directory in directories:
 97+ if directory == '../' or directory == 'latest/':
 98+ continue
 99+ try:
 100+ year = int(directory[:4])
 101+ month = int(directory[4:6])
 102+ except ValueError:
 103+ print directory
 104+ dates.setdefault(year, set())
 105+ dates[year].add(month)
 106+ data = {}
 107+ for year in dates:
 108+ data[str(year)] = list(dates[year])
 109+ return data
 110+
 111+
 112+def launcher():
 113+ properties = wikiprojects.Wiki(settings)
 114+ dumpers = []
 115+ for project in properties.projects:
 116+ if project == 'wiki':
 117+ continue
 118+ dumper = Dumper(project, properties)
 119+ dumpers.append(dumper)
 120+ dumper.start()
 121+
 122+ for d in dumpers:
 123+ d.join()
 124+ print 'Found dumps for %s: %s' % (d.project, d.data)
 125+
 126+
 127+if __name__ == '__main__':
 128+ launcher()

Status & tagging log