r2003 pywikipedia - Code Review archive

Repository:pywikipedia
Revision:r2002‎ | r2003 | r2004 >
Date:22:45, 21 July 2005
Author:wikipedian
Status:old
Tags:
Comment:
new generator to read XML dump files, as a replacement for sqldump.py
Modified paths:
  • /trunk/pywikipedia/wikipedia.py (modified) (history)
  • /trunk/pywikipedia/xmlreader.py (added) (history)

Diff [purge]

Index: trunk/pywikipedia/xmlreader.py
@@ -0,0 +1,131 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+"""
 5+Each XmlEntry object represents a page, as read from an XML source
 6+
 7+The MediaWikiXmlHandler can be used for the XML given by Special:Export
 8+as well as for XML dumps.
 9+
 10+The XmlDump class reads a pages_current XML dump (like the ones offered on
 11+http://download.wikimedia.org/wikipedia/de/) and offers a generator over
 12+XmlEntry objects which can be used by other bots.
 13+"""
 14+import threading, time
 15+import xml.sax
 16+import wikipedia
 17+
 18+class XmlEntry:
 19+ """
 20+ Represents a page.
 21+ """
 22+ def __init__(self, title, text, timestamp):
 23+ # TODO: there are more tags we can read.
 24+ self.title = title
 25+ self.text = text
 26+ self.timestamp = timestamp
 27+
 28+class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
 29+ def setCallback(self, callback):
 30+ self.callback = callback
 31+
 32+ def startElement(self, name, attrs):
 33+ self.destination = None
 34+ if name == 'page':
 35+ self.text=u''
 36+ self.title=u''
 37+ self.timestamp=u''
 38+ elif name == 'text':
 39+ self.destination = 'text'
 40+ self.text=u''
 41+ elif name == 'title':
 42+ self.destination = 'title'
 43+ self.title=u''
 44+ elif name == 'timestamp':
 45+ self.destination = 'timestamp'
 46+ self.timestamp=u''
 47+
 48+ def endElement(self, name):
 49+ if name == 'revision':
 50+ # All done for this.
 51+ text = self.text
 52+ # Remove trailing newlines and spaces
 53+ while text and text[-1] in '\n ':
 54+ text = text[:-1]
 55+ # Replace newline by cr/nl
 56+ text = u'\r\n'.join(text.split('\n'))
 57+ # Decode the timestamp
 58+ timestamp = (self.timestamp[0:4]+
 59+ self.timestamp[5:7]+
 60+ self.timestamp[8:10]+
 61+ self.timestamp[11:13]+
 62+ self.timestamp[14:16]+
 63+ self.timestamp[17:19])
 64+ self.title = self.title.strip()
 65+ # Report back to the caller
 66+ entry = XmlEntry(self.title, text, timestamp)
 67+ self.callback(entry)
 68+
 69+ def characters(self, data):
 70+ if self.destination == 'text':
 71+ self.text += data
 72+ elif self.destination == 'title':
 73+ self.title += data
 74+ elif self.destination == 'timestamp':
 75+ self.timestamp += data
 76+
 77+class XmlParserThread(threading.Thread):
 78+ """
 79+ This XML parser will run as a single thread. This allows the XmlDump
 80+ generator to yield pages before the parser has finished reading the
 81+ entire dump.
 82+
 83+ There surely are more elegant ways to do this.
 84+ """
 85+ def __init__(self, filename, handler):
 86+ threading.Thread.__init__(self)
 87+ self.filename = filename
 88+ self.handler = handler
 89+
 90+ def run(self):
 91+ xml.sax.parse(self.filename, self.handler)
 92+
 93+class XmlDump(object):
 94+ """
 95+ Represents an XML dump file. Reads the local file at initialization,
 96+ parses it, and offers access to the resulting XmlEntries via a generator.
 97+ """
 98+ def __init__(self, filename):
 99+ self.filename = filename
 100+ self.finished = False
 101+ self.handler = MediaWikiXmlHandler()
 102+ self.handler.setCallback(self.oneDone)
 103+ self.parserThread = XmlParserThread(self.filename, self.handler)
 104+ # thread dies when program terminates
 105+ self.parserThread.setDaemon(True)
 106+ # this temporary variable will contain an XmlEntry given by the parser
 107+ # until it has been yielded by the generator.
 108+ self.lastEntry = None
 109+
 110+ def oneDone(self, entry):
 111+ self.lastEntry = entry
 112+ # wait until this class has yielded the page. Otherwise the parser
 113+ # thread would give another page before we had time to yield the
 114+ # current one.
 115+ while self.lastEntry:
 116+ time.sleep(0.001)
 117+
 118+ def __call__(self):
 119+ '''
 120+ Generator which reads one line at a time from the SQL dump file, and
 121+ parses it to create SQLentry objects. Stops when the end of file is
 122+ reached.
 123+ '''
 124+ wikipedia.output(u'Reading XML dump')
 125+ self.parserThread.start()
 126+ while self.parserThread.isAlive():
 127+ if self.lastEntry:
 128+ yield self.lastEntry
 129+ self.lastEntry = None
 130+ else:
 131+ # wait 10 ms
 132+ time.sleep(0.001)
Property changes on: trunk/pywikipedia/xmlreader.py
___________________________________________________________________
Added: svn:eol-style
1133 + native
Added: svn:keywords
2134 + Author Date Id Revision
Index: trunk/pywikipedia/wikipedia.py
@@ -93,12 +93,13 @@
9494 import difflib
9595 import re, urllib, codecs, sys
9696 import xml.sax, xml.sax.handler
 97+import htmlentitydefs
 98+
 99+import config, mediawiki_messages, login
 100+import xmlreader
97101 import warnings
98102 import datetime
99103
100 -import config, mediawiki_messages, login
101 -import htmlentitydefs
102 -
103104 import locale
104105 # we'll set the locale to system default. This will ensure correct string
105106 # handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no
@@ -767,54 +768,7 @@
768769 txt = 'redirect'
769770 return re.compile(r'\#'+txt+':? *\[\[(.*?)(\]|\|)', re.I)
770771
771 -# Shortcut get to get multiple pages at once
772 -class WikimediaXmlHandler(xml.sax.handler.ContentHandler):
773 - def setCallback(self, callback):
774 - self.callback = callback
775 -
776 - def startElement(self, name, attrs):
777 - self.destination = None
778 - if name == 'page':
779 - self.text=u''
780 - self.title=u''
781 - self.timestamp=u''
782 - elif name == 'text':
783 - self.destination = 'text'
784 - self.text=u''
785 - elif name == 'title':
786 - self.destination = 'title'
787 - self.title=u''
788 - elif name == 'timestamp':
789 - self.destination = 'timestamp'
790 - self.timestamp=u''
791 -
792 - def endElement(self, name):
793 - if name == 'revision':
794 - # All done for this.
795 - text = self.text
796 - # Remove trailing newlines and spaces
797 - while text and text[-1] in '\n ':
798 - text = text[:-1]
799 - # Replace newline by cr/nl
800 - text = u'\r\n'.join(text.split('\n'))
801 - # Decode the timestamp
802 - timestamp = (self.timestamp[0:4]+
803 - self.timestamp[5:7]+
804 - self.timestamp[8:10]+
805 - self.timestamp[11:13]+
806 - self.timestamp[14:16]+
807 - self.timestamp[17:19])
808 - # Report back to the caller
809 - self.callback(self.title.strip(), timestamp, text)
810772
811 - def characters(self, data):
812 - if self.destination == 'text':
813 - self.text += data
814 - elif self.destination == 'title':
815 - self.title += data
816 - elif self.destination == 'timestamp':
817 - self.timestamp += data
818 -
819773 class GetAll(object):
820774 def __init__(self, site, pages, throttle):
821775 """First argument is Site object.
@@ -846,7 +800,7 @@
847801 break
848802 if not data:
849803 return
850 - handler = WikimediaXmlHandler()
 804+ handler = xmlreader.MediaWikiXmlHandler()
851805 handler.setCallback(self.oneDone)
852806 try:
853807 xml.sax.parseString(data, handler)
@@ -871,7 +825,10 @@
872826 for x in 'Xx':
873827 pl._contents = pl._contents.replace(c2+x,c2+x+x)
874828
875 - def oneDone(self, title, timestamp, text):
 829+ def oneDone(self, entry):
 830+ title = entry.title
 831+ timestamp = entry.timestamp
 832+ text = entry.text
876833 pl = Page(self.site, title)
877834 for pl2 in self.pages:
878835 if Page(self.site, pl2.sectionFreeTitle()) == pl:
@@ -926,7 +883,7 @@
927884 # find nothing, we will retry the normal way with an unadapted form.
928885 pagenames = u'\r\n'.join([x.sectionFreeTitle() for x in self.pages])
929886 if type(pagenames) != type(u''):
930 - print 'Warning: wikipedia.WikipediaXMLHandler.getData() got non-unicode page names. Please report this.'
 887+ print 'Warning: xmlreader.WikipediaXMLHandler.getData() got non-unicode page names. Please report this.'
931888 print pagenames
932889 # convert Unicode string to the encoding used on that wiki
933890 pagenames = pagenames.encode(self.site.encoding())

Status & tagging log