Index: trunk/pywikipedia/xmlreader.py |
— | — | @@ -0,0 +1,131 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +""" |
| 5 | +Each XmlEntry object represents a page, as read from an XML source |
| 6 | + |
| 7 | +The MediaWikiXmlHandler can be used for the XML given by Special:Export |
| 8 | +as well as for XML dumps. |
| 9 | + |
| 10 | +The XmlDump class reads a pages_current XML dump (like the ones offered on |
| 11 | +http://download.wikimedia.org/wikipedia/de/) and offers a generator over |
| 12 | +XmlEntry objects which can be used by other bots. |
| 13 | +""" |
| 14 | +import threading, time |
| 15 | +import xml.sax |
| 16 | +import wikipedia |
| 17 | + |
| 18 | +class XmlEntry: |
| 19 | + """ |
| 20 | + Represents a page. |
| 21 | + """ |
| 22 | + def __init__(self, title, text, timestamp): |
| 23 | + # TODO: there are more tags we can read. |
| 24 | + self.title = title |
| 25 | + self.text = text |
| 26 | + self.timestamp = timestamp |
| 27 | + |
| 28 | +class MediaWikiXmlHandler(xml.sax.handler.ContentHandler): |
| 29 | + def setCallback(self, callback): |
| 30 | + self.callback = callback |
| 31 | + |
| 32 | + def startElement(self, name, attrs): |
| 33 | + self.destination = None |
| 34 | + if name == 'page': |
| 35 | + self.text=u'' |
| 36 | + self.title=u'' |
| 37 | + self.timestamp=u'' |
| 38 | + elif name == 'text': |
| 39 | + self.destination = 'text' |
| 40 | + self.text=u'' |
| 41 | + elif name == 'title': |
| 42 | + self.destination = 'title' |
| 43 | + self.title=u'' |
| 44 | + elif name == 'timestamp': |
| 45 | + self.destination = 'timestamp' |
| 46 | + self.timestamp=u'' |
| 47 | + |
| 48 | + def endElement(self, name): |
| 49 | + if name == 'revision': |
| 50 | + # All done for this. |
| 51 | + text = self.text |
| 52 | + # Remove trailing newlines and spaces |
| 53 | + while text and text[-1] in '\n ': |
| 54 | + text = text[:-1] |
| 55 | + # Replace newline by cr/nl |
| 56 | + text = u'\r\n'.join(text.split('\n')) |
| 57 | + # Decode the timestamp |
| 58 | + timestamp = (self.timestamp[0:4]+ |
| 59 | + self.timestamp[5:7]+ |
| 60 | + self.timestamp[8:10]+ |
| 61 | + self.timestamp[11:13]+ |
| 62 | + self.timestamp[14:16]+ |
| 63 | + self.timestamp[17:19]) |
| 64 | + self.title = self.title.strip() |
| 65 | + # Report back to the caller |
| 66 | + entry = XmlEntry(self.title, text, timestamp) |
| 67 | + self.callback(entry) |
| 68 | + |
| 69 | + def characters(self, data): |
| 70 | + if self.destination == 'text': |
| 71 | + self.text += data |
| 72 | + elif self.destination == 'title': |
| 73 | + self.title += data |
| 74 | + elif self.destination == 'timestamp': |
| 75 | + self.timestamp += data |
| 76 | + |
| 77 | +class XmlParserThread(threading.Thread): |
| 78 | + """ |
| 79 | + This XML parser will run as a single thread. This allows the XmlDump |
| 80 | + generator to yield pages before the parser has finished reading the |
| 81 | + entire dump. |
| 82 | + |
| 83 | + There surely are more elegant ways to do this. |
| 84 | + """ |
| 85 | + def __init__(self, filename, handler): |
| 86 | + threading.Thread.__init__(self) |
| 87 | + self.filename = filename |
| 88 | + self.handler = handler |
| 89 | + |
| 90 | + def run(self): |
| 91 | + xml.sax.parse(self.filename, self.handler) |
| 92 | + |
| 93 | +class XmlDump(object): |
| 94 | + """ |
| 95 | + Represents an XML dump file. Reads the local file at initialization, |
| 96 | + parses it, and offers access to the resulting XmlEntries via a generator. |
| 97 | + """ |
| 98 | + def __init__(self, filename): |
| 99 | + self.filename = filename |
| 100 | + self.finished = False |
| 101 | + self.handler = MediaWikiXmlHandler() |
| 102 | + self.handler.setCallback(self.oneDone) |
| 103 | + self.parserThread = XmlParserThread(self.filename, self.handler) |
| 104 | + # thread dies when program terminates |
| 105 | + self.parserThread.setDaemon(True) |
| 106 | + # this temporary variable will contain an XmlEntry given by the parser |
| 107 | + # until it has been yielded by the generator. |
| 108 | + self.lastEntry = None |
| 109 | + |
| 110 | + def oneDone(self, entry): |
| 111 | + self.lastEntry = entry |
| 112 | + # wait until this class has yielded the page. Otherwise the parser |
| 113 | + # thread would give another page before we had time to yield the |
| 114 | + # current one. |
| 115 | + while self.lastEntry: |
| 116 | + time.sleep(0.001) |
| 117 | + |
| 118 | + def __call__(self): |
| 119 | + ''' |
| 120 | + Generator which reads one line at a time from the SQL dump file, and |
| 121 | + parses it to create SQLentry objects. Stops when the end of file is |
| 122 | + reached. |
| 123 | + ''' |
| 124 | + wikipedia.output(u'Reading XML dump') |
| 125 | + self.parserThread.start() |
| 126 | + while self.parserThread.isAlive(): |
| 127 | + if self.lastEntry: |
| 128 | + yield self.lastEntry |
| 129 | + self.lastEntry = None |
| 130 | + else: |
| 131 | + # wait 10 ms |
| 132 | + time.sleep(0.001) |
Property changes on: trunk/pywikipedia/xmlreader.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 133 | + native |
Added: svn:keywords |
2 | 134 | + Author Date Id Revision |
Index: trunk/pywikipedia/wikipedia.py |
— | — | @@ -93,12 +93,13 @@ |
94 | 94 | import difflib |
95 | 95 | import re, urllib, codecs, sys |
96 | 96 | import xml.sax, xml.sax.handler |
| 97 | +import htmlentitydefs |
| 98 | + |
| 99 | +import config, mediawiki_messages, login |
| 100 | +import xmlreader |
97 | 101 | import warnings |
98 | 102 | import datetime |
99 | 103 | |
100 | | -import config, mediawiki_messages, login |
101 | | -import htmlentitydefs |
102 | | - |
103 | 104 | import locale |
104 | 105 | # we'll set the locale to system default. This will ensure correct string |
105 | 106 | # handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no |
— | — | @@ -767,54 +768,7 @@ |
768 | 769 | txt = 'redirect' |
769 | 770 | return re.compile(r'\#'+txt+':? *\[\[(.*?)(\]|\|)', re.I) |
770 | 771 | |
771 | | -# Shortcut get to get multiple pages at once |
772 | | -class WikimediaXmlHandler(xml.sax.handler.ContentHandler): |
773 | | - def setCallback(self, callback): |
774 | | - self.callback = callback |
775 | | - |
776 | | - def startElement(self, name, attrs): |
777 | | - self.destination = None |
778 | | - if name == 'page': |
779 | | - self.text=u'' |
780 | | - self.title=u'' |
781 | | - self.timestamp=u'' |
782 | | - elif name == 'text': |
783 | | - self.destination = 'text' |
784 | | - self.text=u'' |
785 | | - elif name == 'title': |
786 | | - self.destination = 'title' |
787 | | - self.title=u'' |
788 | | - elif name == 'timestamp': |
789 | | - self.destination = 'timestamp' |
790 | | - self.timestamp=u'' |
791 | | - |
792 | | - def endElement(self, name): |
793 | | - if name == 'revision': |
794 | | - # All done for this. |
795 | | - text = self.text |
796 | | - # Remove trailing newlines and spaces |
797 | | - while text and text[-1] in '\n ': |
798 | | - text = text[:-1] |
799 | | - # Replace newline by cr/nl |
800 | | - text = u'\r\n'.join(text.split('\n')) |
801 | | - # Decode the timestamp |
802 | | - timestamp = (self.timestamp[0:4]+ |
803 | | - self.timestamp[5:7]+ |
804 | | - self.timestamp[8:10]+ |
805 | | - self.timestamp[11:13]+ |
806 | | - self.timestamp[14:16]+ |
807 | | - self.timestamp[17:19]) |
808 | | - # Report back to the caller |
809 | | - self.callback(self.title.strip(), timestamp, text) |
810 | 772 | |
811 | | - def characters(self, data): |
812 | | - if self.destination == 'text': |
813 | | - self.text += data |
814 | | - elif self.destination == 'title': |
815 | | - self.title += data |
816 | | - elif self.destination == 'timestamp': |
817 | | - self.timestamp += data |
818 | | - |
819 | 773 | class GetAll(object): |
820 | 774 | def __init__(self, site, pages, throttle): |
821 | 775 | """First argument is Site object. |
— | — | @@ -846,7 +800,7 @@ |
847 | 801 | break |
848 | 802 | if not data: |
849 | 803 | return |
850 | | - handler = WikimediaXmlHandler() |
| 804 | + handler = xmlreader.MediaWikiXmlHandler() |
851 | 805 | handler.setCallback(self.oneDone) |
852 | 806 | try: |
853 | 807 | xml.sax.parseString(data, handler) |
— | — | @@ -871,7 +825,10 @@ |
872 | 826 | for x in 'Xx': |
873 | 827 | pl._contents = pl._contents.replace(c2+x,c2+x+x) |
874 | 828 | |
875 | | - def oneDone(self, title, timestamp, text): |
| 829 | + def oneDone(self, entry): |
| 830 | + title = entry.title |
| 831 | + timestamp = entry.timestamp |
| 832 | + text = entry.text |
876 | 833 | pl = Page(self.site, title) |
877 | 834 | for pl2 in self.pages: |
878 | 835 | if Page(self.site, pl2.sectionFreeTitle()) == pl: |
— | — | @@ -926,7 +883,7 @@ |
927 | 884 | # find nothing, we will retry the normal way with an unadapted form. |
928 | 885 | pagenames = u'\r\n'.join([x.sectionFreeTitle() for x in self.pages]) |
929 | 886 | if type(pagenames) != type(u''): |
930 | | - print 'Warning: wikipedia.WikipediaXMLHandler.getData() got non-unicode page names. Please report this.' |
| 887 | + print 'Warning: xmlreader.WikipediaXMLHandler.getData() got non-unicode page names. Please report this.' |
931 | 888 | print pagenames |
932 | 889 | # convert Unicode string to the encoding used on that wiki |
933 | 890 | pagenames = pagenames.encode(self.site.encoding()) |