r2003 pywikipedia - Code Review archive

Repository:	pywikipedia
Revision:	< r2002‎ \| r2003 \| r2004 >
Date:	22:45, 21 July 2005
Author:	wikipedian
Status:	old
Tags:
Comment:	new generator to read XML dump files, as a replacement for sqldump.py
Modified paths:	/trunk/pywikipedia/wikipedia.py (modified) (history) /trunk/pywikipedia/xmlreader.py (added) (history)

Diff [purge]

Index: trunk/pywikipedia/xmlreader.py
—	—	@@ -0,0 +1,131 @@
	2	+# -- coding: utf-8 --
	3	+
	4	+"""
	5	+Each XmlEntry object represents a page, as read from an XML source
	6	+
	7	+The MediaWikiXmlHandler can be used for the XML given by Special:Export
	8	+as well as for XML dumps.
	9	+
	10	+The XmlDump class reads a pages_current XML dump (like the ones offered on
	11	+http://download.wikimedia.org/wikipedia/de/) and offers a generator over
	12	+XmlEntry objects which can be used by other bots.
	13	+"""
	14	+import threading, time
	15	+import xml.sax
	16	+import wikipedia
	17	+
	18	+class XmlEntry:
	19	+ """
	20	+ Represents a page.
	21	+ """
	22	+ def __init__(self, title, text, timestamp):
	23	+ # TODO: there are more tags we can read.
	24	+ self.title = title
	25	+ self.text = text
	26	+ self.timestamp = timestamp
	27	+
	28	+class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
	29	+ def setCallback(self, callback):
	30	+ self.callback = callback
	31	+
	32	+ def startElement(self, name, attrs):
	33	+ self.destination = None
	34	+ if name == 'page':
	35	+ self.text=u''
	36	+ self.title=u''
	37	+ self.timestamp=u''
	38	+ elif name == 'text':
	39	+ self.destination = 'text'
	40	+ self.text=u''
	41	+ elif name == 'title':
	42	+ self.destination = 'title'
	43	+ self.title=u''
	44	+ elif name == 'timestamp':
	45	+ self.destination = 'timestamp'
	46	+ self.timestamp=u''
	47	+
	48	+ def endElement(self, name):
	49	+ if name == 'revision':
	50	+ # All done for this.
	51	+ text = self.text
	52	+ # Remove trailing newlines and spaces
	53	+ while text and text[-1] in '\n ':
	54	+ text = text[:-1]
	55	+ # Replace newline by cr/nl
	56	+ text = u'\r\n'.join(text.split('\n'))
	57	+ # Decode the timestamp
	58	+ timestamp = (self.timestamp[0:4]+
	59	+ self.timestamp[5:7]+
	60	+ self.timestamp[8:10]+
	61	+ self.timestamp[11:13]+
	62	+ self.timestamp[14:16]+
	63	+ self.timestamp[17:19])
	64	+ self.title = self.title.strip()
	65	+ # Report back to the caller
	66	+ entry = XmlEntry(self.title, text, timestamp)
	67	+ self.callback(entry)
	68	+
	69	+ def characters(self, data):
	70	+ if self.destination == 'text':
	71	+ self.text += data
	72	+ elif self.destination == 'title':
	73	+ self.title += data
	74	+ elif self.destination == 'timestamp':
	75	+ self.timestamp += data
	76	+
	77	+class XmlParserThread(threading.Thread):
	78	+ """
	79	+ This XML parser will run as a single thread. This allows the XmlDump
	80	+ generator to yield pages before the parser has finished reading the
	81	+ entire dump.
	82	+
	83	+ There surely are more elegant ways to do this.
	84	+ """
	85	+ def __init__(self, filename, handler):
	86	+ threading.Thread.__init__(self)
	87	+ self.filename = filename
	88	+ self.handler = handler
	89	+
	90	+ def run(self):
	91	+ xml.sax.parse(self.filename, self.handler)
	92	+
	93	+class XmlDump(object):
	94	+ """
	95	+ Represents an XML dump file. Reads the local file at initialization,
	96	+ parses it, and offers access to the resulting XmlEntries via a generator.
	97	+ """
	98	+ def __init__(self, filename):
	99	+ self.filename = filename
	100	+ self.finished = False
	101	+ self.handler = MediaWikiXmlHandler()
	102	+ self.handler.setCallback(self.oneDone)
	103	+ self.parserThread = XmlParserThread(self.filename, self.handler)
	104	+ # thread dies when program terminates
	105	+ self.parserThread.setDaemon(True)
	106	+ # this temporary variable will contain an XmlEntry given by the parser
	107	+ # until it has been yielded by the generator.
	108	+ self.lastEntry = None
	109	+
	110	+ def oneDone(self, entry):
	111	+ self.lastEntry = entry
	112	+ # wait until this class has yielded the page. Otherwise the parser
	113	+ # thread would give another page before we had time to yield the
	114	+ # current one.
	115	+ while self.lastEntry:
	116	+ time.sleep(0.001)
	117	+
	118	+ def __call__(self):
	119	+ '''
	120	+ Generator which reads one line at a time from the SQL dump file, and
	121	+ parses it to create SQLentry objects. Stops when the end of file is
	122	+ reached.
	123	+ '''
	124	+ wikipedia.output(u'Reading XML dump')
	125	+ self.parserThread.start()
	126	+ while self.parserThread.isAlive():
	127	+ if self.lastEntry:
	128	+ yield self.lastEntry
	129	+ self.lastEntry = None
	130	+ else:
	131	+ # wait 10 ms
	132	+ time.sleep(0.001)
Property changes on: trunk/pywikipedia/xmlreader.py
___________________________________________________________________
Added: svn:eol-style
1	133	+ native
Added: svn:keywords
2	134	+ Author Date Id Revision
Index: trunk/pywikipedia/wikipedia.py
—	—	@@ -93,12 +93,13 @@
94	94	import difflib
95	95	import re, urllib, codecs, sys
96	96	import xml.sax, xml.sax.handler
	97	+import htmlentitydefs
	98	+
	99	+import config, mediawiki_messages, login
	100	+import xmlreader
97	101	import warnings
98	102	import datetime
99	103
100		~~-import config, mediawiki_messages, login~~
101		~~-import htmlentitydefs~~
102		-
103	104	import locale
104	105	# we'll set the locale to system default. This will ensure correct string
105	106	# handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no
—	—	@@ -767,54 +768,7 @@
768	769	txt = 'redirect'
769	770	return re.compile(r'\#'+txt+':? \[\[(.?)(\]\|\\|)', re.I)
770	771
771		~~-# Shortcut get to get multiple pages at once~~
772		~~-class WikimediaXmlHandler(xml.sax.handler.ContentHandler):~~
773		~~- def setCallback(self, callback):~~
774		~~- self.callback = callback~~
775		-
776		~~- def startElement(self, name, attrs):~~
777		~~- self.destination = None~~
778		~~- if name == 'page':~~
779		~~- self.text=u''~~
780		~~- self.title=u''~~
781		~~- self.timestamp=u''~~
782		~~- elif name == 'text':~~
783		~~- self.destination = 'text'~~
784		~~- self.text=u''~~
785		~~- elif name == 'title':~~
786		~~- self.destination = 'title'~~
787		~~- self.title=u''~~
788		~~- elif name == 'timestamp':~~
789		~~- self.destination = 'timestamp'~~
790		~~- self.timestamp=u''~~
791		-
792		~~- def endElement(self, name):~~
793		~~- if name == 'revision':~~
794		~~- # All done for this.~~
795		~~- text = self.text~~
796		~~- # Remove trailing newlines and spaces~~
797		~~- while text and text[-1] in '\n ':~~
798		~~- text = text[:-1]~~
799		~~- # Replace newline by cr/nl~~
800		~~- text = u'\r\n'.join(text.split('\n'))~~
801		~~- # Decode the timestamp~~
802		~~- timestamp = (self.timestamp[0:4]+~~
803		~~- self.timestamp[5:7]+~~
804		~~- self.timestamp[8:10]+~~
805		~~- self.timestamp[11:13]+~~
806		~~- self.timestamp[14:16]+~~
807		~~- self.timestamp[17:19])~~
808		~~- # Report back to the caller~~
809		~~- self.callback(self.title.strip(), timestamp, text)~~
810	772
811		~~- def characters(self, data):~~
812		~~- if self.destination == 'text':~~
813		~~- self.text += data~~
814		~~- elif self.destination == 'title':~~
815		~~- self.title += data~~
816		~~- elif self.destination == 'timestamp':~~
817		~~- self.timestamp += data~~
818		-
819	773	class GetAll(object):
820	774	def __init__(self, site, pages, throttle):
821	775	"""First argument is Site object.
—	—	@@ -846,7 +800,7 @@
847	801	break
848	802	if not data:
849	803	return
850		~~- handler = WikimediaXmlHandler()~~
	804	+ handler = xmlreader.MediaWikiXmlHandler()
851	805	handler.setCallback(self.oneDone)
852	806	try:
853	807	xml.sax.parseString(data, handler)
—	—	@@ -871,7 +825,10 @@
872	826	for x in 'Xx':
873	827	pl._contents = pl._contents.replace(c2+x,c2+x+x)
874	828
875		~~- def oneDone(self, title, timestamp, text):~~
	829	+ def oneDone(self, entry):
	830	+ title = entry.title
	831	+ timestamp = entry.timestamp
	832	+ text = entry.text
876	833	pl = Page(self.site, title)
877	834	for pl2 in self.pages:
878	835	if Page(self.site, pl2.sectionFreeTitle()) == pl:
—	—	@@ -926,7 +883,7 @@
927	884	# find nothing, we will retry the normal way with an unadapted form.
928	885	pagenames = u'\r\n'.join([x.sectionFreeTitle() for x in self.pages])
929	886	if type(pagenames) != type(u''):
930		~~- print 'Warning: wikipedia.WikipediaXMLHandler.getData() got non-unicode page names. Please report this.'~~
	887	+ print 'Warning: xmlreader.WikipediaXMLHandler.getData() got non-unicode page names. Please report this.'
931	888	print pagenames
932	889	# convert Unicode string to the encoding used on that wiki
933	890	pagenames = pagenames.encode(self.site.encoding())

Status & tagging log

14:36, 12 September 2011 Meno25 (talk | contribs) changed the status of r2003 [removed: new added: old]