r109753 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109752‎ | r109753 | r109754 >
Date:20:28, 22 January 2012
Author:adamw
Status:deferred
Tags:
Comment:
implement text length, is_redirect, page id, anonymous contributor
correct timestamps (maybe i got lucky), parse namespace from title
Modified paths:
  • /trunk/extensions/Offline/mwimport.py (modified) (history)

Diff [purge]

Index: trunk/extensions/Offline/mwimport.py
@@ -6,11 +6,11 @@
77 import os.path
88 import sys
99 import random
10 -import time
 10+from datetime import datetime
1111 from lxml import etree
1212 from collections import namedtuple
 13+from optparse import OptionParser
1314
14 -
1515 Page = namedtuple('Page', [
1616 'id',
1717 'namespace',
@@ -51,18 +51,10 @@
5252 self.xmlns = xmlns
5353
5454 def text(self, tag, default='NULL'):
55 - content = self.element.findtext(self.xmlns+tag)
56 - if content:
57 - return content
58 - else:
59 - return default
 55+ return self.element.findtext(self.xmlns+tag, default)
6056
6157 def attr(self, tag, default='NULL'):
62 - attr = self.element.get(self.xmlns+tag)
63 - if attr:
64 - return attr
65 - else:
66 - return default
 58+ return self.element.get(self.xmlns+tag, default)
6759
6860 def child(self, tag):
6961 children = self.element.iterchildren(tag=self.xmlns+tag)
@@ -79,7 +71,7 @@
8072 def __init__(self):
8173 pass
8274
83 -class Parser(object):
 75+class DumpParser(object):
8476 def __init__(self, input=None, output_base=None):
8577 if not input or input == "-":
8678 self.input = sys.stdin
@@ -97,7 +89,7 @@
9890 #XXX problem, we would have to listen to tag start events:
9991 #if tag == 'mediawiki':
10092 # self.xmlns = Element(element).attr('xmlns')
101 - method = getattr(Parser, tag, None)
 93+ method = getattr(DumpParser, tag, None)
10294 if method:
10395 article = method(self, element=Element(element, xmlns=self.xmlns))
10496 self.output.write_article(article)
@@ -109,17 +101,30 @@
110102 self.article.page_id=element.text('id')
111103 self.revision(element.child('revision'))
112104
 105+ title = element.text('title')
 106+ if ':' in title:
 107+ namespace, title = title.split(':', 1)
 108+ else:
 109+ namespace = 'Main'
 110+
 111+ if re.match("#redirect", self.article.text.text, re.I):
 112+ redirect = 1
 113+ else:
 114+ redirect = 0
 115+
 116+ touched = datetime.now().strftime("%Y%m%d%H%M%S") # mysql timestamp
 117+
113118 self.article.page = Page(
114119 id=self.article.page_id,
115 - namespace='Main',
116 - title=element.text('title'),
 120+ namespace=namespace,
 121+ title=title,
117122 restrictions=element.text('restrictions', 0),
118123 counter=0,
119 - is_redirect=0, #XXX
 124+ is_redirect=redirect, #XXX
120125 is_new=0,
121126 random=random.randint(0, 4000000000),
122 - touched=time.strftime('%Y-%m-%d %H:%M:%S'), # mysql datetime
123 - latest=0,
 127+ touched=touched,
 128+ latest=self.article.revision.id,
124129 len=self.article.text_len
125130 )
126131 return self.article
@@ -130,18 +135,21 @@
131136 self.contributor(element.child('contributor'))
132137 self.comment(element.child('comment'))
133138
 139+ parsed_time = datetime.strptime(element.text('timestamp'), "%Y-%m-%dT%H:%M:%SZ")
 140+ timestamp = parsed_time.strftime("%Y%m%d%H%M%S") # mysql timestamp
 141+
134142 self.article.revision = Revision(
135143 id=self.article.revision_id,
136144 page=self.article.page_id,
 145+ text_id=self.article.text.id,
137146 comment=self.article.comment,
138147 user=self.article.contrib_id,
139148 user_text=self.article.contrib_user,
140 - text_id=self.article.text.id,
141 - timestamp=element.text('timestamp'),
 149+ timestamp=timestamp,
142150 minor_edit=element.text('minor', 0),
143151 deleted=0,
144152 len=self.article.text_len,
145 - parent_id=0
 153+ parent_id=self.article.page_id
146154 )
147155
148156 def text(self, element):
@@ -152,12 +160,12 @@
153161 flags='utf-8'
154162 )
155163 self.article.text_len = 0
156 - if self.article.text_len:
 164+ if self.article.text.text:
157165 self.article.text_len = len(self.article.text.text)
158166
159167 def contributor(self, element):
160 - self.article.contrib_user = element.text('username')
161 - self.article.contrib_id = element.text('id')
 168+ self.article.contrib_user = element.text('username', element.text('ip'))
 169+ self.article.contrib_id = element.text('id', 0)
162170
163171 def comment(self, element):
164172 if element and not element.attr('deleted'):
@@ -215,5 +223,8 @@
216224
217225
218226 if __name__ == "__main__":
219 - p = Parser()
 227+ #op = OptionParser()
 228+ #op.add_option(
 229+
 230+ p = DumpParser()
220231 p.parse()

Status & tagging log