Index: trunk/extensions/Offline/mwimport.py |
— | — | @@ -6,11 +6,11 @@ |
7 | 7 | import os.path |
8 | 8 | import sys |
9 | 9 | import random |
10 | | -import time |
| 10 | +from datetime import datetime |
11 | 11 | from lxml import etree |
12 | 12 | from collections import namedtuple |
| 13 | +from optparse import OptionParser |
13 | 14 | |
14 | | - |
15 | 15 | Page = namedtuple('Page', [ |
16 | 16 | 'id', |
17 | 17 | 'namespace', |
— | — | @@ -51,18 +51,10 @@ |
52 | 52 | self.xmlns = xmlns |
53 | 53 | |
54 | 54 | def text(self, tag, default='NULL'): |
55 | | - content = self.element.findtext(self.xmlns+tag) |
56 | | - if content: |
57 | | - return content |
58 | | - else: |
59 | | - return default |
| 55 | + return self.element.findtext(self.xmlns+tag, default) |
60 | 56 | |
61 | 57 | def attr(self, tag, default='NULL'): |
62 | | - attr = self.element.get(self.xmlns+tag) |
63 | | - if attr: |
64 | | - return attr |
65 | | - else: |
66 | | - return default |
| 58 | + return self.element.get(self.xmlns+tag, default) |
67 | 59 | |
68 | 60 | def child(self, tag): |
69 | 61 | children = self.element.iterchildren(tag=self.xmlns+tag) |
— | — | @@ -79,7 +71,7 @@ |
80 | 72 | def __init__(self): |
81 | 73 | pass |
82 | 74 | |
83 | | -class Parser(object): |
| 75 | +class DumpParser(object): |
84 | 76 | def __init__(self, input=None, output_base=None): |
85 | 77 | if not input or input == "-": |
86 | 78 | self.input = sys.stdin |
— | — | @@ -97,7 +89,7 @@ |
98 | 90 | #XXX problem, we would have to listen to tag start events: |
99 | 91 | #if tag == 'mediawiki': |
100 | 92 | # self.xmlns = Element(element).attr('xmlns') |
101 | | - method = getattr(Parser, tag, None) |
| 93 | + method = getattr(DumpParser, tag, None) |
102 | 94 | if method: |
103 | 95 | article = method(self, element=Element(element, xmlns=self.xmlns)) |
104 | 96 | self.output.write_article(article) |
— | — | @@ -109,17 +101,30 @@ |
110 | 102 | self.article.page_id=element.text('id') |
111 | 103 | self.revision(element.child('revision')) |
112 | 104 | |
| 105 | + title = element.text('title') |
| 106 | + if ':' in title: |
| 107 | + namespace, title = title.split(':', 1) |
| 108 | + else: |
| 109 | + namespace = 'Main' |
| 110 | + |
| 111 | + if re.match("#redirect", self.article.text.text, re.I): |
| 112 | + redirect = 1 |
| 113 | + else: |
| 114 | + redirect = 0 |
| 115 | + |
| 116 | + touched = datetime.now().strftime("%Y%m%d%H%M%S") # mysql timestamp |
| 117 | + |
113 | 118 | self.article.page = Page( |
114 | 119 | id=self.article.page_id, |
115 | | - namespace='Main', |
116 | | - title=element.text('title'), |
| 120 | + namespace=namespace, |
| 121 | + title=title, |
117 | 122 | restrictions=element.text('restrictions', 0), |
118 | 123 | counter=0, |
119 | | - is_redirect=0, #XXX |
| 124 | + is_redirect=redirect, #XXX |
120 | 125 | is_new=0, |
121 | 126 | random=random.randint(0, 4000000000), |
122 | | - touched=time.strftime('%Y-%m-%d %H:%M:%S'), # mysql datetime |
123 | | - latest=0, |
| 127 | + touched=touched, |
| 128 | + latest=self.article.revision.id, |
124 | 129 | len=self.article.text_len |
125 | 130 | ) |
126 | 131 | return self.article |
— | — | @@ -130,18 +135,21 @@ |
131 | 136 | self.contributor(element.child('contributor')) |
132 | 137 | self.comment(element.child('comment')) |
133 | 138 | |
| 139 | + parsed_time = datetime.strptime(element.text('timestamp'), "%Y-%m-%dT%H:%M:%SZ") |
| 140 | + timestamp = parsed_time.strftime("%Y%m%d%H%M%S") # mysql timestamp |
| 141 | + |
134 | 142 | self.article.revision = Revision( |
135 | 143 | id=self.article.revision_id, |
136 | 144 | page=self.article.page_id, |
| 145 | + text_id=self.article.text.id, |
137 | 146 | comment=self.article.comment, |
138 | 147 | user=self.article.contrib_id, |
139 | 148 | user_text=self.article.contrib_user, |
140 | | - text_id=self.article.text.id, |
141 | | - timestamp=element.text('timestamp'), |
| 149 | + timestamp=timestamp, |
142 | 150 | minor_edit=element.text('minor', 0), |
143 | 151 | deleted=0, |
144 | 152 | len=self.article.text_len, |
145 | | - parent_id=0 |
| 153 | + parent_id=self.article.page_id |
146 | 154 | ) |
147 | 155 | |
148 | 156 | def text(self, element): |
— | — | @@ -152,12 +160,12 @@ |
153 | 161 | flags='utf-8' |
154 | 162 | ) |
155 | 163 | self.article.text_len = 0 |
156 | | - if self.article.text_len: |
| 164 | + if self.article.text.text: |
157 | 165 | self.article.text_len = len(self.article.text.text) |
158 | 166 | |
159 | 167 | def contributor(self, element): |
160 | | - self.article.contrib_user = element.text('username') |
161 | | - self.article.contrib_id = element.text('id') |
| 168 | + self.article.contrib_user = element.text('username', element.text('ip')) |
| 169 | + self.article.contrib_id = element.text('id', 0) |
162 | 170 | |
163 | 171 | def comment(self, element): |
164 | 172 | if element and not element.attr('deleted'): |
— | — | @@ -215,5 +223,8 @@ |
216 | 224 | |
217 | 225 | |
218 | 226 | if __name__ == "__main__": |
219 | | - p = Parser() |
| 227 | + #op = OptionParser() |
| 228 | + #op.add_option( |
| 229 | + |
| 230 | + p = DumpParser() |
220 | 231 | p.parse() |