r78583 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r78582‎ | r78583 | r78584 >
Date:22:19, 18 December 2010
Author:reedy
Status:deferred
Tags:
Comment:
Followup r78582, svn:eol-style native
Modified paths:
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/extracter.py
@@ -1,283 +1,283 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-12-13'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -import re
23 -import json
24 -import os
25 -import xml.etree.cElementTree as cElementTree
26 -
27 -sys.path.append('..')
28 -import configuration
29 -settings = configuration.Settings()
30 -
31 -import wikitree.parser
32 -from bots import bots
33 -from utils import utils
34 -
35 -try:
36 - import psyco
37 - psyco.full()
38 -except ImportError:
39 - pass
40 -
41 -
42 -RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
43 -
44 -
45 -def remove_numeric_character_references(text):
46 - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
47 -
48 -
49 -def lenient_deccharref(m):
50 - try:
51 - return unichr(int(m.group(1)))
52 - except ValueError:
53 - '''
54 - There are a few articles that raise a Value Error here, the reason is
55 - that I am using a narrow Python build (UCS2) instead of a wide build
56 - (UCS4). The quick fix is to return an empty string...
57 - Real solution is to rebuild Python with UCS4 support.....
58 - '''
59 - return ''
60 -
61 -
62 -def remove_namespace(element, namespace):
63 - '''Remove namespace from the XML document.'''
64 - ns = u'{%s}' % namespace
65 - nsl = len(ns)
66 - for elem in element.getiterator():
67 - if elem.tag.startswith(ns):
68 - elem.tag = elem.tag[nsl:]
69 - return element
70 -
71 -
72 -def load_namespace(language):
73 - file = '%s_ns.json' % language
74 - fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
75 - ns = json.load(fh)
76 - fh.close()
77 - ns = ns['query']['namespaces']
78 - return ns
79 -
80 -
81 -def build_namespaces_locale(namespaces, include=['0']):
82 - '''
83 - @include is a list of namespace keys that should not be ignored, the default
84 - setting is to ignore all namespaces except the main namespace.
85 - '''
86 - ns = []
87 - for namespace in namespaces:
88 - if namespace not in include:
89 - value = namespaces[namespace].get(u'*', None)
90 - ns.append(value)
91 - return ns
92 -
93 -
94 -def parse_comments(revisions, function):
95 - for revision in revisions:
96 - comment = revision.find('{%s}comment' % settings.xml_namespace)
97 - #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text
98 - if comment != None and comment.text != None:
99 - comment.text = function(comment.text)
100 - return revisions
101 -
102 -
103 -def is_article_main_namespace(elem, namespace):
104 - '''
105 - checks whether the article belongs to the main namespace
106 - '''
107 - title = elem.text
108 - for ns in namespace:
109 - if title.startswith(ns):
110 - return False
111 - return True
112 -
113 -def validate_hostname(address):
114 - '''
115 - This is not a foolproof solution at all. The problem is that it's really hard
116 - to determine whether a string is a hostname or not **reliably**. This is a
117 - very fast rule of thumb. Will lead to false positives, but that's life :)
118 - '''
119 - parts = address.split(".")
120 - if len(parts) > 2:
121 - return True
122 - else:
123 - return False
124 -
125 -
126 -def validate_ip(address):
127 - parts = address.split(".")
128 - if len(parts) != 4:
129 - return False
130 - parts = parts[:3]
131 - for item in parts:
132 - try:
133 - if not 0 <= int(item) <= 255:
134 - return False
135 - except ValueError:
136 - return False
137 - return True
138 -
139 -
140 -def determine_username_is_bot(contributor, **kwargs):
141 - '''
142 - #contributor is an xml element containing the id of the contributor
143 - @bots should have a dict with all the bot ids and bot names
144 - @Return False if username id is not in bot dict id or True if username id
145 - is a bot id.
146 - '''
147 - bots = kwargs.get('bots')
148 - username = contributor.find('username')
149 - if username == None:
150 - return 0
151 - else:
152 - if username in bots:
153 - return 1
154 - else:
155 - return 0
156 -
157 -
158 -def extract_username(contributor, **kwargs):
159 - contributor = contributor.find('username')
160 - if contributor != None:
161 - return contributor.text
162 - else:
163 - return None
164 -
165 -
166 -def extract_contributor_id(contributor, **kwargs):
167 - '''
168 - @contributor is the xml contributor node containing a number of attributes
169 - Currently, we are only interested in registered contributors, hence we
170 - ignore anonymous editors.
171 - '''
172 - if contributor.get('deleted'):
173 - return None # ASK: Not sure if this is the best way to code deleted contributors.
174 - elem = contributor.find('id')
175 - if elem != None:
176 - return {'id':elem.text}
177 - else:
178 - elem = contributor.find('ip')
179 - if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
180 - return {'username':elem.text, 'id': elem.text}
181 - else:
182 - return None
183 -
184 -
185 -def output_editor_information(revisions, page, bots):
186 - '''
187 - @elem is an XML element containing 1 revision from a page
188 - @output is where to store the data, a filehandle
189 - @**kwargs contains extra information
190 -
191 - the variable tags determines which attributes are being parsed, the values in
192 - this dictionary are the functions used to extract the data.
193 - '''
194 - headers = ['id', 'date', 'article', 'username']
195 - tags = {'contributor': {'id': extract_contributor_id,
196 - 'bot': determine_username_is_bot,
197 - 'username': extract_username,
198 - },
199 - 'timestamp': {'date': wikitree.parser.extract_text},
200 - }
201 - vars = {}
202 - flat = []
203 -
204 - for x, revision in enumerate(revisions):
205 - #print len(revision.getchildren())
206 - vars[x] = {}
207 - vars[x]['article'] = page
208 - for tag in tags:
209 - el = revision.find('%s' % tag)
210 - if el == None:
211 - #print cElementTree.tostring(revision, settings.encoding)
212 - del vars[x]
213 - break
214 - for function in tags[tag].keys():
215 - f = tags[tag][function]
216 - value = f(el, bots=bots)
217 - if type(value) == type({}):
218 - for kw in value:
219 - vars[x][kw] = value[kw]
220 - else:
221 - vars[x][function] = value
222 -
223 - '''
224 - This loop determines for each observation whether it should be stored or not.
225 - '''
226 - for x in vars:
227 - if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
228 - continue
229 - else:
230 - f = []
231 - for head in headers:
232 - f.append(vars[x][head])
233 - flat.append(f)
234 -
235 - return flat
236 -
237 -
238 -def parse_dumpfile(project, language_code, namespaces=['0']):
239 - bot_ids = bots.retrieve_bots(language_code)
240 - ns = load_namespace(language_code)
241 - ns = build_namespaces_locale(ns, namespaces)
242 -
243 - location = os.path.join(settings.input_location, language_code, project)
244 - fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)
245 - for page in wikitree.parser.read_input(fh):
246 - title = page.find('title')
247 - if is_article_main_namespace(title, ns):
248 - #cElementTree.dump(page)
249 - article_id = page.find('id').text
250 - revisions = page.findall('revision')
251 - revisions = parse_comments(revisions, remove_numeric_character_references)
252 - output = output_editor_information(revisions, article_id, bot_ids)
253 - write_output(output, project, language_code)
254 - page.clear()
255 - fh.close()
256 -
257 -
258 -def write_output(output, project, language_code):
259 - location = os.path.join(settings.input_location, language_code, project, 'txt')
260 - for o in output:
261 - file = '%s.csv' % hash(o[0])
262 - try:
263 - fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)
264 - utils.write_list_to_csv(o, fh)
265 - fh.close()
266 - except Exception, error:
267 - print error
268 -
269 -
270 -def hash(id):
271 - '''
272 - A very simple hash function based on modulo. The except clause has been
273 - addde because there are instances where the username is stored in userid
274 - tag and hence that's a string and not an integer.
275 - '''
276 - try:
277 - return int(id) % 500
278 - except:
279 - return sum([ord(i) for i in id]) % 500
280 -
281 -if __name__ == '__main__':
282 - project = 'wiki'
283 - language_code = 'en'
284 - parse_dumpfile(project, language_code)
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-12-13'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+import re
 23+import json
 24+import os
 25+import xml.etree.cElementTree as cElementTree
 26+
 27+sys.path.append('..')
 28+import configuration
 29+settings = configuration.Settings()
 30+
 31+import wikitree.parser
 32+from bots import bots
 33+from utils import utils
 34+
 35+try:
 36+ import psyco
 37+ psyco.full()
 38+except ImportError:
 39+ pass
 40+
 41+
 42+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
 43+
 44+
 45+def remove_numeric_character_references(text):
 46+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
 47+
 48+
 49+def lenient_deccharref(m):
 50+ try:
 51+ return unichr(int(m.group(1)))
 52+ except ValueError:
 53+ '''
 54+ There are a few articles that raise a Value Error here, the reason is
 55+ that I am using a narrow Python build (UCS2) instead of a wide build
 56+ (UCS4). The quick fix is to return an empty string...
 57+ Real solution is to rebuild Python with UCS4 support.....
 58+ '''
 59+ return ''
 60+
 61+
 62+def remove_namespace(element, namespace):
 63+ '''Remove namespace from the XML document.'''
 64+ ns = u'{%s}' % namespace
 65+ nsl = len(ns)
 66+ for elem in element.getiterator():
 67+ if elem.tag.startswith(ns):
 68+ elem.tag = elem.tag[nsl:]
 69+ return element
 70+
 71+
 72+def load_namespace(language):
 73+ file = '%s_ns.json' % language
 74+ fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
 75+ ns = json.load(fh)
 76+ fh.close()
 77+ ns = ns['query']['namespaces']
 78+ return ns
 79+
 80+
 81+def build_namespaces_locale(namespaces, include=['0']):
 82+ '''
 83+ @include is a list of namespace keys that should not be ignored, the default
 84+ setting is to ignore all namespaces except the main namespace.
 85+ '''
 86+ ns = []
 87+ for namespace in namespaces:
 88+ if namespace not in include:
 89+ value = namespaces[namespace].get(u'*', None)
 90+ ns.append(value)
 91+ return ns
 92+
 93+
 94+def parse_comments(revisions, function):
 95+ for revision in revisions:
 96+ comment = revision.find('{%s}comment' % settings.xml_namespace)
 97+ #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text
 98+ if comment != None and comment.text != None:
 99+ comment.text = function(comment.text)
 100+ return revisions
 101+
 102+
 103+def is_article_main_namespace(elem, namespace):
 104+ '''
 105+ checks whether the article belongs to the main namespace
 106+ '''
 107+ title = elem.text
 108+ for ns in namespace:
 109+ if title.startswith(ns):
 110+ return False
 111+ return True
 112+
 113+def validate_hostname(address):
 114+ '''
 115+ This is not a foolproof solution at all. The problem is that it's really hard
 116+ to determine whether a string is a hostname or not **reliably**. This is a
 117+ very fast rule of thumb. Will lead to false positives, but that's life :)
 118+ '''
 119+ parts = address.split(".")
 120+ if len(parts) > 2:
 121+ return True
 122+ else:
 123+ return False
 124+
 125+
 126+def validate_ip(address):
 127+ parts = address.split(".")
 128+ if len(parts) != 4:
 129+ return False
 130+ parts = parts[:3]
 131+ for item in parts:
 132+ try:
 133+ if not 0 <= int(item) <= 255:
 134+ return False
 135+ except ValueError:
 136+ return False
 137+ return True
 138+
 139+
 140+def determine_username_is_bot(contributor, **kwargs):
 141+ '''
 142+ #contributor is an xml element containing the id of the contributor
 143+ @bots should have a dict with all the bot ids and bot names
 144+ @Return False if username id is not in bot dict id or True if username id
 145+ is a bot id.
 146+ '''
 147+ bots = kwargs.get('bots')
 148+ username = contributor.find('username')
 149+ if username == None:
 150+ return 0
 151+ else:
 152+ if username in bots:
 153+ return 1
 154+ else:
 155+ return 0
 156+
 157+
 158+def extract_username(contributor, **kwargs):
 159+ contributor = contributor.find('username')
 160+ if contributor != None:
 161+ return contributor.text
 162+ else:
 163+ return None
 164+
 165+
 166+def extract_contributor_id(contributor, **kwargs):
 167+ '''
 168+ @contributor is the xml contributor node containing a number of attributes
 169+ Currently, we are only interested in registered contributors, hence we
 170+ ignore anonymous editors.
 171+ '''
 172+ if contributor.get('deleted'):
 173+ return None # ASK: Not sure if this is the best way to code deleted contributors.
 174+ elem = contributor.find('id')
 175+ if elem != None:
 176+ return {'id':elem.text}
 177+ else:
 178+ elem = contributor.find('ip')
 179+ if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
 180+ return {'username':elem.text, 'id': elem.text}
 181+ else:
 182+ return None
 183+
 184+
 185+def output_editor_information(revisions, page, bots):
 186+ '''
 187+ @elem is an XML element containing 1 revision from a page
 188+ @output is where to store the data, a filehandle
 189+ @**kwargs contains extra information
 190+
 191+ the variable tags determines which attributes are being parsed, the values in
 192+ this dictionary are the functions used to extract the data.
 193+ '''
 194+ headers = ['id', 'date', 'article', 'username']
 195+ tags = {'contributor': {'id': extract_contributor_id,
 196+ 'bot': determine_username_is_bot,
 197+ 'username': extract_username,
 198+ },
 199+ 'timestamp': {'date': wikitree.parser.extract_text},
 200+ }
 201+ vars = {}
 202+ flat = []
 203+
 204+ for x, revision in enumerate(revisions):
 205+ #print len(revision.getchildren())
 206+ vars[x] = {}
 207+ vars[x]['article'] = page
 208+ for tag in tags:
 209+ el = revision.find('%s' % tag)
 210+ if el == None:
 211+ #print cElementTree.tostring(revision, settings.encoding)
 212+ del vars[x]
 213+ break
 214+ for function in tags[tag].keys():
 215+ f = tags[tag][function]
 216+ value = f(el, bots=bots)
 217+ if type(value) == type({}):
 218+ for kw in value:
 219+ vars[x][kw] = value[kw]
 220+ else:
 221+ vars[x][function] = value
 222+
 223+ '''
 224+ This loop determines for each observation whether it should be stored or not.
 225+ '''
 226+ for x in vars:
 227+ if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
 228+ continue
 229+ else:
 230+ f = []
 231+ for head in headers:
 232+ f.append(vars[x][head])
 233+ flat.append(f)
 234+
 235+ return flat
 236+
 237+
 238+def parse_dumpfile(project, language_code, namespaces=['0']):
 239+ bot_ids = bots.retrieve_bots(language_code)
 240+ ns = load_namespace(language_code)
 241+ ns = build_namespaces_locale(ns, namespaces)
 242+
 243+ location = os.path.join(settings.input_location, language_code, project)
 244+ fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)
 245+ for page in wikitree.parser.read_input(fh):
 246+ title = page.find('title')
 247+ if is_article_main_namespace(title, ns):
 248+ #cElementTree.dump(page)
 249+ article_id = page.find('id').text
 250+ revisions = page.findall('revision')
 251+ revisions = parse_comments(revisions, remove_numeric_character_references)
 252+ output = output_editor_information(revisions, article_id, bot_ids)
 253+ write_output(output, project, language_code)
 254+ page.clear()
 255+ fh.close()
 256+
 257+
 258+def write_output(output, project, language_code):
 259+ location = os.path.join(settings.input_location, language_code, project, 'txt')
 260+ for o in output:
 261+ file = '%s.csv' % hash(o[0])
 262+ try:
 263+ fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)
 264+ utils.write_list_to_csv(o, fh)
 265+ fh.close()
 266+ except Exception, error:
 267+ print error
 268+
 269+
 270+def hash(id):
 271+ '''
 272+ A very simple hash function based on modulo. The except clause has been
 273+ addde because there are instances where the username is stored in userid
 274+ tag and hence that's a string and not an integer.
 275+ '''
 276+ try:
 277+ return int(id) % 500
 278+ except:
 279+ return sum([ord(i) for i in id]) % 500
 280+
 281+if __name__ == '__main__':
 282+ project = 'wiki'
 283+ language_code = 'en'
 284+ parse_dumpfile(project, language_code)
Property changes on: trunk/tools/editor_trends/etl/extracter.py
___________________________________________________________________
Added: svn:eol-style
285285 + native

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r78582Thanks to Nimish for giving me a number of suggestions to reduce the processi...diederik22:10, 18 December 2010

Status & tagging log