Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -1,283 +1,283 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__author__email = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2010-12-13'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -import sys
|
22 | | -import re
|
23 | | -import json
|
24 | | -import os
|
25 | | -import xml.etree.cElementTree as cElementTree
|
26 | | -
|
27 | | -sys.path.append('..')
|
28 | | -import configuration
|
29 | | -settings = configuration.Settings()
|
30 | | -
|
31 | | -import wikitree.parser
|
32 | | -from bots import bots
|
33 | | -from utils import utils
|
34 | | -
|
35 | | -try:
|
36 | | - import psyco
|
37 | | - psyco.full()
|
38 | | -except ImportError:
|
39 | | - pass
|
40 | | -
|
41 | | -
|
42 | | -RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
|
43 | | -
|
44 | | -
|
45 | | -def remove_numeric_character_references(text):
|
46 | | - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
|
47 | | -
|
48 | | -
|
49 | | -def lenient_deccharref(m):
|
50 | | - try:
|
51 | | - return unichr(int(m.group(1)))
|
52 | | - except ValueError:
|
53 | | - '''
|
54 | | - There are a few articles that raise a Value Error here, the reason is
|
55 | | - that I am using a narrow Python build (UCS2) instead of a wide build
|
56 | | - (UCS4). The quick fix is to return an empty string...
|
57 | | - Real solution is to rebuild Python with UCS4 support.....
|
58 | | - '''
|
59 | | - return ''
|
60 | | -
|
61 | | -
|
62 | | -def remove_namespace(element, namespace):
|
63 | | - '''Remove namespace from the XML document.'''
|
64 | | - ns = u'{%s}' % namespace
|
65 | | - nsl = len(ns)
|
66 | | - for elem in element.getiterator():
|
67 | | - if elem.tag.startswith(ns):
|
68 | | - elem.tag = elem.tag[nsl:]
|
69 | | - return element
|
70 | | -
|
71 | | -
|
72 | | -def load_namespace(language):
|
73 | | - file = '%s_ns.json' % language
|
74 | | - fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
|
75 | | - ns = json.load(fh)
|
76 | | - fh.close()
|
77 | | - ns = ns['query']['namespaces']
|
78 | | - return ns
|
79 | | -
|
80 | | -
|
81 | | -def build_namespaces_locale(namespaces, include=['0']):
|
82 | | - '''
|
83 | | - @include is a list of namespace keys that should not be ignored, the default
|
84 | | - setting is to ignore all namespaces except the main namespace.
|
85 | | - '''
|
86 | | - ns = []
|
87 | | - for namespace in namespaces:
|
88 | | - if namespace not in include:
|
89 | | - value = namespaces[namespace].get(u'*', None)
|
90 | | - ns.append(value)
|
91 | | - return ns
|
92 | | -
|
93 | | -
|
94 | | -def parse_comments(revisions, function):
|
95 | | - for revision in revisions:
|
96 | | - comment = revision.find('{%s}comment' % settings.xml_namespace)
|
97 | | - #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text
|
98 | | - if comment != None and comment.text != None:
|
99 | | - comment.text = function(comment.text)
|
100 | | - return revisions
|
101 | | -
|
102 | | -
|
103 | | -def is_article_main_namespace(elem, namespace):
|
104 | | - '''
|
105 | | - checks whether the article belongs to the main namespace
|
106 | | - '''
|
107 | | - title = elem.text
|
108 | | - for ns in namespace:
|
109 | | - if title.startswith(ns):
|
110 | | - return False
|
111 | | - return True
|
112 | | -
|
113 | | -def validate_hostname(address):
|
114 | | - '''
|
115 | | - This is not a foolproof solution at all. The problem is that it's really hard
|
116 | | - to determine whether a string is a hostname or not **reliably**. This is a
|
117 | | - very fast rule of thumb. Will lead to false positives, but that's life :)
|
118 | | - '''
|
119 | | - parts = address.split(".")
|
120 | | - if len(parts) > 2:
|
121 | | - return True
|
122 | | - else:
|
123 | | - return False
|
124 | | -
|
125 | | -
|
126 | | -def validate_ip(address):
|
127 | | - parts = address.split(".")
|
128 | | - if len(parts) != 4:
|
129 | | - return False
|
130 | | - parts = parts[:3]
|
131 | | - for item in parts:
|
132 | | - try:
|
133 | | - if not 0 <= int(item) <= 255:
|
134 | | - return False
|
135 | | - except ValueError:
|
136 | | - return False
|
137 | | - return True
|
138 | | -
|
139 | | -
|
140 | | -def determine_username_is_bot(contributor, **kwargs):
|
141 | | - '''
|
142 | | - #contributor is an xml element containing the id of the contributor
|
143 | | - @bots should have a dict with all the bot ids and bot names
|
144 | | - @Return False if username id is not in bot dict id or True if username id
|
145 | | - is a bot id.
|
146 | | - '''
|
147 | | - bots = kwargs.get('bots')
|
148 | | - username = contributor.find('username')
|
149 | | - if username == None:
|
150 | | - return 0
|
151 | | - else:
|
152 | | - if username in bots:
|
153 | | - return 1
|
154 | | - else:
|
155 | | - return 0
|
156 | | -
|
157 | | -
|
158 | | -def extract_username(contributor, **kwargs):
|
159 | | - contributor = contributor.find('username')
|
160 | | - if contributor != None:
|
161 | | - return contributor.text
|
162 | | - else:
|
163 | | - return None
|
164 | | -
|
165 | | -
|
166 | | -def extract_contributor_id(contributor, **kwargs):
|
167 | | - '''
|
168 | | - @contributor is the xml contributor node containing a number of attributes
|
169 | | - Currently, we are only interested in registered contributors, hence we
|
170 | | - ignore anonymous editors.
|
171 | | - '''
|
172 | | - if contributor.get('deleted'):
|
173 | | - return None # ASK: Not sure if this is the best way to code deleted contributors.
|
174 | | - elem = contributor.find('id')
|
175 | | - if elem != None:
|
176 | | - return {'id':elem.text}
|
177 | | - else:
|
178 | | - elem = contributor.find('ip')
|
179 | | - if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
|
180 | | - return {'username':elem.text, 'id': elem.text}
|
181 | | - else:
|
182 | | - return None
|
183 | | -
|
184 | | -
|
185 | | -def output_editor_information(revisions, page, bots):
|
186 | | - '''
|
187 | | - @elem is an XML element containing 1 revision from a page
|
188 | | - @output is where to store the data, a filehandle
|
189 | | - @**kwargs contains extra information
|
190 | | -
|
191 | | - the variable tags determines which attributes are being parsed, the values in
|
192 | | - this dictionary are the functions used to extract the data.
|
193 | | - '''
|
194 | | - headers = ['id', 'date', 'article', 'username']
|
195 | | - tags = {'contributor': {'id': extract_contributor_id,
|
196 | | - 'bot': determine_username_is_bot,
|
197 | | - 'username': extract_username,
|
198 | | - },
|
199 | | - 'timestamp': {'date': wikitree.parser.extract_text},
|
200 | | - }
|
201 | | - vars = {}
|
202 | | - flat = []
|
203 | | -
|
204 | | - for x, revision in enumerate(revisions):
|
205 | | - #print len(revision.getchildren())
|
206 | | - vars[x] = {}
|
207 | | - vars[x]['article'] = page
|
208 | | - for tag in tags:
|
209 | | - el = revision.find('%s' % tag)
|
210 | | - if el == None:
|
211 | | - #print cElementTree.tostring(revision, settings.encoding)
|
212 | | - del vars[x]
|
213 | | - break
|
214 | | - for function in tags[tag].keys():
|
215 | | - f = tags[tag][function]
|
216 | | - value = f(el, bots=bots)
|
217 | | - if type(value) == type({}):
|
218 | | - for kw in value:
|
219 | | - vars[x][kw] = value[kw]
|
220 | | - else:
|
221 | | - vars[x][function] = value
|
222 | | -
|
223 | | - '''
|
224 | | - This loop determines for each observation whether it should be stored or not.
|
225 | | - '''
|
226 | | - for x in vars:
|
227 | | - if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
|
228 | | - continue
|
229 | | - else:
|
230 | | - f = []
|
231 | | - for head in headers:
|
232 | | - f.append(vars[x][head])
|
233 | | - flat.append(f)
|
234 | | -
|
235 | | - return flat
|
236 | | -
|
237 | | -
|
238 | | -def parse_dumpfile(project, language_code, namespaces=['0']):
|
239 | | - bot_ids = bots.retrieve_bots(language_code)
|
240 | | - ns = load_namespace(language_code)
|
241 | | - ns = build_namespaces_locale(ns, namespaces)
|
242 | | -
|
243 | | - location = os.path.join(settings.input_location, language_code, project)
|
244 | | - fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)
|
245 | | - for page in wikitree.parser.read_input(fh):
|
246 | | - title = page.find('title')
|
247 | | - if is_article_main_namespace(title, ns):
|
248 | | - #cElementTree.dump(page)
|
249 | | - article_id = page.find('id').text
|
250 | | - revisions = page.findall('revision')
|
251 | | - revisions = parse_comments(revisions, remove_numeric_character_references)
|
252 | | - output = output_editor_information(revisions, article_id, bot_ids)
|
253 | | - write_output(output, project, language_code)
|
254 | | - page.clear()
|
255 | | - fh.close()
|
256 | | -
|
257 | | -
|
258 | | -def write_output(output, project, language_code):
|
259 | | - location = os.path.join(settings.input_location, language_code, project, 'txt')
|
260 | | - for o in output:
|
261 | | - file = '%s.csv' % hash(o[0])
|
262 | | - try:
|
263 | | - fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)
|
264 | | - utils.write_list_to_csv(o, fh)
|
265 | | - fh.close()
|
266 | | - except Exception, error:
|
267 | | - print error
|
268 | | -
|
269 | | -
|
270 | | -def hash(id):
|
271 | | - '''
|
272 | | - A very simple hash function based on modulo. The except clause has been
|
273 | | - addde because there are instances where the username is stored in userid
|
274 | | - tag and hence that's a string and not an integer.
|
275 | | - '''
|
276 | | - try:
|
277 | | - return int(id) % 500
|
278 | | - except:
|
279 | | - return sum([ord(i) for i in id]) % 500
|
280 | | -
|
281 | | -if __name__ == '__main__':
|
282 | | - project = 'wiki'
|
283 | | - language_code = 'en'
|
284 | | - parse_dumpfile(project, language_code)
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-12-13' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +import re |
| 23 | +import json |
| 24 | +import os |
| 25 | +import xml.etree.cElementTree as cElementTree |
| 26 | + |
| 27 | +sys.path.append('..') |
| 28 | +import configuration |
| 29 | +settings = configuration.Settings() |
| 30 | + |
| 31 | +import wikitree.parser |
| 32 | +from bots import bots |
| 33 | +from utils import utils |
| 34 | + |
| 35 | +try: |
| 36 | + import psyco |
| 37 | + psyco.full() |
| 38 | +except ImportError: |
| 39 | + pass |
| 40 | + |
| 41 | + |
| 42 | +RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
| 43 | + |
| 44 | + |
| 45 | +def remove_numeric_character_references(text): |
| 46 | + return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8') |
| 47 | + |
| 48 | + |
| 49 | +def lenient_deccharref(m): |
| 50 | + try: |
| 51 | + return unichr(int(m.group(1))) |
| 52 | + except ValueError: |
| 53 | + ''' |
| 54 | + There are a few articles that raise a Value Error here, the reason is |
| 55 | + that I am using a narrow Python build (UCS2) instead of a wide build |
| 56 | + (UCS4). The quick fix is to return an empty string... |
| 57 | + Real solution is to rebuild Python with UCS4 support..... |
| 58 | + ''' |
| 59 | + return '' |
| 60 | + |
| 61 | + |
| 62 | +def remove_namespace(element, namespace): |
| 63 | + '''Remove namespace from the XML document.''' |
| 64 | + ns = u'{%s}' % namespace |
| 65 | + nsl = len(ns) |
| 66 | + for elem in element.getiterator(): |
| 67 | + if elem.tag.startswith(ns): |
| 68 | + elem.tag = elem.tag[nsl:] |
| 69 | + return element |
| 70 | + |
| 71 | + |
| 72 | +def load_namespace(language): |
| 73 | + file = '%s_ns.json' % language |
| 74 | + fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding) |
| 75 | + ns = json.load(fh) |
| 76 | + fh.close() |
| 77 | + ns = ns['query']['namespaces'] |
| 78 | + return ns |
| 79 | + |
| 80 | + |
| 81 | +def build_namespaces_locale(namespaces, include=['0']): |
| 82 | + ''' |
| 83 | + @include is a list of namespace keys that should not be ignored, the default |
| 84 | + setting is to ignore all namespaces except the main namespace. |
| 85 | + ''' |
| 86 | + ns = [] |
| 87 | + for namespace in namespaces: |
| 88 | + if namespace not in include: |
| 89 | + value = namespaces[namespace].get(u'*', None) |
| 90 | + ns.append(value) |
| 91 | + return ns |
| 92 | + |
| 93 | + |
| 94 | +def parse_comments(revisions, function): |
| 95 | + for revision in revisions: |
| 96 | + comment = revision.find('{%s}comment' % settings.xml_namespace) |
| 97 | + #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text |
| 98 | + if comment != None and comment.text != None: |
| 99 | + comment.text = function(comment.text) |
| 100 | + return revisions |
| 101 | + |
| 102 | + |
| 103 | +def is_article_main_namespace(elem, namespace): |
| 104 | + ''' |
| 105 | + checks whether the article belongs to the main namespace |
| 106 | + ''' |
| 107 | + title = elem.text |
| 108 | + for ns in namespace: |
| 109 | + if title.startswith(ns): |
| 110 | + return False |
| 111 | + return True |
| 112 | + |
| 113 | +def validate_hostname(address): |
| 114 | + ''' |
| 115 | + This is not a foolproof solution at all. The problem is that it's really hard |
| 116 | + to determine whether a string is a hostname or not **reliably**. This is a |
| 117 | + very fast rule of thumb. Will lead to false positives, but that's life :) |
| 118 | + ''' |
| 119 | + parts = address.split(".") |
| 120 | + if len(parts) > 2: |
| 121 | + return True |
| 122 | + else: |
| 123 | + return False |
| 124 | + |
| 125 | + |
| 126 | +def validate_ip(address): |
| 127 | + parts = address.split(".") |
| 128 | + if len(parts) != 4: |
| 129 | + return False |
| 130 | + parts = parts[:3] |
| 131 | + for item in parts: |
| 132 | + try: |
| 133 | + if not 0 <= int(item) <= 255: |
| 134 | + return False |
| 135 | + except ValueError: |
| 136 | + return False |
| 137 | + return True |
| 138 | + |
| 139 | + |
| 140 | +def determine_username_is_bot(contributor, **kwargs): |
| 141 | + ''' |
| 142 | + #contributor is an xml element containing the id of the contributor |
| 143 | + @bots should have a dict with all the bot ids and bot names |
| 144 | + @Return False if username id is not in bot dict id or True if username id |
| 145 | + is a bot id. |
| 146 | + ''' |
| 147 | + bots = kwargs.get('bots') |
| 148 | + username = contributor.find('username') |
| 149 | + if username == None: |
| 150 | + return 0 |
| 151 | + else: |
| 152 | + if username in bots: |
| 153 | + return 1 |
| 154 | + else: |
| 155 | + return 0 |
| 156 | + |
| 157 | + |
| 158 | +def extract_username(contributor, **kwargs): |
| 159 | + contributor = contributor.find('username') |
| 160 | + if contributor != None: |
| 161 | + return contributor.text |
| 162 | + else: |
| 163 | + return None |
| 164 | + |
| 165 | + |
| 166 | +def extract_contributor_id(contributor, **kwargs): |
| 167 | + ''' |
| 168 | + @contributor is the xml contributor node containing a number of attributes |
| 169 | + Currently, we are only interested in registered contributors, hence we |
| 170 | + ignore anonymous editors. |
| 171 | + ''' |
| 172 | + if contributor.get('deleted'): |
| 173 | + return None # ASK: Not sure if this is the best way to code deleted contributors. |
| 174 | + elem = contributor.find('id') |
| 175 | + if elem != None: |
| 176 | + return {'id':elem.text} |
| 177 | + else: |
| 178 | + elem = contributor.find('ip') |
| 179 | + if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False: |
| 180 | + return {'username':elem.text, 'id': elem.text} |
| 181 | + else: |
| 182 | + return None |
| 183 | + |
| 184 | + |
| 185 | +def output_editor_information(revisions, page, bots): |
| 186 | + ''' |
| 187 | + @elem is an XML element containing 1 revision from a page |
| 188 | + @output is where to store the data, a filehandle |
| 189 | + @**kwargs contains extra information |
| 190 | + |
| 191 | + the variable tags determines which attributes are being parsed, the values in |
| 192 | + this dictionary are the functions used to extract the data. |
| 193 | + ''' |
| 194 | + headers = ['id', 'date', 'article', 'username'] |
| 195 | + tags = {'contributor': {'id': extract_contributor_id, |
| 196 | + 'bot': determine_username_is_bot, |
| 197 | + 'username': extract_username, |
| 198 | + }, |
| 199 | + 'timestamp': {'date': wikitree.parser.extract_text}, |
| 200 | + } |
| 201 | + vars = {} |
| 202 | + flat = [] |
| 203 | + |
| 204 | + for x, revision in enumerate(revisions): |
| 205 | + #print len(revision.getchildren()) |
| 206 | + vars[x] = {} |
| 207 | + vars[x]['article'] = page |
| 208 | + for tag in tags: |
| 209 | + el = revision.find('%s' % tag) |
| 210 | + if el == None: |
| 211 | + #print cElementTree.tostring(revision, settings.encoding) |
| 212 | + del vars[x] |
| 213 | + break |
| 214 | + for function in tags[tag].keys(): |
| 215 | + f = tags[tag][function] |
| 216 | + value = f(el, bots=bots) |
| 217 | + if type(value) == type({}): |
| 218 | + for kw in value: |
| 219 | + vars[x][kw] = value[kw] |
| 220 | + else: |
| 221 | + vars[x][function] = value |
| 222 | + |
| 223 | + ''' |
| 224 | + This loop determines for each observation whether it should be stored or not. |
| 225 | + ''' |
| 226 | + for x in vars: |
| 227 | + if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None: |
| 228 | + continue |
| 229 | + else: |
| 230 | + f = [] |
| 231 | + for head in headers: |
| 232 | + f.append(vars[x][head]) |
| 233 | + flat.append(f) |
| 234 | + |
| 235 | + return flat |
| 236 | + |
| 237 | + |
| 238 | +def parse_dumpfile(project, language_code, namespaces=['0']): |
| 239 | + bot_ids = bots.retrieve_bots(language_code) |
| 240 | + ns = load_namespace(language_code) |
| 241 | + ns = build_namespaces_locale(ns, namespaces) |
| 242 | + |
| 243 | + location = os.path.join(settings.input_location, language_code, project) |
| 244 | + fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding) |
| 245 | + for page in wikitree.parser.read_input(fh): |
| 246 | + title = page.find('title') |
| 247 | + if is_article_main_namespace(title, ns): |
| 248 | + #cElementTree.dump(page) |
| 249 | + article_id = page.find('id').text |
| 250 | + revisions = page.findall('revision') |
| 251 | + revisions = parse_comments(revisions, remove_numeric_character_references) |
| 252 | + output = output_editor_information(revisions, article_id, bot_ids) |
| 253 | + write_output(output, project, language_code) |
| 254 | + page.clear() |
| 255 | + fh.close() |
| 256 | + |
| 257 | + |
| 258 | +def write_output(output, project, language_code): |
| 259 | + location = os.path.join(settings.input_location, language_code, project, 'txt') |
| 260 | + for o in output: |
| 261 | + file = '%s.csv' % hash(o[0]) |
| 262 | + try: |
| 263 | + fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding) |
| 264 | + utils.write_list_to_csv(o, fh) |
| 265 | + fh.close() |
| 266 | + except Exception, error: |
| 267 | + print error |
| 268 | + |
| 269 | + |
| 270 | +def hash(id): |
| 271 | + ''' |
| 272 | + A very simple hash function based on modulo. The except clause has been |
| 273 | + addde because there are instances where the username is stored in userid |
| 274 | + tag and hence that's a string and not an integer. |
| 275 | + ''' |
| 276 | + try: |
| 277 | + return int(id) % 500 |
| 278 | + except: |
| 279 | + return sum([ord(i) for i in id]) % 500 |
| 280 | + |
| 281 | +if __name__ == '__main__': |
| 282 | + project = 'wiki' |
| 283 | + language_code = 'en' |
| 284 | + parse_dumpfile(project, language_code) |
Property changes on: trunk/tools/editor_trends/etl/extracter.py |
___________________________________________________________________ |
Added: svn:eol-style |
285 | 285 | + native |