r78583 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r78582‎ \| r78583 \| r78584 >
Date:	22:19, 18 December 2010
Author:	reedy
Status:	deferred
Tags:
Comment:	Followup r78582, svn:eol-style native
Modified paths:	/trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -1,283 +1,283 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-12-13'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		~~-import re~~
23		~~-import json~~
24		~~-import os~~
25		~~-import xml.etree.cElementTree as cElementTree~~
26		-
27		~~-sys.path.append('..')~~
28		~~-import configuration~~
29		~~-settings = configuration.Settings()~~
30		-
31		~~-import wikitree.parser~~
32		~~-from bots import bots~~
33		~~-from utils import utils~~
34		-
35		~~-try:~~
36		~~- import psyco~~
37		~~- psyco.full()~~
38		~~-except ImportError:~~
39		~~- pass~~
40		-
41		-
42		~~-RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')~~
43		-
44		-
45		~~-def remove_numeric_character_references(text):~~
46		~~- return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')~~
47		-
48		-
49		~~-def lenient_deccharref(m):~~
50		~~- try:~~
51		~~- return unichr(int(m.group(1)))~~
52		~~- except ValueError:~~
53		~~- '''~~
54		~~- There are a few articles that raise a Value Error here, the reason is~~
55		~~- that I am using a narrow Python build (UCS2) instead of a wide build~~
56		~~- (UCS4). The quick fix is to return an empty string...~~
57		~~- Real solution is to rebuild Python with UCS4 support.....~~
58		~~- '''~~
59		~~- return ''~~
60		-
61		-
62		~~-def remove_namespace(element, namespace):~~
63		~~- '''Remove namespace from the XML document.'''~~
64		~~- ns = u'{%s}' % namespace~~
65		~~- nsl = len(ns)~~
66		~~- for elem in element.getiterator():~~
67		~~- if elem.tag.startswith(ns):~~
68		~~- elem.tag = elem.tag[nsl:]~~
69		~~- return element~~
70		-
71		-
72		~~-def load_namespace(language):~~
73		~~- file = '%s_ns.json' % language~~
74		~~- fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)~~
75		~~- ns = json.load(fh)~~
76		~~- fh.close()~~
77		~~- ns = ns['query']['namespaces']~~
78		~~- return ns~~
79		-
80		-
81		~~-def build_namespaces_locale(namespaces, include=['0']):~~
82		~~- '''~~
83		~~- @include is a list of namespace keys that should not be ignored, the default~~
84		~~- setting is to ignore all namespaces except the main namespace.~~
85		~~- '''~~
86		~~- ns = []~~
87		~~- for namespace in namespaces:~~
88		~~- if namespace not in include:~~
89		~~- value = namespaces[namespace].get(u'*', None)~~
90		~~- ns.append(value)~~
91		~~- return ns~~
92		-
93		-
94		~~-def parse_comments(revisions, function):~~
95		~~- for revision in revisions:~~
96		~~- comment = revision.find('{%s}comment' % settings.xml_namespace)~~
97		~~- #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text~~
98		~~- if comment != None and comment.text != None:~~
99		~~- comment.text = function(comment.text)~~
100		~~- return revisions~~
101		-
102		-
103		~~-def is_article_main_namespace(elem, namespace):~~
104		~~- '''~~
105		~~- checks whether the article belongs to the main namespace~~
106		~~- '''~~
107		~~- title = elem.text~~
108		~~- for ns in namespace:~~
109		~~- if title.startswith(ns):~~
110		~~- return False~~
111		~~- return True~~
112		-
113		~~-def validate_hostname(address):~~
114		~~- '''~~
115		~~- This is not a foolproof solution at all. The problem is that it's really hard~~
116		~~- to determine whether a string is a hostname or not reliably. This is a~~
117		~~- very fast rule of thumb. Will lead to false positives, but that's life :)~~
118		~~- '''~~
119		~~- parts = address.split(".")~~
120		~~- if len(parts) > 2:~~
121		~~- return True~~
122		~~- else:~~
123		~~- return False~~
124		-
125		-
126		~~-def validate_ip(address):~~
127		~~- parts = address.split(".")~~
128		~~- if len(parts) != 4:~~
129		~~- return False~~
130		~~- parts = parts[:3]~~
131		~~- for item in parts:~~
132		~~- try:~~
133		~~- if not 0 <= int(item) <= 255:~~
134		~~- return False~~
135		~~- except ValueError:~~
136		~~- return False~~
137		~~- return True~~
138		-
139		-
140		~~-def determine_username_is_bot(contributor, **kwargs):~~
141		~~- '''~~
142		~~- #contributor is an xml element containing the id of the contributor~~
143		~~- @bots should have a dict with all the bot ids and bot names~~
144		~~- @Return False if username id is not in bot dict id or True if username id~~
145		~~- is a bot id.~~
146		~~- '''~~
147		~~- bots = kwargs.get('bots')~~
148		~~- username = contributor.find('username')~~
149		~~- if username == None:~~
150		~~- return 0~~
151		~~- else:~~
152		~~- if username in bots:~~
153		~~- return 1~~
154		~~- else:~~
155		~~- return 0~~
156		-
157		-
158		~~-def extract_username(contributor, **kwargs):~~
159		~~- contributor = contributor.find('username')~~
160		~~- if contributor != None:~~
161		~~- return contributor.text~~
162		~~- else:~~
163		~~- return None~~
164		-
165		-
166		~~-def extract_contributor_id(contributor, **kwargs):~~
167		~~- '''~~
168		~~- @contributor is the xml contributor node containing a number of attributes~~
169		~~- Currently, we are only interested in registered contributors, hence we~~
170		~~- ignore anonymous editors.~~
171		~~- '''~~
172		~~- if contributor.get('deleted'):~~
173		~~- return None # ASK: Not sure if this is the best way to code deleted contributors.~~
174		~~- elem = contributor.find('id')~~
175		~~- if elem != None:~~
176		~~- return {'id':elem.text}~~
177		~~- else:~~
178		~~- elem = contributor.find('ip')~~
179		~~- if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:~~
180		~~- return {'username':elem.text, 'id': elem.text}~~
181		~~- else:~~
182		~~- return None~~
183		-
184		-
185		~~-def output_editor_information(revisions, page, bots):~~
186		~~- '''~~
187		~~- @elem is an XML element containing 1 revision from a page~~
188		~~- @output is where to store the data, a filehandle~~
189		~~- @**kwargs contains extra information~~
190		-
191		~~- the variable tags determines which attributes are being parsed, the values in~~
192		~~- this dictionary are the functions used to extract the data.~~
193		~~- '''~~
194		~~- headers = ['id', 'date', 'article', 'username']~~
195		~~- tags = {'contributor': {'id': extract_contributor_id,~~
196		~~- 'bot': determine_username_is_bot,~~
197		~~- 'username': extract_username,~~
198		~~- },~~
199		~~- 'timestamp': {'date': wikitree.parser.extract_text},~~
200		~~- }~~
201		~~- vars = {}~~
202		~~- flat = []~~
203		-
204		~~- for x, revision in enumerate(revisions):~~
205		~~- #print len(revision.getchildren())~~
206		~~- vars[x] = {}~~
207		~~- vars[x]['article'] = page~~
208		~~- for tag in tags:~~
209		~~- el = revision.find('%s' % tag)~~
210		~~- if el == None:~~
211		~~- #print cElementTree.tostring(revision, settings.encoding)~~
212		~~- del vars[x]~~
213		~~- break~~
214		~~- for function in tags[tag].keys():~~
215		~~- f = tags[tag][function]~~
216		~~- value = f(el, bots=bots)~~
217		~~- if type(value) == type({}):~~
218		~~- for kw in value:~~
219		~~- vars[x][kw] = value[kw]~~
220		~~- else:~~
221		~~- vars[x][function] = value~~
222		-
223		~~- '''~~
224		~~- This loop determines for each observation whether it should be stored or not.~~
225		~~- '''~~
226		~~- for x in vars:~~
227		~~- if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:~~
228		~~- continue~~
229		~~- else:~~
230		~~- f = []~~
231		~~- for head in headers:~~
232		~~- f.append(vars[x][head])~~
233		~~- flat.append(f)~~
234		-
235		~~- return flat~~
236		-
237		-
238		~~-def parse_dumpfile(project, language_code, namespaces=['0']):~~
239		~~- bot_ids = bots.retrieve_bots(language_code)~~
240		~~- ns = load_namespace(language_code)~~
241		~~- ns = build_namespaces_locale(ns, namespaces)~~
242		-
243		~~- location = os.path.join(settings.input_location, language_code, project)~~
244		~~- fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)~~
245		~~- for page in wikitree.parser.read_input(fh):~~
246		~~- title = page.find('title')~~
247		~~- if is_article_main_namespace(title, ns):~~
248		~~- #cElementTree.dump(page)~~
249		~~- article_id = page.find('id').text~~
250		~~- revisions = page.findall('revision')~~
251		~~- revisions = parse_comments(revisions, remove_numeric_character_references)~~
252		~~- output = output_editor_information(revisions, article_id, bot_ids)~~
253		~~- write_output(output, project, language_code)~~
254		~~- page.clear()~~
255		~~- fh.close()~~
256		-
257		-
258		~~-def write_output(output, project, language_code):~~
259		~~- location = os.path.join(settings.input_location, language_code, project, 'txt')~~
260		~~- for o in output:~~
261		~~- file = '%s.csv' % hash(o[0])~~
262		~~- try:~~
263		~~- fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)~~
264		~~- utils.write_list_to_csv(o, fh)~~
265		~~- fh.close()~~
266		~~- except Exception, error:~~
267		~~- print error~~
268		-
269		-
270		~~-def hash(id):~~
271		~~- '''~~
272		~~- A very simple hash function based on modulo. The except clause has been~~
273		~~- addde because there are instances where the username is stored in userid~~
274		~~- tag and hence that's a string and not an integer.~~
275		~~- '''~~
276		~~- try:~~
277		~~- return int(id) % 500~~
278		~~- except:~~
279		~~- return sum([ord(i) for i in id]) % 500~~
280		-
281		~~-if __name__ == '__main__':~~
282		~~- project = 'wiki'~~
283		~~- language_code = 'en'~~
284		~~- parse_dumpfile(project, language_code)~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-12-13'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+import re
	23	+import json
	24	+import os
	25	+import xml.etree.cElementTree as cElementTree
	26	+
	27	+sys.path.append('..')
	28	+import configuration
	29	+settings = configuration.Settings()
	30	+
	31	+import wikitree.parser
	32	+from bots import bots
	33	+from utils import utils
	34	+
	35	+try:
	36	+ import psyco
	37	+ psyco.full()
	38	+except ImportError:
	39	+ pass
	40	+
	41	+
	42	+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
	43	+
	44	+
	45	+def remove_numeric_character_references(text):
	46	+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
	47	+
	48	+
	49	+def lenient_deccharref(m):
	50	+ try:
	51	+ return unichr(int(m.group(1)))
	52	+ except ValueError:
	53	+ '''
	54	+ There are a few articles that raise a Value Error here, the reason is
	55	+ that I am using a narrow Python build (UCS2) instead of a wide build
	56	+ (UCS4). The quick fix is to return an empty string...
	57	+ Real solution is to rebuild Python with UCS4 support.....
	58	+ '''
	59	+ return ''
	60	+
	61	+
	62	+def remove_namespace(element, namespace):
	63	+ '''Remove namespace from the XML document.'''
	64	+ ns = u'{%s}' % namespace
	65	+ nsl = len(ns)
	66	+ for elem in element.getiterator():
	67	+ if elem.tag.startswith(ns):
	68	+ elem.tag = elem.tag[nsl:]
	69	+ return element
	70	+
	71	+
	72	+def load_namespace(language):
	73	+ file = '%s_ns.json' % language
	74	+ fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
	75	+ ns = json.load(fh)
	76	+ fh.close()
	77	+ ns = ns['query']['namespaces']
	78	+ return ns
	79	+
	80	+
	81	+def build_namespaces_locale(namespaces, include=['0']):
	82	+ '''
	83	+ @include is a list of namespace keys that should not be ignored, the default
	84	+ setting is to ignore all namespaces except the main namespace.
	85	+ '''
	86	+ ns = []
	87	+ for namespace in namespaces:
	88	+ if namespace not in include:
	89	+ value = namespaces[namespace].get(u'*', None)
	90	+ ns.append(value)
	91	+ return ns
	92	+
	93	+
	94	+def parse_comments(revisions, function):
	95	+ for revision in revisions:
	96	+ comment = revision.find('{%s}comment' % settings.xml_namespace)
	97	+ #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text
	98	+ if comment != None and comment.text != None:
	99	+ comment.text = function(comment.text)
	100	+ return revisions
	101	+
	102	+
	103	+def is_article_main_namespace(elem, namespace):
	104	+ '''
	105	+ checks whether the article belongs to the main namespace
	106	+ '''
	107	+ title = elem.text
	108	+ for ns in namespace:
	109	+ if title.startswith(ns):
	110	+ return False
	111	+ return True
	112	+
	113	+def validate_hostname(address):
	114	+ '''
	115	+ This is not a foolproof solution at all. The problem is that it's really hard
	116	+ to determine whether a string is a hostname or not reliably. This is a
	117	+ very fast rule of thumb. Will lead to false positives, but that's life :)
	118	+ '''
	119	+ parts = address.split(".")
	120	+ if len(parts) > 2:
	121	+ return True
	122	+ else:
	123	+ return False
	124	+
	125	+
	126	+def validate_ip(address):
	127	+ parts = address.split(".")
	128	+ if len(parts) != 4:
	129	+ return False
	130	+ parts = parts[:3]
	131	+ for item in parts:
	132	+ try:
	133	+ if not 0 <= int(item) <= 255:
	134	+ return False
	135	+ except ValueError:
	136	+ return False
	137	+ return True
	138	+
	139	+
	140	+def determine_username_is_bot(contributor, **kwargs):
	141	+ '''
	142	+ #contributor is an xml element containing the id of the contributor
	143	+ @bots should have a dict with all the bot ids and bot names
	144	+ @Return False if username id is not in bot dict id or True if username id
	145	+ is a bot id.
	146	+ '''
	147	+ bots = kwargs.get('bots')
	148	+ username = contributor.find('username')
	149	+ if username == None:
	150	+ return 0
	151	+ else:
	152	+ if username in bots:
	153	+ return 1
	154	+ else:
	155	+ return 0
	156	+
	157	+
	158	+def extract_username(contributor, **kwargs):
	159	+ contributor = contributor.find('username')
	160	+ if contributor != None:
	161	+ return contributor.text
	162	+ else:
	163	+ return None
	164	+
	165	+
	166	+def extract_contributor_id(contributor, **kwargs):
	167	+ '''
	168	+ @contributor is the xml contributor node containing a number of attributes
	169	+ Currently, we are only interested in registered contributors, hence we
	170	+ ignore anonymous editors.
	171	+ '''
	172	+ if contributor.get('deleted'):
	173	+ return None # ASK: Not sure if this is the best way to code deleted contributors.
	174	+ elem = contributor.find('id')
	175	+ if elem != None:
	176	+ return {'id':elem.text}
	177	+ else:
	178	+ elem = contributor.find('ip')
	179	+ if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
	180	+ return {'username':elem.text, 'id': elem.text}
	181	+ else:
	182	+ return None
	183	+
	184	+
	185	+def output_editor_information(revisions, page, bots):
	186	+ '''
	187	+ @elem is an XML element containing 1 revision from a page
	188	+ @output is where to store the data, a filehandle
	189	+ @**kwargs contains extra information
	190	+
	191	+ the variable tags determines which attributes are being parsed, the values in
	192	+ this dictionary are the functions used to extract the data.
	193	+ '''
	194	+ headers = ['id', 'date', 'article', 'username']
	195	+ tags = {'contributor': {'id': extract_contributor_id,
	196	+ 'bot': determine_username_is_bot,
	197	+ 'username': extract_username,
	198	+ },
	199	+ 'timestamp': {'date': wikitree.parser.extract_text},
	200	+ }
	201	+ vars = {}
	202	+ flat = []
	203	+
	204	+ for x, revision in enumerate(revisions):
	205	+ #print len(revision.getchildren())
	206	+ vars[x] = {}
	207	+ vars[x]['article'] = page
	208	+ for tag in tags:
	209	+ el = revision.find('%s' % tag)
	210	+ if el == None:
	211	+ #print cElementTree.tostring(revision, settings.encoding)
	212	+ del vars[x]
	213	+ break
	214	+ for function in tags[tag].keys():
	215	+ f = tags[tag][function]
	216	+ value = f(el, bots=bots)
	217	+ if type(value) == type({}):
	218	+ for kw in value:
	219	+ vars[x][kw] = value[kw]
	220	+ else:
	221	+ vars[x][function] = value
	222	+
	223	+ '''
	224	+ This loop determines for each observation whether it should be stored or not.
	225	+ '''
	226	+ for x in vars:
	227	+ if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
	228	+ continue
	229	+ else:
	230	+ f = []
	231	+ for head in headers:
	232	+ f.append(vars[x][head])
	233	+ flat.append(f)
	234	+
	235	+ return flat
	236	+
	237	+
	238	+def parse_dumpfile(project, language_code, namespaces=['0']):
	239	+ bot_ids = bots.retrieve_bots(language_code)
	240	+ ns = load_namespace(language_code)
	241	+ ns = build_namespaces_locale(ns, namespaces)
	242	+
	243	+ location = os.path.join(settings.input_location, language_code, project)
	244	+ fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)
	245	+ for page in wikitree.parser.read_input(fh):
	246	+ title = page.find('title')
	247	+ if is_article_main_namespace(title, ns):
	248	+ #cElementTree.dump(page)
	249	+ article_id = page.find('id').text
	250	+ revisions = page.findall('revision')
	251	+ revisions = parse_comments(revisions, remove_numeric_character_references)
	252	+ output = output_editor_information(revisions, article_id, bot_ids)
	253	+ write_output(output, project, language_code)
	254	+ page.clear()
	255	+ fh.close()
	256	+
	257	+
	258	+def write_output(output, project, language_code):
	259	+ location = os.path.join(settings.input_location, language_code, project, 'txt')
	260	+ for o in output:
	261	+ file = '%s.csv' % hash(o[0])
	262	+ try:
	263	+ fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)
	264	+ utils.write_list_to_csv(o, fh)
	265	+ fh.close()
	266	+ except Exception, error:
	267	+ print error
	268	+
	269	+
	270	+def hash(id):
	271	+ '''
	272	+ A very simple hash function based on modulo. The except clause has been
	273	+ addde because there are instances where the username is stored in userid
	274	+ tag and hence that's a string and not an integer.
	275	+ '''
	276	+ try:
	277	+ return int(id) % 500
	278	+ except:
	279	+ return sum([ord(i) for i in id]) % 500
	280	+
	281	+if __name__ == '__main__':
	282	+ project = 'wiki'
	283	+ language_code = 'en'
	284	+ parse_dumpfile(project, language_code)
Property changes on: trunk/tools/editor_trends/etl/extracter.py
___________________________________________________________________
Added: svn:eol-style
285	285	+ native

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r78582	Thanks to Nimish for giving me a number of suggestions to reduce the processi...	diederik	22:10, 18 December 2010

Status & tagging log

00:03, 14 January 2011 Reedy (talk | contribs) changed the status of r78583 [removed: new added: deferred]