r81316 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r81315‎ \| r81316 \| r81317 >
Date:	19:57, 1 February 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	The extract phase is now also storing the namespace identifier.
Modified paths:	/trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -216,4 +216,4 @@
217	217	if namespaces != None:
218	218	return namespaces.split(',')
219	219	else:
220		~~- return namespaces~~
	220	+ return ['0'] #Assume that the mainspace is of interest
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -66,11 +66,12 @@
67	67	@include is a list of namespace keys that should not be ignored, the default
68	68	setting is to ignore all namespaces except the main namespace.
69	69	'''
70		~~- ns = []~~
	70	+ ns = {}
71	71	for key, value in namespaces.iteritems():
72		~~- if key not in include:~~
	72	+ if key in include:
73	73	#value = namespaces[namespace].get(u'*', None)
74		~~- ns.append(value)~~
	74	+ #ns.append(value)
	75	+ ns[key] = value
75	76	return ns
76	77
77	78
—	—	@@ -82,19 +83,28 @@
83	84	return revisions
84	85
85	86
86		~~-def verify_article_belongs_namespace(elem, namespaces):~~
	87	+def parse_article(elem, namespaces):
87	88	'''
88		~~- @namespaces is a list of namespaces that should be ignored, hence if the~~
89		~~- title of article starts with the namespace then return False else return~~
90		~~- True~~
	89	+ @namespaces is a list of valid namespaces that should be included in the analysis
	90	+ if the article should be ignored then this function returns false, else it returns
	91	+ the namespace identifier and namespace name.
91	92	'''
92	93	title = elem.text
93	94	if title == None:
94	95	return False
95		~~- for namespace in namespaces:~~
96		~~- if title.startswith(namespace):~~
	96	+ ns = title.split(':')
	97	+ if len(ns) ==1 and '0' in namespaces:
	98	+ return {'id': 0, 'name': 'main namespace'}
	99	+ else:
	100	+ if ns[0] in namespaces:
	101	+ return {'id': ns[0], 'name': ns[1]}
	102	+ else:
97	103	return False
98		~~- return True~~
	104	+
	105	+# for namespace in namespaces:
	106	+# if title.startswith(namespace):
	107	+# return False
	108	+# return True
99	109
100	110
101	111	def validate_hostname(address):
—	—	@@ -263,12 +273,15 @@
264	274	for page, article_size in wikitree.parser.read_input(fh1):
265	275	title = page.find('title')
266	276	total += 1
267		~~- if verify_article_belongs_namespace(title, ns):~~
	277	+ namespace = parse_article(title, ns)
	278	+ if namespace != False:
	279	+ #if verify_article_belongs_namespace(title, ns):
268	280	article_id = page.find('id').text
269	281	title = page.find('title').text
270	282	revisions = page.findall('revision')
271	283	revisions = parse_comments(revisions, remove_numeric_character_references)
272	284	output = output_editor_information(revisions, article_id, bot_ids)
	285	+ output = [o.append(namespace['id'] for o in output)]
273	286	write_output(output, filehandles, lock)
274	287	file_utils.write_list_to_csv([article_id, title], fh2)
275	288	processed += 1

Status & tagging log

19:58, 1 February 2011 Reedy (talk | contribs) changed the status of r81316 [removed: new added: deferred]