r89750 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r89749‎ | r89750 | r89751 >
Date:20:51, 8 June 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added support for detecting speedy deletions in the XML dump.
Modified paths:
  • /trunk/tools/editor_trends/etl/variables.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/variables.py
@@ -19,8 +19,11 @@
2020 __version__ = '0.1'
2121
2222 import hashlib
 23+import re
2324 from xml.etree.cElementTree import dump
2425
 26+RE_DEL_ARTICLE = re.compile('/GA[\d]{1,2}')
 27+RE_SPEEDY_DELETION = re.compile('\{\{db\-[a-z\d]*\}\}') #http://en.wikipedia.org/wiki/Wikipedia:Criteria_for_speedy_deletion
2528
2629 def validate_hostname(address):
2730 '''
@@ -68,6 +71,14 @@
6972 return title.text
7073
7174
 75+def detect_speedy_deletion(revision_text):
 76+ spds = re.findall(RE_SPEEDY_DELETION, revision_text)
 77+ templates = {}
 78+ for spd in spds:
 79+ templates[spd] = 1
 80+ return templates
 81+
 82+
7283 def parse_title_meta_data(title, ns, namespaces):
7384 '''
7485 This function categorizes an article to assist the Wikimedia Taxonomy
@@ -75,6 +86,7 @@
7687 http://meta.wikimedia.org/wiki/Contribution_Taxonomy_Project/Research_Questions
7788 '''
7889 title_meta = {}
 90+ re_ga = re.compile('/GA[\d]')
7991 if not ns:
8092 return title_meta
8193 namespace = '%s:' % namespaces[ns]
@@ -83,10 +95,11 @@
8496 title_meta['ns'] = ns
8597 if title.startswith('List of'):
8698 title_meta['category'] = 'List'
 99+ elif ns == 1:
 100+ if re.search(RE_DEL_ARTICLE, title.find):
 101+ title_meta['category'] = 'Good Article'
87102 elif ns == 4 or ns == 5:
88 - if title.find('Articles for deletion') > -1:
89 - title_meta['category'] = 'Deletion'
90 - elif title.find('Arbitration') > -1:
 103+ if title.find('Arbitration') > -1:
91104 title_meta['category'] = 'Arbitration'
92105 elif title.find('Good Article') > -1:
93106 title_meta['category'] = 'Good Article'
@@ -108,6 +121,8 @@
109122 title_meta['category'] = 'Featured Portal'
110123 elif title.find('Featured topic candidates') > -1:
111124 title_meta['category'] = 'Featured Topic'
 125+ elif title.find('Articles for deletion') > -1 and title.find('Articles for deletion/Log/') > -1:
 126+ title_meta['category'] = 'Deletion'
112127
113128 #print title_meta
114129 return title_meta