Index: trunk/tools/editor_trends/analyses/file_size_reduction.py |
— | — | @@ -69,6 +69,7 @@ |
70 | 70 | def calculate_filesize_overhead(location, filename): |
71 | 71 | counter = None |
72 | 72 | ds = DumpStatistics() |
| 73 | + filename = os.path.join(location, filename) |
73 | 74 | context = cElementTree.iterparse(filename, events=('start', 'end')) |
74 | 75 | context = iter(context) |
75 | 76 | event, root = context.next() #get the root element of the XML doc |
— | — | @@ -80,20 +81,20 @@ |
81 | 82 | root.clear() # when done parsing a section clear the tree to release memory |
82 | 83 | except SyntaxError: |
83 | 84 | pass |
84 | | - utils.store_object(ds, settings.binary_location, 'ds') |
| 85 | + utils.store_object(ds, settings.binary_location, 'ds') |
85 | 86 | xml_size = ds.total_size_xml() |
86 | 87 | text_size = ds.total_size_text() |
87 | 88 | print text_size, xml_size |
88 | 89 | print ds.tags |
89 | | - |
90 | 90 | |
| 91 | + |
91 | 92 | def output_dumpstatistics(): |
92 | 93 | ds = utils.load_object(settings.binary_location, 'ds.bin') |
93 | | - |
| 94 | + |
94 | 95 | for key in ds.tags: |
95 | 96 | print '%s\t%s' % (key, ds.tags[key]) |
96 | | - |
| 97 | + |
97 | 98 | if __name__ == '__main__': |
| 99 | + input = os.path.join(settings.input_location, 'en', 'wiki') |
| 100 | + calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml') |
98 | 101 | output_dumpstatistics() |
99 | | - #calculate_filesize_overhead(settings.input_location, settings.input_filename) |
100 | | - |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -182,9 +182,10 @@ |
183 | 183 | filename = kwargs.get('filename') |
184 | 184 | extension = kwargs.get('extension') |
185 | 185 | location = kwargs.get('location') |
| 186 | + full_project = kwargs.get('full_project') |
186 | 187 | pbar = get_value(args, 'progress') |
187 | 188 | domain = settings.wp_dump_location |
188 | | - path = '/%s/latest/' % project |
| 189 | + path = '/%s/latest/' % full_project |
189 | 190 | extension = utils.determine_file_extension(filename) |
190 | 191 | filemode = utils.determine_file_mode(extension) |
191 | 192 | dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar) |
— | — | @@ -322,7 +323,7 @@ |
323 | 324 | ignore = ignore + ',extract' |
324 | 325 | |
325 | 326 | functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'), |
326 | | - (chunker_launcher, 'split'), |
| 327 | + #(chunker_launcher, 'split'), |
327 | 328 | (extract_launcher, 'extract'), |
328 | 329 | (sort_launcher, 'sort'), |
329 | 330 | (store_launcher, 'store'), |
— | — | @@ -407,10 +408,9 @@ |
408 | 409 | parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.') |
409 | 410 | parser_download.set_defaults(func=dump_downloader_launcher) |
410 | 411 | |
411 | | - parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
| 412 | + #parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
| 413 | + #parser_split.set_defaults(func=chunker_launcher) |
412 | 414 | |
413 | | - parser_split.set_defaults(func=chunker_launcher) |
414 | | - |
415 | 415 | parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
416 | 416 | parser_create.set_defaults(func=extract_launcher) |
417 | 417 | |
Index: trunk/tools/editor_trends/wikitree/xml.py |
— | — | @@ -1,55 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -from utils import utils |
22 | | -import configuration |
23 | | -settings = configuration.Settings() |
24 | | - |
25 | | - |
26 | | -def convert_html_entities(text): |
27 | | - return utils.unescape(text) |
28 | | - |
29 | | - |
30 | | -def extract_text(elem, **kwargs): |
31 | | - if elem != None and elem.text != None: |
32 | | - #try: |
33 | | - return elem.text #.decode(settings.encoding) |
34 | | - #except UnicodeDecodeError: |
35 | | - #return None |
36 | | - |
37 | | - |
38 | | -def retrieve_xml_node(xml_nodes, name): |
39 | | - for xml_node in xml_nodes: |
40 | | - if xml_node.tag == name: |
41 | | - return xml_node |
42 | | - return None #maybe this should be replaced with an NotFoundError |
43 | | - |
44 | | - |
45 | | -def read_input(file): |
46 | | - lines = [] |
47 | | - for line in file: |
48 | | - lines.append(line) |
49 | | - if line.find('</page>') > -1: |
50 | | - yield lines |
51 | | - ''' |
52 | | - #This looks counter intuitive but Python continues with this call |
53 | | - after it has finished the yield statement |
54 | | - ''' |
55 | | - lines = [] |
56 | | - file.close() |
Index: trunk/tools/editor_trends/wikitree/parser.py |
— | — | @@ -0,0 +1,70 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import xml.etree.cElementTree as cElementTree |
| 22 | + |
| 23 | +import configuration |
| 24 | +settings = configuration.Settings() |
| 25 | +from utils import utils |
| 26 | + |
| 27 | +def convert_html_entities(text): |
| 28 | + return utils.unescape(text) |
| 29 | + |
| 30 | + |
| 31 | +def extract_text(elem, **kwargs): |
| 32 | + if elem != None and elem.text != None: |
| 33 | + #try: |
| 34 | + return elem.text #.decode(settings.encoding) |
| 35 | + #except UnicodeDecodeError: |
| 36 | + #return None |
| 37 | + |
| 38 | + |
| 39 | +def retrieve_xml_node(xml_nodes, name): |
| 40 | + for xml_node in xml_nodes: |
| 41 | + if xml_node.tag == name: |
| 42 | + return xml_node |
| 43 | + return None #maybe this should be replaced with an NotFoundError |
| 44 | + |
| 45 | +def determine_element(line): |
| 46 | + pos = line.find(' ') |
| 47 | + elem = line[:pos] + '>' |
| 48 | + |
| 49 | + |
| 50 | +def read_input(file): |
| 51 | + lines = [] |
| 52 | + start_parsing = False |
| 53 | + for line in file: |
| 54 | + if line == '\n': |
| 55 | + continue |
| 56 | + if start_parsing == False and line.find('<page>') > -1: |
| 57 | + start_parsing = True |
| 58 | + if start_parsing: |
| 59 | + lines.append(line.strip()) |
| 60 | + if line.find('</page>') > -1: |
| 61 | + #print lines |
| 62 | + lines = '\n'.join(lines) |
| 63 | + lines = lines.encode(settings.encoding) |
| 64 | + xml_string = cElementTree.XML(lines) |
| 65 | + yield xml_string |
| 66 | + ''' |
| 67 | + #This looks counter intuitive but Python continues with this call |
| 68 | + after it has finished the yield statement |
| 69 | + ''' |
| 70 | + lines = [] |
| 71 | + file.close() |
Property changes on: trunk/tools/editor_trends/wikitree/parser.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 72 | + native |
Added: svn:mime-type |
2 | 73 | + text/plain |
Index: trunk/tools/editor_trends/etl/exporter.py |
— | — | @@ -213,16 +213,12 @@ |
214 | 214 | break |
215 | 215 | obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
216 | 216 | |
217 | | - #for editor in tasks: |
218 | | - # obs = tasks[editor] |
219 | 217 | first_edit = obs['first_edit'] |
220 | 218 | last_edit = obs['final_edit'] |
221 | 219 | editor_dt = relativedelta(last_edit, first_edit) |
222 | 220 | editor_dt = (editor_dt.years * 12) + editor_dt.months |
223 | 221 | edits = [] |
224 | 222 | for year in xrange(2001, datetime.datetime.now().year + 1): |
225 | | - #if year == 2009 and editor == '2': |
226 | | - # print 'debug' |
227 | 223 | if first_edit.year > year or last_edit.year < year: |
228 | 224 | continue |
229 | 225 | window_end = datetime.datetime(year, 12, 31) |
— | — | @@ -251,11 +247,67 @@ |
252 | 248 | |
253 | 249 | |
254 | 250 | |
255 | | -def generate_cohort_dataset(tasks, dbname, collection, **kwargs): |
| 251 | +def generate_cohort_dataset_forward(tasks, dbname, collection, **kwargs): |
256 | 252 | mongo = db.init_mongo_db(dbname) |
257 | 253 | editors = mongo[collection + '_dataset'] |
258 | 254 | windows = create_windows() |
259 | 255 | data = shaper.create_datacontainer('dict') |
| 256 | + final_year = datetime.datetime.now().year + 1 |
| 257 | + m1 = [1, 2, 3, 4, 5, 6] |
| 258 | + m2 = [7, 8, 9, 10, 11, 12] |
| 259 | + frames = [m1, m2] |
| 260 | + while True: |
| 261 | + id = tasks.get(block=False) |
| 262 | + if id == None: |
| 263 | + break |
| 264 | + obs = editors.find_one({'editor': id}, {'new_wikipedian': 1, 'monthly_edits': 1, 'final_edit':1}) |
| 265 | + new_wikipedian = obs['new_wikipedian'] |
| 266 | + last_edit = obs['final_edit'] |
| 267 | + start_year = new_wikipedian.year |
| 268 | + last_year = last_edit.year + 1 |
| 269 | + if new_wikipedian.month != 1: |
| 270 | + continue |
| 271 | + for year in xrange(start_year, last_year): |
| 272 | + if year not in data[start_year]: |
| 273 | + data[start_year][year] = {} |
| 274 | + for x, frame in enumerate(frames): |
| 275 | + if x not in data[start_year][year]: |
| 276 | + data[start_year][year][x] = 0 |
| 277 | + if 'n' not in data[start_year][year]: |
| 278 | + data[start_year][year]['n'] = 0 |
| 279 | + |
| 280 | + active = sum([obs['monthly_edits'][str(year)][str(m)] for m in frame]) |
| 281 | + data[start_year][year]['n'] += 1 |
| 282 | + if active > 0: |
| 283 | + data[start_year][year][x] += 1 |
| 284 | + filename = '%s_cohort_forward.csv' % dbname |
| 285 | + fh = utils.create_txt_filehandle(settings.dataset_location, filename, 'w', settings.encoding) |
| 286 | + frames.append('n') |
| 287 | + headers = ["%s_%s" % (year, frame[0]) for year in xrange(2001, final_year) for frame in enumerate(frames)] |
| 288 | + headers.insert(0, '\t') |
| 289 | + utils.write_list_to_csv(headers, fh) |
| 290 | + |
| 291 | + for obs_year in data: |
| 292 | + obs = '%s\t' % obs_year |
| 293 | + for year in xrange(2001, final_year): |
| 294 | + values = data[obs_year].get(year, None) |
| 295 | + if values != None: |
| 296 | + for value in values: |
| 297 | + obs = '%s\t%s\t' % (obs, values[value]) |
| 298 | + else: |
| 299 | + obs = '%s\t.\t.\t.\t' % obs |
| 300 | + |
| 301 | + obs = '%s\n' % obs |
| 302 | + fh.write(obs) |
| 303 | + fh.close() |
| 304 | + |
| 305 | + |
| 306 | + |
| 307 | +def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs): |
| 308 | + mongo = db.init_mongo_db(dbname) |
| 309 | + editors = mongo[collection + '_dataset'] |
| 310 | + windows = create_windows() |
| 311 | + data = shaper.create_datacontainer('dict') |
260 | 312 | data = shaper.add_windows_to_datacontainer(data, windows) |
261 | 313 | |
262 | 314 | while True: |
— | — | @@ -360,10 +412,12 @@ |
361 | 413 | '13':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2009, 4, 30)}, |
362 | 414 | } |
363 | 415 | generate_cohort_dataset(editors, dbname, collection) |
| 416 | + |
| 417 | + |
364 | 418 | if __name__ == '__main__': |
365 | 419 | dbname = 'enwiki' |
366 | 420 | collection = 'editors' |
367 | 421 | #debug(dbname, collection) |
368 | | - dataset_launcher(dbname, collection, generate_cohort_dataset_howie) |
| 422 | + dataset_launcher(dbname, collection, generate_cohort_dataset_forward) |
369 | 423 | #dataset_launcher(dbname, collection, generate_long_editor_dataset) |
370 | 424 | #dataset_launcher(dbname, collection, generate_wide_editor_dataset) |
Index: trunk/tools/editor_trends/etl/extract.py |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | from database import db_settings |
41 | 41 | from database import db |
42 | 42 | from database import cache |
43 | | -from wikitree import xml |
| 43 | +import wikitree.parser |
44 | 44 | from bots import bots |
45 | 45 | from etl import models |
46 | 46 | #from utils import process_constructor as pc |
Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -34,7 +34,7 @@ |
35 | 35 | |
36 | 36 | from utils import utils |
37 | 37 | import extract |
38 | | -from wikitree import xml |
| 38 | +import wikitree.parser |
39 | 39 | from bots import bots |
40 | 40 | |
41 | 41 | |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -0,0 +1,283 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__author__email = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2010-12-13'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import sys
|
| 22 | +import re
|
| 23 | +import json
|
| 24 | +import os
|
| 25 | +import xml.etree.cElementTree as cElementTree
|
| 26 | +
|
| 27 | +sys.path.append('..')
|
| 28 | +import configuration
|
| 29 | +settings = configuration.Settings()
|
| 30 | +
|
| 31 | +import wikitree.parser
|
| 32 | +from bots import bots
|
| 33 | +from utils import utils
|
| 34 | +
|
| 35 | +try:
|
| 36 | + import psyco
|
| 37 | + psyco.full()
|
| 38 | +except ImportError:
|
| 39 | + pass
|
| 40 | +
|
| 41 | +
|
| 42 | +RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
|
| 43 | +
|
| 44 | +
|
| 45 | +def remove_numeric_character_references(text):
|
| 46 | + return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
|
| 47 | +
|
| 48 | +
|
| 49 | +def lenient_deccharref(m):
|
| 50 | + try:
|
| 51 | + return unichr(int(m.group(1)))
|
| 52 | + except ValueError:
|
| 53 | + '''
|
| 54 | + There are a few articles that raise a Value Error here, the reason is
|
| 55 | + that I am using a narrow Python build (UCS2) instead of a wide build
|
| 56 | + (UCS4). The quick fix is to return an empty string...
|
| 57 | + Real solution is to rebuild Python with UCS4 support.....
|
| 58 | + '''
|
| 59 | + return ''
|
| 60 | +
|
| 61 | +
|
| 62 | +def remove_namespace(element, namespace):
|
| 63 | + '''Remove namespace from the XML document.'''
|
| 64 | + ns = u'{%s}' % namespace
|
| 65 | + nsl = len(ns)
|
| 66 | + for elem in element.getiterator():
|
| 67 | + if elem.tag.startswith(ns):
|
| 68 | + elem.tag = elem.tag[nsl:]
|
| 69 | + return element
|
| 70 | +
|
| 71 | +
|
| 72 | +def load_namespace(language):
|
| 73 | + file = '%s_ns.json' % language
|
| 74 | + fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
|
| 75 | + ns = json.load(fh)
|
| 76 | + fh.close()
|
| 77 | + ns = ns['query']['namespaces']
|
| 78 | + return ns
|
| 79 | +
|
| 80 | +
|
| 81 | +def build_namespaces_locale(namespaces, include=['0']):
|
| 82 | + '''
|
| 83 | + @include is a list of namespace keys that should not be ignored, the default
|
| 84 | + setting is to ignore all namespaces except the main namespace.
|
| 85 | + '''
|
| 86 | + ns = []
|
| 87 | + for namespace in namespaces:
|
| 88 | + if namespace not in include:
|
| 89 | + value = namespaces[namespace].get(u'*', None)
|
| 90 | + ns.append(value)
|
| 91 | + return ns
|
| 92 | +
|
| 93 | +
|
| 94 | +def parse_comments(revisions, function):
|
| 95 | + for revision in revisions:
|
| 96 | + comment = revision.find('{%s}comment' % settings.xml_namespace)
|
| 97 | + #timestamp = revision.find('{%s}timestamp' % settings.xml_namespace).text
|
| 98 | + if comment != None and comment.text != None:
|
| 99 | + comment.text = function(comment.text)
|
| 100 | + return revisions
|
| 101 | +
|
| 102 | +
|
| 103 | +def is_article_main_namespace(elem, namespace):
|
| 104 | + '''
|
| 105 | + checks whether the article belongs to the main namespace
|
| 106 | + '''
|
| 107 | + title = elem.text
|
| 108 | + for ns in namespace:
|
| 109 | + if title.startswith(ns):
|
| 110 | + return False
|
| 111 | + return True
|
| 112 | +
|
| 113 | +def validate_hostname(address):
|
| 114 | + '''
|
| 115 | + This is not a foolproof solution at all. The problem is that it's really hard
|
| 116 | + to determine whether a string is a hostname or not **reliably**. This is a
|
| 117 | + very fast rule of thumb. Will lead to false positives, but that's life :)
|
| 118 | + '''
|
| 119 | + parts = address.split(".")
|
| 120 | + if len(parts) > 2:
|
| 121 | + return True
|
| 122 | + else:
|
| 123 | + return False
|
| 124 | +
|
| 125 | +
|
| 126 | +def validate_ip(address):
|
| 127 | + parts = address.split(".")
|
| 128 | + if len(parts) != 4:
|
| 129 | + return False
|
| 130 | + parts = parts[:3]
|
| 131 | + for item in parts:
|
| 132 | + try:
|
| 133 | + if not 0 <= int(item) <= 255:
|
| 134 | + return False
|
| 135 | + except ValueError:
|
| 136 | + return False
|
| 137 | + return True
|
| 138 | +
|
| 139 | +
|
| 140 | +def determine_username_is_bot(contributor, **kwargs):
|
| 141 | + '''
|
| 142 | + #contributor is an xml element containing the id of the contributor
|
| 143 | + @bots should have a dict with all the bot ids and bot names
|
| 144 | + @Return False if username id is not in bot dict id or True if username id
|
| 145 | + is a bot id.
|
| 146 | + '''
|
| 147 | + bots = kwargs.get('bots')
|
| 148 | + username = contributor.find('username')
|
| 149 | + if username == None:
|
| 150 | + return 0
|
| 151 | + else:
|
| 152 | + if username in bots:
|
| 153 | + return 1
|
| 154 | + else:
|
| 155 | + return 0
|
| 156 | +
|
| 157 | +
|
| 158 | +def extract_username(contributor, **kwargs):
|
| 159 | + contributor = contributor.find('username')
|
| 160 | + if contributor != None:
|
| 161 | + return contributor.text
|
| 162 | + else:
|
| 163 | + return None
|
| 164 | +
|
| 165 | +
|
| 166 | +def extract_contributor_id(contributor, **kwargs):
|
| 167 | + '''
|
| 168 | + @contributor is the xml contributor node containing a number of attributes
|
| 169 | + Currently, we are only interested in registered contributors, hence we
|
| 170 | + ignore anonymous editors.
|
| 171 | + '''
|
| 172 | + if contributor.get('deleted'):
|
| 173 | + return None # ASK: Not sure if this is the best way to code deleted contributors.
|
| 174 | + elem = contributor.find('id')
|
| 175 | + if elem != None:
|
| 176 | + return {'id':elem.text}
|
| 177 | + else:
|
| 178 | + elem = contributor.find('ip')
|
| 179 | + if elem != None and elem.text != None and validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
|
| 180 | + return {'username':elem.text, 'id': elem.text}
|
| 181 | + else:
|
| 182 | + return None
|
| 183 | +
|
| 184 | +
|
| 185 | +def output_editor_information(revisions, page, bots):
|
| 186 | + '''
|
| 187 | + @elem is an XML element containing 1 revision from a page
|
| 188 | + @output is where to store the data, a filehandle
|
| 189 | + @**kwargs contains extra information
|
| 190 | +
|
| 191 | + the variable tags determines which attributes are being parsed, the values in
|
| 192 | + this dictionary are the functions used to extract the data.
|
| 193 | + '''
|
| 194 | + headers = ['id', 'date', 'article', 'username']
|
| 195 | + tags = {'contributor': {'id': extract_contributor_id,
|
| 196 | + 'bot': determine_username_is_bot,
|
| 197 | + 'username': extract_username,
|
| 198 | + },
|
| 199 | + 'timestamp': {'date': wikitree.parser.extract_text},
|
| 200 | + }
|
| 201 | + vars = {}
|
| 202 | + flat = []
|
| 203 | +
|
| 204 | + for x, revision in enumerate(revisions):
|
| 205 | + #print len(revision.getchildren())
|
| 206 | + vars[x] = {}
|
| 207 | + vars[x]['article'] = page
|
| 208 | + for tag in tags:
|
| 209 | + el = revision.find('%s' % tag)
|
| 210 | + if el == None:
|
| 211 | + #print cElementTree.tostring(revision, settings.encoding)
|
| 212 | + del vars[x]
|
| 213 | + break
|
| 214 | + for function in tags[tag].keys():
|
| 215 | + f = tags[tag][function]
|
| 216 | + value = f(el, bots=bots)
|
| 217 | + if type(value) == type({}):
|
| 218 | + for kw in value:
|
| 219 | + vars[x][kw] = value[kw]
|
| 220 | + else:
|
| 221 | + vars[x][function] = value
|
| 222 | +
|
| 223 | + '''
|
| 224 | + This loop determines for each observation whether it should be stored or not.
|
| 225 | + '''
|
| 226 | + for x in vars:
|
| 227 | + if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
|
| 228 | + continue
|
| 229 | + else:
|
| 230 | + f = []
|
| 231 | + for head in headers:
|
| 232 | + f.append(vars[x][head])
|
| 233 | + flat.append(f)
|
| 234 | +
|
| 235 | + return flat
|
| 236 | +
|
| 237 | +
|
| 238 | +def parse_dumpfile(project, language_code, namespaces=['0']):
|
| 239 | + bot_ids = bots.retrieve_bots(language_code)
|
| 240 | + ns = load_namespace(language_code)
|
| 241 | + ns = build_namespaces_locale(ns, namespaces)
|
| 242 | +
|
| 243 | + location = os.path.join(settings.input_location, language_code, project)
|
| 244 | + fh = utils.create_txt_filehandle(location, 'enwiki-latest-stub-meta-history.xml', 'r', settings.encoding)
|
| 245 | + for page in wikitree.parser.read_input(fh):
|
| 246 | + title = page.find('title')
|
| 247 | + if is_article_main_namespace(title, ns):
|
| 248 | + #cElementTree.dump(page)
|
| 249 | + article_id = page.find('id').text
|
| 250 | + revisions = page.findall('revision')
|
| 251 | + revisions = parse_comments(revisions, remove_numeric_character_references)
|
| 252 | + output = output_editor_information(revisions, article_id, bot_ids)
|
| 253 | + write_output(output, project, language_code)
|
| 254 | + page.clear()
|
| 255 | + fh.close()
|
| 256 | +
|
| 257 | +
|
| 258 | +def write_output(output, project, language_code):
|
| 259 | + location = os.path.join(settings.input_location, language_code, project, 'txt')
|
| 260 | + for o in output:
|
| 261 | + file = '%s.csv' % hash(o[0])
|
| 262 | + try:
|
| 263 | + fh = utils.create_txt_filehandle(location, file, 'a', settings.encoding)
|
| 264 | + utils.write_list_to_csv(o, fh)
|
| 265 | + fh.close()
|
| 266 | + except Exception, error:
|
| 267 | + print error
|
| 268 | +
|
| 269 | +
|
| 270 | +def hash(id):
|
| 271 | + '''
|
| 272 | + A very simple hash function based on modulo. The except clause has been
|
| 273 | + addde because there are instances where the username is stored in userid
|
| 274 | + tag and hence that's a string and not an integer.
|
| 275 | + '''
|
| 276 | + try:
|
| 277 | + return int(id) % 500
|
| 278 | + except:
|
| 279 | + return sum([ord(i) for i in id]) % 500
|
| 280 | +
|
| 281 | +if __name__ == '__main__':
|
| 282 | + project = 'wiki'
|
| 283 | + language_code = 'en'
|
| 284 | + parse_dumpfile(project, language_code)
|
Index: trunk/tools/editor_trends/etl/models.py |
— | — | @@ -27,7 +27,7 @@ |
28 | 28 | |
29 | 29 | from utils import models |
30 | 30 | from utils import utils |
31 | | -from wikitree import xml |
| 31 | +import wikitree |
32 | 32 | |
33 | 33 | class TXTFile(object): |
34 | 34 | |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | #Change this to match your computers configuration (RAM / CPU) |
59 | 59 | self.minimum_python_version = (2, 6) |
60 | 60 | self.wp_dump_location = 'http://download.wikimedia.org' |
61 | | - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/' |
| 61 | + self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
62 | 62 | self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
63 | 63 | self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
64 | 64 | #Extensions of ascii files, this is used to determine the filemode to use |
Index: trunk/tools/editor_trends/bots/bots.py |
— | — | @@ -28,7 +28,7 @@ |
29 | 29 | |
30 | 30 | import configuration |
31 | 31 | settings = configuration.Settings() |
32 | | -from wikitree import xml |
| 32 | +import wikitree |
33 | 33 | from database import db |
34 | 34 | from utils import utils |
35 | 35 | #from etl import extract |