Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py |
— | — | @@ -1,72 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-01-07' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -import os |
23 | | -if '..' not in sys.path: |
24 | | - sys.path.append('..') |
25 | | - |
26 | | -from classes import settings |
27 | | -settings = settings.Settings() |
28 | | -from etl import extracter |
29 | | -from utils import file_utils |
30 | | -import wikitree |
31 | | - |
32 | | -try: |
33 | | - import psyco |
34 | | - psyco.full() |
35 | | -except ImportError: |
36 | | - pass |
37 | | - |
38 | | -class Article: |
39 | | - def __init__(self, title, id, talk_id=None): |
40 | | - self.title = title |
41 | | - self.id = id |
42 | | - self.talk_id = talk_id |
43 | | - |
44 | | - |
45 | | -def parse_dumpfile(project, language_code, namespaces=['0', '1']): |
46 | | - articles = {} |
47 | | - ns = extracter.load_namespace(language_code) |
48 | | - non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces) |
49 | | - |
50 | | - |
51 | | - location = os.path.join(settings.input_location, language_code, project) |
52 | | - fh = file_utils.create_txt_filehandle(location, |
53 | | - '%s%s-latest-stub-meta-history.xml' % (language_code, project), |
54 | | - 'r', 'utf-8') |
55 | | - |
56 | | - for page, article_size in wikitree.parser.read_input(fh): |
57 | | - title = page.find('title') |
58 | | - if extracter.verify_article_belongs_namespace(title, non_valid_namespaces): |
59 | | - article_id = page.find('id').text |
60 | | - title = title.text |
61 | | - if title.startswith(ns['1'].get('canonical')): |
62 | | - namespace = 'Talk' |
63 | | - article = articles.get(article_id, Article(None, None, article_id)) |
64 | | - article.talk_id = article_id |
65 | | - else: |
66 | | - namespace = 'Main' |
67 | | - article = articles.get(article_id, Article(title, article_id)) |
68 | | - articles[article_id] = article |
69 | | - |
70 | | - file_utils.store_object(articles, settings.binary_location, 'talk2article.bin') |
71 | | - |
72 | | -if __name__ == '__main__': |
73 | | - parse_dumpfile('wiki', 'en') |
Index: trunk/tools/editor_trends/etl/shaper.py |
— | — | @@ -1,72 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | - |
17 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
18 | | -__email__ = 'dvanliere at gmail dot com' |
19 | | -__date__ = '2010-11-24' |
20 | | -__version__ = '0.1' |
21 | | - |
22 | | -import datetime |
23 | | -import math |
24 | | - |
25 | | - |
26 | | -def add_datatype(datatype=0.0): |
27 | | - if datatype == 'dict': |
28 | | - d = dict() |
29 | | - elif datatype == 'list': |
30 | | - d = list() |
31 | | - elif datatype == 'set': |
32 | | - d = set() |
33 | | - else: |
34 | | - d = 0.0 |
35 | | - return d |
36 | | - |
37 | | - |
38 | | -def create_datacontainer(first_year, final_year, datatype='dict'): |
39 | | - ''' |
40 | | - This function initializes an empty dictionary with as key the year (starting |
41 | | - 2001 and running through) and as value @datatype, in most cases this will |
42 | | - be zero so the dictionary will act as a running tally for a variable but |
43 | | - @datatype can also a list, [], or a dictionary, {}, or a set, set(). |
44 | | - ''' |
45 | | - data = {} |
46 | | - for x in xrange(first_year, final_year): |
47 | | - data[str(x)] = add_datatype(datatype) |
48 | | - return data |
49 | | - |
50 | | - |
51 | | -def add_windows_to_datacontainer(datacontainer, windows): |
52 | | - for dc in datacontainer: |
53 | | - for w in windows: |
54 | | - datacontainer[dc][w] = add_datatype() |
55 | | - |
56 | | - return datacontainer |
57 | | - |
58 | | - |
59 | | -def add_months_to_datacontainer(datacontainer, datatype): |
60 | | - for dc in datacontainer: |
61 | | - datacontainer[dc] = {} |
62 | | - for x in xrange(1, 13): |
63 | | - datacontainer[dc][x] = add_datatype(datatype) |
64 | | - |
65 | | - return datacontainer |
66 | | - |
67 | | - |
68 | | -def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype): |
69 | | - for dc in datacontainer: |
70 | | - datacontainer[dc] = {} |
71 | | - for x in range(first_year, final_year): |
72 | | - datacontainer[dc][x] = datatype |
73 | | - return datacontainer |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -86,13 +86,14 @@ |
87 | 87 | } |
88 | 88 | |
89 | 89 | class Statistics: |
90 | | - def __init__(self): |
| 90 | + def __init__(self, process_id): |
| 91 | + self.process_id = process_id |
91 | 92 | self.count_articles = 0 |
92 | 93 | self.count_revisions = 0 |
93 | 94 | |
94 | 95 | def summary(self): |
95 | | - print 'Number of articles: %s' % self.count_articles |
96 | | - print 'Number of revisions: %s' % self.count_revisions |
| 96 | + print 'Worker %s: Number of articles: %s' % (self.process_id, self.count_articles) |
| 97 | + print 'Worker %s: Number of revisions: %s' % (self.process_id, self.count_revisions) |
97 | 98 | |
98 | 99 | class Dummy: |
99 | 100 | pass |
— | — | @@ -108,20 +109,20 @@ |
109 | 110 | |
110 | 111 | |
111 | 112 | class Buffer: |
112 | | - def __init__(self, storage, processs_id, rts=None, filehandles=None, locks=None): |
| 113 | + def __init__(self, storage, process_id, rts=None, filehandles=None, locks=None): |
113 | 114 | assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \ |
114 | 115 | 'Valid storage options are cassandra and mongo.' |
115 | 116 | self.storage = storage |
116 | 117 | self.revisions = {} |
117 | 118 | self.comments = {} |
118 | 119 | self.titles = {} |
119 | | - self.processs_id = processs_id |
| 120 | + self.process_id = process_id |
120 | 121 | self.keyspace_name = 'enwiki' |
121 | 122 | self.keys = ['revision_id', 'article_id', 'id', 'username', 'namespace', |
122 | 123 | 'title', 'timestamp', 'hash', 'revert', 'bot', 'cur_size', |
123 | 124 | 'delta'] |
124 | 125 | self.setup_storage() |
125 | | - self.stats = Statistics() |
| 126 | + self.stats = Statistics(self.process_id) |
126 | 127 | if storage == 'csv' and locks != None: |
127 | 128 | self.rts = rts |
128 | 129 | self.lock1 = locks[0] #lock for generic data |
— | — | @@ -579,8 +580,8 @@ |
580 | 581 | 5: 'Wikipedia Talk', |
581 | 582 | 1: 'Talk', |
582 | 583 | 2: 'User', |
583 | | - 4: 'Wikipedia' |
584 | | - } |
| 584 | + 4: 'Wikipedia'} |
| 585 | + |
585 | 586 | title = parse_title(article['title']) |
586 | 587 | namespaces = article['namespaces'] |
587 | 588 | namespace = determine_namespace(title, namespaces, include_ns, EXCLUDE_NAMESPACE) |
— | — | @@ -641,30 +642,36 @@ |
642 | 643 | article = {} |
643 | 644 | article['revisions'] = [] |
644 | 645 | id = False |
645 | | - for event, elem in context: |
646 | | - if event == 'end' and elem.tag.endswith('siteinfo'): |
647 | | - xml_namespace = determine_xml_namespace(elem) |
648 | | - namespaces = create_namespace_dict(elem, xml_namespace) |
649 | | - article['namespaces'] = namespaces |
650 | | - elif event == 'end' and elem.tag.endswith('title'): |
651 | | - article['title'] = elem |
652 | | - elif event == 'end' and elem.tag.endswith('revision'): |
653 | | - article['revisions'].append(elem) |
654 | | - elif event == 'end' and elem.tag.endswith('id') and id == False: |
655 | | - article['id'] = elem |
656 | | - id = True |
657 | | - elif event == 'end' and elem.tag.endswith('page'): |
658 | | - yield article, xml_namespace |
659 | | - elem.clear() |
660 | | - article = {} |
661 | | - article['revisions'] = [] |
662 | | - article['namespaces'] = namespaces |
663 | | - id = False |
664 | | - elif rts.kaggle == True and event == 'end': |
665 | | - print 'I am cleaning up' |
666 | | - elem.clear() |
667 | 646 | |
| 647 | + try: |
| 648 | + for event, elem in context: |
| 649 | + if event == 'end' and elem.tag.endswith('siteinfo'): |
| 650 | + xml_namespace = determine_xml_namespace(elem) |
| 651 | + namespaces = create_namespace_dict(elem, xml_namespace) |
| 652 | + article['namespaces'] = namespaces |
| 653 | + elif event == 'end' and elem.tag.endswith('title'): |
| 654 | + article['title'] = elem |
| 655 | + elif event == 'end' and elem.tag.endswith('revision'): |
| 656 | + article['revisions'].append(elem) |
| 657 | + elif event == 'end' and elem.tag.endswith('id') and id == False: |
| 658 | + article['id'] = elem |
| 659 | + id = True |
| 660 | + elif event == 'end' and elem.tag.endswith('page'): |
| 661 | + yield article, xml_namespace |
| 662 | + elem.clear() |
| 663 | + article = {} |
| 664 | + article['revisions'] = [] |
| 665 | + article['namespaces'] = namespaces |
| 666 | + id = False |
| 667 | + elif rts.kaggle == True and event == 'end': |
| 668 | + print 'I am cleaning up' |
| 669 | + elem.clear() |
| 670 | + except SyntaxError, error: |
| 671 | + print 'Encountered invalid XML tag. Error message: %s' % error |
| 672 | + dump(elem) |
| 673 | + sys.exit(-1) |
668 | 674 | |
| 675 | + |
669 | 676 | def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts): |
670 | 677 | bots = detector.retrieve_bots('en') |
671 | 678 | path = os.path.join(rts.location, 'txt') |
— | — | @@ -708,7 +715,8 @@ |
709 | 716 | fh.close() |
710 | 717 | |
711 | 718 | t1 = datetime.datetime.now() |
712 | | - print 'Processing of %s took %s' % (filename, (t1 - t0)) |
| 719 | + print 'Worker %s: Processing of %s took %s' % (process_id, filename, (t1 - t0)) |
| 720 | + print 'There are %s files left in the queue' % (input_queue.qsize()) |
713 | 721 | t0 = t1 |
714 | 722 | |
715 | 723 | if dataset == 'training': |
— | — | @@ -725,7 +733,7 @@ |
726 | 734 | filename = 'counts_%s.bin' % filename |
727 | 735 | file_utils.store_object(counts, location, filename) |
728 | 736 | |
729 | | - print 'Finished parsing bz2 archives' |
| 737 | + print 'Finished parsing Wikipedia dump files.' |
730 | 738 | |
731 | 739 | |
732 | 740 | def setup(storage, rts=None): |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -28,9 +28,10 @@ |
29 | 29 | from database import db |
30 | 30 | from utils import file_utils |
31 | 31 | from utils import messages |
| 32 | +from utils import data_converter |
32 | 33 | from classes import consumers |
33 | | -import shaper |
34 | 34 | |
| 35 | + |
35 | 36 | try: |
36 | 37 | import psyco |
37 | 38 | psyco.full() |
— | — | @@ -84,7 +85,7 @@ |
85 | 86 | edit_count = determine_number_edits(edits, first_year, final_year) |
86 | 87 | |
87 | 88 | totals = {} |
88 | | - counts = shaper.create_datacontainer(first_year, final_year) |
| 89 | + counts = data_converter.create_datacontainer(first_year, final_year) |
89 | 90 | totals = calculate_totals(totals, counts, character_count, 'character_count') |
90 | 91 | totals = calculate_totals(totals, counts, revert_count, 'revert_count') |
91 | 92 | totals = calculate_totals(totals, counts, article_count, 'article_count') |
— | — | @@ -95,12 +96,14 @@ |
96 | 97 | new_wikipedian = edits[cutoff]['date'] |
97 | 98 | else: |
98 | 99 | new_wikipedian = False |
| 100 | + cum_edit_count = len(edits) |
99 | 101 | first_edit = edits[0]['date'] |
100 | 102 | final_edit = edits[-1]['date'] |
101 | 103 | |
102 | 104 | self.output_db.insert({'editor': self.id, |
103 | 105 | 'username': username, |
104 | 106 | 'new_wikipedian': new_wikipedian, |
| 107 | + 'cum_edit_count': cum_edit_count, |
105 | 108 | 'final_edit': final_edit, |
106 | 109 | 'first_edit': first_edit, |
107 | 110 | 'last_edit_by_year': last_edit_by_year, |
— | — | @@ -148,8 +151,8 @@ |
149 | 152 | |
150 | 153 | |
151 | 154 | def determine_number_edits(edits, first_year, final_year): |
152 | | - dc = shaper.create_datacontainer(first_year, final_year) |
153 | | - dc = shaper.add_months_to_datacontainer(dc, 'dict') |
| 155 | + dc = data_converter.create_datacontainer(first_year, final_year) |
| 156 | + dc = data_converter.add_months_to_datacontainer(dc, 'dict') |
154 | 157 | for edit in edits: |
155 | 158 | ns = edit['ns'] |
156 | 159 | year, month = str(edit['date'].year), edit['date'].month |
— | — | @@ -161,8 +164,8 @@ |
162 | 165 | |
163 | 166 | |
164 | 167 | def determine_articles_workedon(edits, first_year, final_year): |
165 | | - dc = shaper.create_datacontainer(first_year, final_year) |
166 | | - dc = shaper.add_months_to_datacontainer(dc, 'dict') |
| 168 | + dc = data_converter.create_datacontainer(first_year, final_year) |
| 169 | + dc = data_converter.add_months_to_datacontainer(dc, 'dict') |
167 | 170 | for year in edits: |
168 | 171 | for edit in edits[year]: |
169 | 172 | month = edit['date'].month |
— | — | @@ -179,8 +182,8 @@ |
180 | 183 | |
181 | 184 | |
182 | 185 | def determine_namespaces_workedon(edits, first_year, final_year): |
183 | | - dc = shaper.create_datacontainer(first_year, final_year) |
184 | | - dc = shaper.add_months_to_datacontainer(dc, 'set') |
| 186 | + dc = data_converter.create_datacontainer(first_year, final_year) |
| 187 | + dc = data_converter.add_months_to_datacontainer(dc, 'set') |
185 | 188 | for year in edits: |
186 | 189 | for edit in edits[year]: |
187 | 190 | month = edit['date'].month |
— | — | @@ -194,8 +197,8 @@ |
195 | 198 | |
196 | 199 | |
197 | 200 | def determine_number_reverts(edits, first_year, final_year): |
198 | | - dc = shaper.create_datacontainer(first_year, final_year) |
199 | | - dc = shaper.add_months_to_datacontainer(dc, 'dict') |
| 201 | + dc = data_converter.create_datacontainer(first_year, final_year) |
| 202 | + dc = data_converter.add_months_to_datacontainer(dc, 'dict') |
200 | 203 | for year in edits: |
201 | 204 | for edit in edits[year]: |
202 | 205 | month = edit['date'].month |
— | — | @@ -213,8 +216,8 @@ |
214 | 217 | This function counts the number of characters added and remove by year |
215 | 218 | by month by namespace for a particular editor. |
216 | 219 | ''' |
217 | | - dc = shaper.create_datacontainer(first_year, final_year) |
218 | | - dc = shaper.add_months_to_datacontainer(dc, 'dict') |
| 220 | + dc = data_converter.create_datacontainer(first_year, final_year) |
| 221 | + dc = data_converter.add_months_to_datacontainer(dc, 'dict') |
219 | 222 | for year in edits: |
220 | 223 | for edit in edits[year]: |
221 | 224 | month = edit['date'].month |
— | — | @@ -240,7 +243,7 @@ |
241 | 244 | |
242 | 245 | |
243 | 246 | def determine_last_edit_by_year(edits, first_year, final_year): |
244 | | - dc = shaper.create_datacontainer(first_year, final_year, 0) |
| 247 | + dc = data_converter.create_datacontainer(first_year, final_year, 0) |
245 | 248 | for year in edits: |
246 | 249 | for edit in edits[year]: |
247 | 250 | date = str(edit['date'].year) |
— | — | @@ -257,8 +260,8 @@ |
258 | 261 | This function counts the number of unique articles by year edited by a |
259 | 262 | particular editor. |
260 | 263 | ''' |
261 | | - dc = shaper.create_datacontainer(first_year, final_year) |
262 | | - dc = shaper.add_months_to_datacontainer(dc, 'dict') |
| 264 | + dc = data_converter.create_datacontainer(first_year, final_year) |
| 265 | + dc = data_converter.add_months_to_datacontainer(dc, 'dict') |
263 | 266 | for year in articles_edited: |
264 | 267 | for month in articles_edited[year]: |
265 | 268 | for ns in articles_edited[year][month]: |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -115,10 +115,13 @@ |
116 | 116 | if [True for kw in keywords if kw.find('=') > -1] != []: |
117 | 117 | for kw in keywords: |
118 | 118 | key, value = kw.split('=') |
119 | | - try: |
120 | | - value = int(value) |
121 | | - except ValueError: |
122 | | - pass |
| 119 | + if value.find(';') > -1: |
| 120 | + value = value.split(';') |
| 121 | + else: |
| 122 | + try: |
| 123 | + value = int(value) |
| 124 | + except ValueError: |
| 125 | + pass |
123 | 126 | d[key] = value |
124 | 127 | return d |
125 | 128 | |
Index: trunk/tools/editor_trends/classes/analytics.py |
— | — | @@ -28,7 +28,7 @@ |
29 | 29 | |
30 | 30 | class Replicator: |
31 | 31 | def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs): |
32 | | - #this is an ugly hack to prevent a circular import problem |
| 32 | + #TODO this is an ugly hack to prevent a circular import problem |
33 | 33 | #this needs a better fix. |
34 | 34 | import manage |
35 | 35 | |
Index: trunk/tools/editor_trends/classes/bots.py |
— | — | @@ -27,7 +27,7 @@ |
28 | 28 | from classes import settings |
29 | 29 | settings = settings.Settings() |
30 | 30 | |
31 | | -from etl import shaper |
| 31 | +from utils import data_converter |
32 | 32 | from utils import file_utils |
33 | 33 | |
34 | 34 | |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | def __init__(self, name, **kwargs): |
38 | 38 | self.name = name |
39 | 39 | self.projects = [] |
40 | | - self.time = shaper.create_datacontainer(datatype='list') |
| 40 | + self.time = data_converter.create_datacontainer(datatype='list') |
41 | 41 | self.verified = True |
42 | 42 | for kw in kwargs: |
43 | 43 | setattr(self, kw, kwargs[kw]) |
— | — | @@ -45,7 +45,7 @@ |
46 | 46 | return self.name |
47 | 47 | |
48 | 48 | def hours_active(self): |
49 | | - self.clock = shaper.create_clock() |
| 49 | + self.clock = data_converter.create_clock() |
50 | 50 | years = self.time.keys() |
51 | 51 | for year in years: |
52 | 52 | for obs in self.time[year]: |
Index: trunk/tools/editor_trends/utils/data_converter.py |
— | — | @@ -19,22 +19,77 @@ |
20 | 20 | __version__ = '0.1' |
21 | 21 | |
22 | 22 | import datetime |
| 23 | +import datetime |
| 24 | +import math |
23 | 25 | |
| 26 | + |
| 27 | +def add_datatype(datatype=0.0): |
| 28 | + if datatype == 'dict': |
| 29 | + d = dict() |
| 30 | + elif datatype == 'list': |
| 31 | + d = list() |
| 32 | + elif datatype == 'set': |
| 33 | + d = set() |
| 34 | + else: |
| 35 | + d = 0.0 |
| 36 | + return d |
| 37 | + |
| 38 | + |
| 39 | +def create_datacontainer(first_year, final_year, datatype='dict'): |
| 40 | + ''' |
| 41 | + This function initializes an empty dictionary with as key the year (starting |
| 42 | + 2001 and running through) and as value @datatype, in most cases this will |
| 43 | + be zero so the dictionary will act as a running tally for a variable but |
| 44 | + @datatype can also a list, [], or a dictionary, {}, or a set, set(). |
| 45 | + ''' |
| 46 | + data = {} |
| 47 | + for x in xrange(first_year, final_year): |
| 48 | + data[str(x)] = add_datatype(datatype) |
| 49 | + return data |
| 50 | + |
| 51 | + |
| 52 | +def add_windows_to_datacontainer(datacontainer, windows): |
| 53 | + for dc in datacontainer: |
| 54 | + for w in windows: |
| 55 | + datacontainer[dc][w] = add_datatype() |
| 56 | + |
| 57 | + return datacontainer |
| 58 | + |
| 59 | + |
| 60 | +def add_months_to_datacontainer(datacontainer, datatype): |
| 61 | + for dc in datacontainer: |
| 62 | + datacontainer[dc] = {} |
| 63 | + for x in xrange(1, 13): |
| 64 | + datacontainer[dc][x] = add_datatype(datatype) |
| 65 | + |
| 66 | + return datacontainer |
| 67 | + |
| 68 | + |
| 69 | +def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype): |
| 70 | + for dc in datacontainer: |
| 71 | + datacontainer[dc] = {} |
| 72 | + for x in range(first_year, final_year): |
| 73 | + datacontainer[dc][x] = datatype |
| 74 | + return datacontainer |
| 75 | + |
| 76 | + |
24 | 77 | def create_windows(var, break_down_first_year=True): |
25 | 78 | ''' |
26 | 79 | This function creates a list of months. If break_down_first_year = True then |
27 | 80 | the first year will be split in 3, 6, 9 months as well. |
28 | 81 | ''' |
29 | | - years = (var.max_year - var.min_year) +1 |
| 82 | + years = (var.max_year - var.min_year) + 1 |
30 | 83 | windows = [y * 12 for y in xrange(1, years)] |
31 | 84 | if break_down_first_year: |
32 | 85 | windows = [3, 6, 9] + windows |
33 | 86 | return windows |
34 | 87 | |
| 88 | + |
35 | 89 | def convert_seconds_to_date(secs): |
36 | 90 | #return time.gmtime(secs) |
37 | 91 | return datetime.datetime.fromtimestamp(secs) |
38 | 92 | |
| 93 | + |
39 | 94 | def convert_dataset_to_lists(ds, caller): |
40 | 95 | assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.' |
41 | 96 | data = [] |
— | — | @@ -88,15 +143,6 @@ |
89 | 144 | return headers |
90 | 145 | |
91 | 146 | |
92 | | -#def make_data_rectangular(data, all_keys): |
93 | | -# for i, d in enumerate(data): |
94 | | -# for key in all_keys: |
95 | | -# if key not in d: |
96 | | -# d[key] = 0 |
97 | | -# data[i] = d |
98 | | -# return data |
99 | | - |
100 | | - |
101 | 147 | def get_all_props(var): |
102 | 148 | all_keys = [] |
103 | 149 | for obs in var.obs.values(): |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -31,7 +31,7 @@ |
32 | 32 | |
33 | 33 | import db |
34 | 34 | from utils import file_utils |
35 | | -from etl import shaper |
| 35 | +from utils import data_converter |
36 | 36 | |
37 | 37 | class EditorCache(object): |
38 | 38 | def __init__(self, collection): |
— | — | @@ -65,7 +65,7 @@ |
66 | 66 | if key not in self.editors: |
67 | 67 | self.editors[key] = {} |
68 | 68 | self.editors[key]['obs'] = 0 |
69 | | - self.editors[key]['edits'] = shaper.create_datacontainer(2001, self.final_year, 'list') |
| 69 | + self.editors[key]['edits'] = data_converter.create_datacontainer(2001, self.final_year, 'list') |
70 | 70 | self.editors[key]['username'] = value.pop('username') |
71 | 71 | else: |
72 | 72 | value.pop('username') |
Index: trunk/tools/editor_trends/code-snippets/wikitree/parser.py |
— | — | @@ -0,0 +1,173 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import re |
| 22 | +import cStringIO |
| 23 | +import codecs |
| 24 | +import xml.etree.cElementTree as cElementTree |
| 25 | +from lxml import etree |
| 26 | +import sys |
| 27 | +import gzip |
| 28 | + |
| 29 | +if '..' not in sys.path: |
| 30 | + sys.path.append('..') |
| 31 | + |
| 32 | +from classes import settings |
| 33 | +settings = settings.Settings() |
| 34 | +from utils import file_utils |
| 35 | + |
| 36 | +def convert_html_entities(text): |
| 37 | + return file_utils.unescape(text) |
| 38 | + |
| 39 | + |
| 40 | +def extract_text(elem, **kwargs): |
| 41 | + if elem != None and elem.text != None: |
| 42 | + return u'%s' % elem.text |
| 43 | + else: |
| 44 | + return None |
| 45 | + |
| 46 | + |
| 47 | +def remove_xml_namespace(element, xml_namespace): |
| 48 | + '''Remove namespace from the XML document.''' |
| 49 | + ns = u'{%s}' % xml_namespace |
| 50 | + nsl = len(ns) |
| 51 | + for elem in element.getiterator(): |
| 52 | + if elem.tag.startswith(ns): |
| 53 | + elem.tag = elem.tag[nsl:] |
| 54 | + return element |
| 55 | + |
| 56 | + |
| 57 | +def determine_element(line): |
| 58 | + pos = line.find(' ') |
| 59 | + elem = line[:pos] + '>' |
| 60 | + |
| 61 | + |
| 62 | +def create_namespace_dict(namespaces): |
| 63 | + d = {} |
| 64 | + print 'Constructing namespace dictionary' |
| 65 | + for ns in namespaces: |
| 66 | + key = ns.get('key') |
| 67 | + d[key] = extract_text(ns) |
| 68 | + text = ns.text if ns.text != None else '' |
| 69 | + try: |
| 70 | + print key, text.encode('utf-8') |
| 71 | + except UnicodeEncodeError: |
| 72 | + print key |
| 73 | + return d |
| 74 | + |
| 75 | + |
| 76 | +def extract_meta_information(fh): |
| 77 | + ''' |
| 78 | + The purpose of this function is: |
| 79 | + 1) Determine the version of the mediawiki dump file. Default is 0.4. |
| 80 | + 2) Create a dictionary with the namespaces |
| 81 | + ''' |
| 82 | + buffer = cStringIO.StringIO() |
| 83 | + wrapper = codecs.getwriter('utf-8')(buffer) |
| 84 | + wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
| 85 | + re_version = re.compile('\"\d\.\d\"') |
| 86 | + for x, raw_data in enumerate(fh): |
| 87 | + raw_data = ''.join(raw_data.strip()) |
| 88 | + if x == 0: |
| 89 | + version = re.findall(re_version, raw_data)[0] |
| 90 | + version = version.replace('"', '') |
| 91 | + wrapper.write(raw_data) |
| 92 | + if raw_data.find('</siteinfo>') > -1: |
| 93 | + wrapper.write('</mediawiki>') |
| 94 | + article = wrapper.getvalue() |
| 95 | + elem = cElementTree.XML(article) |
| 96 | + break |
| 97 | + xml_namespace = settings.xml_namespace.replace('0.4', version) |
| 98 | + elem = remove_xml_namespace(elem, xml_namespace) |
| 99 | + siteinfo = elem.find('siteinfo') |
| 100 | + namespaces = siteinfo.find('namespaces') |
| 101 | + namespaces = create_namespace_dict(namespaces) |
| 102 | + return namespaces, xml_namespace |
| 103 | + |
| 104 | + |
| 105 | +def read_input(fh): |
| 106 | + context = cElementTree.iterparse(fh, events=('end',)) |
| 107 | + context = iter(context) |
| 108 | + |
| 109 | + article = {} |
| 110 | + article['revisions'] = [] |
| 111 | + id = False |
| 112 | + namespace = '{http://www.mediawiki.org/xml/export-0.4/}' |
| 113 | + |
| 114 | + for event, elem in context: |
| 115 | + if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'): |
| 116 | + article['title'] = elem |
| 117 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'): |
| 118 | + article['revisions'].append(elem) |
| 119 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False: |
| 120 | + article['id'] = elem |
| 121 | + id = True |
| 122 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'): |
| 123 | + yield article, 0 |
| 124 | + elem.clear() |
| 125 | + article = {} |
| 126 | + article['revisions'] = [] |
| 127 | + id = False |
| 128 | + elif event == 'end': |
| 129 | + elem.clear() |
| 130 | + |
| 131 | +#def read_input(fh): |
| 132 | +# buffer = cStringIO.StringIO() |
| 133 | +# wrapper = codecs.getwriter('utf-8')(buffer) |
| 134 | +# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
| 135 | +# start_parsing = False |
| 136 | +# |
| 137 | +# for raw_data in fh: |
| 138 | +# if raw_data == '\n': |
| 139 | +# continue |
| 140 | +# if start_parsing == False and raw_data.find('<page>') > -1: |
| 141 | +# start_parsing = True |
| 142 | +# if start_parsing: |
| 143 | +# raw_data = ''.join(raw_data.strip()) |
| 144 | +# wrapper.write(raw_data) |
| 145 | +# if raw_data.find('</page>') > -1: |
| 146 | +# article = wrapper.getvalue() |
| 147 | +# size = len(article) |
| 148 | +# #article.encode('utf-8') |
| 149 | +# article = cElementTree.XML(article) |
| 150 | +# yield article, size |
| 151 | +# ''' |
| 152 | +# #This looks counter intuitive but Python continues with this |
| 153 | +# call after it has finished the yield statement |
| 154 | +# ''' |
| 155 | +# buffer = cStringIO.StringIO() |
| 156 | +# wrapper = codecs.getwriter('utf-8')(buffer) |
| 157 | +# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
| 158 | +# fh.close() |
| 159 | + |
| 160 | + |
| 161 | +def debug(): |
| 162 | + #fh = codecs.open('c:\\wikimedia\\en\\wiki\dewiki-latest-stub-meta-history.xml', 'r', 'utf-8') |
| 163 | + filename = 'c:\\wikimedia\\en\\wiki\\enwiki-latest-stub-meta-history10.xml.gz' |
| 164 | + fh = gzip.GzipFile(filename, 'rb') |
| 165 | + |
| 166 | + for raw_data in fh: |
| 167 | + print raw_data |
| 168 | + |
| 169 | + |
| 170 | + fh.close() |
| 171 | + |
| 172 | + |
| 173 | +if __name__ == '__main__': |
| 174 | + debug() |
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree/parser.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 175 | + native |
Index: trunk/tools/editor_trends/code-snippets/wikitree/__init__.py |
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
2 | 176 | + native |
Property changes on: trunk/tools/editor_trends/code-snippets/wikitree |
___________________________________________________________________ |
Added: svn:ignore |
3 | 177 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
Index: trunk/tools/editor_trends/code-snippets/match_talkpage_article.py |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-07' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +import os |
| 23 | +if '..' not in sys.path: |
| 24 | + sys.path.append('..') |
| 25 | + |
| 26 | +from classes import settings |
| 27 | +settings = settings.Settings() |
| 28 | +from etl import extracter |
| 29 | +from utils import file_utils |
| 30 | +import wikitree |
| 31 | + |
| 32 | +try: |
| 33 | + import psyco |
| 34 | + psyco.full() |
| 35 | +except ImportError: |
| 36 | + pass |
| 37 | + |
| 38 | +class Article: |
| 39 | + def __init__(self, title, id, talk_id=None): |
| 40 | + self.title = title |
| 41 | + self.id = id |
| 42 | + self.talk_id = talk_id |
| 43 | + |
| 44 | + |
| 45 | +def parse_dumpfile(project, language_code, namespaces=['0', '1']): |
| 46 | + articles = {} |
| 47 | + ns = extracter.load_namespace(language_code) |
| 48 | + non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces) |
| 49 | + |
| 50 | + |
| 51 | + location = os.path.join(settings.input_location, language_code, project) |
| 52 | + fh = file_utils.create_txt_filehandle(location, |
| 53 | + '%s%s-latest-stub-meta-history.xml' % (language_code, project), |
| 54 | + 'r', 'utf-8') |
| 55 | + |
| 56 | + for page, article_size in wikitree.parser.read_input(fh): |
| 57 | + title = page.find('title') |
| 58 | + if extracter.verify_article_belongs_namespace(title, non_valid_namespaces): |
| 59 | + article_id = page.find('id').text |
| 60 | + title = title.text |
| 61 | + if title.startswith(ns['1'].get('canonical')): |
| 62 | + namespace = 'Talk' |
| 63 | + article = articles.get(article_id, Article(None, None, article_id)) |
| 64 | + article.talk_id = article_id |
| 65 | + else: |
| 66 | + namespace = 'Main' |
| 67 | + article = articles.get(article_id, Article(title, article_id)) |
| 68 | + articles[article_id] = article |
| 69 | + |
| 70 | + file_utils.store_object(articles, settings.binary_location, 'talk2article.bin') |
| 71 | + |
| 72 | +if __name__ == '__main__': |
| 73 | + parse_dumpfile('wiki', 'en') |
Property changes on: trunk/tools/editor_trends/code-snippets/match_talkpage_article.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 74 | + native |
Index: trunk/tools/editor_trends/code-snippets/shaper.py |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | + |
| 17 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 18 | +__email__ = 'dvanliere at gmail dot com' |
| 19 | +__date__ = '2010-11-24' |
| 20 | +__version__ = '0.1' |
| 21 | + |
| 22 | +import datetime |
| 23 | +import math |
| 24 | + |
| 25 | + |
| 26 | +def add_datatype(datatype=0.0): |
| 27 | + if datatype == 'dict': |
| 28 | + d = dict() |
| 29 | + elif datatype == 'list': |
| 30 | + d = list() |
| 31 | + elif datatype == 'set': |
| 32 | + d = set() |
| 33 | + else: |
| 34 | + d = 0.0 |
| 35 | + return d |
| 36 | + |
| 37 | + |
| 38 | +def create_datacontainer(first_year, final_year, datatype='dict'): |
| 39 | + ''' |
| 40 | + This function initializes an empty dictionary with as key the year (starting |
| 41 | + 2001 and running through) and as value @datatype, in most cases this will |
| 42 | + be zero so the dictionary will act as a running tally for a variable but |
| 43 | + @datatype can also a list, [], or a dictionary, {}, or a set, set(). |
| 44 | + ''' |
| 45 | + data = {} |
| 46 | + for x in xrange(first_year, final_year): |
| 47 | + data[str(x)] = add_datatype(datatype) |
| 48 | + return data |
| 49 | + |
| 50 | + |
| 51 | +def add_windows_to_datacontainer(datacontainer, windows): |
| 52 | + for dc in datacontainer: |
| 53 | + for w in windows: |
| 54 | + datacontainer[dc][w] = add_datatype() |
| 55 | + |
| 56 | + return datacontainer |
| 57 | + |
| 58 | + |
| 59 | +def add_months_to_datacontainer(datacontainer, datatype): |
| 60 | + for dc in datacontainer: |
| 61 | + datacontainer[dc] = {} |
| 62 | + for x in xrange(1, 13): |
| 63 | + datacontainer[dc][x] = add_datatype(datatype) |
| 64 | + |
| 65 | + return datacontainer |
| 66 | + |
| 67 | + |
| 68 | +def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype): |
| 69 | + for dc in datacontainer: |
| 70 | + datacontainer[dc] = {} |
| 71 | + for x in range(first_year, final_year): |
| 72 | + datacontainer[dc][x] = datatype |
| 73 | + return datacontainer |
Property changes on: trunk/tools/editor_trends/code-snippets/shaper.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 74 | + native |