Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -141,7 +141,7 @@ |
142 | 142 | config['language_code'] = language_code |
143 | 143 | config['language'] = get_value(args, 'language') |
144 | 144 | config['location'] = os.path.join(location, language_code, project) |
145 | | - config['chunks'] = os.path.join(config['location'], 'chunks') |
| 145 | + #config['chunks'] = os.path.join(config['location'], 'chunks') |
146 | 146 | config['txt'] = os.path.join(config['location'], 'txt') |
147 | 147 | config['sorted'] = os.path.join(config['location'], 'sorted') |
148 | 148 | config['dbready'] = os.path.join(config['location'], 'dbready') |
— | — | @@ -150,7 +150,7 @@ |
151 | 151 | config['filename'] = generate_wikidump_filename(language_code, project, args) |
152 | 152 | config['collection'] = get_value(args, 'collection') |
153 | 153 | config['namespaces'] = get_namespaces(args) |
154 | | - config['directories'] = [config['location'], config['chunks'], config['txt'], config['sorted'], config['dbready']] |
| 154 | + config['directories'] = [config['location'], config['txt'], config['sorted'], config['dbready']] |
155 | 155 | |
156 | 156 | message = 'Settings as generated from the configuration module.' |
157 | 157 | write_message_to_log(logger, args, message, None, **config) |
— | — | @@ -164,9 +164,9 @@ |
165 | 165 | language = kwargs.pop('language') |
166 | 166 | language_code = kwargs.pop('language_code') |
167 | 167 | config = {} |
168 | | - config['Project'] = '\t\t%s' % settings.projects.get(kwargs.pop('project'), 'wiki').title() |
169 | | - config['Language'] = '\t%s / %s' % (language_map[language_code], language) #.decode(settings.encoding) |
170 | | - config['Input directory'] = '\t%s' % kwargs.get('location') |
| 168 | + config['Project'] = '%s' % settings.projects.get(kwargs.pop('project'), 'wiki').title() |
| 169 | + config['Language'] = '%s / %s' % (language_map[language_code], language) #.decode(settings.encoding) |
| 170 | + config['Input directory'] = '%s' % kwargs.get('location') |
171 | 171 | config['Output directory'] = '%s and subdirectories' % kwargs.get('location') |
172 | 172 | |
173 | 173 | message = 'Final settings after parsing command line arguments:' |
— | — | @@ -246,7 +246,7 @@ |
247 | 247 | final_output = os.path.join(location, 'dbready') |
248 | 248 | write_message_to_log(logger, args, location=location, input=input, output=output, final_output=final_output) |
249 | 249 | loader.mergesort_launcher(input, output) |
250 | | - loader.mergesort_external_launcher(output, final_output) |
| 250 | + #loader.mergesort_external_launcher(output, final_output) |
251 | 251 | timer.elapsed() |
252 | 252 | |
253 | 253 | |
— | — | @@ -254,14 +254,14 @@ |
255 | 255 | print 'Start storing data in MongoDB' |
256 | 256 | timer = Timer() |
257 | 257 | location = kwargs.pop('location') |
258 | | - input = os.path.join(location, 'dbready') |
| 258 | + input = os.path.join(location, 'sorted') |
259 | 259 | project = kwargs.pop('full_project') |
260 | 260 | collection = kwargs.pop('collection') |
261 | 261 | |
262 | 262 | db.cleanup_database(project, logger) |
263 | 263 | |
264 | 264 | write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection) |
265 | | - num_editors = loader.store_editors(input, project, collection) |
| 265 | + store.launcher(input, project, collection) |
266 | 266 | cnt_editors = db.count_records(project, collection) |
267 | 267 | #assert num_editors == cnt_editors |
268 | 268 | timer.elapsed() |
— | — | @@ -408,8 +408,8 @@ |
409 | 409 | parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.') |
410 | 410 | parser_download.set_defaults(func=dump_downloader_launcher) |
411 | 411 | |
412 | | - #parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
413 | | - #parser_split.set_defaults(func=chunker_launcher) |
| 412 | + parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
| 413 | + parser_split.set_defaults(func=chunker_launcher) |
414 | 414 | |
415 | 415 | parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
416 | 416 | parser_create.set_defaults(func=extract_launcher) |
— | — | @@ -460,9 +460,9 @@ |
461 | 461 | help='A list of namespaces to include for analysis.', |
462 | 462 | default='0') |
463 | 463 | |
464 | | - parser.add_argument('-fo', '--format', action='store', |
465 | | - help='Indicate which format the chunks should be stored. Valid options are xml and txt.', |
466 | | - default='txt') |
| 464 | + #parser.add_argument('-fo', '--format', action='store', |
| 465 | + # help='Indicate which format the chunks should be stored. Valid options are xml and txt.', |
| 466 | + # default='txt') |
467 | 467 | |
468 | 468 | parser.add_argument('-f', '--file', action='store', |
469 | 469 | choices=file_choices, |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -1,91 +1,91 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__author__email = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-04'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -import multiprocessing
|
23 | | -import sys
|
24 | | -
|
25 | | -sys.path.append('..')
|
26 | | -import configuration
|
27 | | -settings = configuration.Settings()
|
28 | | -from utils import utils
|
29 | | -from database import cache
|
30 | | -from database import db
|
31 | | -
|
32 | | -
|
33 | | -def store_editors(tasks, dbname, collection, input):
|
34 | | - editor_cache = cache.EditorCache(collection)
|
35 | | - prev_contributor = -1
|
36 | | - edits = 0
|
37 | | - while True:
|
38 | | - file = tasks.get(block=False)
|
39 | | - if file == None:
|
40 | | - break
|
41 | | - fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
|
42 | | - for line in utils.readline(fh):
|
43 | | - if len(line) == 0:
|
44 | | - continue
|
45 | | - contributor = line[0]
|
46 | | - #print 'Parsing %s' % contributor
|
47 | | - if prev_contributor != contributor:
|
48 | | - if edits > 9:
|
49 | | - editor_cache.add(prev_contributor, 'NEXT')
|
50 | | - print 'Stored %s' % prev_contributor
|
51 | | - else:
|
52 | | - editor_cache.clear(prev_contributor)
|
53 | | - edits = 0
|
54 | | - edits += 1
|
55 | | - date = utils.convert_timestamp_to_datetime_utc(line[1]) #+ datetime.timedelta(days=1)
|
56 | | - article_id = int(line[2])
|
57 | | - username = line[3].encode(settings.encoding)
|
58 | | - value = {'date': date, 'article': article_id, 'username': username}
|
59 | | - editor_cache.add(contributor, value)
|
60 | | - prev_contributor = contributor
|
61 | | - fh.close()
|
62 | | - print editor_cache.n
|
63 | | - #return editor_cache.n
|
64 | | -
|
65 | | -
|
66 | | -def launcher(input, dbname, collection):
|
67 | | - mongo = db.init_mongo_db(dbname)
|
68 | | - collection = mongo[collection]
|
69 | | - collection.ensure_index('editor')
|
70 | | - collection.create_index('editor')
|
71 | | - files = utils.retrieve_file_list(input, 'csv')
|
72 | | - print files
|
73 | | - print input
|
74 | | - tasks = multiprocessing.JoinableQueue()
|
75 | | - consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(settings.number_of_processes)]
|
76 | | - for file in files:
|
77 | | - tasks.put(file)
|
78 | | -
|
79 | | - for x in xrange(settings.number_of_processes):
|
80 | | - tasks.put(None)
|
81 | | -
|
82 | | - for w in consumers:
|
83 | | - w.start()
|
84 | | -
|
85 | | - tasks.join()
|
86 | | -
|
87 | | - #filename = utils.retrieve_file_list(input, 'txt', mask=None)
|
88 | | - #if len(filename) > 1:
|
89 | | - # filename = [f for f in filename if f.find('final') > -1]
|
90 | | - #filename = ''.join(filename)
|
91 | | -
|
92 | | -
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-04' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +import multiprocessing |
| 23 | +import sys |
| 24 | + |
| 25 | +sys.path.append('..') |
| 26 | +import configuration |
| 27 | +settings = configuration.Settings() |
| 28 | +from utils import utils |
| 29 | +from database import cache |
| 30 | +from database import db |
| 31 | + |
| 32 | + |
| 33 | +def store_editors(tasks, dbname, collection, input): |
| 34 | + editor_cache = cache.EditorCache(collection) |
| 35 | + prev_contributor = -1 |
| 36 | + edits = 0 |
| 37 | + while True: |
| 38 | + file = tasks.get(block=False) |
| 39 | + if file == None: |
| 40 | + break |
| 41 | + fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 42 | + for line in utils.readline(fh): |
| 43 | + if len(line) == 0: |
| 44 | + continue |
| 45 | + contributor = line[0] |
| 46 | + #print 'Parsing %s' % contributor |
| 47 | + if prev_contributor != contributor: |
| 48 | + if edits > 9: |
| 49 | + editor_cache.add(prev_contributor, 'NEXT') |
| 50 | + print 'Stored %s' % prev_contributor |
| 51 | + else: |
| 52 | + editor_cache.clear(prev_contributor) |
| 53 | + edits = 0 |
| 54 | + edits += 1 |
| 55 | + date = utils.convert_timestamp_to_datetime_utc(line[1]) #+ datetime.timedelta(days=1) |
| 56 | + article_id = int(line[2]) |
| 57 | + username = line[3].encode(settings.encoding) |
| 58 | + value = {'date': date, 'article': article_id, 'username': username} |
| 59 | + editor_cache.add(contributor, value) |
| 60 | + prev_contributor = contributor |
| 61 | + fh.close() |
| 62 | + print editor_cache.n |
| 63 | + #return editor_cache.n |
| 64 | + |
| 65 | + |
| 66 | +def launcher(input, dbname, collection): |
| 67 | + mongo = db.init_mongo_db(dbname) |
| 68 | + collection = mongo[collection] |
| 69 | + collection.ensure_index('editor') |
| 70 | + collection.create_index('editor') |
| 71 | + files = utils.retrieve_file_list(input, 'csv') |
| 72 | + print files |
| 73 | + print input |
| 74 | + tasks = multiprocessing.JoinableQueue() |
| 75 | + consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(settings.number_of_processes)] |
| 76 | + for file in files: |
| 77 | + tasks.put(file) |
| 78 | + |
| 79 | + for x in xrange(settings.number_of_processes): |
| 80 | + tasks.put(None) |
| 81 | + |
| 82 | + for w in consumers: |
| 83 | + w.start() |
| 84 | + |
| 85 | + tasks.join() |
| 86 | + |
| 87 | + #filename = utils.retrieve_file_list(input, 'txt', mask=None) |
| 88 | + #if len(filename) > 1: |
| 89 | + # filename = [f for f in filename if f.find('final') > -1] |
| 90 | + #filename = ''.join(filename) |
| 91 | + |
| 92 | + |
Property changes on: trunk/tools/editor_trends/etl/store.py |
___________________________________________________________________ |
Added: svn:eol-style |
93 | 93 | + native |
Index: trunk/tools/editor_trends/etl/xml2pig.py |
— | — | @@ -1,30 +1,30 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__author__email = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2010-11-15'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -import sys
|
22 | | -sys.path.append('..')
|
23 | | -
|
24 | | -import os
|
25 | | -import xml.etree.cElementTree as cElementTree
|
26 | | -
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-15' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +sys.path.append('..') |
| 23 | + |
| 24 | +import os |
| 25 | +import xml.etree.cElementTree as cElementTree |
| 26 | + |
27 | 27 | import configuration |
28 | | -settings = configuration.Settings()
|
29 | | -import split_settings.input_filename
|
30 | | -
|
31 | | -
|
| 28 | +settings = configuration.Settings() |
| 29 | +import split_settings.input_filename |
| 30 | + |
| 31 | + |
Property changes on: trunk/tools/editor_trends/etl/xml2pig.py |
___________________________________________________________________ |
Added: svn:eol-style |
32 | 32 | + native |
Index: trunk/tools/editor_trends/statistics/stata/confidence_intervals.do |
— | — | @@ -1,9 +1,9 @@ |
2 | | -insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\long_dataset.tsv"
|
3 | | -gen date2 = date(_time, "20YMD")
|
4 | | -drop _time
|
5 | | -ren date2 _time
|
6 | | -format _time %td
|
7 | | -tsset _time
|
8 | | -
|
9 | | -generate ub = monthly_edits_avg + (2* monthly_edits_sd)
|
10 | | -generate lb = monthly_edits_avg - (2* monthly_edits_sd)
|
| 2 | +insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\long_dataset.tsv" |
| 3 | +gen date2 = date(_time, "20YMD") |
| 4 | +drop _time |
| 5 | +ren date2 _time |
| 6 | +format _time %td |
| 7 | +tsset _time |
| 8 | + |
| 9 | +generate ub = monthly_edits_avg + (2* monthly_edits_sd) |
| 10 | +generate lb = monthly_edits_avg - (2* monthly_edits_sd) |
Property changes on: trunk/tools/editor_trends/statistics/stata/confidence_intervals.do |
___________________________________________________________________ |
Added: svn:eol-style |
11 | 11 | + native |
Property changes on: trunk/tools/editor_trends/README.1ST |
___________________________________________________________________ |
Deleted: native |
12 | 12 | - svn:eol-style=native |
Property changes on: trunk/tools/editor_trends/data/csv/training_bots.csv |
___________________________________________________________________ |
Added: svn:eol-style |
13 | 13 | + native |
Property changes on: trunk/tools/editor_trends/data/csv/bots_predictionset.csv |
___________________________________________________________________ |
Added: svn:eol-style |
14 | 14 | + native |
Index: trunk/tools/editor_trends/code-snippets/utc.py |
— | — | @@ -1,39 +1,39 @@ |
2 | | -import datetime
|
3 | | -
|
4 | | -import time
|
5 | | -import sys
|
6 | | -
|
7 | | -sys.path.append('..')
|
8 | | -import configuration
|
9 | | -settings = configuration.Settings()
|
10 | | -
|
11 | | -timestamp = '2009-02-18T20:47:12Z'
|
12 | | -
|
13 | | -def convert_timestamp_to_date(timestamp):
|
14 | | - return datetime.datetime.strptime(timestamp[:10], settings.date_format)
|
15 | | -
|
16 | | -
|
17 | | -def convert_timestamp_to_datetime(timestamp):
|
18 | | - return datetime.datetime.strptime(timestamp, settings.timestamp_format)
|
19 | | -
|
20 | | -def astimezone(self, tz):
|
21 | | - if self.tzinfo is tz:
|
22 | | - return self
|
23 | | - # Convert self to UTC, and attach the new time zone object.
|
24 | | - utc = (self - self.utcoffset()).replace(tzinfo=tz)
|
25 | | - # Convert from UTC to tz's local time.
|
26 | | - return tz.fromutc(utc)
|
27 | | -
|
28 | | -def convert_timestamp_to_datetime_utc(timestamp):
|
29 | | - return time.gmtime(time.mktime(time.strptime(timestamp, settings.timestamp_format)))
|
30 | | -
|
31 | | -
|
32 | | -t = convert_timestamp_to_datetime_utc(timestamp)
|
33 | | -d =datetime.datetime.fromtimestamp(time.mktime(t))
|
34 | | -tz = datetime.tzinfo('utc')
|
35 | | -d1 = convert_timestamp_to_datetime(timestamp)
|
36 | | -d2 = d1.replace(tzinfo=tz)
|
37 | | -print tz
|
38 | | -
|
39 | | -print t
|
| 2 | +import datetime |
| 3 | + |
| 4 | +import time |
| 5 | +import sys |
| 6 | + |
| 7 | +sys.path.append('..') |
| 8 | +import configuration |
| 9 | +settings = configuration.Settings() |
| 10 | + |
| 11 | +timestamp = '2009-02-18T20:47:12Z' |
| 12 | + |
| 13 | +def convert_timestamp_to_date(timestamp): |
| 14 | + return datetime.datetime.strptime(timestamp[:10], settings.date_format) |
| 15 | + |
| 16 | + |
| 17 | +def convert_timestamp_to_datetime(timestamp): |
| 18 | + return datetime.datetime.strptime(timestamp, settings.timestamp_format) |
| 19 | + |
| 20 | +def astimezone(self, tz): |
| 21 | + if self.tzinfo is tz: |
| 22 | + return self |
| 23 | + # Convert self to UTC, and attach the new time zone object. |
| 24 | + utc = (self - self.utcoffset()).replace(tzinfo=tz) |
| 25 | + # Convert from UTC to tz's local time. |
| 26 | + return tz.fromutc(utc) |
| 27 | + |
| 28 | +def convert_timestamp_to_datetime_utc(timestamp): |
| 29 | + return time.gmtime(time.mktime(time.strptime(timestamp, settings.timestamp_format))) |
| 30 | + |
| 31 | + |
| 32 | +t = convert_timestamp_to_datetime_utc(timestamp) |
| 33 | +d =datetime.datetime.fromtimestamp(time.mktime(t)) |
| 34 | +tz = datetime.tzinfo('utc') |
| 35 | +d1 = convert_timestamp_to_datetime(timestamp) |
| 36 | +d2 = d1.replace(tzinfo=tz) |
| 37 | +print tz |
| 38 | + |
| 39 | +print t |
40 | 40 | print d |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/code-snippets/utc.py |
___________________________________________________________________ |
Added: svn:eol-style |
41 | 41 | + native |
Index: trunk/tools/editor_trends/code-snippets/test.py |
— | — | @@ -1,6 +1,6 @@ |
2 | 2 | import configuration |
3 | | -settings = configuration.Settings()
|
4 | | -
|
5 | | -from tests.mongodb import store
|
6 | | -
|
7 | | -store.test_date() |
\ No newline at end of file |
| 3 | +settings = configuration.Settings() |
| 4 | + |
| 5 | +from tests.mongodb import store |
| 6 | + |
| 7 | +store.test_date() |
Property changes on: trunk/tools/editor_trends/code-snippets/test.py |
___________________________________________________________________ |
Added: svn:eol-style |
8 | 8 | + native |
Index: trunk/tools/editor_trends/code-snippets/count_editors.py |
— | — | @@ -1,32 +1,32 @@ |
2 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
3 | | -__author__email = 'dvanliere at gmail dot com'
|
4 | | -__date__ = '2010-11-17'
|
5 | | -__version__ = '0.1'
|
6 | | -
|
7 | | -import sys
|
8 | | -import os
|
9 | | -sys.path.append('..')
|
10 | | -
|
11 | | -import configuration
|
12 | | -settings = configuration.Settings()
|
13 | | -
|
14 | | -from utils import utils
|
15 | | -
|
16 | | -def main():
|
17 | | - input = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
|
18 | | - files = utils.retrieve_file_list(input, 'txt', mask='merged_final')
|
19 | | - editors = {}
|
20 | | - for file in files:
|
21 | | - fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
|
22 | | - for line in fh:
|
23 | | - author = line.split('\t')[0]
|
24 | | - if author not in editors:
|
25 | | - editors[author] = 0
|
26 | | - editors[author] += 1
|
27 | | - fh.close()
|
28 | | - utils.store_object(editors, settings.binary_location, 'editors_count4.bin')
|
29 | | - print 'Number of editors: %s' % len(editors)
|
30 | | -
|
31 | | -
|
32 | | -if __name__ == '__main__':
|
33 | | - main()
|
| 2 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 3 | +__author__email = 'dvanliere at gmail dot com' |
| 4 | +__date__ = '2010-11-17' |
| 5 | +__version__ = '0.1' |
| 6 | + |
| 7 | +import sys |
| 8 | +import os |
| 9 | +sys.path.append('..') |
| 10 | + |
| 11 | +import configuration |
| 12 | +settings = configuration.Settings() |
| 13 | + |
| 14 | +from utils import utils |
| 15 | + |
| 16 | +def main(): |
| 17 | + input = os.path.join(settings.input_location, 'en', 'wiki', 'sorted') |
| 18 | + files = utils.retrieve_file_list(input, 'txt', mask='merged_final') |
| 19 | + editors = {} |
| 20 | + for file in files: |
| 21 | + fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 22 | + for line in fh: |
| 23 | + author = line.split('\t')[0] |
| 24 | + if author not in editors: |
| 25 | + editors[author] = 0 |
| 26 | + editors[author] += 1 |
| 27 | + fh.close() |
| 28 | + utils.store_object(editors, settings.binary_location, 'editors_count4.bin') |
| 29 | + print 'Number of editors: %s' % len(editors) |
| 30 | + |
| 31 | + |
| 32 | +if __name__ == '__main__': |
| 33 | + main() |
Property changes on: trunk/tools/editor_trends/code-snippets/count_editors.py |
___________________________________________________________________ |
Added: svn:eol-style |
34 | 34 | + native |
Index: trunk/tools/editor_trends/code-snippets/event.py |
— | — | @@ -1,33 +1,33 @@ |
2 | | -import multiprocessing
|
3 | | -import time
|
4 | | -
|
5 | | -def wait_for_event(e):
|
6 | | - """Wait for the event to be set before doing anything"""
|
7 | | - print 'wait_for_event: starting'
|
8 | | - e.wait()
|
9 | | - print 'wait_for_event: e.is_set()->', e.is_set()
|
10 | | -
|
11 | | -def wait_for_event_timeout(e, t):
|
12 | | - """Wait t seconds and then timeout"""
|
13 | | - print 'wait_for_event_timeout: starting'
|
14 | | - e.wait(t)
|
15 | | - e.set()
|
16 | | - print 'wait_for_event_timeout: e.is_set()->', e.is_set()
|
17 | | -
|
18 | | -
|
19 | | -if __name__ == '__main__':
|
20 | | - e = multiprocessing.Event()
|
21 | | - w1 = multiprocessing.Process(name='block',
|
22 | | - target=wait_for_event,
|
23 | | - args=(e,))
|
24 | | - w1.start()
|
25 | | -
|
26 | | - w2 = multiprocessing.Process(name='non-block',
|
27 | | - target=wait_for_event_timeout,
|
28 | | - args=(e, 2))
|
29 | | - w2.start()
|
30 | | -
|
31 | | - print 'main: waiting before calling Event.set()'
|
32 | | - time.sleep(3)
|
33 | | - #e.set()
|
34 | | - print 'main: event is set'
|
| 2 | +import multiprocessing |
| 3 | +import time |
| 4 | + |
| 5 | +def wait_for_event(e): |
| 6 | + """Wait for the event to be set before doing anything""" |
| 7 | + print 'wait_for_event: starting' |
| 8 | + e.wait() |
| 9 | + print 'wait_for_event: e.is_set()->', e.is_set() |
| 10 | + |
| 11 | +def wait_for_event_timeout(e, t): |
| 12 | + """Wait t seconds and then timeout""" |
| 13 | + print 'wait_for_event_timeout: starting' |
| 14 | + e.wait(t) |
| 15 | + e.set() |
| 16 | + print 'wait_for_event_timeout: e.is_set()->', e.is_set() |
| 17 | + |
| 18 | + |
| 19 | +if __name__ == '__main__': |
| 20 | + e = multiprocessing.Event() |
| 21 | + w1 = multiprocessing.Process(name='block', |
| 22 | + target=wait_for_event, |
| 23 | + args=(e,)) |
| 24 | + w1.start() |
| 25 | + |
| 26 | + w2 = multiprocessing.Process(name='non-block', |
| 27 | + target=wait_for_event_timeout, |
| 28 | + args=(e, 2)) |
| 29 | + w2.start() |
| 30 | + |
| 31 | + print 'main: waiting before calling Event.set()' |
| 32 | + time.sleep(3) |
| 33 | + #e.set() |
| 34 | + print 'main: event is set' |
Property changes on: trunk/tools/editor_trends/code-snippets/event.py |
___________________________________________________________________ |
Added: svn:eol-style |
35 | 35 | + native |