r76463 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76462‎ | r76463 | r76464 >
Date:19:31, 10 November 2010
Author:reedy
Status:ok
Tags:
Comment:
set svn:eol-style native
Modified paths:
  • /trunk/tools/editor_trends/optimize_editors.py (modified) (history)
  • /trunk/tools/editor_trends/run.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/run.py
@@ -1,37 +1,37 @@
2 -import os
3 -
4 -import settings
5 -#from utils import namespace_downloader as nd
6 -#nd.launch_downloader()
7 -
8 -
9 -#def which(program):
10 -# import os
11 -# def is_exe(fpath):
12 -# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
13 -#
14 -# fpath, fname = os.path.split(program)
15 -# if fpath:
16 -# if is_exe(program):
17 -# return program
18 -# else:
19 -# for path in os.environ["PATH"].split(os.pathsep):
20 -# exe_file = os.path.join(path, program)
21 -# if is_exe(exe_file):
22 -# return exe_file
23 -#
24 -# return None
25 -#
26 -#
27 -#result = which('7z.exe')
28 -#print result
29 -
30 -#from database import launcher
31 -#launcher.launcher()
32 -from utils import sort
33 -input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
34 -output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
35 -dbname = 'enwiki'
36 -#sort.debug_mergesort_feeder(input, output)
37 -#sort.mergesort_launcher(input, output)
 2+import os
 3+
 4+import settings
 5+#from utils import namespace_downloader as nd
 6+#nd.launch_downloader()
 7+
 8+
 9+#def which(program):
 10+# import os
 11+# def is_exe(fpath):
 12+# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
 13+#
 14+# fpath, fname = os.path.split(program)
 15+# if fpath:
 16+# if is_exe(program):
 17+# return program
 18+# else:
 19+# for path in os.environ["PATH"].split(os.pathsep):
 20+# exe_file = os.path.join(path, program)
 21+# if is_exe(exe_file):
 22+# return exe_file
 23+#
 24+# return None
 25+#
 26+#
 27+#result = which('7z.exe')
 28+#print result
 29+
 30+#from database import launcher
 31+#launcher.launcher()
 32+from utils import sort
 33+input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
 34+output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
 35+dbname = 'enwiki'
 36+#sort.debug_mergesort_feeder(input, output)
 37+#sort.mergesort_launcher(input, output)
3838 sort.mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/run.py
___________________________________________________________________
Added: svn:eol-style
3939 + native
Index: trunk/tools/editor_trends/optimize_editors.py
@@ -1,145 +1,145 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-11-02'
19 -__version__ = '0.1'
20 -
21 -from multiprocessing import Queue
22 -from Queue import Empty
23 -from operator import itemgetter
24 -import datetime
25 -
26 -import settings
27 -from database import db
28 -from utils import process_constructor as pc
29 -from utils import utils
30 -import construct_datasets
31 -
32 -
33 -def create_datacontainer(init_value=0):
34 - '''
35 - This function initializes an empty dictionary with as key the year (starting
36 - 2001 and running through) and as value @init_value, in most cases this will
37 - be zero so the dictionary will act as a running tally for a variable but
38 - @init_value can also a list, [], or a dictionary, {}, or a set, set().
39 - '''
40 - data = {}
41 - year = datetime.datetime.now().year + 1
42 - for x in xrange(2001, year):
43 - data[str(x)] = init_value
44 - return data
45 -
46 -
47 -def determine_edits_by_year(dates):
48 - '''
49 - This function counts the number of edits by year made by a particular editor.
50 - '''
51 - edits = create_datacontainer()
52 - for date in dates:
53 - year = str(date['date'].year)
54 - edits[year] += 1
55 - return edits
56 -
57 -
58 -def determine_articles_by_year(dates):
59 - '''
60 - This function counts the number of unique articles by year edited by a
61 - particular editor.
62 - '''
63 - articles = create_datacontainer(set())
64 - for date in dates:
65 - year = str(date['date'].year)
66 - articles[year].add(date['article'])
67 - for article in articles:
68 - articles[article] = len(article)
69 - return articles
70 -
71 -
72 -def sort_edits(edits):
73 - edits = utils.merge_list(edits)
74 - return sorted(edits, key=itemgetter('date'))
75 -
76 -
77 -def optimize_editors(input_queue, result_queue, pbar, **kwargs):
78 - dbname = kwargs.pop('dbname')
79 - mongo = db.init_mongo_db(dbname)
80 - input = mongo['editors']
81 - output = mongo['dataset']
82 - output.ensure_index('editor')
83 - output.ensure_index('year_joined')
84 - definition = kwargs.pop('definition')
85 - while True:
86 - try:
87 - id = input_queue.get(block=False)
88 - editor = input.find_one({'editor': id})
89 - edits = editor['edits']
90 - edits = sort_edits(edits)
91 - edit_count = len(edits)
92 - new_wikipedian = edits[9]['date']
93 - first_edit = edits[0]['date']
94 - final_edit = edits[-1]['date']
95 - edits_by_year = determine_edits_by_year(edits)
96 - articles_by_year = determine_articles_by_year(edits)
97 - edits = edits[:10]
98 -
99 - output.insert({'editor': id, 'edits': edits,
100 - 'edits_by_year': edits_by_year,
101 - 'new_wikipedian': new_wikipedian,
102 - 'edit_count': edit_count,
103 - 'final_edit': final_edit,
104 - 'first_edit': first_edit,
105 - 'articles_by_year': articles_by_year})
106 - print 'Items left: %s' % input_queue.qsize()
107 - except Empty:
108 - break
109 -
110 -
111 -def run_optimize_editors(dbname):
112 - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
113 - kwargs = {'definition': 'traditional',
114 - 'pbar': True,
115 - 'dbname': 'enwiki',
116 - 'nr_input_processors': 1,
117 - 'nr_output_processors': 0,
118 - }
119 - print len(ids)
120 - ids = list(ids)
121 - chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
122 -# chunks = {}
123 -# parts = int(round(float(len(ids)) / 1, 0))
124 -# a = 0
125 -# for x in xrange(settings.NUMBER_OF_PROCESSES):
126 -# b = a + parts
127 -# chunks[x] = ids[a:b]
128 -# a = (x + 1) * parts
129 -# if a >= len(ids):
130 -# break
131 -
132 - pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
133 -
134 -
135 -def debug_optimize_editors(dbname):
136 - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
137 - q = pc.load_queue(ids)
138 - kwargs = {'definition': 'traditional',
139 - 'dbname': dbname
140 - }
141 - optimize_editors(q, False, True, kwargs)
142 -
143 -
144 -if __name__ == '__main__':
145 - #debug_optimize_editors('test')
146 - run_optimize_editors('enwiki')
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-02'
 19+__version__ = '0.1'
 20+
 21+from multiprocessing import Queue
 22+from Queue import Empty
 23+from operator import itemgetter
 24+import datetime
 25+
 26+import settings
 27+from database import db
 28+from utils import process_constructor as pc
 29+from utils import utils
 30+import construct_datasets
 31+
 32+
 33+def create_datacontainer(init_value=0):
 34+ '''
 35+ This function initializes an empty dictionary with as key the year (starting
 36+ 2001 and running through) and as value @init_value, in most cases this will
 37+ be zero so the dictionary will act as a running tally for a variable but
 38+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
 39+ '''
 40+ data = {}
 41+ year = datetime.datetime.now().year + 1
 42+ for x in xrange(2001, year):
 43+ data[str(x)] = init_value
 44+ return data
 45+
 46+
 47+def determine_edits_by_year(dates):
 48+ '''
 49+ This function counts the number of edits by year made by a particular editor.
 50+ '''
 51+ edits = create_datacontainer()
 52+ for date in dates:
 53+ year = str(date['date'].year)
 54+ edits[year] += 1
 55+ return edits
 56+
 57+
 58+def determine_articles_by_year(dates):
 59+ '''
 60+ This function counts the number of unique articles by year edited by a
 61+ particular editor.
 62+ '''
 63+ articles = create_datacontainer(set())
 64+ for date in dates:
 65+ year = str(date['date'].year)
 66+ articles[year].add(date['article'])
 67+ for article in articles:
 68+ articles[article] = len(article)
 69+ return articles
 70+
 71+
 72+def sort_edits(edits):
 73+ edits = utils.merge_list(edits)
 74+ return sorted(edits, key=itemgetter('date'))
 75+
 76+
 77+def optimize_editors(input_queue, result_queue, pbar, **kwargs):
 78+ dbname = kwargs.pop('dbname')
 79+ mongo = db.init_mongo_db(dbname)
 80+ input = mongo['editors']
 81+ output = mongo['dataset']
 82+ output.ensure_index('editor')
 83+ output.ensure_index('year_joined')
 84+ definition = kwargs.pop('definition')
 85+ while True:
 86+ try:
 87+ id = input_queue.get(block=False)
 88+ editor = input.find_one({'editor': id})
 89+ edits = editor['edits']
 90+ edits = sort_edits(edits)
 91+ edit_count = len(edits)
 92+ new_wikipedian = edits[9]['date']
 93+ first_edit = edits[0]['date']
 94+ final_edit = edits[-1]['date']
 95+ edits_by_year = determine_edits_by_year(edits)
 96+ articles_by_year = determine_articles_by_year(edits)
 97+ edits = edits[:10]
 98+
 99+ output.insert({'editor': id, 'edits': edits,
 100+ 'edits_by_year': edits_by_year,
 101+ 'new_wikipedian': new_wikipedian,
 102+ 'edit_count': edit_count,
 103+ 'final_edit': final_edit,
 104+ 'first_edit': first_edit,
 105+ 'articles_by_year': articles_by_year})
 106+ print 'Items left: %s' % input_queue.qsize()
 107+ except Empty:
 108+ break
 109+
 110+
 111+def run_optimize_editors(dbname):
 112+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 113+ kwargs = {'definition': 'traditional',
 114+ 'pbar': True,
 115+ 'dbname': 'enwiki',
 116+ 'nr_input_processors': 1,
 117+ 'nr_output_processors': 0,
 118+ }
 119+ print len(ids)
 120+ ids = list(ids)
 121+ chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
 122+# chunks = {}
 123+# parts = int(round(float(len(ids)) / 1, 0))
 124+# a = 0
 125+# for x in xrange(settings.NUMBER_OF_PROCESSES):
 126+# b = a + parts
 127+# chunks[x] = ids[a:b]
 128+# a = (x + 1) * parts
 129+# if a >= len(ids):
 130+# break
 131+
 132+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
 133+
 134+
 135+def debug_optimize_editors(dbname):
 136+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 137+ q = pc.load_queue(ids)
 138+ kwargs = {'definition': 'traditional',
 139+ 'dbname': dbname
 140+ }
 141+ optimize_editors(q, False, True, kwargs)
 142+
 143+
 144+if __name__ == '__main__':
 145+ #debug_optimize_editors('test')
 146+ run_optimize_editors('enwiki')
Property changes on: trunk/tools/editor_trends/optimize_editors.py
___________________________________________________________________
Added: svn:eol-style
147147 + native

Status & tagging log