r76463 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76462‎ \| r76463 \| r76464 >
Date:	19:31, 10 November 2010
Author:	reedy
Status:	ok
Tags:
Comment:	set svn:eol-style native
Modified paths:	/trunk/tools/editor_trends/optimize_editors.py (modified) (history) /trunk/tools/editor_trends/run.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/run.py
—	—	@@ -1,37 +1,37 @@
2		~~-import os~~
3		-
4		~~-import settings~~
5		~~-#from utils import namespace_downloader as nd~~
6		~~-#nd.launch_downloader()~~
7		-
8		-
9		~~-#def which(program):~~
10		~~-# import os~~
11		~~-# def is_exe(fpath):~~
12		~~-# return os.path.exists(fpath) and os.access(fpath, os.X_OK)~~
13		-#
14		~~-# fpath, fname = os.path.split(program)~~
15		~~-# if fpath:~~
16		~~-# if is_exe(program):~~
17		~~-# return program~~
18		~~-# else:~~
19		~~-# for path in os.environ["PATH"].split(os.pathsep):~~
20		~~-# exe_file = os.path.join(path, program)~~
21		~~-# if is_exe(exe_file):~~
22		~~-# return exe_file~~
23		-#
24		~~-# return None~~
25		-#
26		-#
27		~~-#result = which('7z.exe')~~
28		~~-#print result~~
29		-
30		~~-#from database import launcher~~
31		~~-#launcher.launcher()~~
32		~~-from utils import sort~~
33		~~-input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')~~
34		~~-output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')~~
35		~~-dbname = 'enwiki'~~
36		~~-#sort.debug_mergesort_feeder(input, output)~~
37		~~-#sort.mergesort_launcher(input, output)~~
	2	+import os
	3	+
	4	+import settings
	5	+#from utils import namespace_downloader as nd
	6	+#nd.launch_downloader()
	7	+
	8	+
	9	+#def which(program):
	10	+# import os
	11	+# def is_exe(fpath):
	12	+# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
	13	+#
	14	+# fpath, fname = os.path.split(program)
	15	+# if fpath:
	16	+# if is_exe(program):
	17	+# return program
	18	+# else:
	19	+# for path in os.environ["PATH"].split(os.pathsep):
	20	+# exe_file = os.path.join(path, program)
	21	+# if is_exe(exe_file):
	22	+# return exe_file
	23	+#
	24	+# return None
	25	+#
	26	+#
	27	+#result = which('7z.exe')
	28	+#print result
	29	+
	30	+#from database import launcher
	31	+#launcher.launcher()
	32	+from utils import sort
	33	+input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
	34	+output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
	35	+dbname = 'enwiki'
	36	+#sort.debug_mergesort_feeder(input, output)
	37	+#sort.mergesort_launcher(input, output)
38	38	sort.mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/run.py
___________________________________________________________________
Added: svn:eol-style
39	39	+ native
Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -1,145 +1,145 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-11-02'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-from multiprocessing import Queue~~
22		~~-from Queue import Empty~~
23		~~-from operator import itemgetter~~
24		~~-import datetime~~
25		-
26		~~-import settings~~
27		~~-from database import db~~
28		~~-from utils import process_constructor as pc~~
29		~~-from utils import utils~~
30		~~-import construct_datasets~~
31		-
32		-
33		~~-def create_datacontainer(init_value=0):~~
34		~~- '''~~
35		~~- This function initializes an empty dictionary with as key the year (starting~~
36		~~- 2001 and running through) and as value @init_value, in most cases this will~~
37		~~- be zero so the dictionary will act as a running tally for a variable but~~
38		~~- @init_value can also a list, [], or a dictionary, {}, or a set, set().~~
39		~~- '''~~
40		~~- data = {}~~
41		~~- year = datetime.datetime.now().year + 1~~
42		~~- for x in xrange(2001, year):~~
43		~~- data[str(x)] = init_value~~
44		~~- return data~~
45		-
46		-
47		~~-def determine_edits_by_year(dates):~~
48		~~- '''~~
49		~~- This function counts the number of edits by year made by a particular editor.~~
50		~~- '''~~
51		~~- edits = create_datacontainer()~~
52		~~- for date in dates:~~
53		~~- year = str(date['date'].year)~~
54		~~- edits[year] += 1~~
55		~~- return edits~~
56		-
57		-
58		~~-def determine_articles_by_year(dates):~~
59		~~- '''~~
60		~~- This function counts the number of unique articles by year edited by a~~
61		~~- particular editor.~~
62		~~- '''~~
63		~~- articles = create_datacontainer(set())~~
64		~~- for date in dates:~~
65		~~- year = str(date['date'].year)~~
66		~~- articles[year].add(date['article'])~~
67		~~- for article in articles:~~
68		~~- articles[article] = len(article)~~
69		~~- return articles~~
70		-
71		-
72		~~-def sort_edits(edits):~~
73		~~- edits = utils.merge_list(edits)~~
74		~~- return sorted(edits, key=itemgetter('date'))~~
75		-
76		-
77		~~-def optimize_editors(input_queue, result_queue, pbar, **kwargs):~~
78		~~- dbname = kwargs.pop('dbname')~~
79		~~- mongo = db.init_mongo_db(dbname)~~
80		~~- input = mongo['editors']~~
81		~~- output = mongo['dataset']~~
82		~~- output.ensure_index('editor')~~
83		~~- output.ensure_index('year_joined')~~
84		~~- definition = kwargs.pop('definition')~~
85		~~- while True:~~
86		~~- try:~~
87		~~- id = input_queue.get(block=False)~~
88		~~- editor = input.find_one({'editor': id})~~
89		~~- edits = editor['edits']~~
90		~~- edits = sort_edits(edits)~~
91		~~- edit_count = len(edits)~~
92		~~- new_wikipedian = edits[9]['date']~~
93		~~- first_edit = edits[0]['date']~~
94		~~- final_edit = edits[-1]['date']~~
95		~~- edits_by_year = determine_edits_by_year(edits)~~
96		~~- articles_by_year = determine_articles_by_year(edits)~~
97		~~- edits = edits[:10]~~
98		-
99		~~- output.insert({'editor': id, 'edits': edits,~~
100		~~- 'edits_by_year': edits_by_year,~~
101		~~- 'new_wikipedian': new_wikipedian,~~
102		~~- 'edit_count': edit_count,~~
103		~~- 'final_edit': final_edit,~~
104		~~- 'first_edit': first_edit,~~
105		~~- 'articles_by_year': articles_by_year})~~
106		~~- print 'Items left: %s' % input_queue.qsize()~~
107		~~- except Empty:~~
108		~~- break~~
109		-
110		-
111		~~-def run_optimize_editors(dbname):~~
112		~~- ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')~~
113		~~- kwargs = {'definition': 'traditional',~~
114		~~- 'pbar': True,~~
115		~~- 'dbname': 'enwiki',~~
116		~~- 'nr_input_processors': 1,~~
117		~~- 'nr_output_processors': 0,~~
118		~~- }~~
119		~~- print len(ids)~~
120		~~- ids = list(ids)~~
121		~~- chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)~~
122		~~-# chunks = {}~~
123		~~-# parts = int(round(float(len(ids)) / 1, 0))~~
124		~~-# a = 0~~
125		~~-# for x in xrange(settings.NUMBER_OF_PROCESSES):~~
126		~~-# b = a + parts~~
127		~~-# chunks[x] = ids[a:b]~~
128		~~-# a = (x + 1) * parts~~
129		~~-# if a >= len(ids):~~
130		~~-# break~~
131		-
132		~~- pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)~~
133		-
134		-
135		~~-def debug_optimize_editors(dbname):~~
136		~~- ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')~~
137		~~- q = pc.load_queue(ids)~~
138		~~- kwargs = {'definition': 'traditional',~~
139		~~- 'dbname': dbname~~
140		~~- }~~
141		~~- optimize_editors(q, False, True, kwargs)~~
142		-
143		-
144		~~-if __name__ == '__main__':~~
145		~~- #debug_optimize_editors('test')~~
146		~~- run_optimize_editors('enwiki')~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-02'
	19	+__version__ = '0.1'
	20	+
	21	+from multiprocessing import Queue
	22	+from Queue import Empty
	23	+from operator import itemgetter
	24	+import datetime
	25	+
	26	+import settings
	27	+from database import db
	28	+from utils import process_constructor as pc
	29	+from utils import utils
	30	+import construct_datasets
	31	+
	32	+
	33	+def create_datacontainer(init_value=0):
	34	+ '''
	35	+ This function initializes an empty dictionary with as key the year (starting
	36	+ 2001 and running through) and as value @init_value, in most cases this will
	37	+ be zero so the dictionary will act as a running tally for a variable but
	38	+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
	39	+ '''
	40	+ data = {}
	41	+ year = datetime.datetime.now().year + 1
	42	+ for x in xrange(2001, year):
	43	+ data[str(x)] = init_value
	44	+ return data
	45	+
	46	+
	47	+def determine_edits_by_year(dates):
	48	+ '''
	49	+ This function counts the number of edits by year made by a particular editor.
	50	+ '''
	51	+ edits = create_datacontainer()
	52	+ for date in dates:
	53	+ year = str(date['date'].year)
	54	+ edits[year] += 1
	55	+ return edits
	56	+
	57	+
	58	+def determine_articles_by_year(dates):
	59	+ '''
	60	+ This function counts the number of unique articles by year edited by a
	61	+ particular editor.
	62	+ '''
	63	+ articles = create_datacontainer(set())
	64	+ for date in dates:
	65	+ year = str(date['date'].year)
	66	+ articles[year].add(date['article'])
	67	+ for article in articles:
	68	+ articles[article] = len(article)
	69	+ return articles
	70	+
	71	+
	72	+def sort_edits(edits):
	73	+ edits = utils.merge_list(edits)
	74	+ return sorted(edits, key=itemgetter('date'))
	75	+
	76	+
	77	+def optimize_editors(input_queue, result_queue, pbar, **kwargs):
	78	+ dbname = kwargs.pop('dbname')
	79	+ mongo = db.init_mongo_db(dbname)
	80	+ input = mongo['editors']
	81	+ output = mongo['dataset']
	82	+ output.ensure_index('editor')
	83	+ output.ensure_index('year_joined')
	84	+ definition = kwargs.pop('definition')
	85	+ while True:
	86	+ try:
	87	+ id = input_queue.get(block=False)
	88	+ editor = input.find_one({'editor': id})
	89	+ edits = editor['edits']
	90	+ edits = sort_edits(edits)
	91	+ edit_count = len(edits)
	92	+ new_wikipedian = edits[9]['date']
	93	+ first_edit = edits[0]['date']
	94	+ final_edit = edits[-1]['date']
	95	+ edits_by_year = determine_edits_by_year(edits)
	96	+ articles_by_year = determine_articles_by_year(edits)
	97	+ edits = edits[:10]
	98	+
	99	+ output.insert({'editor': id, 'edits': edits,
	100	+ 'edits_by_year': edits_by_year,
	101	+ 'new_wikipedian': new_wikipedian,
	102	+ 'edit_count': edit_count,
	103	+ 'final_edit': final_edit,
	104	+ 'first_edit': first_edit,
	105	+ 'articles_by_year': articles_by_year})
	106	+ print 'Items left: %s' % input_queue.qsize()
	107	+ except Empty:
	108	+ break
	109	+
	110	+
	111	+def run_optimize_editors(dbname):
	112	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
	113	+ kwargs = {'definition': 'traditional',
	114	+ 'pbar': True,
	115	+ 'dbname': 'enwiki',
	116	+ 'nr_input_processors': 1,
	117	+ 'nr_output_processors': 0,
	118	+ }
	119	+ print len(ids)
	120	+ ids = list(ids)
	121	+ chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
	122	+# chunks = {}
	123	+# parts = int(round(float(len(ids)) / 1, 0))
	124	+# a = 0
	125	+# for x in xrange(settings.NUMBER_OF_PROCESSES):
	126	+# b = a + parts
	127	+# chunks[x] = ids[a:b]
	128	+# a = (x + 1) * parts
	129	+# if a >= len(ids):
	130	+# break
	131	+
	132	+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
	133	+
	134	+
	135	+def debug_optimize_editors(dbname):
	136	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
	137	+ q = pc.load_queue(ids)
	138	+ kwargs = {'definition': 'traditional',
	139	+ 'dbname': dbname
	140	+ }
	141	+ optimize_editors(q, False, True, kwargs)
	142	+
	143	+
	144	+if __name__ == '__main__':
	145	+ #debug_optimize_editors('test')
	146	+ run_optimize_editors('enwiki')
Property changes on: trunk/tools/editor_trends/optimize_editors.py
___________________________________________________________________
Added: svn:eol-style
147	147	+ native

Status & tagging log

12:29, 15 November 2010 😂 (talk | contribs) changed the status of r76463 [removed: new added: ok]