r76546 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76545‎ \| r76546 \| r76547 >
Date:	23:07, 11 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Ironing out platform inconsistencies.
Modified paths:	/trunk/tools/editor_trends/config.py (modified) (history) /trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/optimize_editors.py (modified) (history) /trunk/tools/editor_trends/run.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history) /trunk/tools/editor_trends/utils/models.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/sort.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -41,15 +41,15 @@
42	42	return getattr(args, key, None)
43	43
44	44
45		~~-def config_launcher(args, location, filename, project, language_code):~~
	45	+def config_launcher(args, location, filename, project, full_project, language_code):
46	46	config.load_configuration(args)
47	47
48	48
49	49	def determine_default_language():
50	50	language_code = locale.getdefaultlocale()[0]
51	51	return language_code.split('_')[0]
52		-
53		-
	52	+
	53	+
54	54	def retrieve_projectname(args):
55	55	language_code = retrieve_language(args)
56	56	if language_code == None:
—	—	@@ -90,33 +90,39 @@
91	91	language_code = retrieve_language(args)
92	92	locations['language_code'] = language_code
93	93	locations['location'] = os.path.join(location, language_code, project)
94		~~- locations['project'] = retrieve_projectname(args)~~
	94	+ locations['project'] = project
	95	+ locations['full_project'] = retrieve_projectname(args)
95	96	locations['filename'] = generate_wikidump_filename(project, args)
96	97	return locations
97	98
98	99
99		~~-def show_settings(args, location, filename, project, language_code):~~
	100	+def prepare_file_locations(location):
	101	+ result = utils.check_file_exists(location, '')
	102	+ if result == False:
	103	+ utils.create_directory(os.path.join(location))
	104	+
	105	+
	106	+def show_settings(args, location, filename, project, full_project, language_code):
100	107	project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki')
101	108	project = project.title()
102	109	language_map = utils.invert_dict(languages.MAPPING)
103	110	print 'Project: %s' % (project)
104	111	print 'Language: %s' % language_map[language_code].decode('utf-8')
105	112	print 'Input directory: %s' % location
106		~~- print 'Output directory: TODO'~~
107		-
	113	+ print 'Output directory: %s and subdirectories' % location
108	114
109		~~-def dump_downloader_launcher(args, location, filename, project, language_code):~~
	115	+
	116	+def dump_downloader_launcher(args, location, filename, project, full_project, language_code):
110	117	print 'dump downloader'
111	118	pbar = get_value(args, 'progress')
112	119	domain = settings.WP_DUMP_LOCATION
113	120	path = '/%s/latest/' % project
114	121	extension = utils.determine_file_extension(filename)
115	122	filemode = utils.determine_file_mode(extension)
116		-
117	123	dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar)
118	124
119	125
120		~~-def split_xml_file_launcher(args, location, filename, project, language_code):~~
	126	+def split_xml_file_launcher(args, location, filename, project, full_project, language_code):
121	127	print 'split_xml_file_launcher'
122	128	ext = utils.determine_file_extension(filename)
123	129	if ext in settings.COMPRESSION_EXTENSIONS:
—	—	@@ -136,7 +142,7 @@
137	143	path = config.detect_installed_program('7zip')
138	144	source = os.path.join(location, file)
139	145	p = None
140		-
	146	+
141	147	if settings.OS == 'Windows':
142	148	p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()
143	149	elif settings.OS == 'Linux':
—	—	@@ -148,18 +154,22 @@
149	155	return p
150	156
151	157
152		~~-def mongodb_script_launcher(args, location, filename, project, language_code):~~
	158	+def mongodb_script_launcher(args, location, filename, project, full_project, language_code):
153	159	print 'mongodb_script_launcher'
154	160	map_wiki_editors.run_parse_editors(project, language_code, location)
155	161
156	162
157		~~-def dataset_launcher(args, project):~~
	163	+def sort_launcher(args, location, filename, project, full_project, language_code):
	164	+ raise NotImplementedError
	165	+
	166	+
	167	+def dataset_launcher(args, full_project):
158	168	print 'dataset launcher'
159	169	optimize_editors.run_optimize_editors(project)
160	170	construct_datasets.generate_editor_dataset_launcher(project)
161	171
162	172
163		~~-def all_launcher(args, location, filename, project, language_code):~~
	173	+def all_launcher(args, location, filename, project, full_project, language_code):
164	174	print 'all_launcher'
165	175	dump_downloader_launcher(args, location, filename, project, language_code)
166	176	split_xml_file_launcher(args, location, filename, project, language_code)
—	—	@@ -173,7 +183,7 @@
174	184	return tuple(choices)
175	185
176	186
177		~~-def show_languages(args, location, filename, project, language_code):~~
	187	+def show_languages(args, location, filename, project, full_project, language_code):
178	188	first = get_value(args, 'startswith')
179	189	if first != None:
180	190	first = first.title()
—	—	@@ -193,18 +203,18 @@
194	204
195	205
196	206	def detect_python_version():
197		~~- version = ''.join(sys.version_info[0:2])~~
	207	+ version = sys.version_info[0:2]
198	208	if version < settings.MINIMUM_PYTHON_VERSION:
199		~~- raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'~~
	209	+ raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
200	210
201	211	def about():
202	212	print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
203	213	print 'Written by Diederik van Liere (dvanliere@gmail.com).'
204	214	print 'This software comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to distribute it under certain conditions.'
205	215	print 'See the README.1ST file for more information.'
206		~~- print ''~~
207		-
	216	+ print '\n'
208	217
	218	+
209	219	def main():
210	220	default_language = determine_default_language()
211	221	file_choices = ('stub-meta-history.xml.gz',
—	—	@@ -216,8 +226,8 @@
217	227	subparsers = parser.add_subparsers(help='sub-command help')
218	228
219	229	parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.')
220		~~- parser_languages.add_argument('-s', '--startswith',~~
221		~~- action='store',~~
	230	+ parser_languages.add_argument('-s', '--startswith',
	231	+ action='store',
222	232	help='Enter the first letter of a language to see which languages are available.')
223	233	parser_languages.set_defaults(func=show_languages)
224	234
—	—	@@ -230,12 +240,15 @@
231	241	parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
232	242	parser_split.set_defaults(func=split_xml_file_launcher)
233	243
	244	+ parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.')
	245	+ parser_sort.set_defaults(func=sort_launcher)
	246	+
234	247	parser_create = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
235	248	parser_create.set_defaults(func=mongodb_script_launcher)
236	249
237	250	parser_dataset = subparsers.add_parser('dataset', help='Create a dataset from the MongoDB and write it to a csv file.')
238	251	parser_dataset.set_defaults(func=dataset_launcher)
239		-
	252	+
240	253	parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
241	254	parser_all.set_defaults(func=all_launcher)
242	255
—	—	@@ -265,6 +278,7 @@
266	279	args = parser.parse_args()
267	280	config.load_configuration(args)
268	281	locations = determine_file_locations(args)
	282	+ prepare_file_locations(locations['location'])
269	283	about()
270	284	show_settings(args, **locations)
271	285	args.func(args, **locations)
Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -29,6 +29,13 @@
30	30	import construct_datasets
31	31
32	32
	33	+try:
	34	+ import psyco
	35	+ psyco.full()
	36	+except ImportError:
	37	+ pass
	38	+
	39	+
33	40	def create_datacontainer(init_value=0):
34	41	'''
35	42	This function initializes an empty dictionary with as key the year (starting
—	—	@@ -43,6 +50,29 @@
44	51	return data
45	52
46	53
	54	+def add_months_to_datacontainer(datacontainer):
	55	+ for dc in datacontainer:
	56	+ datacontainer[dc] = {}
	57	+ for x in xrange(1, 13):
	58	+ datacontainer[dc][str(x)] = 0
	59	+ return datacontainer
	60	+
	61	+
	62	+def determine_edits_by_month(edits):
	63	+ datacontainer = create_datacontainer(init_value=0)
	64	+ datacontainer = add_months_to_datacontainer(datacontainer)
	65	+ for year in edits:
	66	+ months = set()
	67	+ for edit in edits[year]:
	68	+ m = str(edit['date'].month)
	69	+ if m not in months:
	70	+ datacontainer[year][m] = 1
	71	+ months.add(m)
	72	+ if len(months) == 12:
	73	+ break
	74	+ return datacontainer
	75	+
	76	+
47	77	def determine_edits_by_year(dates):
48	78	'''
49	79	This function counts the number of edits by year made by a particular editor.
—	—	@@ -64,19 +94,19 @@
65	95	year = str(date['date'].year)
66	96	articles[year].add(date['article'])
67	97	for article in articles:
68		~~- articles[article] = len(article)~~
	98	+ articles[article] = len(articles[article])
69	99	return articles
70	100
71	101
72	102	def sort_edits(edits):
73	103	edits = utils.merge_list(edits)
74	104	return sorted(edits, key=itemgetter('date'))
75		-
76	105
	106	+
77	107	def optimize_editors(input_queue, result_queue, pbar, **kwargs):
78	108	dbname = kwargs.pop('dbname')
79	109	mongo = db.init_mongo_db(dbname)
80		~~- input = mongo['editors']~~
	110	+ input = mongo['test']
81	111	output = mongo['dataset']
82	112	output.ensure_index('editor')
83	113	output.ensure_index('year_joined')
—	—	@@ -85,7 +115,10 @@
86	116	try:
87	117	id = input_queue.get(block=False)
88	118	editor = input.find_one({'editor': id})
	119	+ if editor == None:
	120	+ continue
89	121	edits = editor['edits']
	122	+ monthly_edits = determine_edits_by_month(edits)
90	123	edits = sort_edits(edits)
91	124	edit_count = len(edits)
92	125	new_wikipedian = edits[9]['date']
—	—	@@ -93,6 +126,7 @@
94	127	final_edit = edits[-1]['date']
95	128	edits_by_year = determine_edits_by_year(edits)
96	129	articles_by_year = determine_articles_by_year(edits)
	130	+
97	131	edits = edits[:10]
98	132
99	133	output.insert({'editor': id, 'edits': edits,
—	—	@@ -101,7 +135,8 @@
102	136	'edit_count': edit_count,
103	137	'final_edit': final_edit,
104	138	'first_edit': first_edit,
105		~~- 'articles_by_year': articles_by_year})~~
	139	+ 'articles_by_year': articles_by_year,
	140	+ 'monthly_edits': monthly_edits})
106	141	print 'Items left: %s' % input_queue.qsize()
107	142	except Empty:
108	143	break
—	—	@@ -114,20 +149,11 @@
115	150	'dbname': 'enwiki',
116	151	'nr_input_processors': 1,
117	152	'nr_output_processors': 0,
	153	+ 'poison_pill': False
118	154	}
119	155	print len(ids)
120	156	ids = list(ids)
121		~~- chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)~~
122		~~-# chunks = {}~~
123		~~-# parts = int(round(float(len(ids)) / 1, 0))~~
124		~~-# a = 0~~
125		~~-# for x in xrange(settings.NUMBER_OF_PROCESSES):~~
126		~~-# b = a + parts~~
127		~~-# chunks[x] = ids[a:b]~~
128		~~-# a = (x + 1) * parts~~
129		~~-# if a >= len(ids):~~
130		~~-# break~~
131		-
	157	+ chunks = dict(0, ids)
132	158	pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
133	159
134	160
—	—	@@ -142,4 +168,4 @@
143	169
144	170	if __name__ == '__main__':
145	171	#debug_optimize_editors('test')
146		~~- run_optimize_editors('enwiki')~~
	172	+ run_optimize_editors('enwiki')
\ No newline at end of file
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -54,7 +54,7 @@
55	55	IGNORE_DIRS = ['wikistats', 'zips']
56	56	ROOT = '/' if OS != 'Windows' else 'c:\\'
57	57
58		~~-MINIMUM_PYTHON_VERSION = 2.6~~
	58	+MINIMUM_PYTHON_VERSION = (2, 6)
59	59
60	60	dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
61	61	os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
—	—	@@ -111,7 +111,7 @@
112	112	MAX_XML_FILE_SIZE = 67108864
113	113
114	114	if OS == 'Windows' and ARCH == 'i386':
115		~~- MAX_FILES_OPEN = win32file._getmaxstdio()~~
	115	+ MAX_FILES_OPEN = win32file._getmaxstdio()
116	116	elif OS != 'Windows':
117	117	MAX_FILES_OPEN = resource.getrlimit(resource.RLIMIT_NOFILE)
118	118	else:
Index: trunk/tools/editor_trends/run.py
—	—	@@ -1,5 +1,48 @@
2	2	import os
	3	+import settings
	4	+#from utils import namespace_downloader as nd
	5	+#nd.launch_downloader()
3	6
	7	+
	8	+#def which(program):
	9	+# import os
	10	+# def is_exe(fpath):
	11	+# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
	12	+#
	13	+# fpath, fname = os.path.split(program)
	14	+# if fpath:
	15	+# if is_exe(program):
	16	+# return program
	17	+# else:
	18	+# for path in os.environ["PATH"].split(os.pathsep):
	19	+# exe_file = os.path.join(path, program)
	20	+# if is_exe(exe_file):
	21	+# return exe_file
	22	+#
	23	+# return None
	24	+#
	25	+#
	26	+#result = which('7z.exe')
	27	+#print result
	28	+
	29	+#from database import launcher
	30	+#launcher.launcher()
	31	+from utils import sort
	32	+input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
	33	+output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
	34	+dbname = 'enwiki'
	35	+#sort.debug_mergesort_feeder(input, output)
	36	+#sort.mergesort_launcher(input, output)
	37	+#sort.mergesort_external_launcher(dbname, output, output)
	38	+
	39	+
	40	+
	41	+
	42	+
	43	+from analyses import cohort_charts
	44	+cohort_charts.prepare_cohort_dataset()
	45	+import os
	46	+
4	47	import settings
5	48	#from utils import namespace_downloader as nd
6	49	#nd.launch_downloader()
Index: trunk/tools/editor_trends/config.py
—	—	@@ -24,10 +24,13 @@
25	25
26	26	import settings
27	27	from utils import utils
	28	+try:
	29	+ from _winreg import *
	30	+except ImportError:
	31	+ pass
28	32
29		-
30	33	def detect_windows_program(program):
31		- from _winreg import *
	34	+
32	35	entry = settings.WINDOWS_REGISTER[program]
33	36	try:
34	37	key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -171,7 +171,7 @@
172	172	return 'wb'
173	173
174	174
175		~~-def write_list_to_csv(data, fh, recursive=False):~~
	175	+def write_list_to_csv(data, fh, recursive=False, newline=True):
176	176	'''
177	177	@data is a list which can contain other lists that will be written as a
178	178	single line to a textfile
—	—	@@ -188,27 +188,32 @@
189	189	if tab:
190	190	fh.write('\t')
191	191	if type(d) == type([]):
192		~~- recursive = write_list_to_csv(d, fh, True)~~
	192	+ recursive = write_list_to_csv(d, fh, recursive=True, newline=False)
	193	+ elif type(d) == type({}):
	194	+ tab = write_dict_to_csv(d, fh, write_key=False, newline=newline)
193	195	else:
194	196	fh.write('%s' % d)
195	197	tab = True
196	198	if recursive:
197	199	tab = False
198	200	return True
199		~~- fh.write('\n')~~
	201	+ if newline:
	202	+ fh.write('\n')
200	203
201	204
202		~~-def write_dict_to_csv(data, fh):~~
	205	+def write_dict_to_csv(data, fh, write_key=True, newline=True):
203	206	keys = data.keys()
204	207	for key in keys:
205		~~- fh.write('%s' % key)~~
206		~~- for obs in data[key]:~~
207		~~- if getattr(obs, '__iter__', False):~~
208		~~- for o in obs:~~
209		~~- fh.write('\t%s' % o)~~
210		~~- else:~~
211		~~- fh.write('\t%s' % (obs))~~
	208	+ if write_key:
	209	+ fh.write('%s' % key)
	210	+ if getattr(data[key], '__iter__', False):
	211	+ for d in data[key]:
	212	+ fh.write('\t%s' % d)
	213	+ else:
	214	+ fh.write('%s\t' % (data[key]))
	215	+ if newline:
212	216	fh.write('\n')
	217	+ return False #this prevents the calling function from writing \t
213	218
214	219
215	220	def create_txt_filehandle(location, name, mode, encoding):
—	—	@@ -326,6 +331,7 @@
327	332	files.append('.'.join(file))
328	333	return files
329	334
	335	+
330	336	def merge_list(datalist):
331	337	merged = []
332	338	for d in datalist:
—	—	@@ -333,6 +339,7 @@
334	340	merged.append(x)
335	341	return merged
336	342
	343	+
337	344	def split_list(datalist, maxval):
338	345	chunks = {}
339	346	a = 0
Index: trunk/tools/editor_trends/utils/dump_downloader.py
—	—	@@ -29,21 +29,25 @@
30	30
31	31
32	32
33		~~-def determine_remote_filesize(url, filename):~~
	33	+def determine_remote_filesize(domain, filename):
34	34	'''
35		~~- @url is the full path of the file to be downloaded~~
	35	+ @domain is the full path of the file to be downloaded
36	36	@filename is the name of the file to be downloaded
37	37	'''
38		~~- if url.startswith('http://'):~~
39		~~- url = url[7:]~~
40		~~- conn = httplib.HTTPConnection(url, 80)~~
41		~~- conn.request('HEAD', filename)~~
42		~~- res = conn.getresponse()~~
43		~~- conn.close()~~
44		~~- if res.status == 200:~~
45		~~- return int(res.getheader('content-length', -1))~~
46		~~- else:~~
47		~~- return - 1~~
	38	+ try:
	39	+ if domain.startswith('http://'):
	40	+ domain = domain[7:]
	41	+ conn = httplib.HTTPConnection(domain)
	42	+ conn.request('HEAD', filename)
	43	+ res = conn.getresponse()
	44	+ conn.close()
	45	+ if res.status == 200:
	46	+ return int(res.getheader('content-length', -1))
	47	+ else:
	48	+ return - 1
	49	+ except httplib.socket.error:
	50	+ #print 'It seemst that %s is temporarily unavailable, please try again later.' % url
	51	+ raise httplib.NotConnected('It seems that %s is temporarily unavailable, please try again later.' % url)
48	52
49	53
50	54	def download_wiki_file(domain, path, filename, location, filemode, pbar):
—	—	@@ -57,17 +61,12 @@
58	62	@pbar is an instance of progressbar.ProgressBar()
59	63	'''
60	64	chunk = 4096
61		~~- result = utils.check_file_exists(location, '')~~
62		~~- if result == False:~~
63		~~- utils.create_directory(os.path.join(location))~~
	65	+ filesize = determine_remote_filesize(domain, path + filename)
64	66	if filemode == 'w':
65	67	fh = utils.create_txt_filehandle(location, filename, filemode, settings.ENCODING)
66	68	else:
67	69	fh = utils.create_binary_filehandle(location, filename, 'wb')
68	70
69		~~- filesize = determine_remote_filesize(domain, path + filename)~~
70		-
71		-
72	71	if filesize != -1 and pbar:
73	72	widgets = ['%s: ' % filename, progressbar.Percentage(), ' ',
74	73	progressbar.Bar(marker=progressbar.RotatingMarker()),' ',
Index: trunk/tools/editor_trends/utils/models.py
—	—	@@ -30,13 +30,14 @@
31	31	for kw in kwargs:
32	32	setattr(self, kw, kwargs[kw])
33	33
34		~~- def run(self):~~
	34	+ def start(self):
35	35	proc_name = self.name
36	36	kwargs = {}
37	37	IGNORE = ['input_queue', 'result_queue', 'target']
38	38	for kw in self.__dict__:
39	39	if kw not in IGNORE and not kw.startswith('_'):
40	40	kwargs[kw] = getattr(self, kw)
	41	+ self._popen = True
41	42	self.target(self.input_queue, self.result_queue, **kwargs)
42	43
43	44
—	—	@@ -50,11 +51,12 @@
51	52	setattr(self, kw, kwargs[kw])
52	53
53	54
54		~~- def run(self):~~
	55	+ def start(self):
55	56	proc_name = self.name
56	57	kwargs = {}
57	58	IGNORE = ['result_queue', 'target']
58	59	for kw in self.__dict__:
59	60	if kw not in IGNORE and not kw.startswith('_'):
60	61	kwargs[kw] = getattr(self, kw)
	62	+ self._popen = True
61	63	self.target(self.result_queue, **kwargs)
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -81,7 +81,7 @@
82	82	**kwargs) for i in xrange(nr_input_processors)]
83	83
84	84	for input_process in input_processes:
85		~~- input_process.run()~~
	85	+ input_process.start()
86	86	pids = [p.pid for p in input_processes]
87	87	kwargs['pids'] = pids
88	88
Index: trunk/tools/editor_trends/utils/sort.py
—	—	@@ -105,6 +105,7 @@
106	106	mongo = db.init_mongo_db(dbname)
107	107	collection = mongo['test']
108	108	mongo.collection.ensure_index('editor')
	109	+ mongo.collection.create_index('editor')
109	110	editor_cache = cache.EditorCache(collection)
110	111	prev_contributor = -1
111	112	x = 0
—	—	@@ -113,12 +114,15 @@
114	115	for line in readline(fh):
115	116	if len(line) == 0:
116	117	continue
117		~~- contributor = int(line[0])~~
	118	+ contributor = int(line[0])
	119	+ if contributor == 5767932:
	120	+ print 'debug'
118	121	if prev_contributor != contributor:
119	122	if edits >= 10:
120	123	result = editor_cache.add(prev_contributor, 'NEXT')
121	124	if result:
122		~~- editors.add(contributor)~~
	125	+ editors.add(prev_contributor)
	126	+ result = None
123	127	x += 1
124	128	print 'Stored %s editors' % x
125	129	else:
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -19,6 +19,8 @@
20	20
21	21	from multiprocessing import Queue
22	22	from Queue import Empty
	23	+import datetime
	24	+from dateutil.relativedelta import *
23	25
24	26	import progressbar
25	27
—	—	@@ -67,7 +69,18 @@
68	70	obs[var] = edits
69	71	return obs
70	72
	73	+def write_longitudinal_data(id, edits, fh):
	74	+ years = edits.keys()
	75	+ years.sort()
	76	+ for year in years:
	77	+ months = edits[year].keys()
	78	+ months = [int(m) for m in months]
	79	+ months.sort()
	80	+ for m in months:
	81	+ date = datetime.date(int(year), int(m), 1)
	82	+ fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))
71	83
	84	+
72	85	def expand_headers(headers, vars_to_expand, obs):
73	86	for var in vars_to_expand:
74	87	l = len(obs[var])
—	—	@@ -77,29 +90,105 @@
78	91	suffix = 2001 + i
79	92	elif var.endswith('edits'):
80	93	suffix = 1 + i
81		~~- headers.insert(pos+i, '%s_%s' % (var, suffix))~~
	94	+ headers.insert(pos + i, '%s_%s' % (var, suffix))
82	95	headers.remove(var)
83	96	return headers
84		-
85		-
86		~~-def generate_editor_dataset(input_queue, data_queue, pbar, **kwargs):~~
	97	+
	98	+
	99	+def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):
87	100	debug = kwargs.pop('debug')
88	101	dbname = kwargs.pop('dbname')
89	102	mongo = db.init_mongo_db(dbname)
90	103	editors = mongo['dataset']
91		~~- name = dbname + '_editors.csv'~~
	104	+ name = dbname + '_long_editors.csv'
92	105	fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)
93	106	x = 0
94		~~- vars_to_expand = ['edits', 'edits_by_year']~~
	107	+ vars_to_expand = []
95	108	while True:
96	109	try:
	110	+ id = input_queue.get(block=False)
	111	+ obs = editors.find_one({'editor': id}, {'monthly_edits': 1})
	112	+ if x == 0:
	113	+ headers = obs.keys()
	114	+ headers.sort()
	115	+ headers = expand_headers(headers, vars_to_expand, obs)
	116	+ utils.write_list_to_csv(headers, fh)
	117	+ write_longitudinal_data(id, obs['monthly_edits'], fh)
	118	+ #utils.write_list_to_csv(data, fh)
	119	+ x += 1
	120	+ except Empty:
	121	+ break
	122	+
	123	+
	124	+def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):
	125	+ dbname = kwargs.get('dbname')
	126	+ pbar = kwargs.get('pbar')
	127	+ mongo = db.init_mongo_db(dbname)
	128	+ editors = mongo['dataset']
	129	+ year = datetime.datetime.now().year + 1
	130	+ begin = year - 2001
	131	+ p = [3, 6, 9]
	132	+ periods = [y * 12 for y in xrange(1, begin)]
	133	+ periods = p + periods
	134	+ data = {}
	135	+ while True:
	136	+ try:
	137	+ id = input_queue.get(block=False)
	138	+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
	139	+ first_edit = obs['first_edit']
	140	+ last_edit = obs['final_edit']
	141	+ for y in xrange(2001, year):
	142	+ if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):
	143	+ print 'debug'
	144	+ if y not in data:
	145	+ data[y] = {}
	146	+ data[y]['n'] = 0
	147	+ window_end = datetime.datetime(y, 12, 31)
	148	+ if window_end > datetime.datetime.now():
	149	+ now = datetime.datetime.now()
	150	+ m = now.month - 1 #Dump files are always lagging at least one month....
	151	+ d = now.day
	152	+ window_end = datetime.datetime(y, m, d)
	153	+ edits = []
	154	+ for period in periods:
	155	+ if period not in data[y]:
	156	+ data[y][period] = 0
	157	+ window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
	158	+ if window_start < datetime.datetime(2001, 1, 1):
	159	+ window_start = datetime.datetime(2001, 1, 1)
	160	+ if date_falls_in_window(window_start, window_end, first_edit, last_edit):
	161	+ edits.append(period)
	162	+ if edits != []:
	163	+ p = min(edits)
	164	+ data[y]['n'] += 1
	165	+ data[y][p] += 1
	166	+ #pbar.update(+1)
	167	+ except Empty:
	168	+ break
	169	+ utils.store_object(data, settings.BINARY_OBJECT_FILE_LOCATION, 'cohort_data')
	170	+
	171	+def date_falls_in_window(window_start, window_end, first_edit, last_edit):
	172	+ if first_edit >= window_start and first_edit <= window_end:
	173	+ return True
	174	+ else:
	175	+ return False
	176	+
	177	+
	178	+def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):
	179	+ dbname = kwargs.pop('dbname')
	180	+ mongo = db.init_mongo_db(dbname)
	181	+ editors = mongo['dataset']
	182	+ name = dbname + '_wide_editors.csv'
	183	+ fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)
	184	+ x = 0
	185	+ vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year']
	186	+ while True:
	187	+ try:
97	188	if debug:
98	189	id = u'99797'
99	190	else:
100	191	id = input_queue.get(block=False)
101		-
102	192	print input_queue.qsize()
103		-
104	193	obs = editors.find_one({'editor': id})
105	194	obs = expand_observations(obs, vars_to_expand)
106	195	if x == 0:
—	—	@@ -107,14 +196,12 @@
108	197	headers.sort()
109	198	headers = expand_headers(headers, vars_to_expand, obs)
110	199	utils.write_list_to_csv(headers, fh)
111		~~- fh.write('\n')~~
112	200	data = []
113	201	keys = obs.keys()
114	202	keys.sort()
115	203	for key in keys:
116	204	data.append(obs[key])
117	205	utils.write_list_to_csv(data, fh)
118		~~- fh.write('\n')~~
119	206
120	207	x += 1
121	208	except Empty:
—	—	@@ -141,20 +228,13 @@
142	229	'nr_output_processors': 1,
143	230	'debug': False,
144	231	'dbname': dbname,
	232	+ 'poison_pill':False,
	233	+ 'pbar': True
145	234	}
146	235	ids = retrieve_editor_ids_mongo(dbname, 'editors')
147		~~- chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)~~
148		~~-# chunks = {}~~
149		~~-# parts = int(round(float(len(ids)) / 1, 0))~~
150		~~-# a = 0~~
151		~~-# for x in xrange(settings.NUMBER_OF_PROCESSES):~~
152		~~-# b = a + parts~~
153		~~-# chunks[x] = ids[a:b]~~
154		~~-# a = (x + 1) * parts~~
155		~~-# if a >= len(ids):~~
156		~~-# break~~
157		-#
158		~~- pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, False, False, **kwargs)~~
	236	+ ids = list(ids)
	237	+ chunks = dict({0: ids})
	238	+ pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs)
159	239
160	240
161	241	def generate_editor_dataset_debug(dbname):
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -63,16 +63,17 @@
64	64	def current_cache_size(self):
65	65	return sum([self.editors[k].get('obs', 0) for k in self.editors])
66	66
	67	+ def clear(self, key):
	68	+ if key in self.editors:
	69	+ del self.editors[key]
	70	+
67	71	def add(self, key, value):
68	72	if value == 'NEXT':
69		~~- for editor in self.treshold_editors:~~
70		~~- self.insert (editor, self.editors[editor]['edits'])~~
71		~~- self.n -= self.editors[editor]['obs']~~
72		~~- self.number_editors -= 1~~
73		~~- del self.editors[editor]~~
74		~~- if key in self.editors:~~
75		~~- del self.editors[key]~~
76		~~- self.treshold_editors = set()~~
	73	+ result = self.insert(key, self.editors[key]['edits'])
	74	+ self.n -= self.editors[key]['obs']
	75	+ self.number_editors -= 1
	76	+ del self.editors[key]
	77	+ return result
77	78	else:
78	79	self.cumulative_n += 1
79	80	self.n += 1
—	—	@@ -88,8 +89,8 @@
89	90	self.editors[key]['edits'][year].append(value)
90	91	self.editors[key]['obs'] += 1
91	92
92		~~- if self.editors[key]['obs'] == self.treshold:~~
93		~~- self.treshold_editors.add(key)~~
	93	+ #if self.editors[key]['obs'] == self.treshold:
	94	+ # self.treshold_editors.add(key)
94	95
95	96	def add_years(self, key):
96	97	now = datetime.datetime.now().year + 1
—	—	@@ -103,8 +104,9 @@
104	105	def insert(self, editor, values):
105	106	try:
106	107	self.collection.insert({'editor': editor, 'edits': values})
	108	+ return True
107	109	except:
108		~~- pass~~
	110	+ return False
109	111
110	112	def store(self):
111	113	utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76546 [removed: new added: deferred]