r109713 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r109712‎ \| r109713 \| r109714 >
Date:	01:27, 22 January 2012
Author:	rfaulk
Status:	deferred
Tags:	sopa
Comment:	reduced to one DB query for increased speed fixed some issues in output formatting
Modified paths:	/trunk/tools/wsor/scripts/zip_to_congress.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/zip_to_congress.py
—	—	@@ -10,15 +10,15 @@
11	11
12	12
13	13	""" Import python base modules """
14		~~-import sys, re, datetime, logging, csv~~
	14	+import sys, logging, csv
15	15	import settings
16	16
17	17	""" Modify the classpath to include local projects """
18	18	sys.path.append(settings.__project_home__)
19	19
20	20	""" Import Analytics modules """
21		~~-from WSOR.scripts.classes.WSORSlaveDataLoader import VandalLoader~~
22	21	import classes.DataLoader as DL
	22	+import classes.Helper as Hlp
23	23
24	24	"""
25	25	Execution body of main
—	—	@@ -30,56 +30,92 @@
31	31	logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
32	32
33	33	# Use a dataloader to extract zipcode lookup counts from db1008
34		~~- dl = DL.DatLoader()~~
35		~~- sql = 'select count(*) from SOPA_zip_codes where zip_code = %s'~~
	34	+ dl = DL.DataLoader()
	35	+ sql = "select date_format(request_time,'%Y%m%d%H'), zip_code, count(*) from SOPA_zip_codes " + \
	36	+ "where zip_code regexp '[0-9]{5}' and request_time >= '20120118050000' and request_time < '20120119050000' group by 1, 2 order by 2, 1"
36	37
37	38	# CSV with zips + congressional districts
38	39	# Generate counts for congressional districts
39	40
40		~~- zip_to_cd = csv.reader(open('zip_to_cd.csv', 'rb'), delimiter=' ', quotechar='\|')~~
41		~~- cd_counts = csv.writer(open('cd_counts.csv', 'wb'), delimiter=' ', quotechar='\|', quoting=csv.QUOTE_MINIMAL)~~
42		-
	41	+ zip_to_cd = csv.reader(open('zipProportions.tsv', 'rb'), delimiter=' ', quotechar='\|')
	42	+ # cd_counts_csv = csv.writer(open('cd_counts.csv', 'wb'), delimiter=' ', quotechar='\|', quoting=csv.QUOTE_MINIMAL)
	43	+ cd_counts_tsv = open('cd_counts.tsv', 'wb')
	44	+
43	45	zip_index = 0
44	46	cd_index = 1
45	47	weight_index = 2
46	48
47	49	# Stores congressional district look-up counts
48		~~- cd_counts = dict()~~
49		-
	50	+ cd_counts = Hlp.AutoVivification()
	51	+ zip_counts = Hlp.AutoVivification()
	52	+
	53	+ # Populate zip counts from table
	54	+ logging.info('Generating zipcode results from table ...')
	55	+ results = dl.execute_SQL(sql)
	56	+ for row in results:
	57	+ zip_hash = row[1]
	58	+ timestamp = row[0]
	59	+ try:
	60	+ zip_counts[zip_hash][timestamp] = zip_counts[zip_hash][timestamp] + int(row[2])
	61	+ except:
	62	+ zip_counts[zip_hash][timestamp] = int(row[2])
	63	+
50	64	# Populate dict with congressional district counts
51		~~- for row in zip_to_cd:~~
	65	+ for row_cd in zip_to_cd:
52	66
53		~~- cd_hash = row[cd_index]~~
54		~~- zip_hash = row[zip_index]~~
55		-
	67	+ cd_hash = row_cd[cd_index]
	68	+ zip_hash = row_cd[zip_index]
	69	+ district_weight = row_cd[weight_index]
	70	+
56	71	cd_hash = convert_congressional_district_name(cd_hash)
57	72
58		~~- # Retrieve ZIPcode count from DB~~
59		~~- result = dl.execute_SQL(sql % zip_hash)[0][0]~~
60		~~- count = int(float(result) * float(row[weight_index]))~~
	73	+ # logging.info('%s - %s - %s' % (cd_hash, zip_hash, district_weight))
61	74
62		~~- try:~~
63		~~- cd_counts[cd_hash] = cd_counts[cd_hash] + count~~
	75	+ # Generate the count corresponding to the zip code and the weight
64	76
65		~~- except:~~
66		~~- cd_counts[cd_hash] = count~~
	77	+ for timestamp in zip_counts[zip_hash]:
67	78
	79	+ # extract count and weight
	80	+ count = zip_counts[zip_hash][timestamp]
	81	+ count = int(float(count) * float(district_weight))
	82	+
	83	+ try:
	84	+ cd_counts[cd_hash][timestamp] = cd_counts[cd_hash] + count
	85	+ except:
	86	+ cd_counts[cd_hash][timestamp] = count
	87	+
68	88	# Process ZIP/Congress record
69		-
	89	+
70	90	# Write congressional district name and counts to csv
71		~~- for elem in cd_counts:~~
72		~~- cd_counts.writerow([elem,cd_counts[elem]])~~
	91	+ cd_keys = cd_counts.keys()
	92	+ for cd_ind in range(len(cd_keys)):
	93	+ cd_hash = cd_keys[cd_ind]
	94	+ ts_counts = cd_counts[cd_hash]
	95	+ ts_keys = ts_counts.keys()
73	96
	97	+ for ts_ind in range(len(ts_keys)):
	98	+ timestamp = ts_keys[ts_ind]
	99	+ count = ts_counts[timestamp]
	100	+
	101	+ cd_counts_tsv.write(cd_hash + '\t' + timestamp + '00\t' + str(count) + '\n')
	102	+ # cd_counts_tsv.writerow([cd_hash, timestamp, count])
	103	+
	104	+ cd_counts_tsv.close()
	105	+
74	106	return 0
75	107
76	108	"""
77	109	Convert congressional district name from format 'XX-##' to 'XX_##'
78	110	"""
79		~~-def convert_congressional_district_name(name_str):~~
80		~~- parts = name_str.split('-')~~
81		~~- return parts[0] + '_' + parts[1]~~
	111	+def convert_congressional_district_name(name_str):
	112	+
	113	+ try:
	114	+ parts = name_str.split('-')
	115	+ return parts[0] + '_' + parts[1]
	116	+
	117	+ except:
	118	+ return name_str
82	119
83		-
84	120	"""
85	121	Call main, exit when execution is complete
86	122

Status & tagging log

15:18, 23 January 2012 Reedy (talk | contribs) changed the status of r109713 [removed: new added: deferred]
12:32, 22 January 2012 Siebrand (talk | contribs) changed the tags for r109713 [added: sopa]