r109713 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109712‎ | r109713 | r109714 >
Date:01:27, 22 January 2012
Author:rfaulk
Status:deferred
Tags:sopa 
Comment:
reduced to one DB query for increased speed
fixed some issues in output formatting
Modified paths:
  • /trunk/tools/wsor/scripts/zip_to_congress.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/zip_to_congress.py
@@ -10,15 +10,15 @@
1111
1212
1313 """ Import python base modules """
14 -import sys, re, datetime, logging, csv
 14+import sys, logging, csv
1515 import settings
1616
1717 """ Modify the classpath to include local projects """
1818 sys.path.append(settings.__project_home__)
1919
2020 """ Import Analytics modules """
21 -from WSOR.scripts.classes.WSORSlaveDataLoader import VandalLoader
2221 import classes.DataLoader as DL
 22+import classes.Helper as Hlp
2323
2424 """
2525 Execution body of main
@@ -30,56 +30,92 @@
3131 logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
3232
3333 # Use a dataloader to extract zipcode lookup counts from db1008
34 - dl = DL.DatLoader()
35 - sql = 'select count(*) from SOPA_zip_codes where zip_code = %s'
 34+ dl = DL.DataLoader()
 35+ sql = "select date_format(request_time,'%Y%m%d%H'), zip_code, count(*) from SOPA_zip_codes " + \
 36+ "where zip_code regexp '[0-9]{5}' and request_time >= '20120118050000' and request_time < '20120119050000' group by 1, 2 order by 2, 1"
3637
3738 # CSV with zips + congressional districts
3839 # Generate counts for congressional districts
3940
40 - zip_to_cd = csv.reader(open('zip_to_cd.csv', 'rb'), delimiter=' ', quotechar='|')
41 - cd_counts = csv.writer(open('cd_counts.csv', 'wb'), delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
42 -
 41+ zip_to_cd = csv.reader(open('zipProportions.tsv', 'rb'), delimiter=' ', quotechar='|')
 42+ # cd_counts_csv = csv.writer(open('cd_counts.csv', 'wb'), delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
 43+ cd_counts_tsv = open('cd_counts.tsv', 'wb')
 44+
4345 zip_index = 0
4446 cd_index = 1
4547 weight_index = 2
4648
4749 # Stores congressional district look-up counts
48 - cd_counts = dict()
49 -
 50+ cd_counts = Hlp.AutoVivification()
 51+ zip_counts = Hlp.AutoVivification()
 52+
 53+ # Populate zip counts from table
 54+ logging.info('Generating zipcode results from table ...')
 55+ results = dl.execute_SQL(sql)
 56+ for row in results:
 57+ zip_hash = row[1]
 58+ timestamp = row[0]
 59+ try:
 60+ zip_counts[zip_hash][timestamp] = zip_counts[zip_hash][timestamp] + int(row[2])
 61+ except:
 62+ zip_counts[zip_hash][timestamp] = int(row[2])
 63+
5064 # Populate dict with congressional district counts
51 - for row in zip_to_cd:
 65+ for row_cd in zip_to_cd:
5266
53 - cd_hash = row[cd_index]
54 - zip_hash = row[zip_index]
55 -
 67+ cd_hash = row_cd[cd_index]
 68+ zip_hash = row_cd[zip_index]
 69+ district_weight = row_cd[weight_index]
 70+
5671 cd_hash = convert_congressional_district_name(cd_hash)
5772
58 - # Retrieve ZIPcode count from DB
59 - result = dl.execute_SQL(sql % zip_hash)[0][0]
60 - count = int(float(result) * float(row[weight_index]))
 73+ # logging.info('%s - %s - %s' % (cd_hash, zip_hash, district_weight))
6174
62 - try:
63 - cd_counts[cd_hash] = cd_counts[cd_hash] + count
 75+ # Generate the count corresponding to the zip code and the weight
6476
65 - except:
66 - cd_counts[cd_hash] = count
 77+ for timestamp in zip_counts[zip_hash]:
6778
 79+ # extract count and weight
 80+ count = zip_counts[zip_hash][timestamp]
 81+ count = int(float(count) * float(district_weight))
 82+
 83+ try:
 84+ cd_counts[cd_hash][timestamp] = cd_counts[cd_hash] + count
 85+ except:
 86+ cd_counts[cd_hash][timestamp] = count
 87+
6888 # Process ZIP/Congress record
69 -
 89+
7090 # Write congressional district name and counts to csv
71 - for elem in cd_counts:
72 - cd_counts.writerow([elem,cd_counts[elem]])
 91+ cd_keys = cd_counts.keys()
 92+ for cd_ind in range(len(cd_keys)):
 93+ cd_hash = cd_keys[cd_ind]
 94+ ts_counts = cd_counts[cd_hash]
 95+ ts_keys = ts_counts.keys()
7396
 97+ for ts_ind in range(len(ts_keys)):
 98+ timestamp = ts_keys[ts_ind]
 99+ count = ts_counts[timestamp]
 100+
 101+ cd_counts_tsv.write(cd_hash + '\t' + timestamp + '00\t' + str(count) + '\n')
 102+ # cd_counts_tsv.writerow([cd_hash, timestamp, count])
 103+
 104+ cd_counts_tsv.close()
 105+
74106 return 0
75107
76108 """
77109 Convert congressional district name from format 'XX-##' to 'XX_##'
78110 """
79 -def convert_congressional_district_name(name_str):
80 - parts = name_str.split('-')
81 - return parts[0] + '_' + parts[1]
 111+def convert_congressional_district_name(name_str):
 112+
 113+ try:
 114+ parts = name_str.split('-')
 115+ return parts[0] + '_' + parts[1]
 116+
 117+ except:
 118+ return name_str
82119
83 -
84120 """
85121 Call main, exit when execution is complete
86122

Status & tagging log