Index: trunk/tools/wsor/scripts/zip_to_congress.py |
— | — | @@ -10,15 +10,15 @@ |
11 | 11 | |
12 | 12 | |
13 | 13 | """ Import python base modules """ |
14 | | -import sys, re, datetime, logging, csv |
| 14 | +import sys, logging, csv |
15 | 15 | import settings |
16 | 16 | |
17 | 17 | """ Modify the classpath to include local projects """ |
18 | 18 | sys.path.append(settings.__project_home__) |
19 | 19 | |
20 | 20 | """ Import Analytics modules """ |
21 | | -from WSOR.scripts.classes.WSORSlaveDataLoader import VandalLoader |
22 | 21 | import classes.DataLoader as DL |
| 22 | +import classes.Helper as Hlp |
23 | 23 | |
24 | 24 | """ |
25 | 25 | Execution body of main |
— | — | @@ -30,56 +30,92 @@ |
31 | 31 | logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S') |
32 | 32 | |
33 | 33 | # Use a dataloader to extract zipcode lookup counts from db1008 |
34 | | - dl = DL.DatLoader() |
35 | | - sql = 'select count(*) from SOPA_zip_codes where zip_code = %s' |
| 34 | + dl = DL.DataLoader() |
| 35 | + sql = "select date_format(request_time,'%Y%m%d%H'), zip_code, count(*) from SOPA_zip_codes " + \ |
| 36 | + "where zip_code regexp '[0-9]{5}' and request_time >= '20120118050000' and request_time < '20120119050000' group by 1, 2 order by 2, 1" |
36 | 37 | |
37 | 38 | # CSV with zips + congressional districts |
38 | 39 | # Generate counts for congressional districts |
39 | 40 | |
40 | | - zip_to_cd = csv.reader(open('zip_to_cd.csv', 'rb'), delimiter=' ', quotechar='|') |
41 | | - cd_counts = csv.writer(open('cd_counts.csv', 'wb'), delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) |
42 | | - |
| 41 | + zip_to_cd = csv.reader(open('zipProportions.tsv', 'rb'), delimiter=' ', quotechar='|') |
| 42 | + # cd_counts_csv = csv.writer(open('cd_counts.csv', 'wb'), delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) |
| 43 | + cd_counts_tsv = open('cd_counts.tsv', 'wb') |
| 44 | + |
43 | 45 | zip_index = 0 |
44 | 46 | cd_index = 1 |
45 | 47 | weight_index = 2 |
46 | 48 | |
47 | 49 | # Stores congressional district look-up counts |
48 | | - cd_counts = dict() |
49 | | - |
| 50 | + cd_counts = Hlp.AutoVivification() |
| 51 | + zip_counts = Hlp.AutoVivification() |
| 52 | + |
| 53 | + # Populate zip counts from table |
| 54 | + logging.info('Generating zipcode results from table ...') |
| 55 | + results = dl.execute_SQL(sql) |
| 56 | + for row in results: |
| 57 | + zip_hash = row[1] |
| 58 | + timestamp = row[0] |
| 59 | + try: |
| 60 | + zip_counts[zip_hash][timestamp] = zip_counts[zip_hash][timestamp] + int(row[2]) |
| 61 | + except: |
| 62 | + zip_counts[zip_hash][timestamp] = int(row[2]) |
| 63 | + |
50 | 64 | # Populate dict with congressional district counts |
51 | | - for row in zip_to_cd: |
| 65 | + for row_cd in zip_to_cd: |
52 | 66 | |
53 | | - cd_hash = row[cd_index] |
54 | | - zip_hash = row[zip_index] |
55 | | - |
| 67 | + cd_hash = row_cd[cd_index] |
| 68 | + zip_hash = row_cd[zip_index] |
| 69 | + district_weight = row_cd[weight_index] |
| 70 | + |
56 | 71 | cd_hash = convert_congressional_district_name(cd_hash) |
57 | 72 | |
58 | | - # Retrieve ZIPcode count from DB |
59 | | - result = dl.execute_SQL(sql % zip_hash)[0][0] |
60 | | - count = int(float(result) * float(row[weight_index])) |
| 73 | + # logging.info('%s - %s - %s' % (cd_hash, zip_hash, district_weight)) |
61 | 74 | |
62 | | - try: |
63 | | - cd_counts[cd_hash] = cd_counts[cd_hash] + count |
| 75 | + # Generate the count corresponding to the zip code and the weight |
64 | 76 | |
65 | | - except: |
66 | | - cd_counts[cd_hash] = count |
| 77 | + for timestamp in zip_counts[zip_hash]: |
67 | 78 | |
| 79 | + # extract count and weight |
| 80 | + count = zip_counts[zip_hash][timestamp] |
| 81 | + count = int(float(count) * float(district_weight)) |
| 82 | + |
| 83 | + try: |
| 84 | + cd_counts[cd_hash][timestamp] = cd_counts[cd_hash] + count |
| 85 | + except: |
| 86 | + cd_counts[cd_hash][timestamp] = count |
| 87 | + |
68 | 88 | # Process ZIP/Congress record |
69 | | - |
| 89 | + |
70 | 90 | # Write congressional district name and counts to csv |
71 | | - for elem in cd_counts: |
72 | | - cd_counts.writerow([elem,cd_counts[elem]]) |
| 91 | + cd_keys = cd_counts.keys() |
| 92 | + for cd_ind in range(len(cd_keys)): |
| 93 | + cd_hash = cd_keys[cd_ind] |
| 94 | + ts_counts = cd_counts[cd_hash] |
| 95 | + ts_keys = ts_counts.keys() |
73 | 96 | |
| 97 | + for ts_ind in range(len(ts_keys)): |
| 98 | + timestamp = ts_keys[ts_ind] |
| 99 | + count = ts_counts[timestamp] |
| 100 | + |
| 101 | + cd_counts_tsv.write(cd_hash + '\t' + timestamp + '00\t' + str(count) + '\n') |
| 102 | + # cd_counts_tsv.writerow([cd_hash, timestamp, count]) |
| 103 | + |
| 104 | + cd_counts_tsv.close() |
| 105 | + |
74 | 106 | return 0 |
75 | 107 | |
76 | 108 | """ |
77 | 109 | Convert congressional district name from format 'XX-##' to 'XX_##' |
78 | 110 | """ |
79 | | -def convert_congressional_district_name(name_str): |
80 | | - parts = name_str.split('-') |
81 | | - return parts[0] + '_' + parts[1] |
| 111 | +def convert_congressional_district_name(name_str): |
| 112 | + |
| 113 | + try: |
| 114 | + parts = name_str.split('-') |
| 115 | + return parts[0] + '_' + parts[1] |
| 116 | + |
| 117 | + except: |
| 118 | + return name_str |
82 | 119 | |
83 | | - |
84 | 120 | """ |
85 | 121 | Call main, exit when execution is complete |
86 | 122 | |