Index: trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py |
— | — | @@ -17,6 +17,7 @@ |
18 | 18 | import sys |
19 | 19 | import urlparse as up |
20 | 20 | import httpagentparser |
| 21 | +import math |
21 | 22 | |
22 | 23 | import cgi # web queries |
23 | 24 | import re # regular expression matching |
— | — | @@ -45,18 +46,33 @@ |
46 | 47 | queryIndex = 4; |
47 | 48 | pathIndex = 2; |
48 | 49 | |
| 50 | + """ SQL Statements """ |
49 | 51 | |
50 | | - # Clear the records for hour ahead of adding |
51 | | - #timestamp_raw_start = year + month + day + hour + min + '00' |
52 | | - #timestamp_raw_end = year + month + day + hour + min + '00' |
53 | | - #clear_recs_query = 'delete from landing_page where request_time >= \'' + +'\' and request_time < \' \''; |
54 | | - |
55 | | - # SQL Statements |
56 | | - |
57 | 52 | insertStmt_lp = 'INSERT INTO landing_page (utm_source, utm_campaign, utm_medium, landing_page,' + \ |
58 | 53 | 'page_url, referrer_url, browser, lang, country, project, ip, request_time) values ' |
59 | 54 | |
60 | | - |
| 55 | + """ Clear the records for hour ahead of adding """ |
| 56 | + time_stamps = mh.get_timestamps(logFileName) |
| 57 | + |
| 58 | + start = time_stamps[0] |
| 59 | + end = time_stamps[1] |
| 60 | + |
| 61 | + # Ensure that the range is correct; otherwise abort - critical that outside records are not deleted |
| 62 | + time_diff = mh.get_timestamps_diff(start, end) |
| 63 | + |
| 64 | + if math.fabs(time_diff) <= 1.0: |
| 65 | + deleteStmnt = 'delete from landing_page where request_time >= \'' + start + '\' and request_time < \'' + end + '\';' |
| 66 | + |
| 67 | + try: |
| 68 | + # cur.execute(deleteStmnt) |
| 69 | + print >> sys.stdout, "Executed delete from landing page: " + deleteStmnt |
| 70 | + except: |
| 71 | + print >> sys.stderr, "Could not execute delete:\n" + deleteStmnt + "\nResuming insert ..." |
| 72 | + pass |
| 73 | + else: |
| 74 | + print >> sys.stdout, "Could not execute delete statement, DIFF too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end = " + end + "\nResuming insert ..." |
| 75 | + |
| 76 | + |
61 | 77 | # PROCESS LOG FILE |
62 | 78 | # ================ |
63 | 79 | line = logFile.readline() |
Index: trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py |
— | — | @@ -16,6 +16,7 @@ |
17 | 17 | import MySQLdb |
18 | 18 | import sys |
19 | 19 | import urlparse as up |
| 20 | +import math |
20 | 21 | |
21 | 22 | import cgi |
22 | 23 | import re |
— | — | @@ -43,6 +44,28 @@ |
44 | 45 | hr_change = 0 |
45 | 46 | clamp = 0 |
46 | 47 | |
| 48 | + """ Clear the records for hour ahead of adding """ |
| 49 | + time_stamps = mh.get_timestamps(logFileName) |
| 50 | + |
| 51 | + start = time_stamps[0] |
| 52 | + end = time_stamps[1] |
| 53 | + |
| 54 | + # Ensure that the range is correct; otherwise abort - critical that outside records are not deleted |
| 55 | + time_diff = mh.get_timestamps_diff(start, end) |
| 56 | + |
| 57 | + if math.fabs(time_diff) <= 1.0: |
| 58 | + deleteStmnt = 'delete from impression where on_minute >= \'' + start + '\' and on_minute < \'' + end + '\';' |
| 59 | + |
| 60 | + try: |
| 61 | + # cur.execute(deleteStmnt) |
| 62 | + print >> sys.stdout, "Executed delete from landing page: " + deleteStmnt |
| 63 | + except: |
| 64 | + print >> sys.stderr, "Could not execute delete:\n" + deleteStmnt + "\nResuming insert ..." |
| 65 | + pass |
| 66 | + else: |
| 67 | + print >> sys.stdout, "Could not execute delete statement, DIFF too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end = " + end + "\nResuming insert ..." |
| 68 | + |
| 69 | + |
47 | 70 | # PROCESS LOG FILE |
48 | 71 | # ================ |
49 | 72 | |
Index: trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py |
— | — | @@ -17,7 +17,8 @@ |
18 | 18 | import calendar as cal |
19 | 19 | import math |
20 | 20 | |
21 | | -# Determines the following hour based on the precise date to the hour |
| 21 | + |
| 22 | +""" Determines the following hour based on the precise date to the hour """ |
22 | 23 | def getNextHour(year, month, day, hour): |
23 | 24 | |
24 | 25 | lastDayofMonth = cal.monthrange(year,month)[1] |
— | — | @@ -37,7 +38,35 @@ |
38 | 39 | |
39 | 40 | return [next_year, next_month, next_day, next_hour] |
40 | 41 | |
| 42 | +""" Determines the previous hour based on the precise date to the hour """ |
| 43 | +def getPrevHour(year, month, day, hour): |
| 44 | + |
| 45 | + if month == 1: |
| 46 | + last_year = year - 1 |
| 47 | + last_month = 12 |
| 48 | + else: |
| 49 | + last_year = year |
| 50 | + last_month = month - 1 |
| 51 | + |
| 52 | + lastDayofPrevMonth = cal.monthrange(year,last_month)[1] |
| 53 | + |
| 54 | + prev_year = year |
| 55 | + prev_month = month |
| 56 | + prev_day = day |
| 57 | + prev_hour = hour - 1 |
41 | 58 | |
| 59 | + if prev_hour == -1: |
| 60 | + prev_hour = 23 |
| 61 | + if day == 1: |
| 62 | + prev_day = lastDayofPrevMonth |
| 63 | + prev_month = last_month |
| 64 | + prev_year = last_year |
| 65 | + else: |
| 66 | + prev_day = day - 1 |
| 67 | + |
| 68 | + return [prev_year, prev_month, prev_day, prev_hour] |
| 69 | + |
| 70 | + |
42 | 71 | class AutoVivification(dict): |
43 | 72 | """Implementation of perl's autovivification feature.""" |
44 | 73 | def __getitem__(self, item): |
— | — | @@ -70,4 +99,69 @@ |
71 | 100 | |
72 | 101 | |
73 | 102 | def mod_list(lst, modulus): |
74 | | - return [x % modulus for x in lst] |
\ No newline at end of file |
| 103 | + return [x % modulus for x in lst] |
| 104 | + |
| 105 | +""" Extract a timestamp from the filename """ |
| 106 | +def get_timestamps(logFileName): |
| 107 | + |
| 108 | + fname_parts = logFileName.split('-') |
| 109 | + |
| 110 | + year = int(fname_parts[1]) |
| 111 | + month = int(fname_parts[2]) |
| 112 | + day = int(fname_parts[3]) |
| 113 | + hour = int(fname_parts[4][0:2]) |
| 114 | + |
| 115 | + # Is this an afternoon log? |
| 116 | + afternoon = (fname_parts[4][2:4] == 'PM') |
| 117 | + |
| 118 | + # Adjust the hour as necessary if == 12AM or *PM |
| 119 | + if afternoon and hour < 12: |
| 120 | + hour = hour + 12 |
| 121 | + |
| 122 | + if not(afternoon) and hour == 12: |
| 123 | + hour = 0 |
| 124 | + |
| 125 | + prev_hr = getPrevHour(year, month, day, hour) |
| 126 | + |
| 127 | + str_month = '0' + str(month) if month < 10 else str(month) |
| 128 | + str_day = '0' + str(day) if day < 10 else str(day) |
| 129 | + str_hour = '0' + str(hour) if hour < 10 else str(hour) |
| 130 | + |
| 131 | + prev_month = prev_hr[1] |
| 132 | + prev_day = prev_hr[2] |
| 133 | + prev_hour = prev_hr[3] |
| 134 | + str_prev_month = '0' + str(prev_month) if prev_month < 10 else str(prev_month) |
| 135 | + str_prev_day = '0' + str(prev_day) if prev_day < 10 else str(prev_day) |
| 136 | + str_prev_hour = '0' + str(prev_hour) if prev_hour < 10 else str(prev_hour) |
| 137 | + |
| 138 | + log_end = str(year) + str_month + str_day + str_hour + '5500' |
| 139 | + log_start = str(prev_hr[0]) + str_prev_month + str_prev_day + str_prev_hour + '5500' |
| 140 | + |
| 141 | + #log_start = str(year) + str(month) + str(day) + str(hour) + '5500' |
| 142 | + #log_end = str(prev_hr[0]) + str(prev_hr[1]) + str(prev_hr[2]) + str(prev_hr[3]) + '5500' |
| 143 | + |
| 144 | + return [log_start, log_end] |
| 145 | + |
| 146 | + |
| 147 | +""" Compute the difference among two timestamps """ |
| 148 | +def get_timestamps_diff(timestamp_start, timestamp_end): |
| 149 | + |
| 150 | + year_1 = int(timestamp_start[0:4]) |
| 151 | + month_1 = int(timestamp_start[4:6]) |
| 152 | + day_1 = int(timestamp_start[6:8]) |
| 153 | + hr_1 = int(timestamp_start[8:10]) |
| 154 | + min_1 = int(timestamp_start[10:12]) |
| 155 | + |
| 156 | + year_2 = int(timestamp_end[0:4]) |
| 157 | + month_2 = int(timestamp_end[4:6]) |
| 158 | + day_2 = int(timestamp_end[6:8]) |
| 159 | + hr_2 = int(timestamp_end[8:10]) |
| 160 | + min_2 = int(timestamp_end[10:12]) |
| 161 | + |
| 162 | + t1 = cal.datetime.datetime(year=year_1, month=month_1, day=day_1, hour=hr_1, minute=min_1,second=0) |
| 163 | + t2 = cal.datetime.datetime(year=year_2, month=month_2, day=day_2, hour=hr_2, minute=min_2,second=0) |
| 164 | + |
| 165 | + diff = t2 - t1 |
| 166 | + diff = float(diff.seconds) / 3600 |
| 167 | + |
| 168 | + return diff |
\ No newline at end of file |