r79027 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r79026‎ | r79027 | r79028 >
Date:20:48, 26 December 2010
Author:rfaulk
Status:deferred
Tags:
Comment:
Adding functionality to miner scripts such that they safely clear records ahead of insertion.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py
@@ -17,6 +17,7 @@
1818 import sys
1919 import urlparse as up
2020 import httpagentparser
 21+import math
2122
2223 import cgi # web queries
2324 import re # regular expression matching
@@ -45,18 +46,33 @@
4647 queryIndex = 4;
4748 pathIndex = 2;
4849
 50+ """ SQL Statements """
4951
50 - # Clear the records for hour ahead of adding
51 - #timestamp_raw_start = year + month + day + hour + min + '00'
52 - #timestamp_raw_end = year + month + day + hour + min + '00'
53 - #clear_recs_query = 'delete from landing_page where request_time >= \'' + +'\' and request_time < \' \'';
54 -
55 - # SQL Statements
56 -
5752 insertStmt_lp = 'INSERT INTO landing_page (utm_source, utm_campaign, utm_medium, landing_page,' + \
5853 'page_url, referrer_url, browser, lang, country, project, ip, request_time) values '
5954
60 -
 55+ """ Clear the records for hour ahead of adding """
 56+ time_stamps = mh.get_timestamps(logFileName)
 57+
 58+ start = time_stamps[0]
 59+ end = time_stamps[1]
 60+
 61+ # Ensure that the range is correct; otherwise abort - critical that outside records are not deleted
 62+ time_diff = mh.get_timestamps_diff(start, end)
 63+
 64+ if math.fabs(time_diff) <= 1.0:
 65+ deleteStmnt = 'delete from landing_page where request_time >= \'' + start + '\' and request_time < \'' + end + '\';'
 66+
 67+ try:
 68+ # cur.execute(deleteStmnt)
 69+ print >> sys.stdout, "Executed delete from landing page: " + deleteStmnt
 70+ except:
 71+ print >> sys.stderr, "Could not execute delete:\n" + deleteStmnt + "\nResuming insert ..."
 72+ pass
 73+ else:
 74+ print >> sys.stdout, "Could not execute delete statement, DIFF too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end = " + end + "\nResuming insert ..."
 75+
 76+
6177 # PROCESS LOG FILE
6278 # ================
6379 line = logFile.readline()
Index: trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py
@@ -16,6 +16,7 @@
1717 import MySQLdb
1818 import sys
1919 import urlparse as up
 20+import math
2021
2122 import cgi
2223 import re
@@ -43,6 +44,28 @@
4445 hr_change = 0
4546 clamp = 0
4647
 48+ """ Clear the records for hour ahead of adding """
 49+ time_stamps = mh.get_timestamps(logFileName)
 50+
 51+ start = time_stamps[0]
 52+ end = time_stamps[1]
 53+
 54+ # Ensure that the range is correct; otherwise abort - critical that outside records are not deleted
 55+ time_diff = mh.get_timestamps_diff(start, end)
 56+
 57+ if math.fabs(time_diff) <= 1.0:
 58+ deleteStmnt = 'delete from impression where on_minute >= \'' + start + '\' and on_minute < \'' + end + '\';'
 59+
 60+ try:
 61+ # cur.execute(deleteStmnt)
 62+ print >> sys.stdout, "Executed delete from landing page: " + deleteStmnt
 63+ except:
 64+ print >> sys.stderr, "Could not execute delete:\n" + deleteStmnt + "\nResuming insert ..."
 65+ pass
 66+ else:
 67+ print >> sys.stdout, "Could not execute delete statement, DIFF too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end = " + end + "\nResuming insert ..."
 68+
 69+
4770 # PROCESS LOG FILE
4871 # ================
4972
Index: trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py
@@ -17,7 +17,8 @@
1818 import calendar as cal
1919 import math
2020
21 -# Determines the following hour based on the precise date to the hour
 21+
 22+""" Determines the following hour based on the precise date to the hour """
2223 def getNextHour(year, month, day, hour):
2324
2425 lastDayofMonth = cal.monthrange(year,month)[1]
@@ -37,7 +38,35 @@
3839
3940 return [next_year, next_month, next_day, next_hour]
4041
 42+""" Determines the previous hour based on the precise date to the hour """
 43+def getPrevHour(year, month, day, hour):
 44+
 45+ if month == 1:
 46+ last_year = year - 1
 47+ last_month = 12
 48+ else:
 49+ last_year = year
 50+ last_month = month - 1
 51+
 52+ lastDayofPrevMonth = cal.monthrange(year,last_month)[1]
 53+
 54+ prev_year = year
 55+ prev_month = month
 56+ prev_day = day
 57+ prev_hour = hour - 1
4158
 59+ if prev_hour == -1:
 60+ prev_hour = 23
 61+ if day == 1:
 62+ prev_day = lastDayofPrevMonth
 63+ prev_month = last_month
 64+ prev_year = last_year
 65+ else:
 66+ prev_day = day - 1
 67+
 68+ return [prev_year, prev_month, prev_day, prev_hour]
 69+
 70+
4271 class AutoVivification(dict):
4372 """Implementation of perl's autovivification feature."""
4473 def __getitem__(self, item):
@@ -70,4 +99,69 @@
71100
72101
73102 def mod_list(lst, modulus):
74 - return [x % modulus for x in lst]
\ No newline at end of file
 103+ return [x % modulus for x in lst]
 104+
 105+""" Extract a timestamp from the filename """
 106+def get_timestamps(logFileName):
 107+
 108+ fname_parts = logFileName.split('-')
 109+
 110+ year = int(fname_parts[1])
 111+ month = int(fname_parts[2])
 112+ day = int(fname_parts[3])
 113+ hour = int(fname_parts[4][0:2])
 114+
 115+ # Is this an afternoon log?
 116+ afternoon = (fname_parts[4][2:4] == 'PM')
 117+
 118+ # Adjust the hour as necessary if == 12AM or *PM
 119+ if afternoon and hour < 12:
 120+ hour = hour + 12
 121+
 122+ if not(afternoon) and hour == 12:
 123+ hour = 0
 124+
 125+ prev_hr = getPrevHour(year, month, day, hour)
 126+
 127+ str_month = '0' + str(month) if month < 10 else str(month)
 128+ str_day = '0' + str(day) if day < 10 else str(day)
 129+ str_hour = '0' + str(hour) if hour < 10 else str(hour)
 130+
 131+ prev_month = prev_hr[1]
 132+ prev_day = prev_hr[2]
 133+ prev_hour = prev_hr[3]
 134+ str_prev_month = '0' + str(prev_month) if prev_month < 10 else str(prev_month)
 135+ str_prev_day = '0' + str(prev_day) if prev_day < 10 else str(prev_day)
 136+ str_prev_hour = '0' + str(prev_hour) if prev_hour < 10 else str(prev_hour)
 137+
 138+ log_end = str(year) + str_month + str_day + str_hour + '5500'
 139+ log_start = str(prev_hr[0]) + str_prev_month + str_prev_day + str_prev_hour + '5500'
 140+
 141+ #log_start = str(year) + str(month) + str(day) + str(hour) + '5500'
 142+ #log_end = str(prev_hr[0]) + str(prev_hr[1]) + str(prev_hr[2]) + str(prev_hr[3]) + '5500'
 143+
 144+ return [log_start, log_end]
 145+
 146+
 147+""" Compute the difference among two timestamps """
 148+def get_timestamps_diff(timestamp_start, timestamp_end):
 149+
 150+ year_1 = int(timestamp_start[0:4])
 151+ month_1 = int(timestamp_start[4:6])
 152+ day_1 = int(timestamp_start[6:8])
 153+ hr_1 = int(timestamp_start[8:10])
 154+ min_1 = int(timestamp_start[10:12])
 155+
 156+ year_2 = int(timestamp_end[0:4])
 157+ month_2 = int(timestamp_end[4:6])
 158+ day_2 = int(timestamp_end[6:8])
 159+ hr_2 = int(timestamp_end[8:10])
 160+ min_2 = int(timestamp_end[10:12])
 161+
 162+ t1 = cal.datetime.datetime(year=year_1, month=month_1, day=day_1, hour=hr_1, minute=min_1,second=0)
 163+ t2 = cal.datetime.datetime(year=year_2, month=month_2, day=day_2, hour=hr_2, minute=min_2,second=0)
 164+
 165+ diff = t2 - t1
 166+ diff = float(diff.seconds) / 3600
 167+
 168+ return diff
\ No newline at end of file

Status & tagging log