r79777 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r79776‎ | r79777 | r79778 >
Date:00:10, 7 January 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Added ip localization to mining scripts in cases where country data is missing.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py (modified) (history)
  • /trunk/fundraiser-statistics/standalone-queries/create_tables_faulkner.sql (added) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/standalone-queries/create_tables_faulkner.sql
@@ -0,0 +1,57 @@
 2+
 3+
 4+-- create tables in FAULKNER for recording test metric
 5+
 6+use faulkner;
 7+
 8+drop table if exists impression;
 9+drop table if exists landing_page;
 10+drop table if exists ip_country;
 11+
 12+-- drop table if exists treatment;
 13+
 14+
 15+create table impression (
 16+ -- id int(10) unsigned NOT NULL auto_increment,
 17+ run_id int(10) unsigned default NULL,
 18+ utm_source varchar(128) default NULL,
 19+ referrer varchar(128) default NULL,
 20+ country varchar(128) default NULL,
 21+ counts int(10) unsigned default NULL,
 22+ on_minute timestamp,
 23+ -- primary key (id),
 24+ constraint imp_run_fk foreign key (run_id) references log_run (id) on delete cascade
 25+);
 26+
 27+create table landing_page (
 28+ id int(10) unsigned NOT NULL auto_increment,
 29+ -- run_id int(10) unsigned default NULL,
 30+ utm_source varchar(128) default NULL,
 31+ utm_campaign varchar(128) default NULL,
 32+ utm_medium varchar(128) default NULL,
 33+ landing_page varchar(128) default NULL,
 34+ page_url varchar(1000) default NULL,
 35+ referrer_url varchar(1000) default NULL,
 36+ browser varchar(50) default NULL,
 37+ lang varchar(20) default NULL, -- CHARACTER(2) NULL,
 38+ country varchar(20) default NULL, -- CHARACTER(2) NULL,
 39+ project varchar(128) default NULL,
 40+ ip varchar(20) default NULL,
 41+ request_time timestamp,
 42+ primary key (id)
 43+ -- constraint lp_tr_fk foreign key (test_run_id) references test_run (id) on delete cascade
 44+);
 45+
 46+
 47+create table ip_country (
 48+
 49+ ip_from varchar(50) default NULL,
 50+ ip_to varchar(50) default NULL,
 51+ registry varchar(50) default NULL,
 52+ assigned varchar(50) default NULL,
 53+ country_ISO_1 varchar(50) default NULL,
 54+ country_ISO_2 varchar(50) default NULL,
 55+ country_name varchar(50) default NULL,
 56+ primary key (ip_from)
 57+);
 58+
Index: trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py
@@ -200,27 +200,38 @@
201201
202202 if include_request:
203203
204 - # Address cases where the query string contains the landing page
 204+ # Address cases where the query string contains the landing page - ...wikimediafoundation.org/w/index.php?...
205205 if index_str_flag:
206206 try:
 207+ # URLs of the form ...?title=<lp_name>
207208 lp_country = query_fields['title'][0].split('/')
208209 landing_page = lp_country[0]
209210
210 - if len(lp_country) == 3:
211 - country = lp_country[2]
212 - else:
213 - country = lp_country[1]
 211+ # URLs of the form ...?county_code=<iso_code>
 212+ try:
 213+ country = query_fields['country_code'][0]
214214
215 - if country == country.lower():
216 - country = ''
 215+ # URLs of the form ...?title=<lp_name>/<lang>/<iso_code>
 216+ except:
 217+ if len(lp_country) == 3:
 218+ country = lp_country[2]
 219+ else:
 220+ country = lp_country[1]
 221+
217222 except:
218223 landing_page = 'NONE'
219 - country = 'NONE'
220 - else:
 224+ country = mh.localize_IP(ip_add) + ' !'
 225+
 226+ else: # ...wikimediafoundation.org/wiki/...
 227+
221228 landing_path = parsed_landing_url[pathIndex].split('/')
222229 landing_page = landing_path[2];
 230+
 231+ # URLs of the form ...?county_code=<iso_code>
223232 try:
224233 country = query_fields['country_code'][0]
 234+
 235+ # URLs of the form ...<path>/ <lp_name>/<lang>/<iso_code>
225236 except:
226237 try:
227238 if len(landing_path) == 5:
@@ -228,9 +239,15 @@
229240 # source_lang = landing_path[3]
230241 else:
231242 country = landing_path[3]
 243+
232244 except:
233 - country = 'NONE'
 245+ country = mh.localize_IP(ip_add) + ' !'
234246
 247+ # If country is confused with the language use the ip
 248+ if country == country.lower():
 249+ country = mh.localize_IP(ip_add) + ' !'
 250+
 251+
235252 # ensure fields exist
236253 try:
237254 utm_source = query_fields['utm_source'][0]
Index: trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py
@@ -14,10 +14,15 @@
1515 # Helper script for mining
1616 # ===================
1717
 18+
 19+import sys
 20+import math
1821 import calendar as cal
19 -import math
 22+import csv
 23+import MySQLdb
2024
2125
 26+
2227 """ Determines the following hour based on the precise date to the hour """
2328 def getNextHour(year, month, day, hour):
2429
@@ -179,4 +184,93 @@
180185 print "miner_help::convert_list_dict: Invalid type, must be a list or a dictionary."
181186 return 0;
182187
183 - return new_collection
\ No newline at end of file
 188+ return new_collection
 189+
 190+""" Given an IP localizes the country """
 191+def localize_IP(ip_string):
 192+
 193+ """ Get db object / Create cursor """
 194+ # db = MySQLdb.connect(host='127.0.0.1', user='rfaulk', db='faulkner', port=3307)
 195+ db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner')
 196+ cur = db.cursor()
 197+
 198+
 199+ # compute ip number
 200+ ip_fields = ip_string.split('.')
 201+ w = int(ip_fields[0])
 202+ x = int(ip_fields[1])
 203+ y = int(ip_fields[2])
 204+ z = int(ip_fields[3])
 205+
 206+ ip_num = 16777216 * w + 65536 * x + 256 * y + z;
 207+
 208+ sql_stmnt = 'select country_ISO_1 from ip_country where ' + str(ip_num) + ' >= ip_from and ' + str(ip_num) + ' <= ip_to'
 209+
 210+ try:
 211+ cur.execute(sql_stmnt)
 212+ row = cur.fetchone()
 213+ except:
 214+ db.rollback()
 215+ sys.exit("Could execute: " + sql_stmnt)
 216+
 217+ country = row[0]
 218+
 219+ # Commit to the db
 220+ db.commit()
 221+
 222+ # Close connection
 223+ cur.close()
 224+ db.close()
 225+
 226+ return country
 227+
 228+""" Load data into the IP localization table to associate IPs with countries """
 229+def load_IP_localization_table():
 230+
 231+ """ Get db object / Create cursor """
 232+ # db = MySQLdb.connect(host='127.0.0.1', user='rfaulk', db='faulkner', port=3307)
 233+ db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner')
 234+ cur = db.cursor()
 235+
 236+ # Parse CSV file
 237+ ipReader = csv.reader(open('./csv/IpToCountry.csv', 'rb'))
 238+ insert_stmnt = 'INSERT INTO ip_country VALUES '
 239+ # (ip_from,ip_to,registry,assigned,country_ISO_1,country_ISO_2,country_name)
 240+ header = 1
 241+ for row in ipReader:
 242+ # skip the csv comments
 243+ if row[0][0] != '#':
 244+ header = 0
 245+
 246+ if not(header):
 247+
 248+ for i in range(len(row)):
 249+ pieces = row[i].split('\'')
 250+
 251+ if len(pieces) > 1:
 252+ new_str = pieces[0]
 253+
 254+ # remove single quotes from fields
 255+ for j in range(1,len(pieces)):
 256+ new_str = new_str + ' ' + pieces[j]
 257+
 258+ row[i] = new_str
 259+
 260+ vals = '\',\''.join(row)
 261+ sql_stmnt = insert_stmnt + '(\'' + vals + '\')'
 262+
 263+ print vals
 264+ #cur.execute(sql_stmnt)
 265+ try:
 266+ cur.execute(sql_stmnt)
 267+ except:
 268+ db.rollback()
 269+ sys.exit("Could not insert: " + sql_stmnt)
 270+
 271+
 272+ # Commit to the db
 273+ db.commit()
 274+
 275+ # Close connection
 276+ cur.close()
 277+ db.close()

Status & tagging log