r26324 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r26323‎ | r26324 | r26325 >
Date:21:36, 2 October 2007
Author:gmaxwell
Status:old
Tags:
Comment:

Change the collector to aggregate internally to reduce database/disk traffic.

This changeset introduces a dependency on pyjudy because python dicts were causing enormous memory
usage.

If the collector were recoded in C it might actually perform well enough to be useful on
1:1 data, doing so wouldn't be a big task.
Modified paths:
  • /trunk/tools/counter/counter.py (modified) (history)
  • /trunk/tools/counter/display/index.php (modified) (history)
  • /trunk/tools/counter/hitcounter.sql (modified) (history)

Diff [purge]

Index: trunk/tools/counter/hitcounter.sql
@@ -1,9 +1,12 @@
22 CREATE TABLE hit_counter (
3 - hc_ts TIMESTAMP,
 3+ hc_tsstart TIMESTAMP,
 4+ hc_tsend TIMESTAMP,
45
56 hc_site VARCHAR(255) BINARY,
67 hc_page VARCHAR(255) BINARY,
78
8 - KEY (hc_ts, hc_site, hc_page),
9 - KEY (hc_site, hc_page, hc_ts)
 9+ hc_count int8,
 10+
 11+ KEY (hc_tsend, hc_site, hc_page),
 12+ KEY (hc_site, hc_page, hc_tsend)
1013 ) CHARSET=binary;
Index: trunk/tools/counter/display/index.php
@@ -36,7 +36,7 @@
3737 return debugError('Failed to select database.');
3838 }
3939
40 - $res = mysql_query('SELECT hc_site, hc_page, COUNT(*) AS hc_count FROM hit_counter GROUP BY hc_site, hc_page ORDER BY hc_site, hc_page');
 40+ $res = mysql_query('SELECT hc_site, hc_page, sum(hc_count) AS hc_count FROM hit_counter GROUP BY hc_site, hc_page ORDER BY hc_site, hc_page');
4141 if ($res === false) {
4242 return debugError('Query failed.');
4343 }
Index: trunk/tools/counter/counter.py
@@ -1,3 +1,19 @@
 2+#!/usr/bin/python
 3+
 4+#Page view counter
 5+# Reads squid logs (https://wikitech.leuksman.com/view/Squid_log_format)
 6+# Normalizes page name, aggregates them for a configurable time window, shoves the
 7+# aggregates into a database.
 8+# Usage: ./counter.py [list of allowed pages] < logfile
 9+# Be sure sampleHits is set correctly
 10+
 11+#Notes:
 12+# * Requires pyjudy (http://www.dalkescientific.com/Python/PyJudy.html)
 13+# (python dicts and sets use too much darn memory)
 14+# * The final incomplete aggregation window is discarded.
 15+# * Fixed aggregation windows that align to time of day may be more useful than the current
 16+# behavior.
 17+
218 import MySQLdb
319 import re
420 import sys
@@ -2,4 +18,11 @@
319 import urllib
 20+import time
 21+import pyjudy
422
 23+sampleHits = 100 # Number of hits to record per sample
 24+aggThresh = 3600 # Number of sample seconds needed to trigger a data export
 25+
526 globalConnection = None
 27+aggCounter = pyjudy.JudySLInt()
 28+aggRange = (sys.maxint,0)
629
@@ -11,13 +34,16 @@
1235 # or common skin files
1336 if line.find(" GET http://upload.wikimedia.org/") == -1 \
1437 and line.find(".org/skins-1.5/") == -1:
15 - page = extractPage(line)
 38+ page,timestamp = extractPage(line)
1639 if page and (targetPages == None or page in targetPages):
17 - recordHit(page)
 40+ recordHit(page,timestamp)
1841 closeConnection()
1942
2043 def extractPage(line):
21 - url = extractUrl(line)
 44+ # Extract the page name from the URL.
 45+ # A check should probably be placed here to toss requests with
 46+ # page names larger than the maximum length.
 47+ url,timestamp = extractUrl(line)
2248 if url and \
2349 "?" not in url and \
2450 url[0:7] == "http://":
@@ -25,33 +51,49 @@
2652 if len(bits) == 3 and bits[1] == "wiki":
2753 host = bits[0]
2854 page = normalizePage(bits[2])
29 - return host + ":" + page
 55+ return (host + ":" + page, timestamp)
3056 return None
3157
3258 def extractUrl(line):
3359 # https://wikitech.leuksman.com/view/Squid_log_format
3460 # $hostname %sn %ts.%03tu %tr %>a %Ss/%03Hs %<st %rm %ru %Sh/%<A %mt %{Referer}>h %{X-Forwarded-For}>h %{User-Agent}>h
3561 # ...
 62+ # 3. Seconds (and milliseconds) since epoch
 63+ # ...
3664 # 9. URL
3765 bits = line.split(" ", 10)
38 - if len(bits) > 9 and bits[8] == "GET":
39 - return bits[9]
 66+ if len(bits) > 8 and bits[7] == "GET":
 67+ return (bits[8],int(round(float(bits[2]))))
4068 else:
4169 return None
4270
4371 def normalizePage(page):
4472 return urllib.unquote(page).replace("_", " ")
4573
46 -def recordHit(page):
47 - (site, pagename) = page.split(":", 1)
48 - conn = getConnection()
49 - # fixme: format timestamp from the log line
50 - conn.cursor().execute(
51 - "INSERT INTO hit_counter (hc_ts, hc_site, hc_page) " +
52 - "VALUES (CURRENT_TIMESTAMP(), %s, %s)",
53 - (site, pagename))
54 - conn.commit()
 74+def recordHit(page,timestamp):
 75+ global aggCounter
 76+ global aggRange
 77+ global aggThresh
5578
 79+ if (max(timestamp,aggRange[1])-aggRange[0] >= aggThresh):
 80+ for item in aggCounter.items():
 81+ (site, pagename) = item[0].split(":", 1)
 82+ conn = getConnection()
 83+ conn.cursor().execute(
 84+ "INSERT INTO hit_counter (hc_tsstart, hc_tsend, hc_site, hc_page, hc_count) VALUES (%s, %s, %s, %s, %s)",
 85+ (time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[0])),time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[1])),site, pagename, item[1]))
 86+ conn.commit()
 87+ aggRange=(aggRange[1],aggRange[1])
 88+ aggCounter.FreeArray()
 89+
 90+ if page in aggCounter:
 91+ aggCounter[page] += sampleHits
 92+ else:
 93+ aggCounter[page] = sampleHits
 94+ aggRange=(min(timestamp,aggRange[0]),max(timestamp,aggRange[1]))
 95+
 96+
 97+
5698 def getConnection():
5799 global globalConnection
58100 if not globalConnection:
@@ -67,17 +109,19 @@
68110 globalConnection.close()
69111 globalConnection = None
70112
71 -def listFromFile(filename):
 113+def setFromFile(filename):
72114 """Read list of lines from a file"""
73115 infile = open(filename)
74 - out = [line.strip() for line in infile if line.strip() != ""]
 116+ out = pyjudy.JudySLInt()
 117+ for line in infile:
 118+ if line.strip()!="":
 119+ out.Ins(line.strip(),1)
75120 infile.close()
76 - out.sort()
77121 return out
78122
79123 if __name__ == "__main__":
80124 if len(sys.argv) > 1:
81 - targetPages = listFromFile(sys.argv[1])
 125+ targetPages = setFromFile(sys.argv[1])
82126 runLoop(sys.stdin, targetPages)
83127 else:
84128 runLoop(sys.stdin)

Status & tagging log