Index: trunk/tools/counter/hitcounter.sql |
— | — | @@ -1,9 +1,12 @@ |
2 | 2 | CREATE TABLE hit_counter ( |
3 | | - hc_ts TIMESTAMP, |
| 3 | + hc_tsstart TIMESTAMP, |
| 4 | + hc_tsend TIMESTAMP, |
4 | 5 | |
5 | 6 | hc_site VARCHAR(255) BINARY, |
6 | 7 | hc_page VARCHAR(255) BINARY, |
7 | 8 | |
8 | | - KEY (hc_ts, hc_site, hc_page), |
9 | | - KEY (hc_site, hc_page, hc_ts) |
| 9 | + hc_count int8, |
| 10 | + |
| 11 | + KEY (hc_tsend, hc_site, hc_page), |
| 12 | + KEY (hc_site, hc_page, hc_tsend) |
10 | 13 | ) CHARSET=binary; |
Index: trunk/tools/counter/display/index.php |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | return debugError('Failed to select database.'); |
38 | 38 | } |
39 | 39 | |
40 | | - $res = mysql_query('SELECT hc_site, hc_page, COUNT(*) AS hc_count FROM hit_counter GROUP BY hc_site, hc_page ORDER BY hc_site, hc_page'); |
| 40 | + $res = mysql_query('SELECT hc_site, hc_page, sum(hc_count) AS hc_count FROM hit_counter GROUP BY hc_site, hc_page ORDER BY hc_site, hc_page'); |
41 | 41 | if ($res === false) { |
42 | 42 | return debugError('Query failed.'); |
43 | 43 | } |
Index: trunk/tools/counter/counter.py |
— | — | @@ -1,3 +1,19 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | +#Page view counter |
| 5 | +# Reads squid logs (https://wikitech.leuksman.com/view/Squid_log_format) |
| 6 | +# Normalizes page name, aggregates them for a configurable time window, shoves the |
| 7 | +# aggregates into a database. |
| 8 | +# Usage: ./counter.py [list of allowed pages] < logfile |
| 9 | +# Be sure sampleHits is set correctly |
| 10 | + |
| 11 | +#Notes: |
| 12 | +# * Requires pyjudy (http://www.dalkescientific.com/Python/PyJudy.html) |
| 13 | +# (python dicts and sets use too much darn memory) |
| 14 | +# * The final incomplete aggregation window is discarded. |
| 15 | +# * Fixed aggregation windows that align to time of day may be more useful than the current |
| 16 | +# behavior. |
| 17 | + |
2 | 18 | import MySQLdb |
3 | 19 | import re |
4 | 20 | import sys |
— | — | @@ -2,4 +18,11 @@ |
3 | 19 | import urllib |
| 20 | +import time |
| 21 | +import pyjudy |
4 | 22 | |
| 23 | +sampleHits = 100 # Number of hits to record per sample |
| 24 | +aggThresh = 3600 # Number of sample seconds needed to trigger a data export |
| 25 | + |
5 | 26 | globalConnection = None |
| 27 | +aggCounter = pyjudy.JudySLInt() |
| 28 | +aggRange = (sys.maxint,0) |
6 | 29 | |
— | — | @@ -11,13 +34,16 @@ |
12 | 35 | # or common skin files |
13 | 36 | if line.find(" GET http://upload.wikimedia.org/") == -1 \ |
14 | 37 | and line.find(".org/skins-1.5/") == -1: |
15 | | - page = extractPage(line) |
| 38 | + page,timestamp = extractPage(line) |
16 | 39 | if page and (targetPages == None or page in targetPages): |
17 | | - recordHit(page) |
| 40 | + recordHit(page,timestamp) |
18 | 41 | closeConnection() |
19 | 42 | |
20 | 43 | def extractPage(line): |
21 | | - url = extractUrl(line) |
| 44 | + # Extract the page name from the URL. |
| 45 | + # A check should probably be placed here to toss requests with |
| 46 | + # page names larger than the maximum length. |
| 47 | + url,timestamp = extractUrl(line) |
22 | 48 | if url and \ |
23 | 49 | "?" not in url and \ |
24 | 50 | url[0:7] == "http://": |
— | — | @@ -25,33 +51,49 @@ |
26 | 52 | if len(bits) == 3 and bits[1] == "wiki": |
27 | 53 | host = bits[0] |
28 | 54 | page = normalizePage(bits[2]) |
29 | | - return host + ":" + page |
| 55 | + return (host + ":" + page, timestamp) |
30 | 56 | return None |
31 | 57 | |
32 | 58 | def extractUrl(line): |
33 | 59 | # https://wikitech.leuksman.com/view/Squid_log_format |
34 | 60 | # $hostname %sn %ts.%03tu %tr %>a %Ss/%03Hs %<st %rm %ru %Sh/%<A %mt %{Referer}>h %{X-Forwarded-For}>h %{User-Agent}>h |
35 | 61 | # ... |
| 62 | + # 3. Seconds (and milliseconds) since epoch |
| 63 | + # ... |
36 | 64 | # 9. URL |
37 | 65 | bits = line.split(" ", 10) |
38 | | - if len(bits) > 9 and bits[8] == "GET": |
39 | | - return bits[9] |
| 66 | + if len(bits) > 8 and bits[7] == "GET": |
| 67 | + return (bits[8],int(round(float(bits[2])))) |
40 | 68 | else: |
41 | 69 | return None |
42 | 70 | |
43 | 71 | def normalizePage(page): |
44 | 72 | return urllib.unquote(page).replace("_", " ") |
45 | 73 | |
46 | | -def recordHit(page): |
47 | | - (site, pagename) = page.split(":", 1) |
48 | | - conn = getConnection() |
49 | | - # fixme: format timestamp from the log line |
50 | | - conn.cursor().execute( |
51 | | - "INSERT INTO hit_counter (hc_ts, hc_site, hc_page) " + |
52 | | - "VALUES (CURRENT_TIMESTAMP(), %s, %s)", |
53 | | - (site, pagename)) |
54 | | - conn.commit() |
| 74 | +def recordHit(page,timestamp): |
| 75 | + global aggCounter |
| 76 | + global aggRange |
| 77 | + global aggThresh |
55 | 78 | |
| 79 | + if (max(timestamp,aggRange[1])-aggRange[0] >= aggThresh): |
| 80 | + for item in aggCounter.items(): |
| 81 | + (site, pagename) = item[0].split(":", 1) |
| 82 | + conn = getConnection() |
| 83 | + conn.cursor().execute( |
| 84 | + "INSERT INTO hit_counter (hc_tsstart, hc_tsend, hc_site, hc_page, hc_count) VALUES (%s, %s, %s, %s, %s)", |
| 85 | + (time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[0])),time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[1])),site, pagename, item[1])) |
| 86 | + conn.commit() |
| 87 | + aggRange=(aggRange[1],aggRange[1]) |
| 88 | + aggCounter.FreeArray() |
| 89 | + |
| 90 | + if page in aggCounter: |
| 91 | + aggCounter[page] += sampleHits |
| 92 | + else: |
| 93 | + aggCounter[page] = sampleHits |
| 94 | + aggRange=(min(timestamp,aggRange[0]),max(timestamp,aggRange[1])) |
| 95 | + |
| 96 | + |
| 97 | + |
56 | 98 | def getConnection(): |
57 | 99 | global globalConnection |
58 | 100 | if not globalConnection: |
— | — | @@ -67,17 +109,19 @@ |
68 | 110 | globalConnection.close() |
69 | 111 | globalConnection = None |
70 | 112 | |
71 | | -def listFromFile(filename): |
| 113 | +def setFromFile(filename): |
72 | 114 | """Read list of lines from a file""" |
73 | 115 | infile = open(filename) |
74 | | - out = [line.strip() for line in infile if line.strip() != ""] |
| 116 | + out = pyjudy.JudySLInt() |
| 117 | + for line in infile: |
| 118 | + if line.strip()!="": |
| 119 | + out.Ins(line.strip(),1) |
75 | 120 | infile.close() |
76 | | - out.sort() |
77 | 121 | return out |
78 | 122 | |
79 | 123 | if __name__ == "__main__": |
80 | 124 | if len(sys.argv) > 1: |
81 | | - targetPages = listFromFile(sys.argv[1]) |
| 125 | + targetPages = setFromFile(sys.argv[1]) |
82 | 126 | runLoop(sys.stdin, targetPages) |
83 | 127 | else: |
84 | 128 | runLoop(sys.stdin) |