r26324 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r26323‎ \| r26324 \| r26325 >
Date:	21:36, 2 October 2007
Author:	gmaxwell
Status:	old
Tags:
Comment:	Change the collector to aggregate internally to reduce database/disk traffic. This changeset introduces a dependency on pyjudy because python dicts were causing enormous memory usage. If the collector were recoded in C it might actually perform well enough to be useful on 1:1 data, doing so wouldn't be a big task.
Modified paths:	/trunk/tools/counter/counter.py (modified) (history) /trunk/tools/counter/display/index.php (modified) (history) /trunk/tools/counter/hitcounter.sql (modified) (history)

Diff [purge]

Index: trunk/tools/counter/hitcounter.sql
—	—	@@ -1,9 +1,12 @@
2	2	CREATE TABLE hit_counter (
3		~~- hc_ts TIMESTAMP,~~
	3	+ hc_tsstart TIMESTAMP,
	4	+ hc_tsend TIMESTAMP,
4	5
5	6	hc_site VARCHAR(255) BINARY,
6	7	hc_page VARCHAR(255) BINARY,
7	8
8		~~- KEY (hc_ts, hc_site, hc_page),~~
9		~~- KEY (hc_site, hc_page, hc_ts)~~
	9	+ hc_count int8,
	10	+
	11	+ KEY (hc_tsend, hc_site, hc_page),
	12	+ KEY (hc_site, hc_page, hc_tsend)
10	13	) CHARSET=binary;
Index: trunk/tools/counter/display/index.php
—	—	@@ -36,7 +36,7 @@
37	37	return debugError('Failed to select database.');
38	38	}
39	39
40		~~- $res = mysql_query('SELECT hc_site, hc_page, COUNT(*) AS hc_count FROM hit_counter GROUP BY hc_site, hc_page ORDER BY hc_site, hc_page');~~
	40	+ $res = mysql_query('SELECT hc_site, hc_page, sum(hc_count) AS hc_count FROM hit_counter GROUP BY hc_site, hc_page ORDER BY hc_site, hc_page');
41	41	if ($res === false) {
42	42	return debugError('Query failed.');
43	43	}
Index: trunk/tools/counter/counter.py
—	—	@@ -1,3 +1,19 @@
	2	+#!/usr/bin/python
	3	+
	4	+#Page view counter
	5	+# Reads squid logs (https://wikitech.leuksman.com/view/Squid_log_format)
	6	+# Normalizes page name, aggregates them for a configurable time window, shoves the
	7	+# aggregates into a database.
	8	+# Usage: ./counter.py [list of allowed pages] < logfile
	9	+# Be sure sampleHits is set correctly
	10	+
	11	+#Notes:
	12	+# * Requires pyjudy (http://www.dalkescientific.com/Python/PyJudy.html)
	13	+# (python dicts and sets use too much darn memory)
	14	+# * The final incomplete aggregation window is discarded.
	15	+# * Fixed aggregation windows that align to time of day may be more useful than the current
	16	+# behavior.
	17	+
2	18	import MySQLdb
3	19	import re
4	20	import sys
—	—	@@ -2,4 +18,11 @@
3	19	import urllib
	20	+import time
	21	+import pyjudy
4	22
	23	+sampleHits = 100 # Number of hits to record per sample
	24	+aggThresh = 3600 # Number of sample seconds needed to trigger a data export
	25	+
5	26	globalConnection = None
	27	+aggCounter = pyjudy.JudySLInt()
	28	+aggRange = (sys.maxint,0)
6	29
—	—	@@ -11,13 +34,16 @@
12	35	# or common skin files
13	36	if line.find(" GET http://upload.wikimedia.org/") == -1 \
14	37	and line.find(".org/skins-1.5/") == -1:
15		~~- page = extractPage(line)~~
	38	+ page,timestamp = extractPage(line)
16	39	if page and (targetPages == None or page in targetPages):
17		~~- recordHit(page)~~
	40	+ recordHit(page,timestamp)
18	41	closeConnection()
19	42
20	43	def extractPage(line):
21		~~- url = extractUrl(line)~~
	44	+ # Extract the page name from the URL.
	45	+ # A check should probably be placed here to toss requests with
	46	+ # page names larger than the maximum length.
	47	+ url,timestamp = extractUrl(line)
22	48	if url and \
23	49	"?" not in url and \
24	50	url[0:7] == "http://":
—	—	@@ -25,33 +51,49 @@
26	52	if len(bits) == 3 and bits[1] == "wiki":
27	53	host = bits[0]
28	54	page = normalizePage(bits[2])
29		~~- return host + ":" + page~~
	55	+ return (host + ":" + page, timestamp)
30	56	return None
31	57
32	58	def extractUrl(line):
33	59	# https://wikitech.leuksman.com/view/Squid_log_format
34	60	# $hostname %sn %ts.%03tu %tr %>a %Ss/%03Hs %<st %rm %ru %Sh/%<A %mt %{Referer}>h %{X-Forwarded-For}>h %{User-Agent}>h
35	61	# ...
	62	+ # 3. Seconds (and milliseconds) since epoch
	63	+ # ...
36	64	# 9. URL
37	65	bits = line.split(" ", 10)
38		~~- if len(bits) > 9 and bits[8] == "GET":~~
39		~~- return bits[9]~~
	66	+ if len(bits) > 8 and bits[7] == "GET":
	67	+ return (bits[8],int(round(float(bits[2]))))
40	68	else:
41	69	return None
42	70
43	71	def normalizePage(page):
44	72	return urllib.unquote(page).replace("_", " ")
45	73
46		~~-def recordHit(page):~~
47		~~- (site, pagename) = page.split(":", 1)~~
48		~~- conn = getConnection()~~
49		~~- # fixme: format timestamp from the log line~~
50		~~- conn.cursor().execute(~~
51		~~- "INSERT INTO hit_counter (hc_ts, hc_site, hc_page) " +~~
52		~~- "VALUES (CURRENT_TIMESTAMP(), %s, %s)",~~
53		~~- (site, pagename))~~
54		~~- conn.commit()~~
	74	+def recordHit(page,timestamp):
	75	+ global aggCounter
	76	+ global aggRange
	77	+ global aggThresh
55	78
	79	+ if (max(timestamp,aggRange[1])-aggRange[0] >= aggThresh):
	80	+ for item in aggCounter.items():
	81	+ (site, pagename) = item[0].split(":", 1)
	82	+ conn = getConnection()
	83	+ conn.cursor().execute(
	84	+ "INSERT INTO hit_counter (hc_tsstart, hc_tsend, hc_site, hc_page, hc_count) VALUES (%s, %s, %s, %s, %s)",
	85	+ (time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[0])),time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[1])),site, pagename, item[1]))
	86	+ conn.commit()
	87	+ aggRange=(aggRange[1],aggRange[1])
	88	+ aggCounter.FreeArray()
	89	+
	90	+ if page in aggCounter:
	91	+ aggCounter[page] += sampleHits
	92	+ else:
	93	+ aggCounter[page] = sampleHits
	94	+ aggRange=(min(timestamp,aggRange[0]),max(timestamp,aggRange[1]))
	95	+
	96	+
	97	+
56	98	def getConnection():
57	99	global globalConnection
58	100	if not globalConnection:
—	—	@@ -67,17 +109,19 @@
68	110	globalConnection.close()
69	111	globalConnection = None
70	112
71		~~-def listFromFile(filename):~~
	113	+def setFromFile(filename):
72	114	"""Read list of lines from a file"""
73	115	infile = open(filename)
74		~~- out = [line.strip() for line in infile if line.strip() != ""]~~
	116	+ out = pyjudy.JudySLInt()
	117	+ for line in infile:
	118	+ if line.strip()!="":
	119	+ out.Ins(line.strip(),1)
75	120	infile.close()
76		~~- out.sort()~~
77	121	return out
78	122
79	123	if __name__ == "__main__":
80	124	if len(sys.argv) > 1:
81		~~- targetPages = listFromFile(sys.argv[1])~~
	125	+ targetPages = setFromFile(sys.argv[1])
82	126	runLoop(sys.stdin, targetPages)
83	127	else:
84	128	runLoop(sys.stdin)

Status & tagging log

15:21, 12 September 2011 Meno25 (talk | contribs) changed the status of r26324 [removed: ok added: old]