r34988 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r34987‎ \| r34988 \| r34989 >
Date:	18:06, 17 May 2008
Author:	midom
Status:	old
Tags:
Comment:	add squidlogs->collector filter, to sit between udp2log and log2udp
Modified paths:	/trunk/webstatscollector/Makefile (modified) (history) /trunk/webstatscollector/filter.c (added) (history)

Diff [purge]

Index: trunk/webstatscollector/filter.c
—	—	@@ -0,0 +1,155 @@
	2	+#include <string.h>
	3	+#include <stdio.h>
	4	+#include <stdbool.h>
	5	+
	6	+/*
	7	+
	8	+#!/usr/bin/python
	9	+
	10	+import re
	11	+import sys
	12	+
	13	+dupes = re.compile('^(145\.97\.39\.\|66\.230\.200\.\|211\.115\.107\.\|91\.198\.174\.)')
	14	+urlre = re.compile('^http://([^\.]+)\.([^\.]+).org/wiki/([^?]+)')
	15	+
	16	+projects={"wikipedia":"","wiktionary":".d","wikinews":".n","wikimedia":".m","wikibooks":".b","wikisource":".s","mediawiki":".w","wikiversity":".v","wikiquote":".q" }
	17	+
	18	+for line in sys.stdin:
	19	+ ip,undef,bytes,undef,url=line.split()[4:9]
	20	+ if dupes.match(ip): continue
	21	+ stuff=urlre.match(url)
	22	+ if stuff == None: continue
	23	+ language,project,title = stuff.groups()
	24	+ if project=="wikimedia" and language not in ["commons","meta","incubator","species"]: continue
	25	+ try: print language + projects[project] + " 1 " + bytes + " " + title
	26	+ except: continue
	27	+
	28	+*/
	29	+
	30	+#define LINESIZE 4096
	31	+char _sep, _lasttok, *_firsttok;
	32	+#define TOKENIZE(x,y) _lasttok=NULL; _sep=y; _firsttok=strtok_r(x,y,&_lasttok);
	33	+#define FIELD strtok_r(NULL,_sep,&_lasttok)
	34	+#define TAIL _lasttok
	35	+#define HEAD _firsttok
	36	+
	37	+char *wmwhitelist[] = {"commons","meta","incubator","species"};
	38	+bool check_wikimedia(char *language) {
	39	+ char **p=wmwhitelist;
	40	+ for(;*p;p++) {
	41	+ if(!strcmp(*p,language))
	42	+ return true;
	43	+ }
	44	+ return false;
	45	+}
	46	+
	47	+/* IP addresses from which duplicate requests originate */
	48	+
	49	+char *dupes[] = {"145.97.39.","66.230.200.",
	50	+ "208.80.152.","208.80.153.",
	51	+ "208.80.154.","208.80.155.",
	52	+ "211.115.107.","91.198.174.",
	53	+ NULL};
	54	+
	55	+bool check_ip(char *ip) {
	56	+ char **prefix=dupes;
	57	+ for (;*prefix;prefix++) {
	58	+ if(!strncmp(prefix,ip,strlen(prefix)))
	59	+ return false;
	60	+ }
	61	+ return true;
	62	+}
	63	+
	64	+const struct project {
	65	+ char *full;
	66	+ char *suffix;
	67	+ bool (filter)(char );
	68	+} projects[] = {
	69	+ {"wikipedia","",NULL},
	70	+ {"wiktionary",".d",NULL},
	71	+ {"wikinews",".n",NULL},
	72	+ {"wikimedia",".m",check_wikimedia},
	73	+ {"wikibooks",".b",NULL},
	74	+ {"wikisource",".s",NULL},
	75	+ {"mediawiki",".w",NULL},
	76	+ {"wikiversity",".v",NULL},
	77	+ {"wikiquote",".q",NULL},
	78	+ NULL
	79	+ }, *project;
	80	+
	81	+struct info {
	82	+ char *ip;
	83	+ char *size;
	84	+ char *language;
	85	+ char *project;
	86	+ char *title;
	87	+ char *suffix;
	88	+} info;
	89	+
	90	+bool parse_url(char url, struct info in) {
	91	+ if (!url)
	92	+ return false;
	93	+ char host, lang, project, dir;
	94	+
	95	+ TOKENIZE(url,"/"); /* http: */
	96	+ host=FIELD;
	97	+ dir=FIELD;
	98	+ if (!dir)
	99	+ return false;
	100	+ if (strcmp(dir,"wiki"))
	101	+ return false; /* no /wiki/ part :( */
	102	+ in->title=TAIL;
	103	+ TOKENIZE(in->title,"?");
	104	+
	105	+ TOKENIZE(host,".");
	106	+ in->language=HEAD;
	107	+ in->project=FIELD;
	108	+ if(strcmp(TAIL,"org"))
	109	+ return false;
	110	+ if (in->language && in->project)
	111	+ return true;
	112	+ else
	113	+ return false;
	114	+}
	115	+
	116	+bool check_project(struct info *in) {
	117	+ struct project *pr=projects;
	118	+ for(;pr->full;pr++) {
	119	+ if(!strcmp(in->project,pr->full)) {
	120	+ in->suffix=pr->suffix;
	121	+ /* Project found, check if filter needed */
	122	+ if (pr->filter)
	123	+ return pr->filter(in->language);
	124	+ else
	125	+ return true;
	126	+ }
	127	+ }
	128	+ return false;
	129	+}
	130	+
	131	+int main(int ac, char **av) {
	132	+ char line[LINESIZE];
	133	+
	134	+ char undef,ip,url, size;
	135	+ while(fgets(line,LINESIZE-1,stdin)) {
	136	+ bzero(&info,sizeof(info));
	137	+ /* Tokenize the log line */
	138	+ TOKENIZE(line," "); /* server */
	139	+ FIELD; /* id? */
	140	+ FIELD; /* timestamp */
	141	+ FIELD; /* ??? */
	142	+ info.ip= FIELD; /* IP address! */
	143	+ FIELD; /* status */
	144	+ info.size= FIELD; /* object size */
	145	+ FIELD;
	146	+ url= FIELD;
	147	+ if (!parse_url(url,&info))
	148	+ continue;
	149	+ if (!check_ip(info.ip))
	150	+ continue;
	151	+ if (!check_project(&info))
	152	+ continue;
	153	+ printf("%s%s 1 %s %s\n",info.language, info.suffix, info.size, info.title);
	154	+ }
	155	+}
	156	+
Index: trunk/webstatscollector/Makefile
—	—	@@ -12,10 +12,13 @@
13	13	#LDFLAGS+=-ldb
14	14	CFLAGS+=-Wall -g
15	15
16		~~-all: collector~~
	16	+all: collector filter
17	17
18	18	collector: collector.h collector.c export.c
19	19
	20	+filter: filter.c
	21	+ cc -o filter filter.c
	22	+
20	23	#export: collector.h export.c
21	24
22	25	clean:

Status & tagging log

15:26, 12 September 2011 Meno25 (talk | contribs) changed the status of r34988 [removed: ok added: old]