r34988 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r34987‎ | r34988 | r34989 >
Date:18:06, 17 May 2008
Author:midom
Status:old
Tags:
Comment:
add squidlogs->collector filter, to sit between udp2log and log2udp
Modified paths:
  • /trunk/webstatscollector/Makefile (modified) (history)
  • /trunk/webstatscollector/filter.c (added) (history)

Diff [purge]

Index: trunk/webstatscollector/filter.c
@@ -0,0 +1,155 @@
 2+#include <string.h>
 3+#include <stdio.h>
 4+#include <stdbool.h>
 5+
 6+/*
 7+
 8+#!/usr/bin/python
 9+
 10+import re
 11+import sys
 12+
 13+dupes = re.compile('^(145\.97\.39\.|66\.230\.200\.|211\.115\.107\.|91\.198\.174\.)')
 14+urlre = re.compile('^http://([^\.]+)\.([^\.]+).org/wiki/([^?]+)')
 15+
 16+projects={"wikipedia":"","wiktionary":".d","wikinews":".n","wikimedia":".m","wikibooks":".b","wikisource":".s","mediawiki":".w","wikiversity":".v","wikiquote":".q" }
 17+
 18+for line in sys.stdin:
 19+ ip,undef,bytes,undef,url=line.split()[4:9]
 20+ if dupes.match(ip): continue
 21+ stuff=urlre.match(url)
 22+ if stuff == None: continue
 23+ language,project,title = stuff.groups()
 24+ if project=="wikimedia" and language not in ["commons","meta","incubator","species"]: continue
 25+ try: print language + projects[project] + " 1 " + bytes + " " + title
 26+ except: continue
 27+
 28+*/
 29+
 30+#define LINESIZE 4096
 31+char *_sep, *_lasttok, *_firsttok;
 32+#define TOKENIZE(x,y) _lasttok=NULL; _sep=y; _firsttok=strtok_r(x,y,&_lasttok);
 33+#define FIELD strtok_r(NULL,_sep,&_lasttok)
 34+#define TAIL _lasttok
 35+#define HEAD _firsttok
 36+
 37+char *wmwhitelist[] = {"commons","meta","incubator","species"};
 38+bool check_wikimedia(char *language) {
 39+ char **p=wmwhitelist;
 40+ for(;*p;p++) {
 41+ if(!strcmp(*p,language))
 42+ return true;
 43+ }
 44+ return false;
 45+}
 46+
 47+/* IP addresses from which duplicate requests originate */
 48+
 49+char *dupes[] = {"145.97.39.","66.230.200.",
 50+ "208.80.152.","208.80.153.",
 51+ "208.80.154.","208.80.155.",
 52+ "211.115.107.","91.198.174.",
 53+ NULL};
 54+
 55+bool check_ip(char *ip) {
 56+ char **prefix=dupes;
 57+ for (;*prefix;prefix++) {
 58+ if(!strncmp(*prefix,ip,strlen(*prefix)))
 59+ return false;
 60+ }
 61+ return true;
 62+}
 63+
 64+const struct project {
 65+ char *full;
 66+ char *suffix;
 67+ bool (*filter)(char *);
 68+} projects[] = {
 69+ {"wikipedia","",NULL},
 70+ {"wiktionary",".d",NULL},
 71+ {"wikinews",".n",NULL},
 72+ {"wikimedia",".m",check_wikimedia},
 73+ {"wikibooks",".b",NULL},
 74+ {"wikisource",".s",NULL},
 75+ {"mediawiki",".w",NULL},
 76+ {"wikiversity",".v",NULL},
 77+ {"wikiquote",".q",NULL},
 78+ NULL
 79+ }, *project;
 80+
 81+struct info {
 82+ char *ip;
 83+ char *size;
 84+ char *language;
 85+ char *project;
 86+ char *title;
 87+ char *suffix;
 88+} info;
 89+
 90+bool parse_url(char *url, struct info *in) {
 91+ if (!url)
 92+ return false;
 93+ char *host, *lang, *project, *dir;
 94+
 95+ TOKENIZE(url,"/"); /* http: */
 96+ host=FIELD;
 97+ dir=FIELD;
 98+ if (!dir)
 99+ return false;
 100+ if (strcmp(dir,"wiki"))
 101+ return false; /* no /wiki/ part :( */
 102+ in->title=TAIL;
 103+ TOKENIZE(in->title,"?");
 104+
 105+ TOKENIZE(host,".");
 106+ in->language=HEAD;
 107+ in->project=FIELD;
 108+ if(strcmp(TAIL,"org"))
 109+ return false;
 110+ if (in->language && in->project)
 111+ return true;
 112+ else
 113+ return false;
 114+}
 115+
 116+bool check_project(struct info *in) {
 117+ struct project *pr=projects;
 118+ for(;pr->full;pr++) {
 119+ if(!strcmp(in->project,pr->full)) {
 120+ in->suffix=pr->suffix;
 121+ /* Project found, check if filter needed */
 122+ if (pr->filter)
 123+ return pr->filter(in->language);
 124+ else
 125+ return true;
 126+ }
 127+ }
 128+ return false;
 129+}
 130+
 131+int main(int ac, char **av) {
 132+ char line[LINESIZE];
 133+
 134+ char *undef,*ip,*url, *size;
 135+ while(fgets(line,LINESIZE-1,stdin)) {
 136+ bzero(&info,sizeof(info));
 137+ /* Tokenize the log line */
 138+ TOKENIZE(line," "); /* server */
 139+ FIELD; /* id? */
 140+ FIELD; /* timestamp */
 141+ FIELD; /* ??? */
 142+ info.ip= FIELD; /* IP address! */
 143+ FIELD; /* status */
 144+ info.size= FIELD; /* object size */
 145+ FIELD;
 146+ url= FIELD;
 147+ if (!parse_url(url,&info))
 148+ continue;
 149+ if (!check_ip(info.ip))
 150+ continue;
 151+ if (!check_project(&info))
 152+ continue;
 153+ printf("%s%s 1 %s %s\n",info.language, info.suffix, info.size, info.title);
 154+ }
 155+}
 156+
Index: trunk/webstatscollector/Makefile
@@ -12,10 +12,13 @@
1313 #LDFLAGS+=-ldb
1414 CFLAGS+=-Wall -g
1515
16 -all: collector
 16+all: collector filter
1717
1818 collector: collector.h collector.c export.c
1919
 20+filter: filter.c
 21+ cc -o filter filter.c
 22+
2023 #export: collector.h export.c
2124
2225 clean:

Status & tagging log