Index: trunk/webstatscollector/filter.c |
— | — | @@ -0,0 +1,155 @@ |
| 2 | +#include <string.h> |
| 3 | +#include <stdio.h> |
| 4 | +#include <stdbool.h> |
| 5 | + |
| 6 | +/* |
| 7 | + |
| 8 | +#!/usr/bin/python |
| 9 | + |
| 10 | +import re |
| 11 | +import sys |
| 12 | + |
| 13 | +dupes = re.compile('^(145\.97\.39\.|66\.230\.200\.|211\.115\.107\.|91\.198\.174\.)') |
| 14 | +urlre = re.compile('^http://([^\.]+)\.([^\.]+).org/wiki/([^?]+)') |
| 15 | + |
| 16 | +projects={"wikipedia":"","wiktionary":".d","wikinews":".n","wikimedia":".m","wikibooks":".b","wikisource":".s","mediawiki":".w","wikiversity":".v","wikiquote":".q" } |
| 17 | + |
| 18 | +for line in sys.stdin: |
| 19 | + ip,undef,bytes,undef,url=line.split()[4:9] |
| 20 | + if dupes.match(ip): continue |
| 21 | + stuff=urlre.match(url) |
| 22 | + if stuff == None: continue |
| 23 | + language,project,title = stuff.groups() |
| 24 | + if project=="wikimedia" and language not in ["commons","meta","incubator","species"]: continue |
| 25 | + try: print language + projects[project] + " 1 " + bytes + " " + title |
| 26 | + except: continue |
| 27 | + |
| 28 | +*/ |
| 29 | + |
| 30 | +#define LINESIZE 4096 |
| 31 | +char *_sep, *_lasttok, *_firsttok; |
| 32 | +#define TOKENIZE(x,y) _lasttok=NULL; _sep=y; _firsttok=strtok_r(x,y,&_lasttok); |
| 33 | +#define FIELD strtok_r(NULL,_sep,&_lasttok) |
| 34 | +#define TAIL _lasttok |
| 35 | +#define HEAD _firsttok |
| 36 | + |
| 37 | +char *wmwhitelist[] = {"commons","meta","incubator","species"}; |
| 38 | +bool check_wikimedia(char *language) { |
| 39 | + char **p=wmwhitelist; |
| 40 | + for(;*p;p++) { |
| 41 | + if(!strcmp(*p,language)) |
| 42 | + return true; |
| 43 | + } |
| 44 | + return false; |
| 45 | +} |
| 46 | + |
| 47 | +/* IP addresses from which duplicate requests originate */ |
| 48 | + |
| 49 | +char *dupes[] = {"145.97.39.","66.230.200.", |
| 50 | + "208.80.152.","208.80.153.", |
| 51 | + "208.80.154.","208.80.155.", |
| 52 | + "211.115.107.","91.198.174.", |
| 53 | + NULL}; |
| 54 | + |
| 55 | +bool check_ip(char *ip) { |
| 56 | + char **prefix=dupes; |
| 57 | + for (;*prefix;prefix++) { |
| 58 | + if(!strncmp(*prefix,ip,strlen(*prefix))) |
| 59 | + return false; |
| 60 | + } |
| 61 | + return true; |
| 62 | +} |
| 63 | + |
| 64 | +const struct project { |
| 65 | + char *full; |
| 66 | + char *suffix; |
| 67 | + bool (*filter)(char *); |
| 68 | +} projects[] = { |
| 69 | + {"wikipedia","",NULL}, |
| 70 | + {"wiktionary",".d",NULL}, |
| 71 | + {"wikinews",".n",NULL}, |
| 72 | + {"wikimedia",".m",check_wikimedia}, |
| 73 | + {"wikibooks",".b",NULL}, |
| 74 | + {"wikisource",".s",NULL}, |
| 75 | + {"mediawiki",".w",NULL}, |
| 76 | + {"wikiversity",".v",NULL}, |
| 77 | + {"wikiquote",".q",NULL}, |
| 78 | + NULL |
| 79 | + }, *project; |
| 80 | + |
| 81 | +struct info { |
| 82 | + char *ip; |
| 83 | + char *size; |
| 84 | + char *language; |
| 85 | + char *project; |
| 86 | + char *title; |
| 87 | + char *suffix; |
| 88 | +} info; |
| 89 | + |
| 90 | +bool parse_url(char *url, struct info *in) { |
| 91 | + if (!url) |
| 92 | + return false; |
| 93 | + char *host, *lang, *project, *dir; |
| 94 | + |
| 95 | + TOKENIZE(url,"/"); /* http: */ |
| 96 | + host=FIELD; |
| 97 | + dir=FIELD; |
| 98 | + if (!dir) |
| 99 | + return false; |
| 100 | + if (strcmp(dir,"wiki")) |
| 101 | + return false; /* no /wiki/ part :( */ |
| 102 | + in->title=TAIL; |
| 103 | + TOKENIZE(in->title,"?"); |
| 104 | + |
| 105 | + TOKENIZE(host,"."); |
| 106 | + in->language=HEAD; |
| 107 | + in->project=FIELD; |
| 108 | + if(strcmp(TAIL,"org")) |
| 109 | + return false; |
| 110 | + if (in->language && in->project) |
| 111 | + return true; |
| 112 | + else |
| 113 | + return false; |
| 114 | +} |
| 115 | + |
| 116 | +bool check_project(struct info *in) { |
| 117 | + struct project *pr=projects; |
| 118 | + for(;pr->full;pr++) { |
| 119 | + if(!strcmp(in->project,pr->full)) { |
| 120 | + in->suffix=pr->suffix; |
| 121 | + /* Project found, check if filter needed */ |
| 122 | + if (pr->filter) |
| 123 | + return pr->filter(in->language); |
| 124 | + else |
| 125 | + return true; |
| 126 | + } |
| 127 | + } |
| 128 | + return false; |
| 129 | +} |
| 130 | + |
| 131 | +int main(int ac, char **av) { |
| 132 | + char line[LINESIZE]; |
| 133 | + |
| 134 | + char *undef,*ip,*url, *size; |
| 135 | + while(fgets(line,LINESIZE-1,stdin)) { |
| 136 | + bzero(&info,sizeof(info)); |
| 137 | + /* Tokenize the log line */ |
| 138 | + TOKENIZE(line," "); /* server */ |
| 139 | + FIELD; /* id? */ |
| 140 | + FIELD; /* timestamp */ |
| 141 | + FIELD; /* ??? */ |
| 142 | + info.ip= FIELD; /* IP address! */ |
| 143 | + FIELD; /* status */ |
| 144 | + info.size= FIELD; /* object size */ |
| 145 | + FIELD; |
| 146 | + url= FIELD; |
| 147 | + if (!parse_url(url,&info)) |
| 148 | + continue; |
| 149 | + if (!check_ip(info.ip)) |
| 150 | + continue; |
| 151 | + if (!check_project(&info)) |
| 152 | + continue; |
| 153 | + printf("%s%s 1 %s %s\n",info.language, info.suffix, info.size, info.title); |
| 154 | + } |
| 155 | +} |
| 156 | + |
Index: trunk/webstatscollector/Makefile |
— | — | @@ -12,10 +12,13 @@ |
13 | 13 | #LDFLAGS+=-ldb |
14 | 14 | CFLAGS+=-Wall -g |
15 | 15 | |
16 | | -all: collector |
| 16 | +all: collector filter |
17 | 17 | |
18 | 18 | collector: collector.h collector.c export.c |
19 | 19 | |
| 20 | +filter: filter.c |
| 21 | + cc -o filter filter.c |
| 22 | + |
20 | 23 | #export: collector.h export.c |
21 | 24 | |
22 | 25 | clean: |