r92144 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r92143‎ \| r92144 \| r92145 >
Date:	08:35, 14 July 2011
Author:	ariel
Status:	deferred
Tags:
Comment:	version bump; for finding pageid in xml file, workaround for pages with giant cumulative rev text (*cough en pedia pageid 3976790), uses api (relatively fast) with fallback to stub file (much slower but not nearly as slow as a straight decompress and read)
Modified paths:	/branches/ariel/xmldumps-backup/mwbzutils/Makefile (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/httptiny.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
—	—	@@ -1,4 +1,5 @@
2	2	#include <unistd.h>
	3	+#include <getopt.h>
3	4	#include <stdio.h>
4	5	#include <string.h>
5	6	#include <sys/types.h>
—	—	@@ -9,9 +10,9 @@
10	11	#include <sys/types.h>
11	12	#include <regex.h>
12	13	#include <inttypes.h>
	14	+#include <zlib.h>
13	15	#include "mwbzutils.h"
14	16
15		-
16	17	/*
17	18	find the first bz2 block marker in the file,
18	19	from its current position,
—	—	@@ -23,6 +24,13 @@
24	25	int init_and_read_first_buffer_bz2_file(bz_info_t *bfile, int fin) {
25	26	int res;
26	27
	28	+ bfile->bufin_size = BUFINSIZE;
	29	+ bfile->marker = init_marker();
	30	+ bfile->bytes_read = 0;
	31	+ bfile->bytes_written = 0;
	32	+ bfile->eof = 0;
	33	+ bfile->file_size = get_file_size(fin);
	34	+
27	35	bfile->initialized++;
28	36
29	37	res = find_next_bz2_block_marker(fin, bfile, FORWARD);
—	—	@@ -32,35 +40,244 @@
33	41	setup_first_buffer_to_decompress(fin, bfile);
34	42	return(0);
35	43	}
	44	+ else {
	45	+ fprintf(stderr,"failed to find the next frigging block marker\n");
	46	+ return(-1);
	47	+ }
	48	+}
	49	+
	50	+extern char * geturl(char hostname, int port, char url);
	51	+
	52	+char *get_hostname_from_xml_header(int fin) {
	53	+ int res;
	54	+ regmatch_t *match_base_expr;
	55	+ regex_t compiled_base_expr;
	56	+ /* <base>http://el.wiktionary.org/wiki/...</base> */
	57	+ /* <base>http://trouble.localdomain/wiki/ */
	58	+ char *base_expr = "<base>http://([^/]+)/";
	59	+ int length=5000; /* output buffer size */
	60	+
	61	+ buf_info_t *b;
	62	+ bz_info_t bfile;
	63	+
	64	+ int hostname_length = 0;
	65	+
	66	+ off_t old_position, seek_result;
	67	+ static char hostname[256];
	68	+
	69	+ bfile.initialized = 0;
	70	+
	71	+ res = regcomp(&compiled_base_expr, base_expr, REG_EXTENDED);
	72	+ match_base_expr = (regmatch_t )malloc(sizeof(regmatch_t)2);
	73	+
	74	+ b = init_buffer(length);
	75	+ bfile.bytes_read = 0;
	76	+
	77	+ bfile.position = (off_t)0;
	78	+ old_position = lseek(fin,(off_t)0,SEEK_CUR);
	79	+ seek_result = lseek(fin,(off_t)0,SEEK_SET);
	80	+
	81	+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof)) {
	82	+ /* so someday the header might grow enough that <base> isn't in the first 1000 characters but we'll ignore that for now */
	83	+ if (bfile.bytes_read && b->bytes_avail > 1000) {
	84	+ /* get project name and language name from the file header
	85	+ format:
	86	+ <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="el">
	87	+ <siteinfo>
	88	+ <sitename>Βικιλεξικό</sitename>
	89	+ <base>http://el.wiktionary.org/wiki/...</base>
	90	+ */
	91	+ if (regexec(&compiled_base_expr, (char *)b->next_to_read, 2, match_base_expr, 0 ) == 0) {
	92	+ if (match_base_expr[1].rm_so >=0) {
	93	+ hostname_length = match_base_expr[1].rm_eo - match_base_expr[1].rm_so;
	94	+ if (hostname_length > sizeof(hostname)) {
	95	+ fprintf(stderr,"very long hostname, giving up\n");
	96	+ break;
	97	+ }
	98	+ else {
	99	+ memcpy(hostname,(char *)b->next_to_read + match_base_expr[1].rm_so, hostname_length);
	100	+ hostname[hostname_length] = '\0';
	101	+ b->next_to_read = b->end;
	102	+ b->bytes_avail = 0;
	103	+ b->next_to_fill = b->buffer; /* empty */
	104	+ bfile.strm.next_out = (char *)b->next_to_fill;
	105	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	106	+ res = BZ2_bzDecompressEnd ( &(bfile.strm) );
	107	+ seek_result = lseek(fin,old_position,SEEK_SET);
	108	+ free_buffer(b);
	109	+ return(hostname);
	110	+ }
	111	+ }
	112	+ }
	113	+ else {
	114	+ break;
	115	+ }
	116	+ }
	117	+ }
	118	+ res = BZ2_bzDecompressEnd ( &(bfile.strm) );
	119	+ seek_result = lseek(fin,old_position,SEEK_SET);
	120	+ free_buffer(b);
	121	+ return(NULL);
	122	+}
	123	+
	124	+int has_xml_tag(char line, char tag) {
	125	+ return(! strncmp(line,tag,strlen(tag)));
	126	+}
	127	+
	128	+/* assumes the open tag, close tag and avlaue are all on the same line */
	129	+long int get_xml_elt_value(char line, char tag) {
	130	+ return(atol(line+strlen(tag)));
	131	+}
	132	+
	133	+/* returns pageid, or -1 on error. this requires the name of a stub file
	134	+ which contains all page ids and revisions ids in our standard xml format.
	135	+ It scans through the entire file looking for the page id which corresponds
	136	+ to the revision id. This can take up to 5 minutes for the larger
	137	+ stub history files; clearly we don't want to do this unless we
	138	+ have no other option.
	139	+ we need this in the case where the page text is huge (eg en wp pageid 5137507
	140	+ which has a cumulative text length across all revisions of > 163 GB.
	141	+ This can take over two hours to uncompress and scan through looking for
	142	+ the next page id, so we cheat */
	143	+long int get_page_id_from_rev_id_via_stub(long int rev_id, char *stubfile) {
	144	+ gzFile *gz;
	145	+ int page_id = -1;
	146	+ char buf[8192];
	147	+ char *bufp;
	148	+ enum States{WantPage,WantPageID,WantRevOrPage,WantRevID};
	149	+ int state;
	150	+ long int temp_rev_id;
	151	+
	152	+ gz = gzopen(stubfile,"r");
	153	+ state = WantPage;
	154	+ while ((bufp = gzgets(gz,buf,8191)) != NULL) {
	155	+ while (*bufp == ' ') bufp++;
	156	+ if (state == WantPage) {
	157	+ if (has_xml_tag(bufp,"<page>")) {
	158	+ state = WantPageID;
	159	+ }
	160	+ }
	161	+ else if (state == WantPageID) {
	162	+ if (has_xml_tag(bufp,"<id>")) {
	163	+ page_id = get_xml_elt_value(bufp,"<id>");
	164	+ state = WantRevOrPage;
	165	+ }
	166	+ }
	167	+ else if (state == WantRevOrPage) {
	168	+ if (has_xml_tag(bufp,"<revision>")) {
	169	+ state = WantRevID;
	170	+ }
	171	+ else if (has_xml_tag(bufp,"<page>")) {
	172	+ state = WantPageID;
	173	+ }
	174	+ }
	175	+ else if (state == WantRevID) {
	176	+ if (has_xml_tag(bufp,"<id>")) {
	177	+ temp_rev_id = get_xml_elt_value(bufp,"<id>");
	178	+ if (temp_rev_id == rev_id) {
	179	+ return(page_id);
	180	+ }
	181	+ /* this permits multiple revs in the page */
	182	+ state = WantRevOrPage;
	183	+ }
	184	+ }
	185	+ }
36	186	return(-1);
37	187	}
38	188
	189	+/* returns pageid, or -1 on error. this requires network access,
	190	+ it does an api call to the appropriate server for the appropriate project
	191	+ we need this in the case where the page text is huge (eg en wp pageid 5137507
	192	+ which has a cumulative text length across all revisions of > 163 GB.
	193	+ This can take over two hours to uncompress and scan through looking for
	194	+ the next page id, so we cheat */
	195	+int get_page_id_from_rev_id_via_api(long int rev_id, int fin) {
	196	+ /* char hostname[80]; */
	197	+ char *hostname;
	198	+ char url[80];
	199	+ char *buffer;
	200	+ long int page_id = -1;
	201	+ char *api_call = "/w/api.php?action=query&format=xml&revids=";
	202	+ regmatch_t *match_page_id_expr;
	203	+ regex_t compiled_page_id_expr;
	204	+ char *page_id_expr = "<pages><page pageid=\"([0-9]+)\"";
	205	+ int res;
	206	+
	207	+ hostname = get_hostname_from_xml_header(fin);
	208	+ if (!hostname) {
	209	+ return(-1);
	210	+ }
	211	+
	212	+ /*
	213	+ if (strlen(lang) + strlen(project) + strlen(".org") > sizeof(hostname)-2) {
	214	+ fprintf(stderr,"language code plus project name is huuuge string, giving up\n");
	215	+ return(-1);
	216	+ }
	217	+ sprintf(hostname,"%s.%s.org",lang,project);
	218	+ */
	219	+ sprintf(url,"%s%ld",api_call,rev_id);
	220	+
	221	+ buffer = geturl(hostname, 80, url);
	222	+ if (buffer == NULL) {
	223	+ return(-1);
	224	+ }
	225	+ else {
	226	+ /* dig the page id out of the buffer
	227	+ format:
	228	+ <?xml version="1.0"?><api><query><pages><page pageid="6215" ns="0" title="hystérique" /></pages></query></api>
	229	+ */
	230	+ match_page_id_expr = (regmatch_t )malloc(sizeof(regmatch_t)2);
	231	+ res = regcomp(&compiled_page_id_expr, page_id_expr, REG_EXTENDED);
	232	+
	233	+ if (regexec(&compiled_page_id_expr, buffer, 2, match_page_id_expr, 0 ) == 0) {
	234	+ if (match_page_id_expr[1].rm_so >=0) {
	235	+ page_id = atol(buffer + match_page_id_expr[1].rm_so);
	236	+ }
	237	+ }
	238	+ return(page_id);
	239	+ }
	240	+}
	241	+
39	242	/*
40	243	get the first page id after position in file
41	244	if a pageid is found, the structure pinfo will be updated accordingly
	245	+ use_api nonzero means that we will fallback to ask the api about a page
	246	+ that contains a given rev_id, in case we wind up with a huge page which
	247	+ has piles of revisions and we aren't seeing a page tag in a reasonable
	248	+ period of time.
42	249	returns:
43	250	1 if a pageid found,
44	251	0 if no pageid found,
45	252	-1 on error
46	253	*/
47		~~-int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo) {~~
	254	+int get_first_page_id_after_offset(int fin, off_t position, page_info_t pinfo, int use_api, int use_stub, char stubfilename) {
48	255	int res;
49		~~- regmatch_t match_page, match_page_id;~~
50		~~- regex_t compiled_page, compiled_page_id;~~
	256	+ regmatch_t match_page, match_page_id, match_rev, match_rev_id;
	257	+ regex_t compiled_page, compiled_page_id, compiled_rev, compiled_rev_id;
51	258	int length=5000; /* output buffer size */
52	259	char *page = "<page>";
53	260	char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
	261	+ char *rev = "<revision>";
	262	+ char *rev_id_expr = "<revision>\n[ ]+<id>([0-9]+)</id>\n";
54	263
55	264	buf_info_t *b;
56	265	bz_info_t bfile;
	266	+ long int rev_id=0;
	267	+ long int page_id_found=0;
57	268
	269	+ int buffer_count = 0;
	270	+
58	271	bfile.initialized = 0;
59	272
60	273	res = regcomp(&compiled_page, page, REG_EXTENDED);
61	274	res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
	275	+ res = regcomp(&compiled_rev, rev, REG_EXTENDED);
	276	+ res = regcomp(&compiled_rev_id, rev_id_expr, REG_EXTENDED);
62	277
63	278	match_page = (regmatch_t )malloc(sizeof(regmatch_t)1);
64	279	match_page_id = (regmatch_t )malloc(sizeof(regmatch_t)2);
	280	+ match_rev = (regmatch_t )malloc(sizeof(regmatch_t)1);
	281	+ match_rev_id = (regmatch_t )malloc(sizeof(regmatch_t)2);
65	282
66	283	b = init_buffer(length);
67	284
—	—	@@ -76,7 +293,8 @@
77	294	}
78	295
79	296	while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) {
80		~~- if (bfile.bytes_read) {~~
	297	+ buffer_count++;
	298	+ if (bfile.bytes_written) {
81	299	while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
82	300	if (match_page_id[1].rm_so >=0) {
83	301	/* write page_id to stderr */
—	—	@@ -101,6 +319,39 @@
102	320	exit(-1);
103	321	}
104	322	}
	323	+
	324	+ if (use_api \|\| use_stub) {
	325	+ if (!rev_id) {
	326	+ if (regexec(&compiled_rev_id, (char *)b->next_to_read, 2, match_rev_id, 0 ) == 0) {
	327	+ if (match_rev_id[1].rm_so >=0) {
	328	+ rev_id = atoi((char *)(b->next_to_read+match_rev_id[1].rm_so));
	329	+ }
	330	+ }
	331	+ }
	332	+
	333	+ /* this needs to be called if we don't find a page by X tries, or Y buffers read,
	334	+ and we need to retrieve a page id from a revision id in the text instead
	335	+ where does this obscure figure come from? assume we get at least 2-1 compression ratio,
	336	+ text revs are at most 10mb plus a little, then if we read this many buffers we should have
	337	+ at least one rev id in there. 20 million / 5000 or whatever it is, is 4000 buffers full of crap
	338	+ hopefully that doesn't take forever.
	339	+ */
	340	+ /* if (buffer_count>(20000000/BUFINSIZE) && rev_id) { */
	341	+ if (buffer_count>3 && rev_id) {
	342	+ if (use_api) {
	343	+ page_id_found = get_page_id_from_rev_id_via_api(rev_id, fin);
	344	+ }
	345	+ else { /* use_stub */
	346	+ page_id_found = get_page_id_from_rev_id_via_stub(rev_id, stubfilename);
	347	+ }
	348	+ pinfo->page_id = page_id_found +1; /* want the page after this offset, not the one we're in */
	349	+ pinfo->position = bfile.block_start;
	350	+ pinfo->bits_shifted = bfile.bits_shifted;
	351	+ return(1);
	352	+ }
	353	+ }
	354	+ /* FIXME this is probably wrong */
	355	+
105	356	if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
106	357	/* write everything up to but not including the page tag to stdout */
107	358	/*
—	—	@@ -110,14 +361,23 @@
111	362	bfile.strm.next_out = (char *)b->next_to_fill;
112	363	bfile.strm.avail_out = b->end - b->next_to_fill;
113	364	}
	365	+ else if ((use_api \|\| use_stub) && (regexec(&compiled_rev, (char *)b->next_to_read, 1, match_rev, 0 ) == 0)) {
	366	+ /* write everything up to but not including the rev tag to stdout */
	367	+ /*
	368	+ fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
	369	+ */
	370	+ move_bytes_to_buffer_start(b, b->next_to_read + match_rev[0].rm_so, b->bytes_avail - match_rev[0].rm_so);
	371	+ bfile.strm.next_out = (char *)b->next_to_fill;
	372	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	373	+ }
114	374	else {
115		~~- /* could have the first part of the page tag... so copy up enough bytes to cover that case */~~
116		~~- if (b->bytes_avail> 5) {~~
117		~~- /* write everything that didn't match, but leave 5 bytes, to stdout */~~
	375	+ /* could have the first part of the page or the rev tag... so copy up enough bytes to cover that case */
	376	+ if (b->bytes_avail> 10) {
	377	+ /* write everything that didn't match, but leave 10 bytes, to stdout */
118	378	/*
119		~~- fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);~~
	379	+ fwrite(b->next_to_read,b->bytes_avail - 10,1,stdout);
120	380	*/
121		~~- move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);~~
	381	+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 10, 10);
122	382	bfile.strm.next_out = (char *)b->next_to_fill;
123	383	bfile.strm.avail_out = b->end - b->next_to_fill;
124	384	}
—	—	@@ -128,7 +388,7 @@
129	389	b->next_to_fill = b->buffer; /* empty */
130	390	}
131	391	else {
132		~~- /* there were only 5 or less bytes so just save em don't write em to stdout */~~
	392	+ /* there were only 10 or less bytes so just save em don't write em to stdout */
133	393	move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
134	394	bfile.strm.next_out = (char *)b->next_to_fill;
135	395	bfile.strm.avail_out = b->end - b->next_to_fill;
—	—	@@ -161,7 +421,7 @@
162	422
163	423	return value from guess, or -1 on error.
164	424	*/
165		~~-int do_iteration(iter_info_t iinfo, int fin, page_info_t pinfo) {~~
	425	+int do_iteration(iter_info_t iinfo, int fin, page_info_t pinfo, int use_api, int use_stub, char *stubfilename) {
166	426	int res;
167	427	off_t new_position;
168	428	off_t interval;
—	—	@@ -194,7 +454,7 @@
195	455	new_position = iinfo->last_position - interval;
196	456	}
197	457	}
198		~~- res = get_first_page_id_after_offset(fin, new_position, pinfo);~~
	458	+ res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename);
199	459	if (res >0) {
200	460	/* caller wants the new value */
201	461	iinfo->last_value = pinfo->page_id;
—	—	@@ -217,6 +477,14 @@
218	478	}
219	479	}
220	480
	481	+void usage(char whoami, char message) {
	482	+ if (message) {
	483	+ fprintf(stderr,message);
	484	+ }
	485	+ fprintf(stderr,"usage: %s --filename file --pageid id [--useapi]\n", whoami);
	486	+ exit(1);
	487	+}
	488	+
221	489	/*
222	490	given a bzipped and possibly truncated file, and a page id,
223	491	hunt for the page id in the file; this assume that the
—	—	@@ -226,35 +494,71 @@
227	495	writes the offset of the relevant block (from beginning of file)
228	496	and the first pageid found in that block, to stdout
229	497
	498	+ it may use the api to find page ids from rev ids if use_api is specified
	499	+ it may use a stub file to find page ids from rev ids if stubfile is specified
	500	+ it will only do these if it has been reading from awhile without
	501	+ findind a page tag (some pages have > 500K revisions and a heck of
	502	+ a lot of text)
	503	+ if both use_api and stubfile are specified, we will use_api, it's faster
	504	+
230	505	format of output:
231	506	position:xxxxx pageid:nnn
232	507
233	508	returns: 0 on success, -1 on error
234	509	*/
235	510	int main(int argc, char **argv) {
236		~~- int fin, res, page_id;~~
	511	+ int fin, res, page_id=0;
237	512	off_t position, interval, file_size;
238	513	page_info_t pinfo;
239	514	iter_info_t iinfo;
	515	+ char *filename = NULL;
	516	+ int optindex=0;
	517	+ int use_api = 0;
	518	+ int use_stub = 0;
	519	+ int optc;
	520	+ char *stubfile=NULL;
240	521
241		~~- if (argc != 3) {~~
242		~~- fprintf(stderr,"usage: %s infile id\n", argv[0]);~~
243		~~- exit(-1);~~
	522	+ struct option optvalues[] = {
	523	+ {"filename", 1, 0, 'f'},
	524	+ {"pageid", 1, 0, 'p'},
	525	+ {"useapi", 0, 0, 'a'},
	526	+ {"stubfile", 1, 0, 's'},
	527	+ {NULL, 0, NULL, 0}
	528	+ };
	529	+
	530	+ while (1) {
	531	+ optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile", optvalues, &optindex);
	532	+ if (optc=='f') {
	533	+ filename=optarg;
	534	+ }
	535	+ else if (optc=='p') {
	536	+ if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
	537	+ page_id=atoi(optarg);
	538	+ }
	539	+ else if (optc=='a')
	540	+ use_api=1;
	541	+ else if (optc=='s') {
	542	+ use_stub=1;
	543	+ stubfile = optarg;
	544	+ }
	545	+ else if (optc==-1) break;
	546	+ else usage(argv[0],"unknown option or other error\n");
244	547	}
245	548
246		~~- fin = open (argv[1], O_RDONLY);~~
247		~~- if (fin < 0) {~~
248		~~- fprintf(stderr,"failed to open file %s for read\n", argv[1]);~~
249		~~- exit(-1);~~
	549	+ if (! filename \|\| ! page_id) {
	550	+ usage(argv[0],NULL);
250	551	}
251	552
252		~~- page_id = atoi(argv[2]);~~
253	553	if (page_id <1) {
254		~~- fprintf(stderr,"please specify a page_id >= 1.\n");~~
255		~~- fprintf(stderr,"usage: %s infile page_id\n", argv[0]);~~
256		~~- exit(-1);~~
	554	+ usage(argv[0], "please specify a page_id >= 1.\n");
257	555	}
258	556
	557	+ fin = open (filename, O_RDONLY);
	558	+ if (fin < 0) {
	559	+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
	560	+ exit(1);
	561	+ }
	562	+
259	563	file_size = get_file_size(fin);
260	564
261	565	interval = file_size;
—	—	@@ -264,11 +568,10 @@
265	569	pinfo.page_id = -1;
266	570
267	571	iinfo.left_end = (off_t)0;
268		~~- file_size = get_file_size(fin);~~
269	572	iinfo.right_end = file_size;
270	573	iinfo.value_wanted = page_id;
271	574
272		~~- res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo);~~
	575	+ res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile);
273	576	if (res > 0) {
274	577	iinfo.last_value = pinfo.page_id;
275	578	iinfo.last_position = (off_t)0;
—	—	@@ -283,7 +586,7 @@
284	587	}
285	588
286	589	while (1) {
287		~~- res = do_iteration(&iinfo, fin, &pinfo);~~
	590	+ res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile);
288	591	/* things to check: bad return? interval is 0 bytes long? */
289	592	if (iinfo.left_end == iinfo.right_end) {
290	593	fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id);
Index: branches/ariel/xmldumps-backup/mwbzutils/httptiny.c
—	—	@@ -0,0 +1,193 @@
	2	+#include <sys/socket.h>
	3	+#include <stdio.h>
	4	+#include <stdlib.h>
	5	+#include <netinet/in.h>
	6	+#include <sys/time.h>
	7	+#include <errno.h>
	8	+#include <netdb.h>
	9	+#include <netinet/in.h>
	10	+#include <arpa/inet.h>
	11	+#include <sys/ioctl.h>
	12	+#include <string.h>
	13	+#include <unistd.h>
	14	+
	15	+int usage(char *whoami);
	16	+int doconnect(int sd,struct timeval timeout,struct sockaddr_in *sa_us);
	17	+int dowrite(int sd,char *message,int length);
	18	+int doread(int sd, char buf, int length, struct timeval timeout);
	19	+
	20	+extern char *optarg;
	21	+extern int optind;
	22	+
	23	+char *whoami;
	24	+
	25	+#define agentinfo "geturl-tiny/0.3 (Linux x86_64)"
	26	+
	27	+/* expects us to get text back, will only serve up the first BUFSIZ bytes = 8192, that's
	28	+ plenty for what we want, which is tiny api call results */
	29	+char * geturl(char hostname, int port, char url) {
	30	+ int sd;
	31	+ struct sockaddr_in sa_us;
	32	+ struct timeval timeout;
	33	+ int result;
	34	+ struct hostent *hostinfo=NULL;
	35	+ char *message=NULL;
	36	+ static char buf[BUFSIZ];
	37	+
	38	+ if ((hostinfo=gethostbyname(hostname)) == NULL ) {
	39	+ fprintf(stderr,"%s: host lookup failed\n",whoami);
	40	+ return(NULL);
	41	+ }
	42	+
	43	+ /* set up socket and connect */
	44	+ sa_us.sin_family=AF_INET;
	45	+ memcpy(&sa_us.sin_addr,hostinfo->h_addr_list[0],hostinfo->h_length);
	46	+ sa_us.sin_port=htons(port);
	47	+ timeout.tv_sec=30;
	48	+ timeout.tv_usec=0;
	49	+ doconnect(&sd,&timeout,&sa_us);
	50	+
	51	+ /* set up message and send it */
	52	+ if ((message=malloc(strlen(url)+25)) == NULL) {
	53	+ fprintf(stderr,"%s: out of memory\n",whoami);
	54	+ return(NULL);
	55	+ }
	56	+ sprintf(message,"GET %s HTTP/1.0\n",url);
	57	+ dowrite(sd,message,strlen(message));
	58	+ free(message);
	59	+ sprintf(buf,"Host: %s\n",hostname);
	60	+ dowrite(sd,buf,strlen(buf));
	61	+ sprintf(buf,"User-Agent: %s\n\n",agentinfo);
	62	+ dowrite(sd,buf,strlen(buf));
	63	+ /* read reply */
	64	+ errno=0;
	65	+ buf[0]='\0';
	66	+ result=doread(sd,buf,sizeof(buf),&timeout);
	67	+ if (result == -1) {
	68	+ fprintf(stderr,"%s: read error\n",whoami);
	69	+ close(sd);
	70	+ return(NULL);
	71	+ }
	72	+ close(sd);
	73	+ return(buf);
	74	+}
	75	+
	76	+/* fixme need to check content length and only retrieve that amount */
	77	+int doread(int sd, char buf, int length, struct timeval timeout)
	78	+{
	79	+ int result;
	80	+ fd_set fds;
	81	+ int count = 0;
	82	+
	83	+ FD_ZERO(&fds);
	84	+ FD_SET(sd,&fds);
	85	+
	86	+ result = -1;
	87	+ while (count < length) {
	88	+ result = select(FD_SETSIZE,&fds,NULL,NULL,timeout);
	89	+ if (result <= 0) {
	90	+ perror("read error of some sort (0)");
	91	+
	92	+ }
	93	+ else {
	94	+ result=recv(sd,buf+count,length-count,0);
	95	+ if (result == -1) {
	96	+ perror("read error of some sort (1)");
	97	+ if (errno==EWOULDBLOCK) {
	98	+ FD_ZERO(&fds);
	99	+ FD_SET(sd,&fds);
	100	+ if (select(FD_SETSIZE,&fds,NULL,NULL,timeout) != 1) {
	101	+ fprintf(stderr,"%s: timeout %d secs trying to read\n",
	102	+ whoami,(int)timeout->tv_sec);
	103	+ if (select(FD_SETSIZE,&fds,NULL,NULL,timeout) != 1) {
	104	+ fprintf(stderr,"%s: -2- timeout %d secs trying to read\n",
	105	+ whoami,(int)timeout->tv_sec);
	106	+ }
	107	+ return(-1);
	108	+ }
	109	+ else result=recv(sd,buf+count,length-count,0);
	110	+ }
	111	+ else {
	112	+ fprintf(stderr,"%s: can't read from socket\n",whoami);
	113	+ perror(whoami);
	114	+ return(-1);
	115	+ }
	116	+ }
	117	+ else if (result == 0) {
	118	+ break;
	119	+ }
	120	+ else {
	121	+ count += result;
	122	+ buf[count] = '\0';
	123	+ }
	124	+ }
	125	+ }
	126	+ return(result);
	127	+}
	128	+
	129	+int dowrite(int sd,char *message,int length)
	130	+{
	131	+ int result;
	132	+
	133	+ while (1) {
	134	+
	135	+ result=send(sd,message,(unsigned int) length,0);
	136	+ if (result == -1) {
	137	+ perror("some error, let's see it");
	138	+ if (errno!=EAGAIN) {
	139	+ fprintf(stderr,"%s: write to server failed\n",whoami);
	140	+ perror(whoami);
	141	+ exit(1);
	142	+ }
	143	+ }
	144	+ else break;
	145	+ }
	146	+ return(result);
	147	+}
	148	+
	149	+int doconnect(int sd,struct timeval timeout,struct sockaddr_in *sa_us)
	150	+{
	151	+ int val;
	152	+ fd_set fds;
	153	+
	154	+ if ((*sd = socket(AF_INET,SOCK_STREAM,0)) == -1) {
	155	+ fprintf(stderr, "%s: could not get socket\n",whoami);
	156	+ perror(whoami);
	157	+ exit(1);
	158	+ }
	159	+ /*
	160	+ val=1;
	161	+ if (ioctl(*sd, FIONBIO, &val) == -1) {
	162	+ fprintf(stderr,"%s: could not make connection \
	163	+ non-blocking\n",whoami);
	164	+ perror(whoami);
	165	+ exit(1);
	166	+ }
	167	+ */
	168	+ if (connect(sd,(struct sockaddr ) sa_us,sizeof(*sa_us)) == -1) {
	169	+ if (errno != EINPROGRESS) {
	170	+ fprintf(stderr,"%s: could not connect\n", whoami);
	171	+ perror(whoami);
	172	+ exit(1);
	173	+ }
	174	+ else {
	175	+ FD_ZERO(&fds);
	176	+ FD_SET(*sd,&fds);
	177	+ if (select(FD_SETSIZE,NULL,&fds,NULL,timeout) != 1) {
	178	+ fprintf(stderr,"%s: timeout %d secs trying to connect\n",
	179	+ whoami,(int)timeout->tv_sec);
	180	+ exit(1);
	181	+ }
	182	+ else if ((connect(sd,(struct sockaddr ) sa_us,sizeof(*sa_us))== -1)
	183	+ && ( errno != EISCONN)) {
	184	+ /* shouldn't in theory but.. */
	185	+ fprintf(stderr, "%s: connect failed\n",whoami);
	186	+ perror(whoami);
	187	+ exit(1);
	188	+ }
	189	+ }
	190	+ }
	191	+ errno=0;
	192	+ return(0);
	193	+}
	194	+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/httptiny.c
___________________________________________________________________
Added: svn:eol-style
1	195	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h
—	—	@@ -110,6 +110,8 @@
111	111
112	112	buf_info_t *init_buffer(int size);
113	113
	114	+void free_buffer(buf_info_t *b);
	115	+
114	116	int buffer_is_empty(buf_info_t *b);
115	117
116	118	int buffer_is_full(buf_info_t *b);
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c
—	—	@@ -141,6 +141,7 @@
142	142	}
143	143	/* must be after 4 byte file header, and we add a leftmost byte to the buffer
144	144	of data read in case some bits have been shifted into it */
	145	+ /* fprintf(stderr,"position is %"PRId64" and file size is %"PRId64"\n",bfile->position, bfile->file_size); */
145	146	while (bfile->position <= bfile->file_size - 6 && bfile->position >= 0 && bfile->bits_shifted < 0) {
146	147	bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
147	148	if (bfile->bits_shifted < 0) {
—	—	@@ -387,6 +388,16 @@
388	389	return(b);
389	390	}
390	391
	392	+/* free pieces of buf_info_t */
	393	+void free_buffer(buf_info_t *b) {
	394	+ if (b) {
	395	+ if (b->buffer) {
	396	+ free(b->buffer);
	397	+ }
	398	+ }
	399	+ return;
	400	+}
	401	+
391	402	/* check if buffer (used for decompressed data output) is empty,
392	403	returns 1 if so and 0 if not */
393	404	int buffer_is_empty(buf_info_t *b) {
—	—	@@ -476,7 +487,7 @@
477	488	if (buffer_is_full(b)) {
478	489	return(0);
479	490	}
480		-
	491	+
481	492	if (buffer_is_empty(b)) {
482	493	b->next_to_fill = b->buffer;
483	494	}
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile
—	—	@@ -34,8 +34,8 @@
35	35	dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
36	36	$(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2
37	37
38		~~-findpageidinbz2xml: $(OBJSBZ) mwbzlib.o findpageidinbz2xml.o~~
39		~~- $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o mwbzlib.o $(OBJSBZ) -lbz2~~
	38	+findpageidinbz2xml: $(OBJSBZ) mwbzlib.o httptiny.o findpageidinbz2xml.o
	39	+ $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o httptiny.o mwbzlib.o $(OBJSBZ) -lbz2 -lz
40	40
41	41	checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o
42	42	$(CC) $(CFLAGS) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o mwbzlib.o $(OBJSBZ) -lbz2
—	—	@@ -62,6 +62,8 @@
63	63	$(CC) $(CFLAGS) -c bzlibfuncs.c
64	64	mwbzlib.o: mwbzlib.c bzlib.h bzlib_private.h mwbzutils.h
65	65	$(CC) $(CFLAGS) -c mwbzlib.c
	66	+httptiny.o: httptiny.c
	67	+ $(CC) $(CFLAGS) -c httptiny.c
66	68	dumplastbz2block.o: dumplastbz2block.c
67	69	$(CC) $(CFLAGS) -c dumplastbz2block.c
68	70	findpageidinbz2xml.o: findpageidinbz2xml.c
—	—	@@ -73,7 +75,7 @@
74	76
75	77	distclean: clean
76	78
77		~~-DISTNAME=mwbzutils-0.0.1~~
	79	+DISTNAME=mwbzutils-0.0.2
78	80	dist:
79	81	rm -f $(DISTNAME)
80	82	ln -s -f . $(DISTNAME)
—	—	@@ -82,6 +84,7 @@
83	85	$(DISTNAME)/findpageidinbz2xml.c \
84	86	$(DISTNAME)/checkforbz2footer.c \
85	87	$(DISTNAME)/dumpbz2filefromoffset.c \
	88	+ $(DISTNAME)/httptiny.c \
86	89	$(DISTNAME)/mwbzlib.c \
87	90	$(DISTNAME)/mwbzutils.h \
88	91	$(DISTNAME)/bzlibfuncs.c \

Status & tagging log

14:32, 14 July 2011 Reedy (talk | contribs) changed the status of r92144 [removed: new added: deferred]