r92144 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92143‎ | r92144 | r92145 >
Date:08:35, 14 July 2011
Author:ariel
Status:deferred
Tags:
Comment:
version bump; for finding pageid in xml file, workaround for pages with giant cumulative rev text (*cough en pedia pageid 3976790), uses api (relatively fast) with fallback to stub file (much slower but not nearly as slow as a straight decompress and read)
Modified paths:
  • /branches/ariel/xmldumps-backup/mwbzutils/Makefile (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/httptiny.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
@@ -1,4 +1,5 @@
22 #include <unistd.h>
 3+#include <getopt.h>
34 #include <stdio.h>
45 #include <string.h>
56 #include <sys/types.h>
@@ -9,9 +10,9 @@
1011 #include <sys/types.h>
1112 #include <regex.h>
1213 #include <inttypes.h>
 14+#include <zlib.h>
1315 #include "mwbzutils.h"
1416
15 -
1617 /*
1718 find the first bz2 block marker in the file,
1819 from its current position,
@@ -23,6 +24,13 @@
2425 int init_and_read_first_buffer_bz2_file(bz_info_t *bfile, int fin) {
2526 int res;
2627
 28+ bfile->bufin_size = BUFINSIZE;
 29+ bfile->marker = init_marker();
 30+ bfile->bytes_read = 0;
 31+ bfile->bytes_written = 0;
 32+ bfile->eof = 0;
 33+ bfile->file_size = get_file_size(fin);
 34+
2735 bfile->initialized++;
2836
2937 res = find_next_bz2_block_marker(fin, bfile, FORWARD);
@@ -32,35 +40,244 @@
3341 setup_first_buffer_to_decompress(fin, bfile);
3442 return(0);
3543 }
 44+ else {
 45+ fprintf(stderr,"failed to find the next frigging block marker\n");
 46+ return(-1);
 47+ }
 48+}
 49+
 50+extern char * geturl(char *hostname, int port, char *url);
 51+
 52+char *get_hostname_from_xml_header(int fin) {
 53+ int res;
 54+ regmatch_t *match_base_expr;
 55+ regex_t compiled_base_expr;
 56+ /* <base>http://el.wiktionary.org/wiki/...</base> */
 57+ /* <base>http://trouble.localdomain/wiki/ */
 58+ char *base_expr = "<base>http://([^/]+)/";
 59+ int length=5000; /* output buffer size */
 60+
 61+ buf_info_t *b;
 62+ bz_info_t bfile;
 63+
 64+ int hostname_length = 0;
 65+
 66+ off_t old_position, seek_result;
 67+ static char hostname[256];
 68+
 69+ bfile.initialized = 0;
 70+
 71+ res = regcomp(&compiled_base_expr, base_expr, REG_EXTENDED);
 72+ match_base_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 73+
 74+ b = init_buffer(length);
 75+ bfile.bytes_read = 0;
 76+
 77+ bfile.position = (off_t)0;
 78+ old_position = lseek(fin,(off_t)0,SEEK_CUR);
 79+ seek_result = lseek(fin,(off_t)0,SEEK_SET);
 80+
 81+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof)) {
 82+ /* so someday the header might grow enough that <base> isn't in the first 1000 characters but we'll ignore that for now */
 83+ if (bfile.bytes_read && b->bytes_avail > 1000) {
 84+ /* get project name and language name from the file header
 85+ format:
 86+ <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="el">
 87+ <siteinfo>
 88+ <sitename>Βικιλεξικό</sitename>
 89+ <base>http://el.wiktionary.org/wiki/...</base>
 90+ */
 91+ if (regexec(&compiled_base_expr, (char *)b->next_to_read, 2, match_base_expr, 0 ) == 0) {
 92+ if (match_base_expr[1].rm_so >=0) {
 93+ hostname_length = match_base_expr[1].rm_eo - match_base_expr[1].rm_so;
 94+ if (hostname_length > sizeof(hostname)) {
 95+ fprintf(stderr,"very long hostname, giving up\n");
 96+ break;
 97+ }
 98+ else {
 99+ memcpy(hostname,(char *)b->next_to_read + match_base_expr[1].rm_so, hostname_length);
 100+ hostname[hostname_length] = '\0';
 101+ b->next_to_read = b->end;
 102+ b->bytes_avail = 0;
 103+ b->next_to_fill = b->buffer; /* empty */
 104+ bfile.strm.next_out = (char *)b->next_to_fill;
 105+ bfile.strm.avail_out = b->end - b->next_to_fill;
 106+ res = BZ2_bzDecompressEnd ( &(bfile.strm) );
 107+ seek_result = lseek(fin,old_position,SEEK_SET);
 108+ free_buffer(b);
 109+ return(hostname);
 110+ }
 111+ }
 112+ }
 113+ else {
 114+ break;
 115+ }
 116+ }
 117+ }
 118+ res = BZ2_bzDecompressEnd ( &(bfile.strm) );
 119+ seek_result = lseek(fin,old_position,SEEK_SET);
 120+ free_buffer(b);
 121+ return(NULL);
 122+}
 123+
 124+int has_xml_tag(char *line, char *tag) {
 125+ return(! strncmp(line,tag,strlen(tag)));
 126+}
 127+
 128+/* assumes the open tag, close tag and avlaue are all on the same line */
 129+long int get_xml_elt_value(char *line, char *tag) {
 130+ return(atol(line+strlen(tag)));
 131+}
 132+
 133+/* returns pageid, or -1 on error. this requires the name of a stub file
 134+ which contains all page ids and revisions ids in our standard xml format.
 135+ It scans through the entire file looking for the page id which corresponds
 136+ to the revision id. This can take up to 5 minutes for the larger
 137+ stub history files; clearly we don't want to do this unless we
 138+ have no other option.
 139+ we need this in the case where the page text is huge (eg en wp pageid 5137507
 140+ which has a cumulative text length across all revisions of > 163 GB.
 141+ This can take over two hours to uncompress and scan through looking for
 142+ the next page id, so we cheat */
 143+long int get_page_id_from_rev_id_via_stub(long int rev_id, char *stubfile) {
 144+ gzFile *gz;
 145+ int page_id = -1;
 146+ char buf[8192];
 147+ char *bufp;
 148+ enum States{WantPage,WantPageID,WantRevOrPage,WantRevID};
 149+ int state;
 150+ long int temp_rev_id;
 151+
 152+ gz = gzopen(stubfile,"r");
 153+ state = WantPage;
 154+ while ((bufp = gzgets(gz,buf,8191)) != NULL) {
 155+ while (*bufp == ' ') bufp++;
 156+ if (state == WantPage) {
 157+ if (has_xml_tag(bufp,"<page>")) {
 158+ state = WantPageID;
 159+ }
 160+ }
 161+ else if (state == WantPageID) {
 162+ if (has_xml_tag(bufp,"<id>")) {
 163+ page_id = get_xml_elt_value(bufp,"<id>");
 164+ state = WantRevOrPage;
 165+ }
 166+ }
 167+ else if (state == WantRevOrPage) {
 168+ if (has_xml_tag(bufp,"<revision>")) {
 169+ state = WantRevID;
 170+ }
 171+ else if (has_xml_tag(bufp,"<page>")) {
 172+ state = WantPageID;
 173+ }
 174+ }
 175+ else if (state == WantRevID) {
 176+ if (has_xml_tag(bufp,"<id>")) {
 177+ temp_rev_id = get_xml_elt_value(bufp,"<id>");
 178+ if (temp_rev_id == rev_id) {
 179+ return(page_id);
 180+ }
 181+ /* this permits multiple revs in the page */
 182+ state = WantRevOrPage;
 183+ }
 184+ }
 185+ }
36186 return(-1);
37187 }
38188
 189+/* returns pageid, or -1 on error. this requires network access,
 190+ it does an api call to the appropriate server for the appropriate project
 191+ we need this in the case where the page text is huge (eg en wp pageid 5137507
 192+ which has a cumulative text length across all revisions of > 163 GB.
 193+ This can take over two hours to uncompress and scan through looking for
 194+ the next page id, so we cheat */
 195+int get_page_id_from_rev_id_via_api(long int rev_id, int fin) {
 196+ /* char hostname[80]; */
 197+ char *hostname;
 198+ char url[80];
 199+ char *buffer;
 200+ long int page_id = -1;
 201+ char *api_call = "/w/api.php?action=query&format=xml&revids=";
 202+ regmatch_t *match_page_id_expr;
 203+ regex_t compiled_page_id_expr;
 204+ char *page_id_expr = "<pages><page pageid=\"([0-9]+)\"";
 205+ int res;
 206+
 207+ hostname = get_hostname_from_xml_header(fin);
 208+ if (!hostname) {
 209+ return(-1);
 210+ }
 211+
 212+ /*
 213+ if (strlen(lang) + strlen(project) + strlen(".org") > sizeof(hostname)-2) {
 214+ fprintf(stderr,"language code plus project name is huuuge string, giving up\n");
 215+ return(-1);
 216+ }
 217+ sprintf(hostname,"%s.%s.org",lang,project);
 218+ */
 219+ sprintf(url,"%s%ld",api_call,rev_id);
 220+
 221+ buffer = geturl(hostname, 80, url);
 222+ if (buffer == NULL) {
 223+ return(-1);
 224+ }
 225+ else {
 226+ /* dig the page id out of the buffer
 227+ format:
 228+ <?xml version="1.0"?><api><query><pages><page pageid="6215" ns="0" title="hystérique" /></pages></query></api>
 229+ */
 230+ match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 231+ res = regcomp(&compiled_page_id_expr, page_id_expr, REG_EXTENDED);
 232+
 233+ if (regexec(&compiled_page_id_expr, buffer, 2, match_page_id_expr, 0 ) == 0) {
 234+ if (match_page_id_expr[1].rm_so >=0) {
 235+ page_id = atol(buffer + match_page_id_expr[1].rm_so);
 236+ }
 237+ }
 238+ return(page_id);
 239+ }
 240+}
 241+
39242 /*
40243 get the first page id after position in file
41244 if a pageid is found, the structure pinfo will be updated accordingly
 245+ use_api nonzero means that we will fallback to ask the api about a page
 246+ that contains a given rev_id, in case we wind up with a huge page which
 247+ has piles of revisions and we aren't seeing a page tag in a reasonable
 248+ period of time.
42249 returns:
43250 1 if a pageid found,
44251 0 if no pageid found,
45252 -1 on error
46253 */
47 -int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo) {
 254+int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) {
48255 int res;
49 - regmatch_t *match_page, *match_page_id;
50 - regex_t compiled_page, compiled_page_id;
 256+ regmatch_t *match_page, *match_page_id, *match_rev, *match_rev_id;
 257+ regex_t compiled_page, compiled_page_id, compiled_rev, compiled_rev_id;
51258 int length=5000; /* output buffer size */
52259 char *page = "<page>";
53260 char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
 261+ char *rev = "<revision>";
 262+ char *rev_id_expr = "<revision>\n[ ]+<id>([0-9]+)</id>\n";
54263
55264 buf_info_t *b;
56265 bz_info_t bfile;
 266+ long int rev_id=0;
 267+ long int page_id_found=0;
57268
 269+ int buffer_count = 0;
 270+
58271 bfile.initialized = 0;
59272
60273 res = regcomp(&compiled_page, page, REG_EXTENDED);
61274 res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
 275+ res = regcomp(&compiled_rev, rev, REG_EXTENDED);
 276+ res = regcomp(&compiled_rev_id, rev_id_expr, REG_EXTENDED);
62277
63278 match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
64279 match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 280+ match_rev = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
 281+ match_rev_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
65282
66283 b = init_buffer(length);
67284
@@ -76,7 +293,8 @@
77294 }
78295
79296 while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) {
80 - if (bfile.bytes_read) {
 297+ buffer_count++;
 298+ if (bfile.bytes_written) {
81299 while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
82300 if (match_page_id[1].rm_so >=0) {
83301 /* write page_id to stderr */
@@ -101,6 +319,39 @@
102320 exit(-1);
103321 }
104322 }
 323+
 324+ if (use_api || use_stub) {
 325+ if (!rev_id) {
 326+ if (regexec(&compiled_rev_id, (char *)b->next_to_read, 2, match_rev_id, 0 ) == 0) {
 327+ if (match_rev_id[1].rm_so >=0) {
 328+ rev_id = atoi((char *)(b->next_to_read+match_rev_id[1].rm_so));
 329+ }
 330+ }
 331+ }
 332+
 333+ /* this needs to be called if we don't find a page by X tries, or Y buffers read,
 334+ and we need to retrieve a page id from a revision id in the text instead
 335+ where does this obscure figure come from? assume we get at least 2-1 compression ratio,
 336+ text revs are at most 10mb plus a little, then if we read this many buffers we should have
 337+ at least one rev id in there. 20 million / 5000 or whatever it is, is 4000 buffers full of crap
 338+ hopefully that doesn't take forever.
 339+ */
 340+ /* if (buffer_count>(20000000/BUFINSIZE) && rev_id) { */
 341+ if (buffer_count>3 && rev_id) {
 342+ if (use_api) {
 343+ page_id_found = get_page_id_from_rev_id_via_api(rev_id, fin);
 344+ }
 345+ else { /* use_stub */
 346+ page_id_found = get_page_id_from_rev_id_via_stub(rev_id, stubfilename);
 347+ }
 348+ pinfo->page_id = page_id_found +1; /* want the page after this offset, not the one we're in */
 349+ pinfo->position = bfile.block_start;
 350+ pinfo->bits_shifted = bfile.bits_shifted;
 351+ return(1);
 352+ }
 353+ }
 354+ /* FIXME this is probably wrong */
 355+
105356 if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
106357 /* write everything up to but not including the page tag to stdout */
107358 /*
@@ -110,14 +361,23 @@
111362 bfile.strm.next_out = (char *)b->next_to_fill;
112363 bfile.strm.avail_out = b->end - b->next_to_fill;
113364 }
 365+ else if ((use_api || use_stub) && (regexec(&compiled_rev, (char *)b->next_to_read, 1, match_rev, 0 ) == 0)) {
 366+ /* write everything up to but not including the rev tag to stdout */
 367+ /*
 368+ fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
 369+ */
 370+ move_bytes_to_buffer_start(b, b->next_to_read + match_rev[0].rm_so, b->bytes_avail - match_rev[0].rm_so);
 371+ bfile.strm.next_out = (char *)b->next_to_fill;
 372+ bfile.strm.avail_out = b->end - b->next_to_fill;
 373+ }
114374 else {
115 - /* could have the first part of the page tag... so copy up enough bytes to cover that case */
116 - if (b->bytes_avail> 5) {
117 - /* write everything that didn't match, but leave 5 bytes, to stdout */
 375+ /* could have the first part of the page or the rev tag... so copy up enough bytes to cover that case */
 376+ if (b->bytes_avail> 10) {
 377+ /* write everything that didn't match, but leave 10 bytes, to stdout */
118378 /*
119 - fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);
 379+ fwrite(b->next_to_read,b->bytes_avail - 10,1,stdout);
120380 */
121 - move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);
 381+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 10, 10);
122382 bfile.strm.next_out = (char *)b->next_to_fill;
123383 bfile.strm.avail_out = b->end - b->next_to_fill;
124384 }
@@ -128,7 +388,7 @@
129389 b->next_to_fill = b->buffer; /* empty */
130390 }
131391 else {
132 - /* there were only 5 or less bytes so just save em don't write em to stdout */
 392+ /* there were only 10 or less bytes so just save em don't write em to stdout */
133393 move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
134394 bfile.strm.next_out = (char *)b->next_to_fill;
135395 bfile.strm.avail_out = b->end - b->next_to_fill;
@@ -161,7 +421,7 @@
162422
163423 return value from guess, or -1 on error.
164424 */
165 -int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo) {
 425+int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) {
166426 int res;
167427 off_t new_position;
168428 off_t interval;
@@ -194,7 +454,7 @@
195455 new_position = iinfo->last_position - interval;
196456 }
197457 }
198 - res = get_first_page_id_after_offset(fin, new_position, pinfo);
 458+ res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename);
199459 if (res >0) {
200460 /* caller wants the new value */
201461 iinfo->last_value = pinfo->page_id;
@@ -217,6 +477,14 @@
218478 }
219479 }
220480
 481+void usage(char *whoami, char *message) {
 482+ if (message) {
 483+ fprintf(stderr,message);
 484+ }
 485+ fprintf(stderr,"usage: %s --filename file --pageid id [--useapi]\n", whoami);
 486+ exit(1);
 487+}
 488+
221489 /*
222490 given a bzipped and possibly truncated file, and a page id,
223491 hunt for the page id in the file; this assume that the
@@ -226,35 +494,71 @@
227495 writes the offset of the relevant block (from beginning of file)
228496 and the first pageid found in that block, to stdout
229497
 498+ it may use the api to find page ids from rev ids if use_api is specified
 499+ it may use a stub file to find page ids from rev ids if stubfile is specified
 500+ it will only do these if it has been reading from awhile without
 501+ findind a page tag (some pages have > 500K revisions and a heck of
 502+ a lot of text)
 503+ if both use_api and stubfile are specified, we will use_api, it's faster
 504+
230505 format of output:
231506 position:xxxxx pageid:nnn
232507
233508 returns: 0 on success, -1 on error
234509 */
235510 int main(int argc, char **argv) {
236 - int fin, res, page_id;
 511+ int fin, res, page_id=0;
237512 off_t position, interval, file_size;
238513 page_info_t pinfo;
239514 iter_info_t iinfo;
 515+ char *filename = NULL;
 516+ int optindex=0;
 517+ int use_api = 0;
 518+ int use_stub = 0;
 519+ int optc;
 520+ char *stubfile=NULL;
240521
241 - if (argc != 3) {
242 - fprintf(stderr,"usage: %s infile id\n", argv[0]);
243 - exit(-1);
 522+ struct option optvalues[] = {
 523+ {"filename", 1, 0, 'f'},
 524+ {"pageid", 1, 0, 'p'},
 525+ {"useapi", 0, 0, 'a'},
 526+ {"stubfile", 1, 0, 's'},
 527+ {NULL, 0, NULL, 0}
 528+ };
 529+
 530+ while (1) {
 531+ optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile", optvalues, &optindex);
 532+ if (optc=='f') {
 533+ filename=optarg;
 534+ }
 535+ else if (optc=='p') {
 536+ if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
 537+ page_id=atoi(optarg);
 538+ }
 539+ else if (optc=='a')
 540+ use_api=1;
 541+ else if (optc=='s') {
 542+ use_stub=1;
 543+ stubfile = optarg;
 544+ }
 545+ else if (optc==-1) break;
 546+ else usage(argv[0],"unknown option or other error\n");
244547 }
245548
246 - fin = open (argv[1], O_RDONLY);
247 - if (fin < 0) {
248 - fprintf(stderr,"failed to open file %s for read\n", argv[1]);
249 - exit(-1);
 549+ if (! filename || ! page_id) {
 550+ usage(argv[0],NULL);
250551 }
251552
252 - page_id = atoi(argv[2]);
253553 if (page_id <1) {
254 - fprintf(stderr,"please specify a page_id >= 1.\n");
255 - fprintf(stderr,"usage: %s infile page_id\n", argv[0]);
256 - exit(-1);
 554+ usage(argv[0], "please specify a page_id >= 1.\n");
257555 }
258556
 557+ fin = open (filename, O_RDONLY);
 558+ if (fin < 0) {
 559+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 560+ exit(1);
 561+ }
 562+
259563 file_size = get_file_size(fin);
260564
261565 interval = file_size;
@@ -264,11 +568,10 @@
265569 pinfo.page_id = -1;
266570
267571 iinfo.left_end = (off_t)0;
268 - file_size = get_file_size(fin);
269572 iinfo.right_end = file_size;
270573 iinfo.value_wanted = page_id;
271574
272 - res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo);
 575+ res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile);
273576 if (res > 0) {
274577 iinfo.last_value = pinfo.page_id;
275578 iinfo.last_position = (off_t)0;
@@ -283,7 +586,7 @@
284587 }
285588
286589 while (1) {
287 - res = do_iteration(&iinfo, fin, &pinfo);
 590+ res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile);
288591 /* things to check: bad return? interval is 0 bytes long? */
289592 if (iinfo.left_end == iinfo.right_end) {
290593 fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id);
Index: branches/ariel/xmldumps-backup/mwbzutils/httptiny.c
@@ -0,0 +1,193 @@
 2+#include <sys/socket.h>
 3+#include <stdio.h>
 4+#include <stdlib.h>
 5+#include <netinet/in.h>
 6+#include <sys/time.h>
 7+#include <errno.h>
 8+#include <netdb.h>
 9+#include <netinet/in.h>
 10+#include <arpa/inet.h>
 11+#include <sys/ioctl.h>
 12+#include <string.h>
 13+#include <unistd.h>
 14+
 15+int usage(char *whoami);
 16+int doconnect(int *sd,struct timeval *timeout,struct sockaddr_in *sa_us);
 17+int dowrite(int sd,char *message,int length);
 18+int doread(int sd, char *buf, int length, struct timeval *timeout);
 19+
 20+extern char *optarg;
 21+extern int optind;
 22+
 23+char *whoami;
 24+
 25+#define agentinfo "geturl-tiny/0.3 (Linux x86_64)"
 26+
 27+/* expects us to get text back, will only serve up the first BUFSIZ bytes = 8192, that's
 28+ plenty for what we want, which is tiny api call results */
 29+char * geturl(char *hostname, int port, char *url) {
 30+ int sd;
 31+ struct sockaddr_in sa_us;
 32+ struct timeval timeout;
 33+ int result;
 34+ struct hostent *hostinfo=NULL;
 35+ char *message=NULL;
 36+ static char buf[BUFSIZ];
 37+
 38+ if ((hostinfo=gethostbyname(hostname)) == NULL ) {
 39+ fprintf(stderr,"%s: host lookup failed\n",whoami);
 40+ return(NULL);
 41+ }
 42+
 43+ /* set up socket and connect */
 44+ sa_us.sin_family=AF_INET;
 45+ memcpy(&sa_us.sin_addr,hostinfo->h_addr_list[0],hostinfo->h_length);
 46+ sa_us.sin_port=htons(port);
 47+ timeout.tv_sec=30;
 48+ timeout.tv_usec=0;
 49+ doconnect(&sd,&timeout,&sa_us);
 50+
 51+ /* set up message and send it */
 52+ if ((message=malloc(strlen(url)+25)) == NULL) {
 53+ fprintf(stderr,"%s: out of memory\n",whoami);
 54+ return(NULL);
 55+ }
 56+ sprintf(message,"GET %s HTTP/1.0\n",url);
 57+ dowrite(sd,message,strlen(message));
 58+ free(message);
 59+ sprintf(buf,"Host: %s\n",hostname);
 60+ dowrite(sd,buf,strlen(buf));
 61+ sprintf(buf,"User-Agent: %s\n\n",agentinfo);
 62+ dowrite(sd,buf,strlen(buf));
 63+ /* read reply */
 64+ errno=0;
 65+ buf[0]='\0';
 66+ result=doread(sd,buf,sizeof(buf),&timeout);
 67+ if (result == -1) {
 68+ fprintf(stderr,"%s: read error\n",whoami);
 69+ close(sd);
 70+ return(NULL);
 71+ }
 72+ close(sd);
 73+ return(buf);
 74+}
 75+
 76+/* fixme need to check content length and only retrieve that amount */
 77+int doread(int sd, char *buf, int length, struct timeval *timeout)
 78+{
 79+ int result;
 80+ fd_set fds;
 81+ int count = 0;
 82+
 83+ FD_ZERO(&fds);
 84+ FD_SET(sd,&fds);
 85+
 86+ result = -1;
 87+ while (count < length) {
 88+ result = select(FD_SETSIZE,&fds,NULL,NULL,timeout);
 89+ if (result <= 0) {
 90+ perror("read error of some sort (0)");
 91+
 92+ }
 93+ else {
 94+ result=recv(sd,buf+count,length-count,0);
 95+ if (result == -1) {
 96+ perror("read error of some sort (1)");
 97+ if (errno==EWOULDBLOCK) {
 98+ FD_ZERO(&fds);
 99+ FD_SET(sd,&fds);
 100+ if (select(FD_SETSIZE,&fds,NULL,NULL,timeout) != 1) {
 101+ fprintf(stderr,"%s: timeout %d secs trying to read\n",
 102+ whoami,(int)timeout->tv_sec);
 103+ if (select(FD_SETSIZE,&fds,NULL,NULL,timeout) != 1) {
 104+ fprintf(stderr,"%s: -2- timeout %d secs trying to read\n",
 105+ whoami,(int)timeout->tv_sec);
 106+ }
 107+ return(-1);
 108+ }
 109+ else result=recv(sd,buf+count,length-count,0);
 110+ }
 111+ else {
 112+ fprintf(stderr,"%s: can't read from socket\n",whoami);
 113+ perror(whoami);
 114+ return(-1);
 115+ }
 116+ }
 117+ else if (result == 0) {
 118+ break;
 119+ }
 120+ else {
 121+ count += result;
 122+ buf[count] = '\0';
 123+ }
 124+ }
 125+ }
 126+ return(result);
 127+}
 128+
 129+int dowrite(int sd,char *message,int length)
 130+{
 131+ int result;
 132+
 133+ while (1) {
 134+
 135+ result=send(sd,message,(unsigned int) length,0);
 136+ if (result == -1) {
 137+ perror("some error, let's see it");
 138+ if (errno!=EAGAIN) {
 139+ fprintf(stderr,"%s: write to server failed\n",whoami);
 140+ perror(whoami);
 141+ exit(1);
 142+ }
 143+ }
 144+ else break;
 145+ }
 146+ return(result);
 147+}
 148+
 149+int doconnect(int *sd,struct timeval *timeout,struct sockaddr_in *sa_us)
 150+{
 151+ int val;
 152+ fd_set fds;
 153+
 154+ if ((*sd = socket(AF_INET,SOCK_STREAM,0)) == -1) {
 155+ fprintf(stderr, "%s: could not get socket\n",whoami);
 156+ perror(whoami);
 157+ exit(1);
 158+ }
 159+ /*
 160+ val=1;
 161+ if (ioctl(*sd, FIONBIO, &val) == -1) {
 162+ fprintf(stderr,"%s: could not make connection \
 163+ non-blocking\n",whoami);
 164+ perror(whoami);
 165+ exit(1);
 166+ }
 167+ */
 168+ if (connect(*sd,(struct sockaddr *) sa_us,sizeof(*sa_us)) == -1) {
 169+ if (errno != EINPROGRESS) {
 170+ fprintf(stderr,"%s: could not connect\n", whoami);
 171+ perror(whoami);
 172+ exit(1);
 173+ }
 174+ else {
 175+ FD_ZERO(&fds);
 176+ FD_SET(*sd,&fds);
 177+ if (select(FD_SETSIZE,NULL,&fds,NULL,timeout) != 1) {
 178+ fprintf(stderr,"%s: timeout %d secs trying to connect\n",
 179+ whoami,(int)timeout->tv_sec);
 180+ exit(1);
 181+ }
 182+ else if ((connect(*sd,(struct sockaddr *) sa_us,sizeof(*sa_us))== -1)
 183+ && ( errno != EISCONN)) {
 184+ /* shouldn't in theory but.. */
 185+ fprintf(stderr, "%s: connect failed\n",whoami);
 186+ perror(whoami);
 187+ exit(1);
 188+ }
 189+ }
 190+ }
 191+ errno=0;
 192+ return(0);
 193+}
 194+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/httptiny.c
___________________________________________________________________
Added: svn:eol-style
1195 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h
@@ -110,6 +110,8 @@
111111
112112 buf_info_t *init_buffer(int size);
113113
 114+void free_buffer(buf_info_t *b);
 115+
114116 int buffer_is_empty(buf_info_t *b);
115117
116118 int buffer_is_full(buf_info_t *b);
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c
@@ -141,6 +141,7 @@
142142 }
143143 /* must be after 4 byte file header, and we add a leftmost byte to the buffer
144144 of data read in case some bits have been shifted into it */
 145+ /* fprintf(stderr,"position is %"PRId64" and file size is %"PRId64"\n",bfile->position, bfile->file_size); */
145146 while (bfile->position <= bfile->file_size - 6 && bfile->position >= 0 && bfile->bits_shifted < 0) {
146147 bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
147148 if (bfile->bits_shifted < 0) {
@@ -387,6 +388,16 @@
388389 return(b);
389390 }
390391
 392+/* free pieces of buf_info_t */
 393+void free_buffer(buf_info_t *b) {
 394+ if (b) {
 395+ if (b->buffer) {
 396+ free(b->buffer);
 397+ }
 398+ }
 399+ return;
 400+}
 401+
391402 /* check if buffer (used for decompressed data output) is empty,
392403 returns 1 if so and 0 if not */
393404 int buffer_is_empty(buf_info_t *b) {
@@ -476,7 +487,7 @@
477488 if (buffer_is_full(b)) {
478489 return(0);
479490 }
480 -
 491+
481492 if (buffer_is_empty(b)) {
482493 b->next_to_fill = b->buffer;
483494 }
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile
@@ -34,8 +34,8 @@
3535 dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
3636 $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2
3737
38 -findpageidinbz2xml: $(OBJSBZ) mwbzlib.o findpageidinbz2xml.o
39 - $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o mwbzlib.o $(OBJSBZ) -lbz2
 38+findpageidinbz2xml: $(OBJSBZ) mwbzlib.o httptiny.o findpageidinbz2xml.o
 39+ $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o httptiny.o mwbzlib.o $(OBJSBZ) -lbz2 -lz
4040
4141 checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o
4242 $(CC) $(CFLAGS) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o mwbzlib.o $(OBJSBZ) -lbz2
@@ -62,6 +62,8 @@
6363 $(CC) $(CFLAGS) -c bzlibfuncs.c
6464 mwbzlib.o: mwbzlib.c bzlib.h bzlib_private.h mwbzutils.h
6565 $(CC) $(CFLAGS) -c mwbzlib.c
 66+httptiny.o: httptiny.c
 67+ $(CC) $(CFLAGS) -c httptiny.c
6668 dumplastbz2block.o: dumplastbz2block.c
6769 $(CC) $(CFLAGS) -c dumplastbz2block.c
6870 findpageidinbz2xml.o: findpageidinbz2xml.c
@@ -73,7 +75,7 @@
7476
7577 distclean: clean
7678
77 -DISTNAME=mwbzutils-0.0.1
 79+DISTNAME=mwbzutils-0.0.2
7880 dist:
7981 rm -f $(DISTNAME)
8082 ln -s -f . $(DISTNAME)
@@ -82,6 +84,7 @@
8385 $(DISTNAME)/findpageidinbz2xml.c \
8486 $(DISTNAME)/checkforbz2footer.c \
8587 $(DISTNAME)/dumpbz2filefromoffset.c \
 88+ $(DISTNAME)/httptiny.c \
8689 $(DISTNAME)/mwbzlib.c \
8790 $(DISTNAME)/mwbzutils.h \
8891 $(DISTNAME)/bzlibfuncs.c \

Status & tagging log