Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c |
— | — | @@ -1,4 +1,5 @@ |
2 | 2 | #include <unistd.h> |
| 3 | +#include <getopt.h> |
3 | 4 | #include <stdio.h> |
4 | 5 | #include <string.h> |
5 | 6 | #include <sys/types.h> |
— | — | @@ -9,9 +10,9 @@ |
10 | 11 | #include <sys/types.h> |
11 | 12 | #include <regex.h> |
12 | 13 | #include <inttypes.h> |
| 14 | +#include <zlib.h> |
13 | 15 | #include "mwbzutils.h" |
14 | 16 | |
15 | | - |
16 | 17 | /* |
17 | 18 | find the first bz2 block marker in the file, |
18 | 19 | from its current position, |
— | — | @@ -23,6 +24,13 @@ |
24 | 25 | int init_and_read_first_buffer_bz2_file(bz_info_t *bfile, int fin) { |
25 | 26 | int res; |
26 | 27 | |
| 28 | + bfile->bufin_size = BUFINSIZE; |
| 29 | + bfile->marker = init_marker(); |
| 30 | + bfile->bytes_read = 0; |
| 31 | + bfile->bytes_written = 0; |
| 32 | + bfile->eof = 0; |
| 33 | + bfile->file_size = get_file_size(fin); |
| 34 | + |
27 | 35 | bfile->initialized++; |
28 | 36 | |
29 | 37 | res = find_next_bz2_block_marker(fin, bfile, FORWARD); |
— | — | @@ -32,35 +40,244 @@ |
33 | 41 | setup_first_buffer_to_decompress(fin, bfile); |
34 | 42 | return(0); |
35 | 43 | } |
| 44 | + else { |
| 45 | + fprintf(stderr,"failed to find the next frigging block marker\n"); |
| 46 | + return(-1); |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +extern char * geturl(char *hostname, int port, char *url); |
| 51 | + |
| 52 | +char *get_hostname_from_xml_header(int fin) { |
| 53 | + int res; |
| 54 | + regmatch_t *match_base_expr; |
| 55 | + regex_t compiled_base_expr; |
| 56 | + /* <base>http://el.wiktionary.org/wiki/...</base> */ |
| 57 | + /* <base>http://trouble.localdomain/wiki/ */ |
| 58 | + char *base_expr = "<base>http://([^/]+)/"; |
| 59 | + int length=5000; /* output buffer size */ |
| 60 | + |
| 61 | + buf_info_t *b; |
| 62 | + bz_info_t bfile; |
| 63 | + |
| 64 | + int hostname_length = 0; |
| 65 | + |
| 66 | + off_t old_position, seek_result; |
| 67 | + static char hostname[256]; |
| 68 | + |
| 69 | + bfile.initialized = 0; |
| 70 | + |
| 71 | + res = regcomp(&compiled_base_expr, base_expr, REG_EXTENDED); |
| 72 | + match_base_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
| 73 | + |
| 74 | + b = init_buffer(length); |
| 75 | + bfile.bytes_read = 0; |
| 76 | + |
| 77 | + bfile.position = (off_t)0; |
| 78 | + old_position = lseek(fin,(off_t)0,SEEK_CUR); |
| 79 | + seek_result = lseek(fin,(off_t)0,SEEK_SET); |
| 80 | + |
| 81 | + while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof)) { |
| 82 | + /* so someday the header might grow enough that <base> isn't in the first 1000 characters but we'll ignore that for now */ |
| 83 | + if (bfile.bytes_read && b->bytes_avail > 1000) { |
| 84 | + /* get project name and language name from the file header |
| 85 | + format: |
| 86 | + <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="el"> |
| 87 | + <siteinfo> |
| 88 | + <sitename>Βικιλεξικό</sitename> |
| 89 | + <base>http://el.wiktionary.org/wiki/...</base> |
| 90 | + */ |
| 91 | + if (regexec(&compiled_base_expr, (char *)b->next_to_read, 2, match_base_expr, 0 ) == 0) { |
| 92 | + if (match_base_expr[1].rm_so >=0) { |
| 93 | + hostname_length = match_base_expr[1].rm_eo - match_base_expr[1].rm_so; |
| 94 | + if (hostname_length > sizeof(hostname)) { |
| 95 | + fprintf(stderr,"very long hostname, giving up\n"); |
| 96 | + break; |
| 97 | + } |
| 98 | + else { |
| 99 | + memcpy(hostname,(char *)b->next_to_read + match_base_expr[1].rm_so, hostname_length); |
| 100 | + hostname[hostname_length] = '\0'; |
| 101 | + b->next_to_read = b->end; |
| 102 | + b->bytes_avail = 0; |
| 103 | + b->next_to_fill = b->buffer; /* empty */ |
| 104 | + bfile.strm.next_out = (char *)b->next_to_fill; |
| 105 | + bfile.strm.avail_out = b->end - b->next_to_fill; |
| 106 | + res = BZ2_bzDecompressEnd ( &(bfile.strm) ); |
| 107 | + seek_result = lseek(fin,old_position,SEEK_SET); |
| 108 | + free_buffer(b); |
| 109 | + return(hostname); |
| 110 | + } |
| 111 | + } |
| 112 | + } |
| 113 | + else { |
| 114 | + break; |
| 115 | + } |
| 116 | + } |
| 117 | + } |
| 118 | + res = BZ2_bzDecompressEnd ( &(bfile.strm) ); |
| 119 | + seek_result = lseek(fin,old_position,SEEK_SET); |
| 120 | + free_buffer(b); |
| 121 | + return(NULL); |
| 122 | +} |
| 123 | + |
| 124 | +int has_xml_tag(char *line, char *tag) { |
| 125 | + return(! strncmp(line,tag,strlen(tag))); |
| 126 | +} |
| 127 | + |
| 128 | +/* assumes the open tag, close tag and avlaue are all on the same line */ |
| 129 | +long int get_xml_elt_value(char *line, char *tag) { |
| 130 | + return(atol(line+strlen(tag))); |
| 131 | +} |
| 132 | + |
| 133 | +/* returns pageid, or -1 on error. this requires the name of a stub file |
| 134 | + which contains all page ids and revisions ids in our standard xml format. |
| 135 | + It scans through the entire file looking for the page id which corresponds |
| 136 | + to the revision id. This can take up to 5 minutes for the larger |
| 137 | + stub history files; clearly we don't want to do this unless we |
| 138 | + have no other option. |
| 139 | + we need this in the case where the page text is huge (eg en wp pageid 5137507 |
| 140 | + which has a cumulative text length across all revisions of > 163 GB. |
| 141 | + This can take over two hours to uncompress and scan through looking for |
| 142 | + the next page id, so we cheat */ |
| 143 | +long int get_page_id_from_rev_id_via_stub(long int rev_id, char *stubfile) { |
| 144 | + gzFile *gz; |
| 145 | + int page_id = -1; |
| 146 | + char buf[8192]; |
| 147 | + char *bufp; |
| 148 | + enum States{WantPage,WantPageID,WantRevOrPage,WantRevID}; |
| 149 | + int state; |
| 150 | + long int temp_rev_id; |
| 151 | + |
| 152 | + gz = gzopen(stubfile,"r"); |
| 153 | + state = WantPage; |
| 154 | + while ((bufp = gzgets(gz,buf,8191)) != NULL) { |
| 155 | + while (*bufp == ' ') bufp++; |
| 156 | + if (state == WantPage) { |
| 157 | + if (has_xml_tag(bufp,"<page>")) { |
| 158 | + state = WantPageID; |
| 159 | + } |
| 160 | + } |
| 161 | + else if (state == WantPageID) { |
| 162 | + if (has_xml_tag(bufp,"<id>")) { |
| 163 | + page_id = get_xml_elt_value(bufp,"<id>"); |
| 164 | + state = WantRevOrPage; |
| 165 | + } |
| 166 | + } |
| 167 | + else if (state == WantRevOrPage) { |
| 168 | + if (has_xml_tag(bufp,"<revision>")) { |
| 169 | + state = WantRevID; |
| 170 | + } |
| 171 | + else if (has_xml_tag(bufp,"<page>")) { |
| 172 | + state = WantPageID; |
| 173 | + } |
| 174 | + } |
| 175 | + else if (state == WantRevID) { |
| 176 | + if (has_xml_tag(bufp,"<id>")) { |
| 177 | + temp_rev_id = get_xml_elt_value(bufp,"<id>"); |
| 178 | + if (temp_rev_id == rev_id) { |
| 179 | + return(page_id); |
| 180 | + } |
| 181 | + /* this permits multiple revs in the page */ |
| 182 | + state = WantRevOrPage; |
| 183 | + } |
| 184 | + } |
| 185 | + } |
36 | 186 | return(-1); |
37 | 187 | } |
38 | 188 | |
| 189 | +/* returns pageid, or -1 on error. this requires network access, |
| 190 | + it does an api call to the appropriate server for the appropriate project |
| 191 | + we need this in the case where the page text is huge (eg en wp pageid 5137507 |
| 192 | + which has a cumulative text length across all revisions of > 163 GB. |
| 193 | + This can take over two hours to uncompress and scan through looking for |
| 194 | + the next page id, so we cheat */ |
| 195 | +int get_page_id_from_rev_id_via_api(long int rev_id, int fin) { |
| 196 | + /* char hostname[80]; */ |
| 197 | + char *hostname; |
| 198 | + char url[80]; |
| 199 | + char *buffer; |
| 200 | + long int page_id = -1; |
| 201 | + char *api_call = "/w/api.php?action=query&format=xml&revids="; |
| 202 | + regmatch_t *match_page_id_expr; |
| 203 | + regex_t compiled_page_id_expr; |
| 204 | + char *page_id_expr = "<pages><page pageid=\"([0-9]+)\""; |
| 205 | + int res; |
| 206 | + |
| 207 | + hostname = get_hostname_from_xml_header(fin); |
| 208 | + if (!hostname) { |
| 209 | + return(-1); |
| 210 | + } |
| 211 | + |
| 212 | + /* |
| 213 | + if (strlen(lang) + strlen(project) + strlen(".org") > sizeof(hostname)-2) { |
| 214 | + fprintf(stderr,"language code plus project name is huuuge string, giving up\n"); |
| 215 | + return(-1); |
| 216 | + } |
| 217 | + sprintf(hostname,"%s.%s.org",lang,project); |
| 218 | + */ |
| 219 | + sprintf(url,"%s%ld",api_call,rev_id); |
| 220 | + |
| 221 | + buffer = geturl(hostname, 80, url); |
| 222 | + if (buffer == NULL) { |
| 223 | + return(-1); |
| 224 | + } |
| 225 | + else { |
| 226 | + /* dig the page id out of the buffer |
| 227 | + format: |
| 228 | + <?xml version="1.0"?><api><query><pages><page pageid="6215" ns="0" title="hystérique" /></pages></query></api> |
| 229 | + */ |
| 230 | + match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
| 231 | + res = regcomp(&compiled_page_id_expr, page_id_expr, REG_EXTENDED); |
| 232 | + |
| 233 | + if (regexec(&compiled_page_id_expr, buffer, 2, match_page_id_expr, 0 ) == 0) { |
| 234 | + if (match_page_id_expr[1].rm_so >=0) { |
| 235 | + page_id = atol(buffer + match_page_id_expr[1].rm_so); |
| 236 | + } |
| 237 | + } |
| 238 | + return(page_id); |
| 239 | + } |
| 240 | +} |
| 241 | + |
39 | 242 | /* |
40 | 243 | get the first page id after position in file |
41 | 244 | if a pageid is found, the structure pinfo will be updated accordingly |
| 245 | + use_api nonzero means that we will fallback to ask the api about a page |
| 246 | + that contains a given rev_id, in case we wind up with a huge page which |
| 247 | + has piles of revisions and we aren't seeing a page tag in a reasonable |
| 248 | + period of time. |
42 | 249 | returns: |
43 | 250 | 1 if a pageid found, |
44 | 251 | 0 if no pageid found, |
45 | 252 | -1 on error |
46 | 253 | */ |
47 | | -int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo) { |
| 254 | +int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) { |
48 | 255 | int res; |
49 | | - regmatch_t *match_page, *match_page_id; |
50 | | - regex_t compiled_page, compiled_page_id; |
| 256 | + regmatch_t *match_page, *match_page_id, *match_rev, *match_rev_id; |
| 257 | + regex_t compiled_page, compiled_page_id, compiled_rev, compiled_rev_id; |
51 | 258 | int length=5000; /* output buffer size */ |
52 | 259 | char *page = "<page>"; |
53 | 260 | char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n"; |
| 261 | + char *rev = "<revision>"; |
| 262 | + char *rev_id_expr = "<revision>\n[ ]+<id>([0-9]+)</id>\n"; |
54 | 263 | |
55 | 264 | buf_info_t *b; |
56 | 265 | bz_info_t bfile; |
| 266 | + long int rev_id=0; |
| 267 | + long int page_id_found=0; |
57 | 268 | |
| 269 | + int buffer_count = 0; |
| 270 | + |
58 | 271 | bfile.initialized = 0; |
59 | 272 | |
60 | 273 | res = regcomp(&compiled_page, page, REG_EXTENDED); |
61 | 274 | res = regcomp(&compiled_page_id, page_id, REG_EXTENDED); |
| 275 | + res = regcomp(&compiled_rev, rev, REG_EXTENDED); |
| 276 | + res = regcomp(&compiled_rev_id, rev_id_expr, REG_EXTENDED); |
62 | 277 | |
63 | 278 | match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1); |
64 | 279 | match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
| 280 | + match_rev = (regmatch_t *)malloc(sizeof(regmatch_t)*1); |
| 281 | + match_rev_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
65 | 282 | |
66 | 283 | b = init_buffer(length); |
67 | 284 | |
— | — | @@ -76,7 +293,8 @@ |
77 | 294 | } |
78 | 295 | |
79 | 296 | while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) { |
80 | | - if (bfile.bytes_read) { |
| 297 | + buffer_count++; |
| 298 | + if (bfile.bytes_written) { |
81 | 299 | while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) { |
82 | 300 | if (match_page_id[1].rm_so >=0) { |
83 | 301 | /* write page_id to stderr */ |
— | — | @@ -101,6 +319,39 @@ |
102 | 320 | exit(-1); |
103 | 321 | } |
104 | 322 | } |
| 323 | + |
| 324 | + if (use_api || use_stub) { |
| 325 | + if (!rev_id) { |
| 326 | + if (regexec(&compiled_rev_id, (char *)b->next_to_read, 2, match_rev_id, 0 ) == 0) { |
| 327 | + if (match_rev_id[1].rm_so >=0) { |
| 328 | + rev_id = atoi((char *)(b->next_to_read+match_rev_id[1].rm_so)); |
| 329 | + } |
| 330 | + } |
| 331 | + } |
| 332 | + |
| 333 | + /* this needs to be called if we don't find a page by X tries, or Y buffers read, |
| 334 | + and we need to retrieve a page id from a revision id in the text instead |
| 335 | + where does this obscure figure come from? assume we get at least 2-1 compression ratio, |
| 336 | + text revs are at most 10mb plus a little, then if we read this many buffers we should have |
| 337 | + at least one rev id in there. 20 million / 5000 or whatever it is, is 4000 buffers full of crap |
| 338 | + hopefully that doesn't take forever. |
| 339 | + */ |
| 340 | + /* if (buffer_count>(20000000/BUFINSIZE) && rev_id) { */ |
| 341 | + if (buffer_count>3 && rev_id) { |
| 342 | + if (use_api) { |
| 343 | + page_id_found = get_page_id_from_rev_id_via_api(rev_id, fin); |
| 344 | + } |
| 345 | + else { /* use_stub */ |
| 346 | + page_id_found = get_page_id_from_rev_id_via_stub(rev_id, stubfilename); |
| 347 | + } |
| 348 | + pinfo->page_id = page_id_found +1; /* want the page after this offset, not the one we're in */ |
| 349 | + pinfo->position = bfile.block_start; |
| 350 | + pinfo->bits_shifted = bfile.bits_shifted; |
| 351 | + return(1); |
| 352 | + } |
| 353 | + } |
| 354 | + /* FIXME this is probably wrong */ |
| 355 | + |
105 | 356 | if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) { |
106 | 357 | /* write everything up to but not including the page tag to stdout */ |
107 | 358 | /* |
— | — | @@ -110,14 +361,23 @@ |
111 | 362 | bfile.strm.next_out = (char *)b->next_to_fill; |
112 | 363 | bfile.strm.avail_out = b->end - b->next_to_fill; |
113 | 364 | } |
| 365 | + else if ((use_api || use_stub) && (regexec(&compiled_rev, (char *)b->next_to_read, 1, match_rev, 0 ) == 0)) { |
| 366 | + /* write everything up to but not including the rev tag to stdout */ |
| 367 | + /* |
| 368 | + fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout); |
| 369 | + */ |
| 370 | + move_bytes_to_buffer_start(b, b->next_to_read + match_rev[0].rm_so, b->bytes_avail - match_rev[0].rm_so); |
| 371 | + bfile.strm.next_out = (char *)b->next_to_fill; |
| 372 | + bfile.strm.avail_out = b->end - b->next_to_fill; |
| 373 | + } |
114 | 374 | else { |
115 | | - /* could have the first part of the page tag... so copy up enough bytes to cover that case */ |
116 | | - if (b->bytes_avail> 5) { |
117 | | - /* write everything that didn't match, but leave 5 bytes, to stdout */ |
| 375 | + /* could have the first part of the page or the rev tag... so copy up enough bytes to cover that case */ |
| 376 | + if (b->bytes_avail> 10) { |
| 377 | + /* write everything that didn't match, but leave 10 bytes, to stdout */ |
118 | 378 | /* |
119 | | - fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout); |
| 379 | + fwrite(b->next_to_read,b->bytes_avail - 10,1,stdout); |
120 | 380 | */ |
121 | | - move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5); |
| 381 | + move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 10, 10); |
122 | 382 | bfile.strm.next_out = (char *)b->next_to_fill; |
123 | 383 | bfile.strm.avail_out = b->end - b->next_to_fill; |
124 | 384 | } |
— | — | @@ -128,7 +388,7 @@ |
129 | 389 | b->next_to_fill = b->buffer; /* empty */ |
130 | 390 | } |
131 | 391 | else { |
132 | | - /* there were only 5 or less bytes so just save em don't write em to stdout */ |
| 392 | + /* there were only 10 or less bytes so just save em don't write em to stdout */ |
133 | 393 | move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail); |
134 | 394 | bfile.strm.next_out = (char *)b->next_to_fill; |
135 | 395 | bfile.strm.avail_out = b->end - b->next_to_fill; |
— | — | @@ -161,7 +421,7 @@ |
162 | 422 | |
163 | 423 | return value from guess, or -1 on error. |
164 | 424 | */ |
165 | | -int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo) { |
| 425 | +int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) { |
166 | 426 | int res; |
167 | 427 | off_t new_position; |
168 | 428 | off_t interval; |
— | — | @@ -194,7 +454,7 @@ |
195 | 455 | new_position = iinfo->last_position - interval; |
196 | 456 | } |
197 | 457 | } |
198 | | - res = get_first_page_id_after_offset(fin, new_position, pinfo); |
| 458 | + res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename); |
199 | 459 | if (res >0) { |
200 | 460 | /* caller wants the new value */ |
201 | 461 | iinfo->last_value = pinfo->page_id; |
— | — | @@ -217,6 +477,14 @@ |
218 | 478 | } |
219 | 479 | } |
220 | 480 | |
| 481 | +void usage(char *whoami, char *message) { |
| 482 | + if (message) { |
| 483 | + fprintf(stderr,message); |
| 484 | + } |
| 485 | + fprintf(stderr,"usage: %s --filename file --pageid id [--useapi]\n", whoami); |
| 486 | + exit(1); |
| 487 | +} |
| 488 | + |
221 | 489 | /* |
222 | 490 | given a bzipped and possibly truncated file, and a page id, |
223 | 491 | hunt for the page id in the file; this assume that the |
— | — | @@ -226,35 +494,71 @@ |
227 | 495 | writes the offset of the relevant block (from beginning of file) |
228 | 496 | and the first pageid found in that block, to stdout |
229 | 497 | |
| 498 | + it may use the api to find page ids from rev ids if use_api is specified |
| 499 | + it may use a stub file to find page ids from rev ids if stubfile is specified |
| 500 | + it will only do these if it has been reading from awhile without |
| 501 | + findind a page tag (some pages have > 500K revisions and a heck of |
| 502 | + a lot of text) |
| 503 | + if both use_api and stubfile are specified, we will use_api, it's faster |
| 504 | + |
230 | 505 | format of output: |
231 | 506 | position:xxxxx pageid:nnn |
232 | 507 | |
233 | 508 | returns: 0 on success, -1 on error |
234 | 509 | */ |
235 | 510 | int main(int argc, char **argv) { |
236 | | - int fin, res, page_id; |
| 511 | + int fin, res, page_id=0; |
237 | 512 | off_t position, interval, file_size; |
238 | 513 | page_info_t pinfo; |
239 | 514 | iter_info_t iinfo; |
| 515 | + char *filename = NULL; |
| 516 | + int optindex=0; |
| 517 | + int use_api = 0; |
| 518 | + int use_stub = 0; |
| 519 | + int optc; |
| 520 | + char *stubfile=NULL; |
240 | 521 | |
241 | | - if (argc != 3) { |
242 | | - fprintf(stderr,"usage: %s infile id\n", argv[0]); |
243 | | - exit(-1); |
| 522 | + struct option optvalues[] = { |
| 523 | + {"filename", 1, 0, 'f'}, |
| 524 | + {"pageid", 1, 0, 'p'}, |
| 525 | + {"useapi", 0, 0, 'a'}, |
| 526 | + {"stubfile", 1, 0, 's'}, |
| 527 | + {NULL, 0, NULL, 0} |
| 528 | + }; |
| 529 | + |
| 530 | + while (1) { |
| 531 | + optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile", optvalues, &optindex); |
| 532 | + if (optc=='f') { |
| 533 | + filename=optarg; |
| 534 | + } |
| 535 | + else if (optc=='p') { |
| 536 | + if (!(isdigit(optarg[0]))) usage(argv[0],NULL); |
| 537 | + page_id=atoi(optarg); |
| 538 | + } |
| 539 | + else if (optc=='a') |
| 540 | + use_api=1; |
| 541 | + else if (optc=='s') { |
| 542 | + use_stub=1; |
| 543 | + stubfile = optarg; |
| 544 | + } |
| 545 | + else if (optc==-1) break; |
| 546 | + else usage(argv[0],"unknown option or other error\n"); |
244 | 547 | } |
245 | 548 | |
246 | | - fin = open (argv[1], O_RDONLY); |
247 | | - if (fin < 0) { |
248 | | - fprintf(stderr,"failed to open file %s for read\n", argv[1]); |
249 | | - exit(-1); |
| 549 | + if (! filename || ! page_id) { |
| 550 | + usage(argv[0],NULL); |
250 | 551 | } |
251 | 552 | |
252 | | - page_id = atoi(argv[2]); |
253 | 553 | if (page_id <1) { |
254 | | - fprintf(stderr,"please specify a page_id >= 1.\n"); |
255 | | - fprintf(stderr,"usage: %s infile page_id\n", argv[0]); |
256 | | - exit(-1); |
| 554 | + usage(argv[0], "please specify a page_id >= 1.\n"); |
257 | 555 | } |
258 | 556 | |
| 557 | + fin = open (filename, O_RDONLY); |
| 558 | + if (fin < 0) { |
| 559 | + fprintf(stderr,"failed to open file %s for read\n", argv[1]); |
| 560 | + exit(1); |
| 561 | + } |
| 562 | + |
259 | 563 | file_size = get_file_size(fin); |
260 | 564 | |
261 | 565 | interval = file_size; |
— | — | @@ -264,11 +568,10 @@ |
265 | 569 | pinfo.page_id = -1; |
266 | 570 | |
267 | 571 | iinfo.left_end = (off_t)0; |
268 | | - file_size = get_file_size(fin); |
269 | 572 | iinfo.right_end = file_size; |
270 | 573 | iinfo.value_wanted = page_id; |
271 | 574 | |
272 | | - res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo); |
| 575 | + res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile); |
273 | 576 | if (res > 0) { |
274 | 577 | iinfo.last_value = pinfo.page_id; |
275 | 578 | iinfo.last_position = (off_t)0; |
— | — | @@ -283,7 +586,7 @@ |
284 | 587 | } |
285 | 588 | |
286 | 589 | while (1) { |
287 | | - res = do_iteration(&iinfo, fin, &pinfo); |
| 590 | + res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile); |
288 | 591 | /* things to check: bad return? interval is 0 bytes long? */ |
289 | 592 | if (iinfo.left_end == iinfo.right_end) { |
290 | 593 | fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id); |
Index: branches/ariel/xmldumps-backup/mwbzutils/httptiny.c |
— | — | @@ -0,0 +1,193 @@ |
| 2 | +#include <sys/socket.h> |
| 3 | +#include <stdio.h> |
| 4 | +#include <stdlib.h> |
| 5 | +#include <netinet/in.h> |
| 6 | +#include <sys/time.h> |
| 7 | +#include <errno.h> |
| 8 | +#include <netdb.h> |
| 9 | +#include <netinet/in.h> |
| 10 | +#include <arpa/inet.h> |
| 11 | +#include <sys/ioctl.h> |
| 12 | +#include <string.h> |
| 13 | +#include <unistd.h> |
| 14 | + |
| 15 | +int usage(char *whoami); |
| 16 | +int doconnect(int *sd,struct timeval *timeout,struct sockaddr_in *sa_us); |
| 17 | +int dowrite(int sd,char *message,int length); |
| 18 | +int doread(int sd, char *buf, int length, struct timeval *timeout); |
| 19 | + |
| 20 | +extern char *optarg; |
| 21 | +extern int optind; |
| 22 | + |
| 23 | +char *whoami; |
| 24 | + |
| 25 | +#define agentinfo "geturl-tiny/0.3 (Linux x86_64)" |
| 26 | + |
| 27 | +/* expects us to get text back, will only serve up the first BUFSIZ bytes = 8192, that's |
| 28 | + plenty for what we want, which is tiny api call results */ |
| 29 | +char * geturl(char *hostname, int port, char *url) { |
| 30 | + int sd; |
| 31 | + struct sockaddr_in sa_us; |
| 32 | + struct timeval timeout; |
| 33 | + int result; |
| 34 | + struct hostent *hostinfo=NULL; |
| 35 | + char *message=NULL; |
| 36 | + static char buf[BUFSIZ]; |
| 37 | + |
| 38 | + if ((hostinfo=gethostbyname(hostname)) == NULL ) { |
| 39 | + fprintf(stderr,"%s: host lookup failed\n",whoami); |
| 40 | + return(NULL); |
| 41 | + } |
| 42 | + |
| 43 | + /* set up socket and connect */ |
| 44 | + sa_us.sin_family=AF_INET; |
| 45 | + memcpy(&sa_us.sin_addr,hostinfo->h_addr_list[0],hostinfo->h_length); |
| 46 | + sa_us.sin_port=htons(port); |
| 47 | + timeout.tv_sec=30; |
| 48 | + timeout.tv_usec=0; |
| 49 | + doconnect(&sd,&timeout,&sa_us); |
| 50 | + |
| 51 | + /* set up message and send it */ |
| 52 | + if ((message=malloc(strlen(url)+25)) == NULL) { |
| 53 | + fprintf(stderr,"%s: out of memory\n",whoami); |
| 54 | + return(NULL); |
| 55 | + } |
| 56 | + sprintf(message,"GET %s HTTP/1.0\n",url); |
| 57 | + dowrite(sd,message,strlen(message)); |
| 58 | + free(message); |
| 59 | + sprintf(buf,"Host: %s\n",hostname); |
| 60 | + dowrite(sd,buf,strlen(buf)); |
| 61 | + sprintf(buf,"User-Agent: %s\n\n",agentinfo); |
| 62 | + dowrite(sd,buf,strlen(buf)); |
| 63 | + /* read reply */ |
| 64 | + errno=0; |
| 65 | + buf[0]='\0'; |
| 66 | + result=doread(sd,buf,sizeof(buf),&timeout); |
| 67 | + if (result == -1) { |
| 68 | + fprintf(stderr,"%s: read error\n",whoami); |
| 69 | + close(sd); |
| 70 | + return(NULL); |
| 71 | + } |
| 72 | + close(sd); |
| 73 | + return(buf); |
| 74 | +} |
| 75 | + |
| 76 | +/* fixme need to check content length and only retrieve that amount */ |
| 77 | +int doread(int sd, char *buf, int length, struct timeval *timeout) |
| 78 | +{ |
| 79 | + int result; |
| 80 | + fd_set fds; |
| 81 | + int count = 0; |
| 82 | + |
| 83 | + FD_ZERO(&fds); |
| 84 | + FD_SET(sd,&fds); |
| 85 | + |
| 86 | + result = -1; |
| 87 | + while (count < length) { |
| 88 | + result = select(FD_SETSIZE,&fds,NULL,NULL,timeout); |
| 89 | + if (result <= 0) { |
| 90 | + perror("read error of some sort (0)"); |
| 91 | + |
| 92 | + } |
| 93 | + else { |
| 94 | + result=recv(sd,buf+count,length-count,0); |
| 95 | + if (result == -1) { |
| 96 | + perror("read error of some sort (1)"); |
| 97 | + if (errno==EWOULDBLOCK) { |
| 98 | + FD_ZERO(&fds); |
| 99 | + FD_SET(sd,&fds); |
| 100 | + if (select(FD_SETSIZE,&fds,NULL,NULL,timeout) != 1) { |
| 101 | + fprintf(stderr,"%s: timeout %d secs trying to read\n", |
| 102 | + whoami,(int)timeout->tv_sec); |
| 103 | + if (select(FD_SETSIZE,&fds,NULL,NULL,timeout) != 1) { |
| 104 | + fprintf(stderr,"%s: -2- timeout %d secs trying to read\n", |
| 105 | + whoami,(int)timeout->tv_sec); |
| 106 | + } |
| 107 | + return(-1); |
| 108 | + } |
| 109 | + else result=recv(sd,buf+count,length-count,0); |
| 110 | + } |
| 111 | + else { |
| 112 | + fprintf(stderr,"%s: can't read from socket\n",whoami); |
| 113 | + perror(whoami); |
| 114 | + return(-1); |
| 115 | + } |
| 116 | + } |
| 117 | + else if (result == 0) { |
| 118 | + break; |
| 119 | + } |
| 120 | + else { |
| 121 | + count += result; |
| 122 | + buf[count] = '\0'; |
| 123 | + } |
| 124 | + } |
| 125 | + } |
| 126 | + return(result); |
| 127 | +} |
| 128 | + |
| 129 | +int dowrite(int sd,char *message,int length) |
| 130 | +{ |
| 131 | + int result; |
| 132 | + |
| 133 | + while (1) { |
| 134 | + |
| 135 | + result=send(sd,message,(unsigned int) length,0); |
| 136 | + if (result == -1) { |
| 137 | + perror("some error, let's see it"); |
| 138 | + if (errno!=EAGAIN) { |
| 139 | + fprintf(stderr,"%s: write to server failed\n",whoami); |
| 140 | + perror(whoami); |
| 141 | + exit(1); |
| 142 | + } |
| 143 | + } |
| 144 | + else break; |
| 145 | + } |
| 146 | + return(result); |
| 147 | +} |
| 148 | + |
| 149 | +int doconnect(int *sd,struct timeval *timeout,struct sockaddr_in *sa_us) |
| 150 | +{ |
| 151 | + int val; |
| 152 | + fd_set fds; |
| 153 | + |
| 154 | + if ((*sd = socket(AF_INET,SOCK_STREAM,0)) == -1) { |
| 155 | + fprintf(stderr, "%s: could not get socket\n",whoami); |
| 156 | + perror(whoami); |
| 157 | + exit(1); |
| 158 | + } |
| 159 | + /* |
| 160 | + val=1; |
| 161 | + if (ioctl(*sd, FIONBIO, &val) == -1) { |
| 162 | + fprintf(stderr,"%s: could not make connection \ |
| 163 | + non-blocking\n",whoami); |
| 164 | + perror(whoami); |
| 165 | + exit(1); |
| 166 | + } |
| 167 | + */ |
| 168 | + if (connect(*sd,(struct sockaddr *) sa_us,sizeof(*sa_us)) == -1) { |
| 169 | + if (errno != EINPROGRESS) { |
| 170 | + fprintf(stderr,"%s: could not connect\n", whoami); |
| 171 | + perror(whoami); |
| 172 | + exit(1); |
| 173 | + } |
| 174 | + else { |
| 175 | + FD_ZERO(&fds); |
| 176 | + FD_SET(*sd,&fds); |
| 177 | + if (select(FD_SETSIZE,NULL,&fds,NULL,timeout) != 1) { |
| 178 | + fprintf(stderr,"%s: timeout %d secs trying to connect\n", |
| 179 | + whoami,(int)timeout->tv_sec); |
| 180 | + exit(1); |
| 181 | + } |
| 182 | + else if ((connect(*sd,(struct sockaddr *) sa_us,sizeof(*sa_us))== -1) |
| 183 | + && ( errno != EISCONN)) { |
| 184 | + /* shouldn't in theory but.. */ |
| 185 | + fprintf(stderr, "%s: connect failed\n",whoami); |
| 186 | + perror(whoami); |
| 187 | + exit(1); |
| 188 | + } |
| 189 | + } |
| 190 | + } |
| 191 | + errno=0; |
| 192 | + return(0); |
| 193 | +} |
| 194 | + |
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/httptiny.c |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 195 | + native |
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h |
— | — | @@ -110,6 +110,8 @@ |
111 | 111 | |
112 | 112 | buf_info_t *init_buffer(int size); |
113 | 113 | |
| 114 | +void free_buffer(buf_info_t *b); |
| 115 | + |
114 | 116 | int buffer_is_empty(buf_info_t *b); |
115 | 117 | |
116 | 118 | int buffer_is_full(buf_info_t *b); |
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c |
— | — | @@ -141,6 +141,7 @@ |
142 | 142 | } |
143 | 143 | /* must be after 4 byte file header, and we add a leftmost byte to the buffer |
144 | 144 | of data read in case some bits have been shifted into it */ |
| 145 | + /* fprintf(stderr,"position is %"PRId64" and file size is %"PRId64"\n",bfile->position, bfile->file_size); */ |
145 | 146 | while (bfile->position <= bfile->file_size - 6 && bfile->position >= 0 && bfile->bits_shifted < 0) { |
146 | 147 | bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile); |
147 | 148 | if (bfile->bits_shifted < 0) { |
— | — | @@ -387,6 +388,16 @@ |
388 | 389 | return(b); |
389 | 390 | } |
390 | 391 | |
| 392 | +/* free pieces of buf_info_t */ |
| 393 | +void free_buffer(buf_info_t *b) { |
| 394 | + if (b) { |
| 395 | + if (b->buffer) { |
| 396 | + free(b->buffer); |
| 397 | + } |
| 398 | + } |
| 399 | + return; |
| 400 | +} |
| 401 | + |
391 | 402 | /* check if buffer (used for decompressed data output) is empty, |
392 | 403 | returns 1 if so and 0 if not */ |
393 | 404 | int buffer_is_empty(buf_info_t *b) { |
— | — | @@ -476,7 +487,7 @@ |
477 | 488 | if (buffer_is_full(b)) { |
478 | 489 | return(0); |
479 | 490 | } |
480 | | - |
| 491 | + |
481 | 492 | if (buffer_is_empty(b)) { |
482 | 493 | b->next_to_fill = b->buffer; |
483 | 494 | } |
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile |
— | — | @@ -34,8 +34,8 @@ |
35 | 35 | dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o |
36 | 36 | $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2 |
37 | 37 | |
38 | | -findpageidinbz2xml: $(OBJSBZ) mwbzlib.o findpageidinbz2xml.o |
39 | | - $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o mwbzlib.o $(OBJSBZ) -lbz2 |
| 38 | +findpageidinbz2xml: $(OBJSBZ) mwbzlib.o httptiny.o findpageidinbz2xml.o |
| 39 | + $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o httptiny.o mwbzlib.o $(OBJSBZ) -lbz2 -lz |
40 | 40 | |
41 | 41 | checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o |
42 | 42 | $(CC) $(CFLAGS) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o mwbzlib.o $(OBJSBZ) -lbz2 |
— | — | @@ -62,6 +62,8 @@ |
63 | 63 | $(CC) $(CFLAGS) -c bzlibfuncs.c |
64 | 64 | mwbzlib.o: mwbzlib.c bzlib.h bzlib_private.h mwbzutils.h |
65 | 65 | $(CC) $(CFLAGS) -c mwbzlib.c |
| 66 | +httptiny.o: httptiny.c |
| 67 | + $(CC) $(CFLAGS) -c httptiny.c |
66 | 68 | dumplastbz2block.o: dumplastbz2block.c |
67 | 69 | $(CC) $(CFLAGS) -c dumplastbz2block.c |
68 | 70 | findpageidinbz2xml.o: findpageidinbz2xml.c |
— | — | @@ -73,7 +75,7 @@ |
74 | 76 | |
75 | 77 | distclean: clean |
76 | 78 | |
77 | | -DISTNAME=mwbzutils-0.0.1 |
| 79 | +DISTNAME=mwbzutils-0.0.2 |
78 | 80 | dist: |
79 | 81 | rm -f $(DISTNAME) |
80 | 82 | ln -s -f . $(DISTNAME) |
— | — | @@ -82,6 +84,7 @@ |
83 | 85 | $(DISTNAME)/findpageidinbz2xml.c \ |
84 | 86 | $(DISTNAME)/checkforbz2footer.c \ |
85 | 87 | $(DISTNAME)/dumpbz2filefromoffset.c \ |
| 88 | + $(DISTNAME)/httptiny.c \ |
86 | 89 | $(DISTNAME)/mwbzlib.c \ |
87 | 90 | $(DISTNAME)/mwbzutils.h \ |
88 | 91 | $(DISTNAME)/bzlibfuncs.c \ |