Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c |
— | — | @@ -226,12 +226,12 @@ |
227 | 227 | format: |
228 | 228 | <?xml version="1.0"?><api><query><pages><page pageid="6215" ns="0" title="hystérique" /></pages></query></api> |
229 | 229 | */ |
230 | | - match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
| 230 | + match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*3); |
231 | 231 | res = regcomp(&compiled_page_id_expr, page_id_expr, REG_EXTENDED); |
232 | 232 | |
233 | | - if (regexec(&compiled_page_id_expr, buffer, 2, match_page_id_expr, 0 ) == 0) { |
234 | | - if (match_page_id_expr[1].rm_so >=0) { |
235 | | - page_id = atol(buffer + match_page_id_expr[1].rm_so); |
| 233 | + if (regexec(&compiled_page_id_expr, buffer, 3, match_page_id_expr, 0 ) == 0) { |
| 234 | + if (match_page_id_expr[2].rm_so >=0) { |
| 235 | + page_id = atol(buffer + match_page_id_expr[2].rm_so); |
236 | 236 | } |
237 | 237 | } |
238 | 238 | return(page_id); |
— | — | @@ -250,13 +250,13 @@ |
251 | 251 | 0 if no pageid found, |
252 | 252 | -1 on error |
253 | 253 | */ |
254 | | -int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) { |
| 254 | +int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename, int verbose) { |
255 | 255 | int res; |
256 | 256 | regmatch_t *match_page, *match_page_id, *match_rev, *match_rev_id; |
257 | 257 | regex_t compiled_page, compiled_page_id, compiled_rev, compiled_rev_id; |
258 | 258 | int length=5000; /* output buffer size */ |
259 | 259 | char *page = "<page>"; |
260 | | - char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n"; |
| 260 | + char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n([ ]+<ns>[0-9]+</ns>\n)?[ ]+<id>([0-9]+)</id>\n"; |
261 | 261 | char *rev = "<revision>"; |
262 | 262 | char *rev_id_expr = "<revision>\n[ ]+<id>([0-9]+)</id>\n"; |
263 | 263 | |
— | — | @@ -275,7 +275,7 @@ |
276 | 276 | res = regcomp(&compiled_rev_id, rev_id_expr, REG_EXTENDED); |
277 | 277 | |
278 | 278 | match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1); |
279 | | - match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
| 279 | + match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*3); |
280 | 280 | match_rev = (regmatch_t *)malloc(sizeof(regmatch_t)*1); |
281 | 281 | match_rev_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
282 | 282 | |
— | — | @@ -288,21 +288,23 @@ |
289 | 289 | bfile.bytes_read = 0; |
290 | 290 | |
291 | 291 | if (find_first_bz2_block_from_offset(&bfile, fin, position, FORWARD) <= (off_t)0) { |
292 | | - /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */ |
| 292 | + if (verbose) fprintf(stderr,"failed to find block in bz2file after offset %"PRId64" (1)\n", position); |
293 | 293 | return(-1); |
294 | 294 | } |
295 | 295 | |
| 296 | + if (verbose) fprintf(stderr,"found first block in bz2file after offset %"PRId64"\n", position); |
| 297 | + |
296 | 298 | while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) { |
297 | 299 | buffer_count++; |
| 300 | + if (verbose >=2) fprintf(stderr,"buffers read: %d\n", buffer_count); |
298 | 301 | if (bfile.bytes_written) { |
299 | | - while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) { |
300 | | - if (match_page_id[1].rm_so >=0) { |
301 | | - /* write page_id to stderr */ |
302 | | - /* |
303 | | - fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr); |
| 302 | + while (regexec(&compiled_page_id, (char *)b->next_to_read, 3, match_page_id, 0 ) == 0) { |
| 303 | + if (match_page_id[2].rm_so >=0) { |
| 304 | + if (verbose){ |
| 305 | + fwrite(b->next_to_read+match_page_id[2].rm_so, sizeof(unsigned char), match_page_id[2].rm_eo - match_page_id[2].rm_so, stderr); |
304 | 306 | fwrite("\n",1,1,stderr); |
305 | | - */ |
306 | | - pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so)); |
| 307 | + } |
| 308 | + pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[2].rm_so)); |
307 | 309 | pinfo->position = bfile.block_start; |
308 | 310 | pinfo->bits_shifted = bfile.bits_shifted; |
309 | 311 | return(1); |
— | — | @@ -337,6 +339,7 @@ |
338 | 340 | hopefully that doesn't take forever. |
339 | 341 | */ |
340 | 342 | if (buffer_count>(20000000/BUFINSIZE) && rev_id) { |
| 343 | + if (verbose) fprintf(stderr, "passed cutoff for using api\n"); |
341 | 344 | if (use_api) { |
342 | 345 | page_id_found = get_page_id_from_rev_id_via_api(rev_id, fin); |
343 | 346 | } |
— | — | @@ -420,7 +423,7 @@ |
421 | 424 | |
422 | 425 | return value from guess, or -1 on error. |
423 | 426 | */ |
424 | | -int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) { |
| 427 | +int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename, int verbose) { |
425 | 428 | int res; |
426 | 429 | off_t new_position; |
427 | 430 | off_t interval; |
— | — | @@ -434,7 +437,8 @@ |
435 | 438 | if (interval == (off_t)0) { |
436 | 439 | interval = (off_t)1; |
437 | 440 | } |
438 | | - /* fprintf(stderr,"interval size is %"PRId64", left end %"PRId64", right end %"PRId64", last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */ |
| 441 | + if (verbose) |
| 442 | + fprintf(stderr,"interval size is %"PRId64", left end %"PRId64", right end %"PRId64", last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); |
439 | 443 | /* if we're this close, we'll check this value and be done with it */ |
440 | 444 | if (iinfo->right_end -iinfo->left_end < (off_t)2) { |
441 | 445 | new_position = iinfo->left_end; |
— | — | @@ -442,18 +446,18 @@ |
443 | 447 | } |
444 | 448 | else { |
445 | 449 | if (iinfo->last_value < iinfo->value_wanted) { |
446 | | - /* fprintf(stderr,"resetting left end\n"); */ |
| 450 | + if (verbose >=2) fprintf(stderr,"resetting left end\n"); |
447 | 451 | iinfo->left_end = iinfo->last_position; |
448 | 452 | new_position = iinfo->last_position + interval; |
449 | 453 | } |
450 | 454 | /* iinfo->last_value > iinfo->value_wanted */ |
451 | 455 | else { |
452 | | - /* fprintf(stderr,"resetting right end\n"); */ |
| 456 | + if (verbose >=2) fprintf(stderr,"resetting right end\n"); |
453 | 457 | iinfo->right_end = iinfo->last_position; |
454 | 458 | new_position = iinfo->last_position - interval; |
455 | 459 | } |
456 | 460 | } |
457 | | - res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename); |
| 461 | + res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename, verbose); |
458 | 462 | if (res >0) { |
459 | 463 | /* caller wants the new value */ |
460 | 464 | iinfo->last_value = pinfo->page_id; |
— | — | @@ -470,17 +474,18 @@ |
471 | 475 | } |
472 | 476 | /* in theory we were moving towards beginning of file, should not have issues, so bail here */ |
473 | 477 | else { |
474 | | - /* fprintf(stderr,"something very broken, giving up\n"); */ |
| 478 | + if (verbose) fprintf(stderr,"something very broken, giving up\n"); |
475 | 479 | return(-1); |
476 | 480 | } |
477 | 481 | } |
478 | 482 | } |
479 | 483 | |
| 484 | + |
480 | 485 | void usage(char *whoami, char *message) { |
481 | 486 | if (message) { |
482 | 487 | fprintf(stderr,message); |
483 | 488 | } |
484 | | - fprintf(stderr,"usage: %s --filename file --pageid id [--useapi]\n", whoami); |
| 489 | + fprintf(stderr,"usage: %s --filename file --pageid id [--stubfile] [--useapi] [--verbose]\n", whoami); |
485 | 490 | exit(1); |
486 | 491 | } |
487 | 492 | |
— | — | @@ -514,6 +519,7 @@ |
515 | 520 | int optindex=0; |
516 | 521 | int use_api = 0; |
517 | 522 | int use_stub = 0; |
| 523 | + int verbose = 0; |
518 | 524 | int optc; |
519 | 525 | char *stubfile=NULL; |
520 | 526 | |
— | — | @@ -521,12 +527,13 @@ |
522 | 528 | {"filename", 1, 0, 'f'}, |
523 | 529 | {"pageid", 1, 0, 'p'}, |
524 | 530 | {"useapi", 0, 0, 'a'}, |
| 531 | + {"verbose", 0, 0, 'v'}, |
525 | 532 | {"stubfile", 1, 0, 's'}, |
526 | 533 | {NULL, 0, NULL, 0} |
527 | 534 | }; |
528 | 535 | |
529 | 536 | while (1) { |
530 | | - optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile", optvalues, &optindex); |
| 537 | + optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile:verbose", optvalues, &optindex); |
531 | 538 | if (optc=='f') { |
532 | 539 | filename=optarg; |
533 | 540 | } |
— | — | @@ -540,6 +547,8 @@ |
541 | 548 | use_stub=1; |
542 | 549 | stubfile = optarg; |
543 | 550 | } |
| 551 | + else if (optc=='v') |
| 552 | + verbose++; |
544 | 553 | else if (optc==-1) break; |
545 | 554 | else usage(argv[0],"unknown option or other error\n"); |
546 | 555 | } |
— | — | @@ -570,7 +579,7 @@ |
571 | 580 | iinfo.right_end = file_size; |
572 | 581 | iinfo.value_wanted = page_id; |
573 | 582 | |
574 | | - res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile); |
| 583 | + res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile, verbose); |
575 | 584 | if (res > 0) { |
576 | 585 | iinfo.last_value = pinfo.page_id; |
577 | 586 | iinfo.last_position = (off_t)0; |
— | — | @@ -580,12 +589,13 @@ |
581 | 590 | exit(1); |
582 | 591 | } |
583 | 592 | if (pinfo.page_id == page_id) { |
584 | | - fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id); |
585 | | - exit(0); |
| 593 | + if (verbose) fprintf(stderr,"found the page id right away, no iterations needed.\n"); |
| 594 | + fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id); |
| 595 | + exit(0); |
586 | 596 | } |
587 | 597 | |
588 | 598 | while (1) { |
589 | | - res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile); |
| 599 | + res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile, verbose); |
590 | 600 | /* things to check: bad return? interval is 0 bytes long? */ |
591 | 601 | if (iinfo.left_end == iinfo.right_end) { |
592 | 602 | fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id); |