r107841 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107840‎ | r107841 | r107842 >
Date:17:24, 2 January 2012
Author:ariel
Status:deferred
Tags:
Comment:
add verbose option
Modified paths:
  • /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
@@ -226,12 +226,12 @@
227227 format:
228228 <?xml version="1.0"?><api><query><pages><page pageid="6215" ns="0" title="hystérique" /></pages></query></api>
229229 */
230 - match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 230+ match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*3);
231231 res = regcomp(&compiled_page_id_expr, page_id_expr, REG_EXTENDED);
232232
233 - if (regexec(&compiled_page_id_expr, buffer, 2, match_page_id_expr, 0 ) == 0) {
234 - if (match_page_id_expr[1].rm_so >=0) {
235 - page_id = atol(buffer + match_page_id_expr[1].rm_so);
 233+ if (regexec(&compiled_page_id_expr, buffer, 3, match_page_id_expr, 0 ) == 0) {
 234+ if (match_page_id_expr[2].rm_so >=0) {
 235+ page_id = atol(buffer + match_page_id_expr[2].rm_so);
236236 }
237237 }
238238 return(page_id);
@@ -250,13 +250,13 @@
251251 0 if no pageid found,
252252 -1 on error
253253 */
254 -int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) {
 254+int get_first_page_id_after_offset(int fin, off_t position, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename, int verbose) {
255255 int res;
256256 regmatch_t *match_page, *match_page_id, *match_rev, *match_rev_id;
257257 regex_t compiled_page, compiled_page_id, compiled_rev, compiled_rev_id;
258258 int length=5000; /* output buffer size */
259259 char *page = "<page>";
260 - char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
 260+ char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n([ ]+<ns>[0-9]+</ns>\n)?[ ]+<id>([0-9]+)</id>\n";
261261 char *rev = "<revision>";
262262 char *rev_id_expr = "<revision>\n[ ]+<id>([0-9]+)</id>\n";
263263
@@ -275,7 +275,7 @@
276276 res = regcomp(&compiled_rev_id, rev_id_expr, REG_EXTENDED);
277277
278278 match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
279 - match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 279+ match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*3);
280280 match_rev = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
281281 match_rev_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
282282
@@ -288,21 +288,23 @@
289289 bfile.bytes_read = 0;
290290
291291 if (find_first_bz2_block_from_offset(&bfile, fin, position, FORWARD) <= (off_t)0) {
292 - /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
 292+ if (verbose) fprintf(stderr,"failed to find block in bz2file after offset %"PRId64" (1)\n", position);
293293 return(-1);
294294 }
295295
 296+ if (verbose) fprintf(stderr,"found first block in bz2file after offset %"PRId64"\n", position);
 297+
296298 while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) {
297299 buffer_count++;
 300+ if (verbose >=2) fprintf(stderr,"buffers read: %d\n", buffer_count);
298301 if (bfile.bytes_written) {
299 - while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
300 - if (match_page_id[1].rm_so >=0) {
301 - /* write page_id to stderr */
302 - /*
303 - fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);
 302+ while (regexec(&compiled_page_id, (char *)b->next_to_read, 3, match_page_id, 0 ) == 0) {
 303+ if (match_page_id[2].rm_so >=0) {
 304+ if (verbose){
 305+ fwrite(b->next_to_read+match_page_id[2].rm_so, sizeof(unsigned char), match_page_id[2].rm_eo - match_page_id[2].rm_so, stderr);
304306 fwrite("\n",1,1,stderr);
305 - */
306 - pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));
 307+ }
 308+ pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[2].rm_so));
307309 pinfo->position = bfile.block_start;
308310 pinfo->bits_shifted = bfile.bits_shifted;
309311 return(1);
@@ -337,6 +339,7 @@
338340 hopefully that doesn't take forever.
339341 */
340342 if (buffer_count>(20000000/BUFINSIZE) && rev_id) {
 343+ if (verbose) fprintf(stderr, "passed cutoff for using api\n");
341344 if (use_api) {
342345 page_id_found = get_page_id_from_rev_id_via_api(rev_id, fin);
343346 }
@@ -420,7 +423,7 @@
421424
422425 return value from guess, or -1 on error.
423426 */
424 -int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename) {
 427+int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo, int use_api, int use_stub, char *stubfilename, int verbose) {
425428 int res;
426429 off_t new_position;
427430 off_t interval;
@@ -434,7 +437,8 @@
435438 if (interval == (off_t)0) {
436439 interval = (off_t)1;
437440 }
438 - /* fprintf(stderr,"interval size is %"PRId64", left end %"PRId64", right end %"PRId64", last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */
 441+ if (verbose)
 442+ fprintf(stderr,"interval size is %"PRId64", left end %"PRId64", right end %"PRId64", last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value);
439443 /* if we're this close, we'll check this value and be done with it */
440444 if (iinfo->right_end -iinfo->left_end < (off_t)2) {
441445 new_position = iinfo->left_end;
@@ -442,18 +446,18 @@
443447 }
444448 else {
445449 if (iinfo->last_value < iinfo->value_wanted) {
446 - /* fprintf(stderr,"resetting left end\n"); */
 450+ if (verbose >=2) fprintf(stderr,"resetting left end\n");
447451 iinfo->left_end = iinfo->last_position;
448452 new_position = iinfo->last_position + interval;
449453 }
450454 /* iinfo->last_value > iinfo->value_wanted */
451455 else {
452 - /* fprintf(stderr,"resetting right end\n"); */
 456+ if (verbose >=2) fprintf(stderr,"resetting right end\n");
453457 iinfo->right_end = iinfo->last_position;
454458 new_position = iinfo->last_position - interval;
455459 }
456460 }
457 - res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename);
 461+ res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename, verbose);
458462 if (res >0) {
459463 /* caller wants the new value */
460464 iinfo->last_value = pinfo->page_id;
@@ -470,17 +474,18 @@
471475 }
472476 /* in theory we were moving towards beginning of file, should not have issues, so bail here */
473477 else {
474 - /* fprintf(stderr,"something very broken, giving up\n"); */
 478+ if (verbose) fprintf(stderr,"something very broken, giving up\n");
475479 return(-1);
476480 }
477481 }
478482 }
479483
 484+
480485 void usage(char *whoami, char *message) {
481486 if (message) {
482487 fprintf(stderr,message);
483488 }
484 - fprintf(stderr,"usage: %s --filename file --pageid id [--useapi]\n", whoami);
 489+ fprintf(stderr,"usage: %s --filename file --pageid id [--stubfile] [--useapi] [--verbose]\n", whoami);
485490 exit(1);
486491 }
487492
@@ -514,6 +519,7 @@
515520 int optindex=0;
516521 int use_api = 0;
517522 int use_stub = 0;
 523+ int verbose = 0;
518524 int optc;
519525 char *stubfile=NULL;
520526
@@ -521,12 +527,13 @@
522528 {"filename", 1, 0, 'f'},
523529 {"pageid", 1, 0, 'p'},
524530 {"useapi", 0, 0, 'a'},
 531+ {"verbose", 0, 0, 'v'},
525532 {"stubfile", 1, 0, 's'},
526533 {NULL, 0, NULL, 0}
527534 };
528535
529536 while (1) {
530 - optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile", optvalues, &optindex);
 537+ optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile:verbose", optvalues, &optindex);
531538 if (optc=='f') {
532539 filename=optarg;
533540 }
@@ -540,6 +547,8 @@
541548 use_stub=1;
542549 stubfile = optarg;
543550 }
 551+ else if (optc=='v')
 552+ verbose++;
544553 else if (optc==-1) break;
545554 else usage(argv[0],"unknown option or other error\n");
546555 }
@@ -570,7 +579,7 @@
571580 iinfo.right_end = file_size;
572581 iinfo.value_wanted = page_id;
573582
574 - res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile);
 583+ res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile, verbose);
575584 if (res > 0) {
576585 iinfo.last_value = pinfo.page_id;
577586 iinfo.last_position = (off_t)0;
@@ -580,12 +589,13 @@
581590 exit(1);
582591 }
583592 if (pinfo.page_id == page_id) {
584 - fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id);
585 - exit(0);
 593+ if (verbose) fprintf(stderr,"found the page id right away, no iterations needed.\n");
 594+ fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id);
 595+ exit(0);
586596 }
587597
588598 while (1) {
589 - res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile);
 599+ res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile, verbose);
590600 /* things to check: bad return? interval is 0 bytes long? */
591601 if (iinfo.left_end == iinfo.right_end) {
592602 fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.page_id);

Status & tagging log