r91454 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91453‎ | r91454 | r91455 >
Date:13:38, 5 July 2011
Author:ariel
Status:deferred
Tags:
Comment:
locate offset of bz2block in bz2 file with specified pageid (or slightly before it)
Modified paths:
  • /branches/ariel/xmldumps-backup/findpageidinbz2xml.c (added) (history)
  • /branches/ariel/xmldumps-backup/findpageidinbz2xml.h (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.c
@@ -0,0 +1,842 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include <sys/types.h>
 11+#include <regex.h>
 12+#include "bzlib.h"
 13+#include "findpageidinbz2xml.h"
 14+
 15+/* return n ones either at left or right end */
 16+int bitmask(int numbits, int end) {
 17+ if (end == MASKRIGHT) {
 18+ return((1<<numbits)-1);
 19+ }
 20+ else {
 21+ return(((1<<numbits)-1) << (8-numbits));
 22+ }
 23+}
 24+
 25+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
 26+ int i;
 27+
 28+ if (numbits == 0) {
 29+ return;
 30+ }
 31+
 32+ for (i=0; i<buflen; i++) {
 33+ /* left 1 */
 34+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
 35+
 36+ /* grab leftmost from next byte */
 37+ if (i < buflen-1) {
 38+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );
 39+ }
 40+ }
 41+}
 42+
 43+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
 44+ int i;
 45+
 46+ for (i=buflen-1; i>=0; i--) {
 47+ /* right 1 */
 48+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 49+
 50+ /* grab rightmost from prev byte */
 51+ if (i > 0) {
 52+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));
 53+ }
 54+ }
 55+}
 56+
 57+unsigned char ** init_marker() {
 58+ unsigned char **marker = malloc(8*sizeof(unsigned char *));
 59+ int i;
 60+
 61+ /* set up block marker plus its various right-shifted incarnations */
 62+ for (i = 0; i< 8; i++) {
 63+ marker[i] = malloc(sizeof(unsigned char)*7);
 64+ }
 65+ marker[0][0]= (unsigned char) 0x31;
 66+ marker[0][1]= (unsigned char) 0x41;
 67+ marker[0][2]= (unsigned char) 0x59;
 68+ marker[0][3]= (unsigned char) 0x26;
 69+ marker[0][4]= (unsigned char) 0x53;
 70+ marker[0][5]= (unsigned char) 0x59;
 71+ marker[0][6]= (unsigned char) 0x00;
 72+ for (i = 1; i< 8; i++) {
 73+ memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
 74+ shift_bytes_right(marker[i],7,1);
 75+ }
 76+ return(marker);
 77+}
 78+
 79+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 80+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 81+ matches and 0 otherwise. */
 82+int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 83+ int i;
 84+
 85+ if (bitsrightshifted == 0) {
 86+ for (i = 0; i< numbytes; i++) {
 87+ if (buff1[i] != buff2[i]) {
 88+ return(1);
 89+ }
 90+ }
 91+ return(0);
 92+ }
 93+ else {
 94+ for (i = 1; i< numbytes-2; i++) {
 95+ if (buff1[i] != buff2[i]) {
 96+ return(1);
 97+ }
 98+ }
 99+ /* do leftmost byte */
 100+ if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {
 101+ return(1);
 102+ }
 103+ /* do rightmost byte */
 104+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {
 105+ return(1);
 106+ }
 107+ return(0);
 108+ }
 109+}
 110+
 111+
 112+/* return -1 if no match
 113+ return number of bits rightshifted otherwise */
 114+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
 115+ int result, i;
 116+
 117+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
 118+ if (!result) {
 119+ return(0);
 120+ }
 121+ for (i=1; i<8; i++) {
 122+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
 123+ if (!result) {
 124+ return(i);
 125+ }
 126+ }
 127+ return(-1);
 128+}
 129+
 130+
 131+/* return: 1 if found, 0 if not, -1 on error */
 132+int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
 133+ int result;
 134+
 135+ bfile->bits_shifted = -1;
 136+ result = read(fin, bfile->marker_buffer, 7);
 137+ if (result < 0) {
 138+ /* fprintf(stderr,"read of file failed\n"); */
 139+ return(-1);
 140+ }
 141+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
 142+ of data read in case some bits have been shifted into it */
 143+ while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
 144+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
 145+ if (bfile->bits_shifted < 0) {
 146+ bfile->position++;
 147+ result = lseek(fin, (bfile->position), SEEK_SET);
 148+ if (result == -1) {
 149+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
 150+ return(-1);
 151+ }
 152+ result = read(fin, bfile->marker_buffer, 7);
 153+ if (result < 7) {
 154+ /* fprintf(stderr,"read of file failed\n"); */
 155+ return(-1);
 156+ }
 157+ }
 158+ else {
 159+ bfile->block_start = bfile->position;
 160+ return(1);
 161+ }
 162+ }
 163+ return(0);
 164+}
 165+
 166+/*
 167+ initializes the bz2 strm structure,
 168+ calls the BZ2 decompression library initializer
 169+
 170+ returns:
 171+ BZ_OK on success
 172+ various BZ_ errors on failure (see bzlib.h)
 173+*/
 174+int init_decompress(bz_info_t *bfile) {
 175+ int bz_verbosity = 0;
 176+ int bz_small = 0;
 177+ int ret;
 178+
 179+ bfile->strm.bzalloc = NULL;
 180+ bfile->strm.bzfree = NULL;
 181+ bfile->strm.opaque = NULL;
 182+
 183+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
 184+ if (ret != BZ_OK) {
 185+ fprintf(stderr,"uncompress failed, err %d\n", ret);
 186+ exit(-1);
 187+ }
 188+ return(ret);
 189+}
 190+
 191+/*
 192+ reads the first 4 bytes from a bz2 file (should be
 193+ "BZh" followed by the block size indicator, typically "9")
 194+ and passes them into the BZ2 decompression library.
 195+ This must be done before decompression of any block of the
 196+ file is attempted.
 197+
 198+ returns:
 199+ BZ_OK if successful,
 200+ various BZ_ errors on failure (see bzlib.h)
 201+*/
 202+int decompress_header(int fin, bz_info_t *bfile) {
 203+ int ret, res;
 204+
 205+ res = lseek(fin,0,SEEK_SET);
 206+ if (res < 0) {
 207+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
 208+ exit(-1);
 209+ }
 210+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
 211+ if (bfile->bytes_read < 4) {
 212+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
 213+ exit(-1);
 214+ }
 215+ bfile->strm.next_in = (char *)bfile->header_buffer;
 216+ bfile->strm.avail_in = 4;
 217+
 218+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 219+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
 220+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
 221+ exit(-1);
 222+ }
 223+ return(ret);
 224+}
 225+
 226+/*
 227+ seek to appropriate offset as specified in bfile,
 228+ read compressed data into buffer indicated by bfile,
 229+ update the bfile structure accordingly,
 230+ save the overflow byte (bit-shifted data = suck)
 231+ this is for the *first* buffer of data in a stream,
 232+ for subsequent buffers use fill_buffer_to_decompress()
 233+
 234+ this will set bfile->eof on eof. no other indicator
 235+ will be provided.
 236+
 237+ returns:
 238+ 0 on success
 239+ -1 on error
 240+*/
 241+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
 242+ int res;
 243+
 244+ if (bfile->bits_shifted == 0) {
 245+ res = lseek(fin,bfile->position+1,SEEK_SET);
 246+ if (res < 0) {
 247+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
 248+ return(-1);
 249+ }
 250+ }
 251+ else {
 252+ res = lseek(fin,bfile->position,SEEK_SET);
 253+ if (res < 0) {
 254+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
 255+ return(-1);
 256+ }
 257+ }
 258+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
 259+ if (bfile->bytes_read > 0) {
 260+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
 261+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
 262+
 263+ bfile->strm.next_in = (char *)(bfile->bufin);
 264+ bfile->strm.avail_in = bfile->bytes_read-1;
 265+ }
 266+ if (bfile->bytes_read <=0) {
 267+ bfile->eof++;
 268+ }
 269+ return(0);
 270+}
 271+
 272+/*
 273+ read compressed data into buffer indicated by bfile,
 274+ from current position of file,
 275+ stuffing the overflow byte in first.
 276+ update the bfile structure accordingly
 277+ save the new overflow byte (bit-shifted data = suck)
 278+ this function is for decompression of buffers *after
 279+ the first one*. for the first one use
 280+ setup_first_buffer_to_decompress()
 281+
 282+ this will set bfile->eof on eof. no other indicator
 283+ will be provided.
 284+
 285+ returns:
 286+ 0 on success
 287+ hmm, it really does not do anything about errors :-D
 288+*/
 289+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
 290+ if (bfile->strm.avail_in == 0) {
 291+ bfile->strm.next_in = (char *)(bfile->bufin);
 292+ bfile->bufin[0] = bfile->overflow;
 293+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
 294+ if (bfile->bytes_read > 0) {
 295+ bfile->overflow = bfile->bufin[bfile->bytes_read];
 296+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
 297+ bfile->strm.avail_in = bfile->bytes_read;
 298+ bfile->position+=bfile->bytes_read;
 299+ }
 300+ else {
 301+ bfile->strm.avail_in = 1; /* the overflow byte */
 302+ bfile->eof++;
 303+ }
 304+ }
 305+ return(0);
 306+}
 307+
 308+/* size of buffer is bytes usable. there will be a null byte at the end
 309+
 310+ what we do with the buffer:
 311+ - read from front of buffer to end,
 312+ - fill from point where prev read did not fill buffer, or from where
 313+ move of data at end of buffer to beginning left room,
 314+ - mark a string of bytes (starting from what's available to read) as "read"
 315+
 316+*/
 317+buf_info_t *init_buffer(int size) {
 318+ buf_info_t *b;
 319+
 320+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
 321+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
 322+ b->buffer[size]='\0';
 323+ b->end = b->buffer + size;
 324+ b->next_to_read = b->end; /* nothing available */
 325+ b->bytes_avail = 0; /* bytes to read, nothing available */
 326+ b->next_to_fill = b->buffer; /* empty */
 327+ b->next_to_fill[0] = '\0';
 328+ return(b);
 329+}
 330+
 331+/* check if buffer (used for decompressed data output) is empty,
 332+ returns 1 if so and 0 if not */
 333+int buffer_is_empty(buf_info_t *b) {
 334+ if (b->bytes_avail == 0) {
 335+ return(1);
 336+ }
 337+ else {
 338+ return(0);
 339+ }
 340+}
 341+
 342+/* check if buffer (used for decompressed data output) is full,
 343+
 344+ returns 1 if so and 0 if not
 345+ I'm not liking this function so well, fixme */
 346+int buffer_is_full(buf_info_t *b) {
 347+ if (b->next_to_fill == b->end) {
 348+ return(1);
 349+ }
 350+ else {
 351+ return(0);
 352+ }
 353+}
 354+
 355+/* FIXME do this right. whatever. */
 356+int get_file_size(int fin) {
 357+ int res;
 358+
 359+ res = lseek(fin, 0, SEEK_END);
 360+ if (res < 0) {
 361+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
 362+ exit(-1);
 363+ }
 364+ return(res);
 365+}
 366+
 367+
 368+/*
 369+ look for the first bz2 block in the file after specified offset
 370+ it tests that the block is valid by doing partial decompression.
 371+ this function will update the bfile structure:
 372+ bfile->position will contain the current position of the file (? will it?)
 373+ bfile->bits_shifted will contain the number of bits that the block is rightshifted
 374+ bfile->block_start will contain the offset from start of file to the block
 375+ returns:
 376+ position of next byte in file to be read, on success
 377+ -1 if no marker or other error
 378+*/
 379+int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {
 380+ int res;
 381+
 382+ bfile->bufin_size = BUFINSIZE;
 383+ bfile->marker = init_marker();
 384+ bfile->position = position;
 385+ bfile->block_start = -1;
 386+ bfile->bytes_read = 0;
 387+ bfile->bytes_written = 0;
 388+ bfile->eof = 0;
 389+ bfile->bits_shifted = -1;
 390+
 391+ bfile->file_size = get_file_size(fin);
 392+
 393+ while (bfile->bits_shifted < 0) {
 394+ if (bfile->position > bfile->file_size) {
 395+ return(-1);
 396+ }
 397+ res = lseek(fin, bfile->position, SEEK_SET);
 398+ if (res < 0) {
 399+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 400+ exit(-1);
 401+ }
 402+ res = find_next_bz2_block_marker(fin, bfile);
 403+ if (res == 1) {
 404+ init_decompress(bfile);
 405+ decompress_header(fin, bfile);
 406+ res = setup_first_buffer_to_decompress(fin, bfile);
 407+ if (res == -1) {
 408+ fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
 409+ exit(-1);
 410+ }
 411+ bfile->strm.next_out = (char *)bfile->bufout;
 412+ bfile->strm.avail_out = bfile->bufout_size;
 413+ res = BZ2_bzDecompress ( &(bfile->strm) );
 414+ /* this means we (probably) have a genuine marker */
 415+ if (BZ_OK == res || BZ_STREAM_END == res) {
 416+ res = BZ2_bzDecompressEnd ( &(bfile->strm) );
 417+ bfile->bytes_read = 0;
 418+ bfile->bytes_written = 0;
 419+ bfile->eof = 0;
 420+ /* leave the file at the right position */
 421+ res = lseek(fin, bfile->block_start, SEEK_SET);
 422+ if (res < 0) {
 423+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 424+ exit(-1);
 425+ }
 426+ return(0);
 427+ }
 428+ /* right bytes, but there by chance, skip and try again */
 429+ else {
 430+ bfile->position+=6;
 431+ bfile->bits_shifted = -1;
 432+ bfile->block_start = -1;
 433+ }
 434+ }
 435+ else {
 436+ return(-1);
 437+ }
 438+ }
 439+ return(-1);
 440+}
 441+
 442+/*
 443+ find the first bz2 block marker in the file,
 444+ from its current position,
 445+ then set up for decompression from that point
 446+ returns:
 447+ 0 on success
 448+ -1 if no marker or other error
 449+*/
 450+int init_bz2_file(bz_info_t *bfile, int fin) {
 451+ int res;
 452+
 453+ bfile->initialized++;
 454+
 455+ res = find_next_bz2_block_marker(fin, bfile);
 456+ if (res ==1) {
 457+ init_decompress(bfile);
 458+ decompress_header(fin, bfile);
 459+ setup_first_buffer_to_decompress(fin, bfile);
 460+ return(0);
 461+ }
 462+ return(-1);
 463+}
 464+
 465+/* return -1 if error */
 466+int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
 467+ int ret;
 468+
 469+ bfile->bufout = bufferout;
 470+ bfile->bufout_size = bufout_size;
 471+ bfile->bytes_written = 0;
 472+
 473+ if (! bfile->initialized) {
 474+ if (init_bz2_file(bfile, fin) == -1) {
 475+ /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */
 476+ return(-1);
 477+ };
 478+ bfile->strm.next_out = (char *)bfile->bufout;
 479+ bfile->strm.avail_out = bfile->bufout_size;
 480+ }
 481+
 482+ ret = BZ_OK;
 483+ while (BZ_OK == ret && bfile->bytes_written == 0) {
 484+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 485+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 486+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
 487+ }
 488+ else {
 489+ /* fprintf(stderr,"error from BZ decompress %d\n",ret); */
 490+ return(-1);
 491+ }
 492+ fill_buffer_to_decompress(fin, bfile, ret);
 493+ /*
 494+ if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
 495+ fprintf(stderr,"eof reached\n");
 496+ }
 497+ */
 498+ }
 499+ return(0);
 500+}
 501+
 502+
 503+/*
 504+ fill output buffer in b with uncompressed data from bfile
 505+ if this is the first call to the function for this file,
 506+ the file header will be read, and the first buffer of
 507+ uncompressed data will be prepared. bfile->position
 508+ should be set to the offset (from the beginning of file) from
 509+ which to find the first bz2 block.
 510+
 511+ returns:
 512+ on success, number of bytes read (may be 0)
 513+ -1 on error
 514+*/
 515+int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
 516+ int res;
 517+
 518+ if (buffer_is_full(b)) {
 519+ return(0);
 520+ }
 521+
 522+ if (buffer_is_empty(b)) {
 523+ b->next_to_fill = b->buffer;
 524+ }
 525+
 526+ res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
 527+ if (res == -1) {
 528+ return(res);
 529+ }
 530+ if (bfile->bytes_written < 0) {
 531+ /* fprintf(stderr,"read of file failed\n"); */
 532+ return(-1);
 533+ }
 534+ else {
 535+ /* really?? FIXME check this */
 536+ if (buffer_is_empty(b)) {
 537+ b->next_to_read = b->next_to_fill; /* where we just read */
 538+ }
 539+ b->bytes_avail += bfile->bytes_written;
 540+ b->next_to_fill += bfile->bytes_written;
 541+ b->next_to_fill[0] = '\0';
 542+ return(0);
 543+ }
 544+}
 545+
 546+void dumpbuf_info_t(buf_info_t *b) {
 547+ fprintf(stdout, "\n");
 548+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
 549+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
 550+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
 551+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
 552+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
 553+}
 554+
 555+
 556+/*
 557+ copy text from end of buffer to the beginning, that we want to keep
 558+ around for further processing (i.e. further regex matches)
 559+ returns number of bytes copied
 560+*/
 561+int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *from_where, int maxbytes) {
 562+ int i, tocopy;
 563+
 564+ if (from_where >= b->end) {
 565+ return(0);
 566+ }
 567+ else {
 568+ tocopy = b->end - from_where;
 569+ if (maxbytes && (tocopy > maxbytes)) {
 570+ tocopy = maxbytes;
 571+ }
 572+ for (i = 0; i < tocopy; i++) {
 573+ b->buffer[i] = from_where[i];
 574+ }
 575+ b->next_to_fill = b->buffer + tocopy;
 576+ b->next_to_fill[0] = '\0';
 577+ b->next_to_read = b->buffer;
 578+ b->bytes_avail = tocopy;
 579+ return(tocopy);
 580+ }
 581+}
 582+
 583+/*
 584+ get the first page id after position in file
 585+ if a pageid is found, the structure pinfo will be updated accordingly
 586+ returns:
 587+ 1 if a pageid found,
 588+ 0 if no pageid found,
 589+ -1 on error
 590+*/
 591+int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) {
 592+ int res;
 593+ regmatch_t *match_page, *match_page_id;
 594+ regex_t compiled_page, compiled_page_id;
 595+ int length=5000; /* output buffer size */
 596+ char *page = "<page>";
 597+ char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
 598+
 599+ buf_info_t *b;
 600+ bz_info_t bfile;
 601+
 602+ bfile.initialized = 0;
 603+
 604+ res = regcomp(&compiled_page, page, REG_EXTENDED);
 605+ res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
 606+
 607+ match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
 608+ match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 609+
 610+ b = init_buffer(length);
 611+
 612+ pinfo->bits_shifted = -1;
 613+ pinfo->position = -1;
 614+ pinfo->page_id = -1;
 615+
 616+ bfile.bytes_read = 0;
 617+
 618+ if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {
 619+ /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
 620+ return(-1);
 621+ }
 622+
 623+ while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {
 624+ if (bfile.bytes_read) {
 625+ while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
 626+ if (match_page_id[1].rm_so >=0) {
 627+ /* write page_id to stderr */
 628+ /*
 629+ fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);
 630+ fwrite("\n",1,1,stderr);
 631+ */
 632+ pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));
 633+ pinfo->position = bfile.block_start;
 634+ pinfo->bits_shifted = bfile.bits_shifted;
 635+ return(1);
 636+ /* write up to and including page id tag to stdout */
 637+ /*
 638+ fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout);
 639+ b->next_to_read = b->next_to_read+match_page_id[0].rm_eo;
 640+ b->bytes_avail -= match_page_id[0].rm_eo;
 641+ */
 642+ }
 643+ else {
 644+ /* should never happen */
 645+ fprintf(stderr,"regex gone bad...\n");
 646+ exit(-1);
 647+ }
 648+ }
 649+ if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
 650+ /* write everything up to but not including the page tag to stdout */
 651+ /*
 652+ fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
 653+ */
 654+ move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so);
 655+ bfile.strm.next_out = (char *)b->next_to_fill;
 656+ bfile.strm.avail_out = b->end - b->next_to_fill;
 657+ }
 658+ else {
 659+ /* could have the first part of the page tag... so copy up enough bytes to cover that case */
 660+ if (b->bytes_avail> 5) {
 661+ /* write everything that didn't match, but leave 5 bytes, to stdout */
 662+ /*
 663+ fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);
 664+ */
 665+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);
 666+ bfile.strm.next_out = (char *)b->next_to_fill;
 667+ bfile.strm.avail_out = b->end - b->next_to_fill;
 668+ }
 669+ else {
 670+ if (buffer_is_empty(b)) {
 671+ bfile.strm.next_out = (char *)b->buffer;
 672+ bfile.strm.avail_out = bfile.bufout_size;
 673+ b->next_to_fill = b->buffer; /* empty */
 674+ }
 675+ else {
 676+ /* there were only 5 or less bytes so just save em don't write em to stdout */
 677+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
 678+ bfile.strm.next_out = (char *)b->next_to_fill;
 679+ bfile.strm.avail_out = b->end - b->next_to_fill;
 680+ }
 681+ }
 682+ }
 683+ }
 684+ }
 685+ /*
 686+ if (b->bytes_avail) {
 687+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 688+ }
 689+ */
 690+ return(0);
 691+}
 692+
 693+/* search for pageid in a bz2 file, given start and end offsets
 694+ to search for
 695+ we guess by the most boring method possible (shrink the
 696+ interval according to the value found on the last guess,
 697+ try midpoint of the new interval)
 698+ multiple calls of this will get the job done.
 699+ interval has left end = right end if search is complete.
 700+ this function may return the previous guess and simply
 701+ shrink the interval.
 702+ note that a "match" means either that the pageid we find
 703+ is smaller than the one the caller wants, or is equal.
 704+ why? because then we can use the output for prefetch
 705+ for xml dumps and be sure a specific page range is covered :-P
 706+
 707+ return value from guess, or -1 on error.
 708+ */
 709+int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo) {
 710+ int res;
 711+ int new_position;
 712+ int interval;
 713+
 714+ /*
 715+ last_position is somewhere in the interval, perhaps at an end
 716+ last_value is the value we had at that position
 717+ */
 718+
 719+ interval = (iinfo->right_end - iinfo->left_end)/2;
 720+ if (interval == 0) {
 721+ interval = 1;
 722+ }
 723+ /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */
 724+ /* if we're this close, we'll check this value and be done with it */
 725+ if (iinfo->right_end -iinfo->left_end < 2) {
 726+ new_position = iinfo->left_end;
 727+ iinfo->right_end = iinfo->left_end;
 728+ }
 729+ else {
 730+ if (iinfo->last_value < iinfo->value_wanted) {
 731+ /* fprintf(stderr,"resetting left end\n"); */
 732+ iinfo->left_end = iinfo->last_position;
 733+ new_position = iinfo->last_position + interval;
 734+ }
 735+ /* iinfo->last_value > iinfo->value_wanted */
 736+ else {
 737+ /* fprintf(stderr,"resetting right end\n"); */
 738+ iinfo->right_end = iinfo->last_position;
 739+ new_position = iinfo->last_position - interval;
 740+ }
 741+ }
 742+ res = get_first_page_id_after_offset(fin, new_position, pinfo);
 743+ if (res >0) {
 744+ /* caller wants the new value */
 745+ iinfo->last_value = pinfo->page_id;
 746+ iinfo->last_position = new_position;
 747+ return(pinfo->page_id);
 748+ }
 749+ else {
 750+ /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or
 751+ there's just no block here.
 752+ set the right end, keep the last value and position and let the caller retry with the new interval */
 753+ if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */
 754+ iinfo->right_end = new_position;
 755+ return(iinfo->last_value);
 756+ }
 757+ /* in theory we were moving towards beginning of file, should not have issues, so bail here */
 758+ else {
 759+ /* fprintf(stderr,"something very broken, giving up\n"); */
 760+ return(-1);
 761+ }
 762+ }
 763+}
 764+
 765+/*
 766+ given a bzipped and possibly truncated file, and a page id,
 767+ hunt for the page id in the file; this assume that the
 768+ bz2 header is intact and that page ids are steadily increasing
 769+ throughout the file.
 770+
 771+ writes the offset of the relevant block (from beginning of file)
 772+ and the first pageid found in that block, to stdout
 773+
 774+ format of output:
 775+ position:xxxxx pageid:nnn
 776+
 777+ returns: 0 on success, -1 on error
 778+*/
 779+int main(int argc, char **argv) {
 780+ int fin, position, res, interval, page_id, oldmarker, file_size;
 781+ page_info_t pinfo;
 782+ iter_info_t iinfo;
 783+
 784+ if (argc != 3) {
 785+ fprintf(stderr,"usage: %s infile id\n", argv[0]);
 786+ exit(-1);
 787+ }
 788+
 789+ fin = open (argv[1], O_RDONLY);
 790+ if (fin < 0) {
 791+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 792+ exit(-1);
 793+ }
 794+
 795+ page_id = atoi(argv[2]);
 796+ if (page_id <1) {
 797+ fprintf(stderr,"please specify a page_id >= 1.\n");
 798+ fprintf(stderr,"usage: %s infile page_id\n", argv[0]);
 799+ exit(-1);
 800+ }
 801+
 802+ file_size = get_file_size(fin);
 803+
 804+ interval = file_size;
 805+ position = 0;
 806+ oldmarker = -1;
 807+ pinfo.bits_shifted = -1;
 808+ pinfo.position = -1;
 809+ pinfo.page_id = -1;
 810+
 811+ iinfo.left_end = 0;
 812+ file_size = get_file_size(fin);
 813+ iinfo.right_end = file_size;
 814+ iinfo.value_wanted = page_id;
 815+
 816+ res = get_first_page_id_after_offset(fin, 0, &pinfo);
 817+ if (res > 0) {
 818+ iinfo.last_value = pinfo.page_id;
 819+ iinfo.last_position = 0;
 820+ }
 821+ else {
 822+ fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n");
 823+ exit(1);
 824+ }
 825+ if (pinfo.page_id == page_id) {
 826+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
 827+ exit(0);
 828+ }
 829+
 830+ while (1) {
 831+ res = do_iteration(&iinfo, fin, &pinfo);
 832+ /* things to check: bad return? interval is 0 bytes long? */
 833+ if (iinfo.left_end == iinfo.right_end) {
 834+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
 835+ exit(0);
 836+ }
 837+ else if (res < 0) {
 838+ fprintf(stderr,"broken and quitting\n");
 839+ exit(-1);
 840+ }
 841+ }
 842+ exit(0);
 843+}
Property changes on: branches/ariel/xmldumps-backup/findpageidinbz2xml.c
___________________________________________________________________
Added: svn:eol-style
1844 + native
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.h
@@ -0,0 +1,81 @@
 2+#ifndef _FINDPAGEID_H
 3+#define _FINDPAGEID_H
 4+
 5+typedef struct {
 6+ int page_id; /* first id in the block */
 7+ int bits_shifted; /* block is right shifted this many bits */
 8+ int position; /* position in file of block */
 9+} page_info_t;
 10+
 11+#define BUFINSIZE 5000
 12+
 13+/*
 14+ keeps all information about a bzipped file
 15+ plus input/output buffers for decompression
 16+*/
 17+typedef struct {
 18+ unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
 19+ unsigned char *bufout; /* uncompressed data, must be allocated by caller */
 20+ unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
 21+ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
 22+
 23+ int bufin_size; /* size of input buffer for compressed data */
 24+ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
 25+
 26+ int initialized; /* whether bz2file has been initialized (header processed, seek to
 27+ some bz2 block in the file and input buffer filled) */
 28+ int block_start; /* position of bz2 block in file from which we started to read (we
 29+ read a sequence of bz2 blocks from a given position, this is
 30+ the offset to the first one) */
 31+
 32+ bz_stream strm; /* stream structure for libbz2 */
 33+ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
 34+ read around so we can grab the lower end bits off the end for
 35+ sticking in front of the next pile of compressed bytes we read */
 36+
 37+ int bits_shifted; /* number of bits that the compressed data has been right shifted
 38+ in the file (if the number is 0, the block marker and subsequent
 39+ data is byte-aligned) */
 40+ unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for
 41+ locating the marker in a stream of compressed data */
 42+
 43+ int position; /* current offset into file from start of file */
 44+
 45+ int bytes_read; /* number of bytes of compressed data read from file (per read) */
 46+ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
 47+ int eof; /* nonzero if eof reached */
 48+ int file_size; /* length of file, so we don't search past it for blocks */
 49+} bz_info_t;
 50+
 51+#define MASKLEFT 0
 52+#define MASKRIGHT 1
 53+
 54+/*
 55+ this output buffer is used to collect decompressed output.
 56+ this is not a circular buffer; when it is full the user is
 57+ responsible for emptying it completely or partially and moving
 58+ to the beginning any unused bytes.
 59+
 60+*/
 61+typedef struct {
 62+ unsigned char *buffer; /* output storage, allocated by the caller */
 63+ unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */
 64+ unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */
 65+ int bytes_avail; /* number of bytes available for reading */
 66+ unsigned char *end; /* points to byte after end of buffer */
 67+} buf_info_t;
 68+
 69+/*
 70+ used for each iteration of narrowing down the location in a bzipped2 file of
 71+ a desired pageid, by finding first compressed block after a guessed
 72+ position and checking the first pageid (if any) contained in it.
 73+*/
 74+typedef struct {
 75+ int left_end; /* left end of interval to search (bytes from start of file) */
 76+ int right_end; /* right end of interval to search */
 77+ int value_wanted; /* pageid desired */
 78+ int last_value; /* pageid we found in last iteration */
 79+ int last_position; /* position in file for last iteration */
 80+} iter_info_t;
 81+
 82+#endif
Property changes on: branches/ariel/xmldumps-backup/findpageidinbz2xml.h
___________________________________________________________________
Added: svn:eol-style
183 + native

Status & tagging log