Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.c |
— | — | @@ -0,0 +1,842 @@ |
| 2 | +#include <unistd.h> |
| 3 | +#include <stdio.h> |
| 4 | +#include <string.h> |
| 5 | +#include <sys/types.h> |
| 6 | +#include <sys/stat.h> |
| 7 | +#include <fcntl.h> |
| 8 | +#include <stdlib.h> |
| 9 | +#include <errno.h> |
| 10 | +#include <sys/types.h> |
| 11 | +#include <regex.h> |
| 12 | +#include "bzlib.h" |
| 13 | +#include "findpageidinbz2xml.h" |
| 14 | + |
| 15 | +/* return n ones either at left or right end */ |
| 16 | +int bitmask(int numbits, int end) { |
| 17 | + if (end == MASKRIGHT) { |
| 18 | + return((1<<numbits)-1); |
| 19 | + } |
| 20 | + else { |
| 21 | + return(((1<<numbits)-1) << (8-numbits)); |
| 22 | + } |
| 23 | +} |
| 24 | + |
| 25 | +void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) { |
| 26 | + int i; |
| 27 | + |
| 28 | + if (numbits == 0) { |
| 29 | + return; |
| 30 | + } |
| 31 | + |
| 32 | + for (i=0; i<buflen; i++) { |
| 33 | + /* left 1 */ |
| 34 | + buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits); |
| 35 | + |
| 36 | + /* grab leftmost from next byte */ |
| 37 | + if (i < buflen-1) { |
| 38 | + buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) ); |
| 39 | + } |
| 40 | + } |
| 41 | +} |
| 42 | + |
| 43 | +void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) { |
| 44 | + int i; |
| 45 | + |
| 46 | + for (i=buflen-1; i>=0; i--) { |
| 47 | + /* right 1 */ |
| 48 | + buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits); |
| 49 | + |
| 50 | + /* grab rightmost from prev byte */ |
| 51 | + if (i > 0) { |
| 52 | + buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT))); |
| 53 | + } |
| 54 | + } |
| 55 | +} |
| 56 | + |
| 57 | +unsigned char ** init_marker() { |
| 58 | + unsigned char **marker = malloc(8*sizeof(unsigned char *)); |
| 59 | + int i; |
| 60 | + |
| 61 | + /* set up block marker plus its various right-shifted incarnations */ |
| 62 | + for (i = 0; i< 8; i++) { |
| 63 | + marker[i] = malloc(sizeof(unsigned char)*7); |
| 64 | + } |
| 65 | + marker[0][0]= (unsigned char) 0x31; |
| 66 | + marker[0][1]= (unsigned char) 0x41; |
| 67 | + marker[0][2]= (unsigned char) 0x59; |
| 68 | + marker[0][3]= (unsigned char) 0x26; |
| 69 | + marker[0][4]= (unsigned char) 0x53; |
| 70 | + marker[0][5]= (unsigned char) 0x59; |
| 71 | + marker[0][6]= (unsigned char) 0x00; |
| 72 | + for (i = 1; i< 8; i++) { |
| 73 | + memcpy((char *)(marker[i]), (char *)(marker[i-1]),7); |
| 74 | + shift_bytes_right(marker[i],7,1); |
| 75 | + } |
| 76 | + return(marker); |
| 77 | +} |
| 78 | + |
| 79 | +/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1, |
| 80 | + both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2 |
| 81 | + matches and 0 otherwise. */ |
| 82 | +int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) { |
| 83 | + int i; |
| 84 | + |
| 85 | + if (bitsrightshifted == 0) { |
| 86 | + for (i = 0; i< numbytes; i++) { |
| 87 | + if (buff1[i] != buff2[i]) { |
| 88 | + return(1); |
| 89 | + } |
| 90 | + } |
| 91 | + return(0); |
| 92 | + } |
| 93 | + else { |
| 94 | + for (i = 1; i< numbytes-2; i++) { |
| 95 | + if (buff1[i] != buff2[i]) { |
| 96 | + return(1); |
| 97 | + } |
| 98 | + } |
| 99 | + /* do leftmost byte */ |
| 100 | + if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) { |
| 101 | + return(1); |
| 102 | + } |
| 103 | + /* do rightmost byte */ |
| 104 | + if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) { |
| 105 | + return(1); |
| 106 | + } |
| 107 | + return(0); |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | + |
| 112 | +/* return -1 if no match |
| 113 | + return number of bits rightshifted otherwise */ |
| 114 | +int check_buffer_for_bz2_block_marker(bz_info_t *bfile) { |
| 115 | + int result, i; |
| 116 | + |
| 117 | + result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0); |
| 118 | + if (!result) { |
| 119 | + return(0); |
| 120 | + } |
| 121 | + for (i=1; i<8; i++) { |
| 122 | + result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i); |
| 123 | + if (!result) { |
| 124 | + return(i); |
| 125 | + } |
| 126 | + } |
| 127 | + return(-1); |
| 128 | +} |
| 129 | + |
| 130 | + |
| 131 | +/* return: 1 if found, 0 if not, -1 on error */ |
| 132 | +int find_next_bz2_block_marker(int fin, bz_info_t *bfile) { |
| 133 | + int result; |
| 134 | + |
| 135 | + bfile->bits_shifted = -1; |
| 136 | + result = read(fin, bfile->marker_buffer, 7); |
| 137 | + if (result < 0) { |
| 138 | + /* fprintf(stderr,"read of file failed\n"); */ |
| 139 | + return(-1); |
| 140 | + } |
| 141 | + /* must be after 4 byte file header, and we add a leftmost byte to the buffer |
| 142 | + of data read in case some bits have been shifted into it */ |
| 143 | + while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) { |
| 144 | + bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile); |
| 145 | + if (bfile->bits_shifted < 0) { |
| 146 | + bfile->position++; |
| 147 | + result = lseek(fin, (bfile->position), SEEK_SET); |
| 148 | + if (result == -1) { |
| 149 | + fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position); |
| 150 | + return(-1); |
| 151 | + } |
| 152 | + result = read(fin, bfile->marker_buffer, 7); |
| 153 | + if (result < 7) { |
| 154 | + /* fprintf(stderr,"read of file failed\n"); */ |
| 155 | + return(-1); |
| 156 | + } |
| 157 | + } |
| 158 | + else { |
| 159 | + bfile->block_start = bfile->position; |
| 160 | + return(1); |
| 161 | + } |
| 162 | + } |
| 163 | + return(0); |
| 164 | +} |
| 165 | + |
| 166 | +/* |
| 167 | + initializes the bz2 strm structure, |
| 168 | + calls the BZ2 decompression library initializer |
| 169 | + |
| 170 | + returns: |
| 171 | + BZ_OK on success |
| 172 | + various BZ_ errors on failure (see bzlib.h) |
| 173 | +*/ |
| 174 | +int init_decompress(bz_info_t *bfile) { |
| 175 | + int bz_verbosity = 0; |
| 176 | + int bz_small = 0; |
| 177 | + int ret; |
| 178 | + |
| 179 | + bfile->strm.bzalloc = NULL; |
| 180 | + bfile->strm.bzfree = NULL; |
| 181 | + bfile->strm.opaque = NULL; |
| 182 | + |
| 183 | + ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small ); |
| 184 | + if (ret != BZ_OK) { |
| 185 | + fprintf(stderr,"uncompress failed, err %d\n", ret); |
| 186 | + exit(-1); |
| 187 | + } |
| 188 | + return(ret); |
| 189 | +} |
| 190 | + |
| 191 | +/* |
| 192 | + reads the first 4 bytes from a bz2 file (should be |
| 193 | + "BZh" followed by the block size indicator, typically "9") |
| 194 | + and passes them into the BZ2 decompression library. |
| 195 | + This must be done before decompression of any block of the |
| 196 | + file is attempted. |
| 197 | + |
| 198 | + returns: |
| 199 | + BZ_OK if successful, |
| 200 | + various BZ_ errors on failure (see bzlib.h) |
| 201 | +*/ |
| 202 | +int decompress_header(int fin, bz_info_t *bfile) { |
| 203 | + int ret, res; |
| 204 | + |
| 205 | + res = lseek(fin,0,SEEK_SET); |
| 206 | + if (res < 0) { |
| 207 | + fprintf(stderr,"lseek of file to 0 failed (3)\n"); |
| 208 | + exit(-1); |
| 209 | + } |
| 210 | + bfile->bytes_read = read(fin, bfile->header_buffer, 4); |
| 211 | + if (bfile->bytes_read < 4) { |
| 212 | + fprintf(stderr,"failed to read 4 bytes of header, exiting\n"); |
| 213 | + exit(-1); |
| 214 | + } |
| 215 | + bfile->strm.next_in = (char *)bfile->header_buffer; |
| 216 | + bfile->strm.avail_in = 4; |
| 217 | + |
| 218 | + ret = BZ2_bzDecompress ( &(bfile->strm) ); |
| 219 | + if (BZ_OK != ret && BZ_STREAM_END != ret) { |
| 220 | + fprintf(stderr,"Corrupt bzip2 header, exiting\n"); |
| 221 | + exit(-1); |
| 222 | + } |
| 223 | + return(ret); |
| 224 | +} |
| 225 | + |
| 226 | +/* |
| 227 | + seek to appropriate offset as specified in bfile, |
| 228 | + read compressed data into buffer indicated by bfile, |
| 229 | + update the bfile structure accordingly, |
| 230 | + save the overflow byte (bit-shifted data = suck) |
| 231 | + this is for the *first* buffer of data in a stream, |
| 232 | + for subsequent buffers use fill_buffer_to_decompress() |
| 233 | + |
| 234 | + this will set bfile->eof on eof. no other indicator |
| 235 | + will be provided. |
| 236 | + |
| 237 | + returns: |
| 238 | + 0 on success |
| 239 | + -1 on error |
| 240 | +*/ |
| 241 | +int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) { |
| 242 | + int res; |
| 243 | + |
| 244 | + if (bfile->bits_shifted == 0) { |
| 245 | + res = lseek(fin,bfile->position+1,SEEK_SET); |
| 246 | + if (res < 0) { |
| 247 | + fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1); |
| 248 | + return(-1); |
| 249 | + } |
| 250 | + } |
| 251 | + else { |
| 252 | + res = lseek(fin,bfile->position,SEEK_SET); |
| 253 | + if (res < 0) { |
| 254 | + fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position); |
| 255 | + return(-1); |
| 256 | + } |
| 257 | + } |
| 258 | + bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size); |
| 259 | + if (bfile->bytes_read > 0) { |
| 260 | + bfile->overflow = bfile->bufin[bfile->bytes_read-1]; |
| 261 | + shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted); |
| 262 | + |
| 263 | + bfile->strm.next_in = (char *)(bfile->bufin); |
| 264 | + bfile->strm.avail_in = bfile->bytes_read-1; |
| 265 | + } |
| 266 | + if (bfile->bytes_read <=0) { |
| 267 | + bfile->eof++; |
| 268 | + } |
| 269 | + return(0); |
| 270 | +} |
| 271 | + |
| 272 | +/* |
| 273 | + read compressed data into buffer indicated by bfile, |
| 274 | + from current position of file, |
| 275 | + stuffing the overflow byte in first. |
| 276 | + update the bfile structure accordingly |
| 277 | + save the new overflow byte (bit-shifted data = suck) |
| 278 | + this function is for decompression of buffers *after |
| 279 | + the first one*. for the first one use |
| 280 | + setup_first_buffer_to_decompress() |
| 281 | + |
| 282 | + this will set bfile->eof on eof. no other indicator |
| 283 | + will be provided. |
| 284 | + |
| 285 | + returns: |
| 286 | + 0 on success |
| 287 | + hmm, it really does not do anything about errors :-D |
| 288 | +*/ |
| 289 | +int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) { |
| 290 | + if (bfile->strm.avail_in == 0) { |
| 291 | + bfile->strm.next_in = (char *)(bfile->bufin); |
| 292 | + bfile->bufin[0] = bfile->overflow; |
| 293 | + bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1); |
| 294 | + if (bfile->bytes_read > 0) { |
| 295 | + bfile->overflow = bfile->bufin[bfile->bytes_read]; |
| 296 | + shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted); |
| 297 | + bfile->strm.avail_in = bfile->bytes_read; |
| 298 | + bfile->position+=bfile->bytes_read; |
| 299 | + } |
| 300 | + else { |
| 301 | + bfile->strm.avail_in = 1; /* the overflow byte */ |
| 302 | + bfile->eof++; |
| 303 | + } |
| 304 | + } |
| 305 | + return(0); |
| 306 | +} |
| 307 | + |
| 308 | +/* size of buffer is bytes usable. there will be a null byte at the end |
| 309 | + |
| 310 | + what we do with the buffer: |
| 311 | + - read from front of buffer to end, |
| 312 | + - fill from point where prev read did not fill buffer, or from where |
| 313 | + move of data at end of buffer to beginning left room, |
| 314 | + - mark a string of bytes (starting from what's available to read) as "read" |
| 315 | + |
| 316 | +*/ |
| 317 | +buf_info_t *init_buffer(int size) { |
| 318 | + buf_info_t *b; |
| 319 | + |
| 320 | + b = (buf_info_t *)malloc(sizeof(buf_info_t)); |
| 321 | + b->buffer = malloc(sizeof(unsigned char)*(size+1)); |
| 322 | + b->buffer[size]='\0'; |
| 323 | + b->end = b->buffer + size; |
| 324 | + b->next_to_read = b->end; /* nothing available */ |
| 325 | + b->bytes_avail = 0; /* bytes to read, nothing available */ |
| 326 | + b->next_to_fill = b->buffer; /* empty */ |
| 327 | + b->next_to_fill[0] = '\0'; |
| 328 | + return(b); |
| 329 | +} |
| 330 | + |
| 331 | +/* check if buffer (used for decompressed data output) is empty, |
| 332 | + returns 1 if so and 0 if not */ |
| 333 | +int buffer_is_empty(buf_info_t *b) { |
| 334 | + if (b->bytes_avail == 0) { |
| 335 | + return(1); |
| 336 | + } |
| 337 | + else { |
| 338 | + return(0); |
| 339 | + } |
| 340 | +} |
| 341 | + |
| 342 | +/* check if buffer (used for decompressed data output) is full, |
| 343 | + |
| 344 | + returns 1 if so and 0 if not |
| 345 | + I'm not liking this function so well, fixme */ |
| 346 | +int buffer_is_full(buf_info_t *b) { |
| 347 | + if (b->next_to_fill == b->end) { |
| 348 | + return(1); |
| 349 | + } |
| 350 | + else { |
| 351 | + return(0); |
| 352 | + } |
| 353 | +} |
| 354 | + |
| 355 | +/* FIXME do this right. whatever. */ |
| 356 | +int get_file_size(int fin) { |
| 357 | + int res; |
| 358 | + |
| 359 | + res = lseek(fin, 0, SEEK_END); |
| 360 | + if (res < 0) { |
| 361 | + fprintf(stderr,"lseek of file to 0 failed (6)\n"); |
| 362 | + exit(-1); |
| 363 | + } |
| 364 | + return(res); |
| 365 | +} |
| 366 | + |
| 367 | + |
| 368 | +/* |
| 369 | + look for the first bz2 block in the file after specified offset |
| 370 | + it tests that the block is valid by doing partial decompression. |
| 371 | + this function will update the bfile structure: |
| 372 | + bfile->position will contain the current position of the file (? will it?) |
| 373 | + bfile->bits_shifted will contain the number of bits that the block is rightshifted |
| 374 | + bfile->block_start will contain the offset from start of file to the block |
| 375 | + returns: |
| 376 | + position of next byte in file to be read, on success |
| 377 | + -1 if no marker or other error |
| 378 | +*/ |
| 379 | +int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) { |
| 380 | + int res; |
| 381 | + |
| 382 | + bfile->bufin_size = BUFINSIZE; |
| 383 | + bfile->marker = init_marker(); |
| 384 | + bfile->position = position; |
| 385 | + bfile->block_start = -1; |
| 386 | + bfile->bytes_read = 0; |
| 387 | + bfile->bytes_written = 0; |
| 388 | + bfile->eof = 0; |
| 389 | + bfile->bits_shifted = -1; |
| 390 | + |
| 391 | + bfile->file_size = get_file_size(fin); |
| 392 | + |
| 393 | + while (bfile->bits_shifted < 0) { |
| 394 | + if (bfile->position > bfile->file_size) { |
| 395 | + return(-1); |
| 396 | + } |
| 397 | + res = lseek(fin, bfile->position, SEEK_SET); |
| 398 | + if (res < 0) { |
| 399 | + fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position); |
| 400 | + exit(-1); |
| 401 | + } |
| 402 | + res = find_next_bz2_block_marker(fin, bfile); |
| 403 | + if (res == 1) { |
| 404 | + init_decompress(bfile); |
| 405 | + decompress_header(fin, bfile); |
| 406 | + res = setup_first_buffer_to_decompress(fin, bfile); |
| 407 | + if (res == -1) { |
| 408 | + fprintf(stderr,"couldn't get first buffer of data to uncompress\n"); |
| 409 | + exit(-1); |
| 410 | + } |
| 411 | + bfile->strm.next_out = (char *)bfile->bufout; |
| 412 | + bfile->strm.avail_out = bfile->bufout_size; |
| 413 | + res = BZ2_bzDecompress ( &(bfile->strm) ); |
| 414 | + /* this means we (probably) have a genuine marker */ |
| 415 | + if (BZ_OK == res || BZ_STREAM_END == res) { |
| 416 | + res = BZ2_bzDecompressEnd ( &(bfile->strm) ); |
| 417 | + bfile->bytes_read = 0; |
| 418 | + bfile->bytes_written = 0; |
| 419 | + bfile->eof = 0; |
| 420 | + /* leave the file at the right position */ |
| 421 | + res = lseek(fin, bfile->block_start, SEEK_SET); |
| 422 | + if (res < 0) { |
| 423 | + fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position); |
| 424 | + exit(-1); |
| 425 | + } |
| 426 | + return(0); |
| 427 | + } |
| 428 | + /* right bytes, but there by chance, skip and try again */ |
| 429 | + else { |
| 430 | + bfile->position+=6; |
| 431 | + bfile->bits_shifted = -1; |
| 432 | + bfile->block_start = -1; |
| 433 | + } |
| 434 | + } |
| 435 | + else { |
| 436 | + return(-1); |
| 437 | + } |
| 438 | + } |
| 439 | + return(-1); |
| 440 | +} |
| 441 | + |
| 442 | +/* |
| 443 | + find the first bz2 block marker in the file, |
| 444 | + from its current position, |
| 445 | + then set up for decompression from that point |
| 446 | + returns: |
| 447 | + 0 on success |
| 448 | + -1 if no marker or other error |
| 449 | +*/ |
| 450 | +int init_bz2_file(bz_info_t *bfile, int fin) { |
| 451 | + int res; |
| 452 | + |
| 453 | + bfile->initialized++; |
| 454 | + |
| 455 | + res = find_next_bz2_block_marker(fin, bfile); |
| 456 | + if (res ==1) { |
| 457 | + init_decompress(bfile); |
| 458 | + decompress_header(fin, bfile); |
| 459 | + setup_first_buffer_to_decompress(fin, bfile); |
| 460 | + return(0); |
| 461 | + } |
| 462 | + return(-1); |
| 463 | +} |
| 464 | + |
| 465 | +/* return -1 if error */ |
| 466 | +int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) { |
| 467 | + int ret; |
| 468 | + |
| 469 | + bfile->bufout = bufferout; |
| 470 | + bfile->bufout_size = bufout_size; |
| 471 | + bfile->bytes_written = 0; |
| 472 | + |
| 473 | + if (! bfile->initialized) { |
| 474 | + if (init_bz2_file(bfile, fin) == -1) { |
| 475 | + /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */ |
| 476 | + return(-1); |
| 477 | + }; |
| 478 | + bfile->strm.next_out = (char *)bfile->bufout; |
| 479 | + bfile->strm.avail_out = bfile->bufout_size; |
| 480 | + } |
| 481 | + |
| 482 | + ret = BZ_OK; |
| 483 | + while (BZ_OK == ret && bfile->bytes_written == 0) { |
| 484 | + ret = BZ2_bzDecompress ( &(bfile->strm) ); |
| 485 | + if (BZ_OK == ret || BZ_STREAM_END == ret) { |
| 486 | + bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout; |
| 487 | + } |
| 488 | + else { |
| 489 | + /* fprintf(stderr,"error from BZ decompress %d\n",ret); */ |
| 490 | + return(-1); |
| 491 | + } |
| 492 | + fill_buffer_to_decompress(fin, bfile, ret); |
| 493 | + /* |
| 494 | + if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) { |
| 495 | + fprintf(stderr,"eof reached\n"); |
| 496 | + } |
| 497 | + */ |
| 498 | + } |
| 499 | + return(0); |
| 500 | +} |
| 501 | + |
| 502 | + |
| 503 | +/* |
| 504 | + fill output buffer in b with uncompressed data from bfile |
| 505 | + if this is the first call to the function for this file, |
| 506 | + the file header will be read, and the first buffer of |
| 507 | + uncompressed data will be prepared. bfile->position |
| 508 | + should be set to the offset (from the beginning of file) from |
| 509 | + which to find the first bz2 block. |
| 510 | + |
| 511 | + returns: |
| 512 | + on success, number of bytes read (may be 0) |
| 513 | + -1 on error |
| 514 | +*/ |
| 515 | +int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) { |
| 516 | + int res; |
| 517 | + |
| 518 | + if (buffer_is_full(b)) { |
| 519 | + return(0); |
| 520 | + } |
| 521 | + |
| 522 | + if (buffer_is_empty(b)) { |
| 523 | + b->next_to_fill = b->buffer; |
| 524 | + } |
| 525 | + |
| 526 | + res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill); |
| 527 | + if (res == -1) { |
| 528 | + return(res); |
| 529 | + } |
| 530 | + if (bfile->bytes_written < 0) { |
| 531 | + /* fprintf(stderr,"read of file failed\n"); */ |
| 532 | + return(-1); |
| 533 | + } |
| 534 | + else { |
| 535 | + /* really?? FIXME check this */ |
| 536 | + if (buffer_is_empty(b)) { |
| 537 | + b->next_to_read = b->next_to_fill; /* where we just read */ |
| 538 | + } |
| 539 | + b->bytes_avail += bfile->bytes_written; |
| 540 | + b->next_to_fill += bfile->bytes_written; |
| 541 | + b->next_to_fill[0] = '\0'; |
| 542 | + return(0); |
| 543 | + } |
| 544 | +} |
| 545 | + |
| 546 | +void dumpbuf_info_t(buf_info_t *b) { |
| 547 | + fprintf(stdout, "\n"); |
| 548 | + fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer); |
| 549 | + fprintf(stdout, "b->end: %ld\n", (long int) b->end); |
| 550 | + fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read); |
| 551 | + fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill); |
| 552 | + fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail); |
| 553 | +} |
| 554 | + |
| 555 | + |
| 556 | +/* |
| 557 | + copy text from end of buffer to the beginning, that we want to keep |
| 558 | + around for further processing (i.e. further regex matches) |
| 559 | + returns number of bytes copied |
| 560 | +*/ |
| 561 | +int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *from_where, int maxbytes) { |
| 562 | + int i, tocopy; |
| 563 | + |
| 564 | + if (from_where >= b->end) { |
| 565 | + return(0); |
| 566 | + } |
| 567 | + else { |
| 568 | + tocopy = b->end - from_where; |
| 569 | + if (maxbytes && (tocopy > maxbytes)) { |
| 570 | + tocopy = maxbytes; |
| 571 | + } |
| 572 | + for (i = 0; i < tocopy; i++) { |
| 573 | + b->buffer[i] = from_where[i]; |
| 574 | + } |
| 575 | + b->next_to_fill = b->buffer + tocopy; |
| 576 | + b->next_to_fill[0] = '\0'; |
| 577 | + b->next_to_read = b->buffer; |
| 578 | + b->bytes_avail = tocopy; |
| 579 | + return(tocopy); |
| 580 | + } |
| 581 | +} |
| 582 | + |
| 583 | +/* |
| 584 | + get the first page id after position in file |
| 585 | + if a pageid is found, the structure pinfo will be updated accordingly |
| 586 | + returns: |
| 587 | + 1 if a pageid found, |
| 588 | + 0 if no pageid found, |
| 589 | + -1 on error |
| 590 | +*/ |
| 591 | +int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) { |
| 592 | + int res; |
| 593 | + regmatch_t *match_page, *match_page_id; |
| 594 | + regex_t compiled_page, compiled_page_id; |
| 595 | + int length=5000; /* output buffer size */ |
| 596 | + char *page = "<page>"; |
| 597 | + char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n"; |
| 598 | + |
| 599 | + buf_info_t *b; |
| 600 | + bz_info_t bfile; |
| 601 | + |
| 602 | + bfile.initialized = 0; |
| 603 | + |
| 604 | + res = regcomp(&compiled_page, page, REG_EXTENDED); |
| 605 | + res = regcomp(&compiled_page_id, page_id, REG_EXTENDED); |
| 606 | + |
| 607 | + match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1); |
| 608 | + match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2); |
| 609 | + |
| 610 | + b = init_buffer(length); |
| 611 | + |
| 612 | + pinfo->bits_shifted = -1; |
| 613 | + pinfo->position = -1; |
| 614 | + pinfo->page_id = -1; |
| 615 | + |
| 616 | + bfile.bytes_read = 0; |
| 617 | + |
| 618 | + if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) { |
| 619 | + /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */ |
| 620 | + return(-1); |
| 621 | + } |
| 622 | + |
| 623 | + while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) { |
| 624 | + if (bfile.bytes_read) { |
| 625 | + while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) { |
| 626 | + if (match_page_id[1].rm_so >=0) { |
| 627 | + /* write page_id to stderr */ |
| 628 | + /* |
| 629 | + fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr); |
| 630 | + fwrite("\n",1,1,stderr); |
| 631 | + */ |
| 632 | + pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so)); |
| 633 | + pinfo->position = bfile.block_start; |
| 634 | + pinfo->bits_shifted = bfile.bits_shifted; |
| 635 | + return(1); |
| 636 | + /* write up to and including page id tag to stdout */ |
| 637 | + /* |
| 638 | + fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout); |
| 639 | + b->next_to_read = b->next_to_read+match_page_id[0].rm_eo; |
| 640 | + b->bytes_avail -= match_page_id[0].rm_eo; |
| 641 | + */ |
| 642 | + } |
| 643 | + else { |
| 644 | + /* should never happen */ |
| 645 | + fprintf(stderr,"regex gone bad...\n"); |
| 646 | + exit(-1); |
| 647 | + } |
| 648 | + } |
| 649 | + if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) { |
| 650 | + /* write everything up to but not including the page tag to stdout */ |
| 651 | + /* |
| 652 | + fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout); |
| 653 | + */ |
| 654 | + move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so); |
| 655 | + bfile.strm.next_out = (char *)b->next_to_fill; |
| 656 | + bfile.strm.avail_out = b->end - b->next_to_fill; |
| 657 | + } |
| 658 | + else { |
| 659 | + /* could have the first part of the page tag... so copy up enough bytes to cover that case */ |
| 660 | + if (b->bytes_avail> 5) { |
| 661 | + /* write everything that didn't match, but leave 5 bytes, to stdout */ |
| 662 | + /* |
| 663 | + fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout); |
| 664 | + */ |
| 665 | + move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5); |
| 666 | + bfile.strm.next_out = (char *)b->next_to_fill; |
| 667 | + bfile.strm.avail_out = b->end - b->next_to_fill; |
| 668 | + } |
| 669 | + else { |
| 670 | + if (buffer_is_empty(b)) { |
| 671 | + bfile.strm.next_out = (char *)b->buffer; |
| 672 | + bfile.strm.avail_out = bfile.bufout_size; |
| 673 | + b->next_to_fill = b->buffer; /* empty */ |
| 674 | + } |
| 675 | + else { |
| 676 | + /* there were only 5 or less bytes so just save em don't write em to stdout */ |
| 677 | + move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail); |
| 678 | + bfile.strm.next_out = (char *)b->next_to_fill; |
| 679 | + bfile.strm.avail_out = b->end - b->next_to_fill; |
| 680 | + } |
| 681 | + } |
| 682 | + } |
| 683 | + } |
| 684 | + } |
| 685 | + /* |
| 686 | + if (b->bytes_avail) { |
| 687 | + fwrite(b->next_to_read,b->bytes_avail,1,stdout); |
| 688 | + } |
| 689 | + */ |
| 690 | + return(0); |
| 691 | +} |
| 692 | + |
| 693 | +/* search for pageid in a bz2 file, given start and end offsets |
| 694 | + to search for |
| 695 | + we guess by the most boring method possible (shrink the |
| 696 | + interval according to the value found on the last guess, |
| 697 | + try midpoint of the new interval) |
| 698 | + multiple calls of this will get the job done. |
| 699 | + interval has left end = right end if search is complete. |
| 700 | + this function may return the previous guess and simply |
| 701 | + shrink the interval. |
| 702 | + note that a "match" means either that the pageid we find |
| 703 | + is smaller than the one the caller wants, or is equal. |
| 704 | + why? because then we can use the output for prefetch |
| 705 | + for xml dumps and be sure a specific page range is covered :-P |
| 706 | + |
| 707 | + return value from guess, or -1 on error. |
| 708 | + */ |
| 709 | +int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo) { |
| 710 | + int res; |
| 711 | + int new_position; |
| 712 | + int interval; |
| 713 | + |
| 714 | + /* |
| 715 | + last_position is somewhere in the interval, perhaps at an end |
| 716 | + last_value is the value we had at that position |
| 717 | + */ |
| 718 | + |
| 719 | + interval = (iinfo->right_end - iinfo->left_end)/2; |
| 720 | + if (interval == 0) { |
| 721 | + interval = 1; |
| 722 | + } |
| 723 | + /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */ |
| 724 | + /* if we're this close, we'll check this value and be done with it */ |
| 725 | + if (iinfo->right_end -iinfo->left_end < 2) { |
| 726 | + new_position = iinfo->left_end; |
| 727 | + iinfo->right_end = iinfo->left_end; |
| 728 | + } |
| 729 | + else { |
| 730 | + if (iinfo->last_value < iinfo->value_wanted) { |
| 731 | + /* fprintf(stderr,"resetting left end\n"); */ |
| 732 | + iinfo->left_end = iinfo->last_position; |
| 733 | + new_position = iinfo->last_position + interval; |
| 734 | + } |
| 735 | + /* iinfo->last_value > iinfo->value_wanted */ |
| 736 | + else { |
| 737 | + /* fprintf(stderr,"resetting right end\n"); */ |
| 738 | + iinfo->right_end = iinfo->last_position; |
| 739 | + new_position = iinfo->last_position - interval; |
| 740 | + } |
| 741 | + } |
| 742 | + res = get_first_page_id_after_offset(fin, new_position, pinfo); |
| 743 | + if (res >0) { |
| 744 | + /* caller wants the new value */ |
| 745 | + iinfo->last_value = pinfo->page_id; |
| 746 | + iinfo->last_position = new_position; |
| 747 | + return(pinfo->page_id); |
| 748 | + } |
| 749 | + else { |
| 750 | + /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or |
| 751 | + there's just no block here. |
| 752 | + set the right end, keep the last value and position and let the caller retry with the new interval */ |
| 753 | + if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */ |
| 754 | + iinfo->right_end = new_position; |
| 755 | + return(iinfo->last_value); |
| 756 | + } |
| 757 | + /* in theory we were moving towards beginning of file, should not have issues, so bail here */ |
| 758 | + else { |
| 759 | + /* fprintf(stderr,"something very broken, giving up\n"); */ |
| 760 | + return(-1); |
| 761 | + } |
| 762 | + } |
| 763 | +} |
| 764 | + |
| 765 | +/* |
| 766 | + given a bzipped and possibly truncated file, and a page id, |
| 767 | + hunt for the page id in the file; this assume that the |
| 768 | + bz2 header is intact and that page ids are steadily increasing |
| 769 | + throughout the file. |
| 770 | + |
| 771 | + writes the offset of the relevant block (from beginning of file) |
| 772 | + and the first pageid found in that block, to stdout |
| 773 | + |
| 774 | + format of output: |
| 775 | + position:xxxxx pageid:nnn |
| 776 | + |
| 777 | + returns: 0 on success, -1 on error |
| 778 | +*/ |
| 779 | +int main(int argc, char **argv) { |
| 780 | + int fin, position, res, interval, page_id, oldmarker, file_size; |
| 781 | + page_info_t pinfo; |
| 782 | + iter_info_t iinfo; |
| 783 | + |
| 784 | + if (argc != 3) { |
| 785 | + fprintf(stderr,"usage: %s infile id\n", argv[0]); |
| 786 | + exit(-1); |
| 787 | + } |
| 788 | + |
| 789 | + fin = open (argv[1], O_RDONLY); |
| 790 | + if (fin < 0) { |
| 791 | + fprintf(stderr,"failed to open file %s for read\n", argv[1]); |
| 792 | + exit(-1); |
| 793 | + } |
| 794 | + |
| 795 | + page_id = atoi(argv[2]); |
| 796 | + if (page_id <1) { |
| 797 | + fprintf(stderr,"please specify a page_id >= 1.\n"); |
| 798 | + fprintf(stderr,"usage: %s infile page_id\n", argv[0]); |
| 799 | + exit(-1); |
| 800 | + } |
| 801 | + |
| 802 | + file_size = get_file_size(fin); |
| 803 | + |
| 804 | + interval = file_size; |
| 805 | + position = 0; |
| 806 | + oldmarker = -1; |
| 807 | + pinfo.bits_shifted = -1; |
| 808 | + pinfo.position = -1; |
| 809 | + pinfo.page_id = -1; |
| 810 | + |
| 811 | + iinfo.left_end = 0; |
| 812 | + file_size = get_file_size(fin); |
| 813 | + iinfo.right_end = file_size; |
| 814 | + iinfo.value_wanted = page_id; |
| 815 | + |
| 816 | + res = get_first_page_id_after_offset(fin, 0, &pinfo); |
| 817 | + if (res > 0) { |
| 818 | + iinfo.last_value = pinfo.page_id; |
| 819 | + iinfo.last_position = 0; |
| 820 | + } |
| 821 | + else { |
| 822 | + fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n"); |
| 823 | + exit(1); |
| 824 | + } |
| 825 | + if (pinfo.page_id == page_id) { |
| 826 | + fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id); |
| 827 | + exit(0); |
| 828 | + } |
| 829 | + |
| 830 | + while (1) { |
| 831 | + res = do_iteration(&iinfo, fin, &pinfo); |
| 832 | + /* things to check: bad return? interval is 0 bytes long? */ |
| 833 | + if (iinfo.left_end == iinfo.right_end) { |
| 834 | + fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id); |
| 835 | + exit(0); |
| 836 | + } |
| 837 | + else if (res < 0) { |
| 838 | + fprintf(stderr,"broken and quitting\n"); |
| 839 | + exit(-1); |
| 840 | + } |
| 841 | + } |
| 842 | + exit(0); |
| 843 | +} |
Property changes on: branches/ariel/xmldumps-backup/findpageidinbz2xml.c |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 844 | + native |
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.h |
— | — | @@ -0,0 +1,81 @@ |
| 2 | +#ifndef _FINDPAGEID_H |
| 3 | +#define _FINDPAGEID_H |
| 4 | + |
| 5 | +typedef struct { |
| 6 | + int page_id; /* first id in the block */ |
| 7 | + int bits_shifted; /* block is right shifted this many bits */ |
| 8 | + int position; /* position in file of block */ |
| 9 | +} page_info_t; |
| 10 | + |
| 11 | +#define BUFINSIZE 5000 |
| 12 | + |
| 13 | +/* |
| 14 | + keeps all information about a bzipped file |
| 15 | + plus input/output buffers for decompression |
| 16 | +*/ |
| 17 | +typedef struct { |
| 18 | + unsigned char bufin[BUFINSIZE]; /* compressed data read from file */ |
| 19 | + unsigned char *bufout; /* uncompressed data, must be allocated by caller */ |
| 20 | + unsigned char marker_buffer[7]; /* data to test for bz2 block marker */ |
| 21 | + unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */ |
| 22 | + |
| 23 | + int bufin_size; /* size of input buffer for compressed data */ |
| 24 | + int bufout_size; /* size of output buffer for decompressed data, may vary at each call */ |
| 25 | + |
| 26 | + int initialized; /* whether bz2file has been initialized (header processed, seek to |
| 27 | + some bz2 block in the file and input buffer filled) */ |
| 28 | + int block_start; /* position of bz2 block in file from which we started to read (we |
| 29 | + read a sequence of bz2 blocks from a given position, this is |
| 30 | + the offset to the first one) */ |
| 31 | + |
| 32 | + bz_stream strm; /* stream structure for libbz2 */ |
| 33 | + unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte |
| 34 | + read around so we can grab the lower end bits off the end for |
| 35 | + sticking in front of the next pile of compressed bytes we read */ |
| 36 | + |
| 37 | + int bits_shifted; /* number of bits that the compressed data has been right shifted |
| 38 | + in the file (if the number is 0, the block marker and subsequent |
| 39 | + data is byte-aligned) */ |
| 40 | + unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for |
| 41 | + locating the marker in a stream of compressed data */ |
| 42 | + |
| 43 | + int position; /* current offset into file from start of file */ |
| 44 | + |
| 45 | + int bytes_read; /* number of bytes of compressed data read from file (per read) */ |
| 46 | + int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */ |
| 47 | + int eof; /* nonzero if eof reached */ |
| 48 | + int file_size; /* length of file, so we don't search past it for blocks */ |
| 49 | +} bz_info_t; |
| 50 | + |
| 51 | +#define MASKLEFT 0 |
| 52 | +#define MASKRIGHT 1 |
| 53 | + |
| 54 | +/* |
| 55 | + this output buffer is used to collect decompressed output. |
| 56 | + this is not a circular buffer; when it is full the user is |
| 57 | + responsible for emptying it completely or partially and moving |
| 58 | + to the beginning any unused bytes. |
| 59 | + |
| 60 | +*/ |
| 61 | +typedef struct { |
| 62 | + unsigned char *buffer; /* output storage, allocated by the caller */ |
| 63 | + unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */ |
| 64 | + unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */ |
| 65 | + int bytes_avail; /* number of bytes available for reading */ |
| 66 | + unsigned char *end; /* points to byte after end of buffer */ |
| 67 | +} buf_info_t; |
| 68 | + |
| 69 | +/* |
| 70 | + used for each iteration of narrowing down the location in a bzipped2 file of |
| 71 | + a desired pageid, by finding first compressed block after a guessed |
| 72 | + position and checking the first pageid (if any) contained in it. |
| 73 | +*/ |
| 74 | +typedef struct { |
| 75 | + int left_end; /* left end of interval to search (bytes from start of file) */ |
| 76 | + int right_end; /* right end of interval to search */ |
| 77 | + int value_wanted; /* pageid desired */ |
| 78 | + int last_value; /* pageid we found in last iteration */ |
| 79 | + int last_position; /* position in file for last iteration */ |
| 80 | +} iter_info_t; |
| 81 | + |
| 82 | +#endif |
Property changes on: branches/ariel/xmldumps-backup/findpageidinbz2xml.h |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 83 | + native |