Index: branches/ariel/xmldumps-backup/dumplastbz2block.c |
— | — | @@ -0,0 +1,463 @@ |
| 2 | +#include <unistd.h> |
| 3 | +#include <stdio.h> |
| 4 | +#include <string.h> |
| 5 | +#include <sys/types.h> |
| 6 | +#include <sys/stat.h> |
| 7 | +#include <fcntl.h> |
| 8 | +#include <stdlib.h> |
| 9 | +#include <errno.h> |
| 10 | +#include "bzlib.h" |
| 11 | + |
| 12 | +/* |
| 13 | + Find the last bz2 block marker in a file |
| 14 | + and dump whatever can be decompressed after |
| 15 | + that point. The header of the file must |
| 16 | + be intact in order for any output to be produced. |
| 17 | + This will produce output for truncated files as well, |
| 18 | + as long as there is "enough" data after the block |
| 19 | + marker. |
| 20 | + |
| 21 | + Arguments: the name of the file to check, presumably |
| 22 | + a bzipped file. |
| 23 | + Outputs: the decompressed data at the end of the file. |
| 24 | + Exits with 0 if decompression of some data can be done, |
| 25 | + 1 if decompression fails, and -1 on error. |
| 26 | +*/ |
| 27 | + |
| 28 | +#define BUFSIZE 121072 |
| 29 | +typedef struct { |
| 30 | + unsigned char bufin[BUFSIZE]; |
| 31 | + unsigned char bufout[BUFSIZE]; |
| 32 | + int bufsize; |
| 33 | + bz_stream strm; |
| 34 | + unsigned char overflow; |
| 35 | + int bitsshifted; |
| 36 | + int position; |
| 37 | +} bzinfo; |
| 38 | + |
| 39 | +int read_footer(unsigned char *buffer, int fin) { |
| 40 | + int res; |
| 41 | + |
| 42 | + res = lseek(fin, -11, SEEK_END); |
| 43 | + if (res < 0) { |
| 44 | + fprintf(stderr,"lseek of file failed\n"); |
| 45 | + exit(-1); |
| 46 | + } |
| 47 | + res = read(fin, buffer, 11); |
| 48 | + if (res < 0) { |
| 49 | + fprintf(stderr,"read of file failed\n"); |
| 50 | + exit(-1); |
| 51 | + } |
| 52 | + return(0); |
| 53 | +} |
| 54 | + |
| 55 | +#define LEFT 0 |
| 56 | +#define RIGHT 1 |
| 57 | + |
| 58 | +/* return n ones either at left or right end */ |
| 59 | +int bitmask(int numbits, int end) { |
| 60 | + if (end == RIGHT) { |
| 61 | + return((1<<numbits)-1); |
| 62 | + } |
| 63 | + else { |
| 64 | + return(((1<<numbits)-1) << (8-numbits)); |
| 65 | + } |
| 66 | +} |
| 67 | + |
| 68 | +void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) { |
| 69 | + int i; |
| 70 | + |
| 71 | + if (numbits == 0) { |
| 72 | + return; |
| 73 | + } |
| 74 | + |
| 75 | + for (i=0; i<buflen; i++) { |
| 76 | + /* left 1 */ |
| 77 | + buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits); |
| 78 | + |
| 79 | + /* grab leftmost from next byte */ |
| 80 | + if (i < buflen-1) { |
| 81 | + buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) ); |
| 82 | + } |
| 83 | + } |
| 84 | +} |
| 85 | + |
| 86 | + |
| 87 | +void shiftbytesright(unsigned char *buffer, int buflen, int numbits) { |
| 88 | + int i; |
| 89 | + |
| 90 | + for (i=buflen-1; i>=0; i--) { |
| 91 | + /* right 1 */ |
| 92 | + buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits); |
| 93 | + |
| 94 | + /* grab rightmost from prev byte */ |
| 95 | + if (i > 0) { |
| 96 | + buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT))); |
| 97 | + } |
| 98 | + } |
| 99 | +} |
| 100 | + |
| 101 | +unsigned char ** init_marker() { |
| 102 | + unsigned char **marker = malloc(8*sizeof(unsigned char *)); |
| 103 | + int i; |
| 104 | + |
| 105 | + /* set up block marker plus its various right-shifted incarnations */ |
| 106 | + for (i = 0; i< 8; i++) { |
| 107 | + marker[i] = malloc(sizeof(unsigned char)*7); |
| 108 | + } |
| 109 | + marker[0][0]= (unsigned char) 0x31; |
| 110 | + marker[0][1]= (unsigned char) 0x41; |
| 111 | + marker[0][2]= (unsigned char) 0x59; |
| 112 | + marker[0][3]= (unsigned char) 0x26; |
| 113 | + marker[0][4]= (unsigned char) 0x53; |
| 114 | + marker[0][5]= (unsigned char) 0x59; |
| 115 | + marker[0][6]= (unsigned char) 0x00; |
| 116 | + for (i = 1; i< 8; i++) { |
| 117 | + memcpy((char *)(marker[i]), (char *)(marker[i-1]),7); |
| 118 | + shiftbytesright(marker[i],7,1); |
| 119 | + } |
| 120 | + return(marker); |
| 121 | +} |
| 122 | + |
| 123 | +unsigned char ** init_footer() { |
| 124 | + unsigned char **footer = malloc(8*sizeof(unsigned char *)); |
| 125 | + int i; |
| 126 | + |
| 127 | + /* set up footer plus its various right-shifted incarnations */ |
| 128 | + /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */ |
| 129 | + for (i = 0; i< 8; i++) { |
| 130 | + footer[i] = malloc(sizeof(unsigned char)*7); |
| 131 | + } |
| 132 | + footer[0][0]= (unsigned char) 0x17; |
| 133 | + footer[0][1]= (unsigned char) 0x72; |
| 134 | + footer[0][2]= (unsigned char) 0x45; |
| 135 | + footer[0][3]= (unsigned char) 0x38; |
| 136 | + footer[0][4]= (unsigned char) 0x50; |
| 137 | + footer[0][5]= (unsigned char) 0x90; |
| 138 | + footer[0][6]= (unsigned char) 0x00; |
| 139 | + for (i = 1; i< 8; i++) { |
| 140 | + memcpy((char *)(footer[i]), (char *)(footer[i-1]),7); |
| 141 | + shiftbytesright(footer[i],7,1); |
| 142 | + } |
| 143 | + return(footer); |
| 144 | +} |
| 145 | + |
| 146 | + |
| 147 | +/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1, |
| 148 | + both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2 |
| 149 | + matches and 0 otherwise. */ |
| 150 | +int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) { |
| 151 | + int i; |
| 152 | + |
| 153 | + if (bitsrightshifted == 0) { |
| 154 | + for (i = 0; i< numbytes; i++) { |
| 155 | + if (buff1[i] != buff2[i]) { |
| 156 | + return(1); |
| 157 | + } |
| 158 | + } |
| 159 | + return(0); |
| 160 | + } |
| 161 | + else { |
| 162 | + for (i = 1; i< numbytes-2; i++) { |
| 163 | + if (buff1[i] != buff2[i]) { |
| 164 | + return(1); |
| 165 | + } |
| 166 | + } |
| 167 | + /* do leftmost byte */ |
| 168 | + if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) { |
| 169 | + return(1); |
| 170 | + } |
| 171 | + /* do rightmost byte */ |
| 172 | + if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) { |
| 173 | + return(1); |
| 174 | + } |
| 175 | + return(0); |
| 176 | + } |
| 177 | +} |
| 178 | + |
| 179 | +/* return -1 if no match |
| 180 | + return number of bits rightshifted otherwise */ |
| 181 | +int checkfileforfooter(int fin, unsigned char **footer) { |
| 182 | + unsigned char buffer[11]; |
| 183 | + int result, i; |
| 184 | + |
| 185 | + read_footer(buffer,fin); |
| 186 | + |
| 187 | + result = bytescompare(footer[0],buffer+1,6,0); |
| 188 | + if (!result) { |
| 189 | + return(0); |
| 190 | + } |
| 191 | + |
| 192 | + for (i=1; i<8; i++) { |
| 193 | + result = bytescompare(footer[i],buffer,7,i); |
| 194 | + if (!result) { |
| 195 | + return(i); |
| 196 | + } |
| 197 | + } |
| 198 | + return(-1); |
| 199 | +} |
| 200 | + |
| 201 | +/* return -1 if no match |
| 202 | + return number of bits rightshifted otherwise */ |
| 203 | +int checkbufferforblockmarker(unsigned char *buffer, unsigned char **marker) { |
| 204 | + int result, i; |
| 205 | + |
| 206 | + result = bytescompare(marker[0],buffer+1,6,0); |
| 207 | + if (!result) { |
| 208 | + return(0); |
| 209 | + } |
| 210 | + for (i=1; i<8; i++) { |
| 211 | + result = bytescompare(marker[i],buffer,7,i); |
| 212 | + if (!result) { |
| 213 | + return(i); |
| 214 | + } |
| 215 | + } |
| 216 | + return(-1); |
| 217 | +} |
| 218 | + |
| 219 | +void clearbuffer(unsigned char *buf, int length) { |
| 220 | + int i; |
| 221 | + |
| 222 | + for (i=0; i<length; i++) { |
| 223 | + buf[i]=0; |
| 224 | + } |
| 225 | + return; |
| 226 | +} |
| 227 | + |
| 228 | +int findnextmarker(int fin, int *start_at, int *position, unsigned char **marker, unsigned char *buffer ) { |
| 229 | + int bitsshifted = -1; |
| 230 | + int result; |
| 231 | + |
| 232 | + /* must be after 4 byte file header, and we add a leftmost byte to the buffer |
| 233 | + of data read in case some bits have been shifted into it */ |
| 234 | + while (*position >= 3 && bitsshifted < 0) { |
| 235 | + bitsshifted = checkbufferforblockmarker(buffer, marker); |
| 236 | + if (bitsshifted < 0) { |
| 237 | + (*start_at)++; |
| 238 | + /* |
| 239 | + if (*start_at % 10000 == 0) { |
| 240 | + fprintf(stderr, "starting at %d, position %d\n", *start_at, *position); |
| 241 | + } |
| 242 | + */ |
| 243 | + *position = lseek(fin, -1*(*start_at), SEEK_END); |
| 244 | + if (*position < 0) { |
| 245 | + fprintf(stderr,"lseek of file failed\n"); |
| 246 | + exit(-1); |
| 247 | + } |
| 248 | + result = read(fin, buffer, 7); |
| 249 | + if (result < 0) { |
| 250 | + fprintf(stderr,"read of file failed\n"); |
| 251 | + exit(-1); |
| 252 | + } |
| 253 | + } |
| 254 | + else { |
| 255 | + return(bitsshifted); |
| 256 | + } |
| 257 | + } |
| 258 | + return(bitsshifted); |
| 259 | +} |
| 260 | + |
| 261 | +int init_decompress(bzinfo *bfile) { |
| 262 | + int bz_verbosity = 0; |
| 263 | + int bz_small = 0; |
| 264 | + int ret; |
| 265 | + |
| 266 | + bfile->strm.bzalloc = NULL; |
| 267 | + bfile->strm.bzfree = NULL; |
| 268 | + bfile->strm.opaque = NULL; |
| 269 | + |
| 270 | + ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small ); |
| 271 | + if (ret != BZ_OK) { |
| 272 | + fprintf(stderr,"uncompress failed, err %d\n", ret); |
| 273 | + exit(-1); |
| 274 | + } |
| 275 | + return(ret); |
| 276 | +} |
| 277 | + |
| 278 | +int decompress_header(int fin, bzinfo *bfile) { |
| 279 | + int bytesread, ret; |
| 280 | + unsigned char header[4]; |
| 281 | + |
| 282 | + lseek(fin,0,SEEK_SET); |
| 283 | + bytesread = read(fin, header, 4); |
| 284 | + if (bytesread < 4) { |
| 285 | + fprintf(stderr,"failed to read 4 bytes of header, exiting\n"); |
| 286 | + exit(-1); |
| 287 | + } |
| 288 | + bfile->strm.next_in = (char *)header; |
| 289 | + bfile->strm.avail_in = 4; |
| 290 | + |
| 291 | + bfile->strm.next_out = (char *)(bfile->bufout); |
| 292 | + bfile->strm.avail_out = bfile->bufsize; |
| 293 | + ret = BZ2_bzDecompress ( &(bfile->strm) ); |
| 294 | + if (BZ_OK != ret && BZ_STREAM_END != ret) { |
| 295 | + fprintf(stderr,"Corrupt bzip2 header, exiting\n"); |
| 296 | + exit(-1); |
| 297 | + } |
| 298 | + return(ret); |
| 299 | +} |
| 300 | + |
| 301 | +int setup_first_buffer(int fin, bzinfo *bfile) { |
| 302 | + int bytesread, eof=0; |
| 303 | + |
| 304 | + if (bfile->bitsshifted == 0) { |
| 305 | + lseek(fin,bfile->position+1,SEEK_SET); |
| 306 | + } |
| 307 | + else { |
| 308 | + lseek(fin,bfile->position,SEEK_SET); |
| 309 | + } |
| 310 | + bytesread = read(fin, bfile->bufin, bfile->bufsize); |
| 311 | + if (bytesread > 0) { |
| 312 | + bfile->overflow = bfile->bufin[bytesread-1]; |
| 313 | + shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted); |
| 314 | + |
| 315 | + bfile->strm.next_in = (char *)(bfile->bufin); |
| 316 | + bfile->strm.avail_in = bytesread-1; |
| 317 | + |
| 318 | + bfile->strm.next_out = (char *)(bfile->bufout); |
| 319 | + bfile->strm.avail_out = bfile->bufsize; |
| 320 | + } |
| 321 | + if (bytesread <=0) { |
| 322 | + eof++; |
| 323 | + } |
| 324 | + return(eof); |
| 325 | +} |
| 326 | + |
| 327 | +int do_last_byte(bzinfo *bfile) { |
| 328 | + int ret=BZ_OK; |
| 329 | + int written; |
| 330 | + |
| 331 | + if (bfile->strm.avail_in == 0) { |
| 332 | + bfile->strm.next_in = (char *)(bfile->bufin); |
| 333 | + bfile->bufin[0] = bfile->overflow; |
| 334 | + shiftbytesleft(bfile->bufin,1,bfile->bitsshifted); |
| 335 | + bfile->strm.avail_in = 1; |
| 336 | + bfile->strm.next_out = (char *)(bfile->bufout); |
| 337 | + bfile->strm.avail_out = bfile->bufsize; |
| 338 | + ret = BZ2_bzDecompress ( &(bfile->strm) ); |
| 339 | + if (BZ_OK == ret || BZ_STREAM_END == ret) { |
| 340 | + written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout); |
| 341 | + } |
| 342 | + } |
| 343 | + return(ret); |
| 344 | +} |
| 345 | + |
| 346 | +int read_next_buffer(int fin, bzinfo *bfile, int ret) { |
| 347 | + int bytesread, eof=0; |
| 348 | + |
| 349 | + /* fprintf(stderr," got return from decompress of %d\n", ret); */ |
| 350 | + |
| 351 | + if (bfile->strm.avail_in == 0) { |
| 352 | + bfile->strm.next_in = (char *)(bfile->bufin); |
| 353 | + bfile->bufin[0] = bfile->overflow; |
| 354 | + bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1); |
| 355 | + if (bytesread > 0) { |
| 356 | + bfile->overflow = bfile->bufin[bytesread]; |
| 357 | + shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted); |
| 358 | + bfile->strm.avail_in = bytesread; |
| 359 | + } |
| 360 | + else { |
| 361 | + eof++; |
| 362 | + bfile->strm.avail_in = 0; |
| 363 | + } |
| 364 | + } |
| 365 | + bfile->strm.next_out = (char *)(bfile->bufout); |
| 366 | + bfile->strm.avail_out = bfile->bufsize; |
| 367 | + |
| 368 | + return(eof); |
| 369 | +} |
| 370 | + |
| 371 | + |
| 372 | +int main(int argc, char **argv) { |
| 373 | + |
| 374 | + bzinfo bfile; |
| 375 | + |
| 376 | + int fin; |
| 377 | + int result, ret; |
| 378 | + unsigned char buffer[8]; |
| 379 | + |
| 380 | + unsigned char **footer; |
| 381 | + unsigned char **marker; |
| 382 | + |
| 383 | + int written=0; |
| 384 | + int start_at; |
| 385 | + |
| 386 | + int eof = 0; |
| 387 | + |
| 388 | + if (argc != 2) { |
| 389 | + fprintf(stderr,"usage: %s infile\n", argv[0]); |
| 390 | + exit(-1); |
| 391 | + } |
| 392 | + |
| 393 | + marker = init_marker(); |
| 394 | + footer = init_footer(); |
| 395 | + |
| 396 | + fin = open (argv[1], O_RDONLY); |
| 397 | + if (fin < 0) { |
| 398 | + fprintf(stderr,"failed to open file %s for read\n", argv[1]); |
| 399 | + exit(-1); |
| 400 | + } |
| 401 | + |
| 402 | + bfile.bufsize = BUFSIZE; |
| 403 | + |
| 404 | + result = checkfileforfooter(fin, footer); |
| 405 | + if (result == -1) { |
| 406 | + start_at = 0; |
| 407 | + } |
| 408 | + else { |
| 409 | + start_at = 11; /* size of footer, perhaps with 1 byte extra */ |
| 410 | + } |
| 411 | + start_at +=6; /* size of marker */ |
| 412 | + bfile.position = lseek(fin, -1*start_at, SEEK_END); |
| 413 | + if (bfile.position < 0) { |
| 414 | + fprintf(stderr,"lseek of file failed\n"); |
| 415 | + exit(-1); |
| 416 | + } |
| 417 | + result = read(fin, buffer, 7); |
| 418 | + if (result < 0) { |
| 419 | + fprintf(stderr,"read of file failed\n"); |
| 420 | + exit(-1); |
| 421 | + } |
| 422 | + |
| 423 | + while (1) { |
| 424 | + |
| 425 | + bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer); |
| 426 | + if (bfile.bitsshifted >= 0) { |
| 427 | + /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */ |
| 428 | + ret = init_decompress(&bfile); |
| 429 | + |
| 430 | + /* pass in the header */ |
| 431 | + ret = decompress_header(fin,&bfile); |
| 432 | + |
| 433 | + eof = setup_first_buffer(fin, &bfile); |
| 434 | + |
| 435 | + while (BZ_OK == ret && !eof) { |
| 436 | + ret = BZ2_bzDecompress ( &(bfile.strm) ); |
| 437 | + if (BZ_OK == ret || BZ_STREAM_END == ret) { |
| 438 | + written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout); |
| 439 | + } |
| 440 | + eof = read_next_buffer(fin, &bfile, ret); |
| 441 | + } |
| 442 | + if (BZ_OK == ret || BZ_STREAM_END == ret ) { |
| 443 | + /* so we read no bytes, process the last byte we held */ |
| 444 | + do_last_byte(&bfile); |
| 445 | + } |
| 446 | + if (written == 0) { |
| 447 | + /* truncated block or other corruption, try going back one */ |
| 448 | + start_at +=5; |
| 449 | + clearbuffer(buffer,sizeof(buffer)); |
| 450 | + continue; |
| 451 | + } |
| 452 | + else { |
| 453 | + break; |
| 454 | + } |
| 455 | + } |
| 456 | + else { |
| 457 | + fprintf(stderr,"no block marker in this file.\n"); |
| 458 | + exit(-1); |
| 459 | + } |
| 460 | + } |
| 461 | + close(fin); |
| 462 | + exit(0); |
| 463 | +} |
| 464 | + |
Property changes on: branches/ariel/xmldumps-backup/dumplastbz2block.c |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 465 | + native |