r91271 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91270‎ | r91271 | r91272 >
Date:16:45, 1 July 2011
Author:ariel
Status:deferred
Tags:
Comment:
dumps the last salvageable block of a truncated (or intact) bz2 file to stdout
Modified paths:
  • /branches/ariel/xmldumps-backup/dumplastbz2block.c (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/dumplastbz2block.c
@@ -0,0 +1,463 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include "bzlib.h"
 11+
 12+/*
 13+ Find the last bz2 block marker in a file
 14+ and dump whatever can be decompressed after
 15+ that point. The header of the file must
 16+ be intact in order for any output to be produced.
 17+ This will produce output for truncated files as well,
 18+ as long as there is "enough" data after the block
 19+ marker.
 20+
 21+ Arguments: the name of the file to check, presumably
 22+ a bzipped file.
 23+ Outputs: the decompressed data at the end of the file.
 24+ Exits with 0 if decompression of some data can be done,
 25+ 1 if decompression fails, and -1 on error.
 26+*/
 27+
 28+#define BUFSIZE 121072
 29+typedef struct {
 30+ unsigned char bufin[BUFSIZE];
 31+ unsigned char bufout[BUFSIZE];
 32+ int bufsize;
 33+ bz_stream strm;
 34+ unsigned char overflow;
 35+ int bitsshifted;
 36+ int position;
 37+} bzinfo;
 38+
 39+int read_footer(unsigned char *buffer, int fin) {
 40+ int res;
 41+
 42+ res = lseek(fin, -11, SEEK_END);
 43+ if (res < 0) {
 44+ fprintf(stderr,"lseek of file failed\n");
 45+ exit(-1);
 46+ }
 47+ res = read(fin, buffer, 11);
 48+ if (res < 0) {
 49+ fprintf(stderr,"read of file failed\n");
 50+ exit(-1);
 51+ }
 52+ return(0);
 53+}
 54+
 55+#define LEFT 0
 56+#define RIGHT 1
 57+
 58+/* return n ones either at left or right end */
 59+int bitmask(int numbits, int end) {
 60+ if (end == RIGHT) {
 61+ return((1<<numbits)-1);
 62+ }
 63+ else {
 64+ return(((1<<numbits)-1) << (8-numbits));
 65+ }
 66+}
 67+
 68+void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {
 69+ int i;
 70+
 71+ if (numbits == 0) {
 72+ return;
 73+ }
 74+
 75+ for (i=0; i<buflen; i++) {
 76+ /* left 1 */
 77+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
 78+
 79+ /* grab leftmost from next byte */
 80+ if (i < buflen-1) {
 81+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );
 82+ }
 83+ }
 84+}
 85+
 86+
 87+void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
 88+ int i;
 89+
 90+ for (i=buflen-1; i>=0; i--) {
 91+ /* right 1 */
 92+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 93+
 94+ /* grab rightmost from prev byte */
 95+ if (i > 0) {
 96+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));
 97+ }
 98+ }
 99+}
 100+
 101+unsigned char ** init_marker() {
 102+ unsigned char **marker = malloc(8*sizeof(unsigned char *));
 103+ int i;
 104+
 105+ /* set up block marker plus its various right-shifted incarnations */
 106+ for (i = 0; i< 8; i++) {
 107+ marker[i] = malloc(sizeof(unsigned char)*7);
 108+ }
 109+ marker[0][0]= (unsigned char) 0x31;
 110+ marker[0][1]= (unsigned char) 0x41;
 111+ marker[0][2]= (unsigned char) 0x59;
 112+ marker[0][3]= (unsigned char) 0x26;
 113+ marker[0][4]= (unsigned char) 0x53;
 114+ marker[0][5]= (unsigned char) 0x59;
 115+ marker[0][6]= (unsigned char) 0x00;
 116+ for (i = 1; i< 8; i++) {
 117+ memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
 118+ shiftbytesright(marker[i],7,1);
 119+ }
 120+ return(marker);
 121+}
 122+
 123+unsigned char ** init_footer() {
 124+ unsigned char **footer = malloc(8*sizeof(unsigned char *));
 125+ int i;
 126+
 127+ /* set up footer plus its various right-shifted incarnations */
 128+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
 129+ for (i = 0; i< 8; i++) {
 130+ footer[i] = malloc(sizeof(unsigned char)*7);
 131+ }
 132+ footer[0][0]= (unsigned char) 0x17;
 133+ footer[0][1]= (unsigned char) 0x72;
 134+ footer[0][2]= (unsigned char) 0x45;
 135+ footer[0][3]= (unsigned char) 0x38;
 136+ footer[0][4]= (unsigned char) 0x50;
 137+ footer[0][5]= (unsigned char) 0x90;
 138+ footer[0][6]= (unsigned char) 0x00;
 139+ for (i = 1; i< 8; i++) {
 140+ memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
 141+ shiftbytesright(footer[i],7,1);
 142+ }
 143+ return(footer);
 144+}
 145+
 146+
 147+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 148+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 149+ matches and 0 otherwise. */
 150+int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 151+ int i;
 152+
 153+ if (bitsrightshifted == 0) {
 154+ for (i = 0; i< numbytes; i++) {
 155+ if (buff1[i] != buff2[i]) {
 156+ return(1);
 157+ }
 158+ }
 159+ return(0);
 160+ }
 161+ else {
 162+ for (i = 1; i< numbytes-2; i++) {
 163+ if (buff1[i] != buff2[i]) {
 164+ return(1);
 165+ }
 166+ }
 167+ /* do leftmost byte */
 168+ if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
 169+ return(1);
 170+ }
 171+ /* do rightmost byte */
 172+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
 173+ return(1);
 174+ }
 175+ return(0);
 176+ }
 177+}
 178+
 179+/* return -1 if no match
 180+ return number of bits rightshifted otherwise */
 181+int checkfileforfooter(int fin, unsigned char **footer) {
 182+ unsigned char buffer[11];
 183+ int result, i;
 184+
 185+ read_footer(buffer,fin);
 186+
 187+ result = bytescompare(footer[0],buffer+1,6,0);
 188+ if (!result) {
 189+ return(0);
 190+ }
 191+
 192+ for (i=1; i<8; i++) {
 193+ result = bytescompare(footer[i],buffer,7,i);
 194+ if (!result) {
 195+ return(i);
 196+ }
 197+ }
 198+ return(-1);
 199+}
 200+
 201+/* return -1 if no match
 202+ return number of bits rightshifted otherwise */
 203+int checkbufferforblockmarker(unsigned char *buffer, unsigned char **marker) {
 204+ int result, i;
 205+
 206+ result = bytescompare(marker[0],buffer+1,6,0);
 207+ if (!result) {
 208+ return(0);
 209+ }
 210+ for (i=1; i<8; i++) {
 211+ result = bytescompare(marker[i],buffer,7,i);
 212+ if (!result) {
 213+ return(i);
 214+ }
 215+ }
 216+ return(-1);
 217+}
 218+
 219+void clearbuffer(unsigned char *buf, int length) {
 220+ int i;
 221+
 222+ for (i=0; i<length; i++) {
 223+ buf[i]=0;
 224+ }
 225+ return;
 226+}
 227+
 228+int findnextmarker(int fin, int *start_at, int *position, unsigned char **marker, unsigned char *buffer ) {
 229+ int bitsshifted = -1;
 230+ int result;
 231+
 232+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
 233+ of data read in case some bits have been shifted into it */
 234+ while (*position >= 3 && bitsshifted < 0) {
 235+ bitsshifted = checkbufferforblockmarker(buffer, marker);
 236+ if (bitsshifted < 0) {
 237+ (*start_at)++;
 238+ /*
 239+ if (*start_at % 10000 == 0) {
 240+ fprintf(stderr, "starting at %d, position %d\n", *start_at, *position);
 241+ }
 242+ */
 243+ *position = lseek(fin, -1*(*start_at), SEEK_END);
 244+ if (*position < 0) {
 245+ fprintf(stderr,"lseek of file failed\n");
 246+ exit(-1);
 247+ }
 248+ result = read(fin, buffer, 7);
 249+ if (result < 0) {
 250+ fprintf(stderr,"read of file failed\n");
 251+ exit(-1);
 252+ }
 253+ }
 254+ else {
 255+ return(bitsshifted);
 256+ }
 257+ }
 258+ return(bitsshifted);
 259+}
 260+
 261+int init_decompress(bzinfo *bfile) {
 262+ int bz_verbosity = 0;
 263+ int bz_small = 0;
 264+ int ret;
 265+
 266+ bfile->strm.bzalloc = NULL;
 267+ bfile->strm.bzfree = NULL;
 268+ bfile->strm.opaque = NULL;
 269+
 270+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
 271+ if (ret != BZ_OK) {
 272+ fprintf(stderr,"uncompress failed, err %d\n", ret);
 273+ exit(-1);
 274+ }
 275+ return(ret);
 276+}
 277+
 278+int decompress_header(int fin, bzinfo *bfile) {
 279+ int bytesread, ret;
 280+ unsigned char header[4];
 281+
 282+ lseek(fin,0,SEEK_SET);
 283+ bytesread = read(fin, header, 4);
 284+ if (bytesread < 4) {
 285+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
 286+ exit(-1);
 287+ }
 288+ bfile->strm.next_in = (char *)header;
 289+ bfile->strm.avail_in = 4;
 290+
 291+ bfile->strm.next_out = (char *)(bfile->bufout);
 292+ bfile->strm.avail_out = bfile->bufsize;
 293+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 294+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
 295+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
 296+ exit(-1);
 297+ }
 298+ return(ret);
 299+}
 300+
 301+int setup_first_buffer(int fin, bzinfo *bfile) {
 302+ int bytesread, eof=0;
 303+
 304+ if (bfile->bitsshifted == 0) {
 305+ lseek(fin,bfile->position+1,SEEK_SET);
 306+ }
 307+ else {
 308+ lseek(fin,bfile->position,SEEK_SET);
 309+ }
 310+ bytesread = read(fin, bfile->bufin, bfile->bufsize);
 311+ if (bytesread > 0) {
 312+ bfile->overflow = bfile->bufin[bytesread-1];
 313+ shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);
 314+
 315+ bfile->strm.next_in = (char *)(bfile->bufin);
 316+ bfile->strm.avail_in = bytesread-1;
 317+
 318+ bfile->strm.next_out = (char *)(bfile->bufout);
 319+ bfile->strm.avail_out = bfile->bufsize;
 320+ }
 321+ if (bytesread <=0) {
 322+ eof++;
 323+ }
 324+ return(eof);
 325+}
 326+
 327+int do_last_byte(bzinfo *bfile) {
 328+ int ret=BZ_OK;
 329+ int written;
 330+
 331+ if (bfile->strm.avail_in == 0) {
 332+ bfile->strm.next_in = (char *)(bfile->bufin);
 333+ bfile->bufin[0] = bfile->overflow;
 334+ shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);
 335+ bfile->strm.avail_in = 1;
 336+ bfile->strm.next_out = (char *)(bfile->bufout);
 337+ bfile->strm.avail_out = bfile->bufsize;
 338+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 339+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 340+ written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);
 341+ }
 342+ }
 343+ return(ret);
 344+}
 345+
 346+int read_next_buffer(int fin, bzinfo *bfile, int ret) {
 347+ int bytesread, eof=0;
 348+
 349+ /* fprintf(stderr," got return from decompress of %d\n", ret); */
 350+
 351+ if (bfile->strm.avail_in == 0) {
 352+ bfile->strm.next_in = (char *)(bfile->bufin);
 353+ bfile->bufin[0] = bfile->overflow;
 354+ bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);
 355+ if (bytesread > 0) {
 356+ bfile->overflow = bfile->bufin[bytesread];
 357+ shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);
 358+ bfile->strm.avail_in = bytesread;
 359+ }
 360+ else {
 361+ eof++;
 362+ bfile->strm.avail_in = 0;
 363+ }
 364+ }
 365+ bfile->strm.next_out = (char *)(bfile->bufout);
 366+ bfile->strm.avail_out = bfile->bufsize;
 367+
 368+ return(eof);
 369+}
 370+
 371+
 372+int main(int argc, char **argv) {
 373+
 374+ bzinfo bfile;
 375+
 376+ int fin;
 377+ int result, ret;
 378+ unsigned char buffer[8];
 379+
 380+ unsigned char **footer;
 381+ unsigned char **marker;
 382+
 383+ int written=0;
 384+ int start_at;
 385+
 386+ int eof = 0;
 387+
 388+ if (argc != 2) {
 389+ fprintf(stderr,"usage: %s infile\n", argv[0]);
 390+ exit(-1);
 391+ }
 392+
 393+ marker = init_marker();
 394+ footer = init_footer();
 395+
 396+ fin = open (argv[1], O_RDONLY);
 397+ if (fin < 0) {
 398+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 399+ exit(-1);
 400+ }
 401+
 402+ bfile.bufsize = BUFSIZE;
 403+
 404+ result = checkfileforfooter(fin, footer);
 405+ if (result == -1) {
 406+ start_at = 0;
 407+ }
 408+ else {
 409+ start_at = 11; /* size of footer, perhaps with 1 byte extra */
 410+ }
 411+ start_at +=6; /* size of marker */
 412+ bfile.position = lseek(fin, -1*start_at, SEEK_END);
 413+ if (bfile.position < 0) {
 414+ fprintf(stderr,"lseek of file failed\n");
 415+ exit(-1);
 416+ }
 417+ result = read(fin, buffer, 7);
 418+ if (result < 0) {
 419+ fprintf(stderr,"read of file failed\n");
 420+ exit(-1);
 421+ }
 422+
 423+ while (1) {
 424+
 425+ bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);
 426+ if (bfile.bitsshifted >= 0) {
 427+ /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */
 428+ ret = init_decompress(&bfile);
 429+
 430+ /* pass in the header */
 431+ ret = decompress_header(fin,&bfile);
 432+
 433+ eof = setup_first_buffer(fin, &bfile);
 434+
 435+ while (BZ_OK == ret && !eof) {
 436+ ret = BZ2_bzDecompress ( &(bfile.strm) );
 437+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 438+ written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);
 439+ }
 440+ eof = read_next_buffer(fin, &bfile, ret);
 441+ }
 442+ if (BZ_OK == ret || BZ_STREAM_END == ret ) {
 443+ /* so we read no bytes, process the last byte we held */
 444+ do_last_byte(&bfile);
 445+ }
 446+ if (written == 0) {
 447+ /* truncated block or other corruption, try going back one */
 448+ start_at +=5;
 449+ clearbuffer(buffer,sizeof(buffer));
 450+ continue;
 451+ }
 452+ else {
 453+ break;
 454+ }
 455+ }
 456+ else {
 457+ fprintf(stderr,"no block marker in this file.\n");
 458+ exit(-1);
 459+ }
 460+ }
 461+ close(fin);
 462+ exit(0);
 463+}
 464+
Property changes on: branches/ariel/xmldumps-backup/dumplastbz2block.c
___________________________________________________________________
Added: svn:eol-style
1465 + native

Status & tagging log