r91638 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91637‎ | r91638 | r91639 >
Date:12:21, 7 July 2011
Author:ariel
Status:deferred
Tags:
Comment:
Move common functions into a separate file;
use a slightly modified version of BZ2_bzDecompress
from the bz2lib which skips the cumulaive crc check;
add makefile, license information, readme for when I forget
what these files do
Modified paths:
  • /branches/ariel/xmldumps-backup/mwbzutils/CHANGES (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/COPYING (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/LICENSE_BZ (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/Makefile (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/README (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/bzlib_private.h (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/bzlibfuncs.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h (deleted) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h
@@ -1,81 +0,0 @@
2 -#ifndef _FINDPAGEID_H
3 -#define _FINDPAGEID_H
4 -
5 -typedef struct {
6 - int page_id; /* first id in the block */
7 - int bits_shifted; /* block is right shifted this many bits */
8 - int position; /* position in file of block */
9 -} page_info_t;
10 -
11 -#define BUFINSIZE 5000
12 -
13 -/*
14 - keeps all information about a bzipped file
15 - plus input/output buffers for decompression
16 -*/
17 -typedef struct {
18 - unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
19 - unsigned char *bufout; /* uncompressed data, must be allocated by caller */
20 - unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
21 - unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
22 -
23 - int bufin_size; /* size of input buffer for compressed data */
24 - int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
25 -
26 - int initialized; /* whether bz2file has been initialized (header processed, seek to
27 - some bz2 block in the file and input buffer filled) */
28 - int block_start; /* position of bz2 block in file from which we started to read (we
29 - read a sequence of bz2 blocks from a given position, this is
30 - the offset to the first one) */
31 -
32 - bz_stream strm; /* stream structure for libbz2 */
33 - unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
34 - read around so we can grab the lower end bits off the end for
35 - sticking in front of the next pile of compressed bytes we read */
36 -
37 - int bits_shifted; /* number of bits that the compressed data has been right shifted
38 - in the file (if the number is 0, the block marker and subsequent
39 - data is byte-aligned) */
40 - unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for
41 - locating the marker in a stream of compressed data */
42 -
43 - int position; /* current offset into file from start of file */
44 -
45 - int bytes_read; /* number of bytes of compressed data read from file (per read) */
46 - int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
47 - int eof; /* nonzero if eof reached */
48 - int file_size; /* length of file, so we don't search past it for blocks */
49 -} bz_info_t;
50 -
51 -#define MASKLEFT 0
52 -#define MASKRIGHT 1
53 -
54 -/*
55 - this output buffer is used to collect decompressed output.
56 - this is not a circular buffer; when it is full the user is
57 - responsible for emptying it completely or partially and moving
58 - to the beginning any unused bytes.
59 -
60 -*/
61 -typedef struct {
62 - unsigned char *buffer; /* output storage, allocated by the caller */
63 - unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */
64 - unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */
65 - int bytes_avail; /* number of bytes available for reading */
66 - unsigned char *end; /* points to byte after end of buffer */
67 -} buf_info_t;
68 -
69 -/*
70 - used for each iteration of narrowing down the location in a bzipped2 file of
71 - a desired pageid, by finding first compressed block after a guessed
72 - position and checking the first pageid (if any) contained in it.
73 -*/
74 -typedef struct {
75 - int left_end; /* left end of interval to search (bytes from start of file) */
76 - int right_end; /* right end of interval to search */
77 - int value_wanted; /* pageid desired */
78 - int last_value; /* pageid we found in last iteration */
79 - int last_position; /* position in file for last iteration */
80 -} iter_info_t;
81 -
82 -#endif
Index: branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
@@ -8,519 +8,9 @@
99 #include <errno.h>
1010 #include <sys/types.h>
1111 #include <regex.h>
12 -#include "bzlib.h"
13 -#include "findpageidinbz2xml.h"
 12+#include "mwbzutils.h"
1413
15 -
16 -/* return n ones either at left or right end */
17 -int bit_mask(int numbits, int end) {
18 - if (end == MASKRIGHT) {
19 - return((1<<numbits)-1);
20 - }
21 - else {
22 - return(((1<<numbits)-1) << (8-numbits));
23 - }
24 -}
25 -
26 -void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
27 - int i;
28 -
29 - if (numbits == 0) {
30 - return;
31 - }
32 -
33 - for (i=0; i<buflen; i++) {
34 - /* left 1 */
35 - buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
36 -
37 - /* grab leftmost from next byte */
38 - if (i < buflen-1) {
39 - buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );
40 - }
41 - }
42 -}
43 -
44 -
45 -void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
46 - int i;
47 -
48 - for (i=buflen-1; i>=0; i--) {
49 - /* right 1 */
50 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
51 -
52 - /* grab rightmost from prev byte */
53 - if (i > 0) {
54 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));
55 - }
56 - }
57 -}
58 -
59 -unsigned char ** init_marker() {
60 - unsigned char **marker = malloc(8*sizeof(unsigned char *));
61 - int i;
62 -
63 - /* set up block marker plus its various right-shifted incarnations */
64 - for (i = 0; i< 8; i++) {
65 - marker[i] = malloc(sizeof(unsigned char)*7);
66 - }
67 - marker[0][0]= (unsigned char) 0x31;
68 - marker[0][1]= (unsigned char) 0x41;
69 - marker[0][2]= (unsigned char) 0x59;
70 - marker[0][3]= (unsigned char) 0x26;
71 - marker[0][4]= (unsigned char) 0x53;
72 - marker[0][5]= (unsigned char) 0x59;
73 - marker[0][6]= (unsigned char) 0x00;
74 - for (i = 1; i< 8; i++) {
75 - memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
76 - shift_bytes_right(marker[i],7,1);
77 - }
78 - return(marker);
79 -}
80 -
81 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
82 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
83 - matches and 0 otherwise. */
84 -int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
85 - int i;
86 -
87 - if (bitsrightshifted == 0) {
88 - for (i = 0; i< numbytes; i++) {
89 - if (buff1[i] != buff2[i]) {
90 - return(1);
91 - }
92 - }
93 - return(0);
94 - }
95 - else {
96 - for (i = 1; i< numbytes-2; i++) {
97 - if (buff1[i] != buff2[i]) {
98 - return(1);
99 - }
100 - }
101 - /* do leftmost byte */
102 - if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {
103 - return(1);
104 - }
105 - /* do rightmost byte */
106 - if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {
107 - return(1);
108 - }
109 - return(0);
110 - }
111 -}
112 -
113 -/* return -1 if no match
114 - return number of bits rightshifted otherwise */
115 -int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
116 - int result, i;
117 -
118 - result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
119 - if (!result) {
120 - return(0);
121 - }
122 - for (i=1; i<8; i++) {
123 - result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
124 - if (!result) {
125 - return(i);
126 - }
127 - }
128 - return(-1);
129 -}
130 -
131 -/* return: 1 if found, 0 if not, -1 on error */
132 -int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
133 - int result;
134 -
135 - bfile->bits_shifted = -1;
136 - result = read(fin, bfile->marker_buffer, 7);
137 - if (result == -1) {
138 - fprintf(stderr,"read of file failed\n");
139 - exit(-1);
140 - }
141 - /* must be after 4 byte file header, and we add a leftmost byte to the buffer
142 - of data read in case some bits have been shifted into it */
143 - while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
144 - bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
145 - if (bfile->bits_shifted < 0) {
146 - bfile->position++;
147 - result = lseek(fin, (bfile->position), SEEK_SET);
148 - if (result == -1) {
149 - fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
150 - exit(-1);
151 - }
152 - result = read(fin, bfile->marker_buffer, 7);
153 - if (result < 7) {
154 - /* fprintf(stderr,"read of file failed\n"); */
155 - exit(-1);
156 - }
157 - }
158 - else {
159 - bfile->block_start = bfile->position;
160 - return(1);
161 - }
162 - }
163 - return(0);
164 -}
165 -
166 -/*
167 - initializes the bz2 strm structure,
168 - calls the BZ2 decompression library initializer
169 -
170 - returns:
171 - BZ_OK on success
172 - various BZ_ errors on failure (see bzlib.h)
173 -*/
174 -int init_decompress(bz_info_t *bfile) {
175 - int bz_verbosity = 0;
176 - int bz_small = 0;
177 - int ret;
178 -
179 - bfile->strm.bzalloc = NULL;
180 - bfile->strm.bzfree = NULL;
181 - bfile->strm.opaque = NULL;
182 -
183 - ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
184 - if (ret != BZ_OK) {
185 - fprintf(stderr,"uncompress failed, err %d\n", ret);
186 - exit(-1);
187 - }
188 - return(ret);
189 -}
190 -
191 -/*
192 - reads the first 4 bytes from a bz2 file (should be
193 - "BZh" followed by the block size indicator, typically "9")
194 - and passes them into the BZ2 decompression library.
195 - This must be done before decompression of any block of the
196 - file is attempted.
197 -
198 - returns:
199 - BZ_OK if successful,
200 - various BZ_ errors on failure (see bzlib.h)
201 -*/
202 -int decompress_header(int fin, bz_info_t *bfile) {
203 - int ret, res;
204 -
205 - res = lseek(fin,0,SEEK_SET);
206 - if (res == -1) {
207 - fprintf(stderr,"lseek of file to 0 failed (3)\n");
208 - }
209 - bfile->bytes_read = read(fin, bfile->header_buffer, 4);
210 - if (bfile->bytes_read < 4) {
211 - fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
212 - exit(-1);
213 - }
214 - bfile->strm.next_in = (char *)bfile->header_buffer;
215 - bfile->strm.avail_in = 4;
216 -
217 - ret = BZ2_bzDecompress ( &(bfile->strm) );
218 - if (BZ_OK != ret && BZ_STREAM_END != ret) {
219 - fprintf(stderr,"Corrupt bzip2 header, exiting\n");
220 - exit(-1);
221 - }
222 - return(ret);
223 -}
224 -
225 -/*
226 - seek to appropriate offset as specified in bfile,
227 - read compressed data into buffer indicated by bfile,
228 - update the bfile structure accordingly,
229 - save the overflow byte (bit-shifted data = suck)
230 - this is for the *first* buffer of data in a stream,
231 - for subsequent buffers use fill_buffer_to_decompress()
232 -
233 - this will set bfile->eof on eof. no other indicator
234 - will be provided.
235 -
236 - returns:
237 - 0 on success
238 - -1 on error
239 -*/
240 -int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
241 - int res;
242 -
243 - if (bfile->bits_shifted == 0) {
244 - res = lseek(fin,bfile->position+1,SEEK_SET);
245 - if (res == -1) {
246 - fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
247 - return(-1);
248 - }
249 - }
250 - else {
251 - res = lseek(fin,bfile->position,SEEK_SET);
252 - if (res == -1) {
253 - fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
254 - return(-1);
255 - }
256 - }
257 - bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
258 - if (bfile->bytes_read > 0) {
259 - bfile->overflow = bfile->bufin[bfile->bytes_read-1];
260 - shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
261 -
262 - bfile->strm.next_in = (char *)(bfile->bufin);
263 - bfile->strm.avail_in = bfile->bytes_read-1;
264 - }
265 - if (bfile->bytes_read <=0) {
266 - bfile->eof++;
267 - }
268 - return(0);
269 -}
270 -
271 -/*
272 - read compressed data into buffer indicated by bfile,
273 - from current position of file,
274 - stuffing the overflow byte in first.
275 - update the bfile structure accordingly
276 - save the new overflow byte (bit-shifted data = suck)
277 - this function is for decompression of buffers *after
278 - the first one*. for the first one use
279 - setup_first_buffer_to_decompress()
280 -
281 - this will set bfile->eof on eof. no other indicator
282 - will be provided.
283 -
284 - returns:
285 - 0 on success
286 - hmm, it really does not do anything about errors :-D
287 -*/
288 -int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
289 - if (bfile->strm.avail_in == 0) {
290 - bfile->strm.next_in = (char *)(bfile->bufin);
291 - bfile->bufin[0] = bfile->overflow;
292 - bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
293 - if (bfile->bytes_read > 0) {
294 - bfile->position+=bfile->bytes_read;
295 - bfile->overflow = bfile->bufin[bfile->bytes_read];
296 - shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
297 - bfile->strm.avail_in = bfile->bytes_read;
298 - }
299 - else {
300 - bfile->strm.avail_in = 1; /* the overflow byte */
301 - bfile->eof++;
302 - }
303 - }
304 - return(0);
305 -}
306 -
307 -/* size of buffer is bytes usable. there will be a null byte at the end
308 -
309 - what we do with the buffer:
310 - - read from front of buffer to end,
311 - - fill from point where prev read did not fill buffer, or from where
312 - move of data at end of buffer to beginning left room,
313 - - mark a string of bytes (starting from what's available to read) as "read"
314 -
315 -*/
316 -buf_info_t *init_buffer(int size) {
317 - buf_info_t *b;
318 -
319 - b = (buf_info_t *)malloc(sizeof(buf_info_t));
320 - b->buffer = malloc(sizeof(unsigned char)*(size+1));
321 - b->buffer[size]='\0';
322 - b->end = b->buffer + size;
323 - b->next_to_read = b->end; /* nothing available */
324 - b->bytes_avail = 0; /* bytes to read, nothing available */
325 - b->next_to_fill = b->buffer; /* empty */
326 - b->next_to_fill[0] = '\0';
327 - return(b);
328 -}
329 -
330 -/* check if buffer (used for decompressed data output) is empty,
331 - returns 1 if so and 0 if not */
332 -int buffer_is_empty(buf_info_t *b) {
333 - if (b->bytes_avail == 0) {
334 - return(1);
335 - }
336 - else {
337 - return(0);
338 - }
339 -}
340 -
341 -/* check if buffer (used for decompressed data output) is full,
342 -
343 - returns 1 if so and 0 if not
344 - I'm not liking this function so well, fixme */
345 -int buffer_is_full(buf_info_t *b) {
346 - if (b->next_to_fill == b->end) {
347 - return(1);
348 - }
349 - else {
350 - return(0);
351 - }
352 -}
353 -
354 -/* FIXME do this right. whatever. */
355 -int get_file_size(int fin) {
356 - int res;
357 -
358 - res = lseek(fin, 0, SEEK_END);
359 - if (res == -1) {
360 - fprintf(stderr,"lseek of file to 0 failed (6)\n");
361 - exit(-1);
362 - }
363 - return(res);
364 -}
365 -
366 -
36714 /*
368 - set up the marker, seek to right place, get first
369 - buffer of compressed data for processing
370 - bfile->position must be set to desired offset first by caller.
371 - returns:
372 - -1 if no marker or other error, position of next read if ok
373 -*/
374 -int init_bz2_file(bz_info_t *bfile, int fin) {
375 - int res;
376 -
377 - bfile->bufin_size = BUFINSIZE;
378 - bfile->marker = init_marker();
379 - bfile->bytes_read = 0;
380 - bfile->bytes_written = 0;
381 - bfile->eof = 0;
382 -
383 - bfile->initialized++;
384 -
385 - bfile->file_size = get_file_size(fin);
386 - if (bfile->position > bfile->file_size) {
387 - fprintf(stderr,"asked for position past end of file\n");
388 - exit(-1);
389 - }
390 - res = lseek(fin, bfile->position, SEEK_SET);
391 - if (res == -1) {
392 - fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
393 - exit(-1);
394 - }
395 -
396 - find_next_bz2_block_marker(fin, bfile);
397 - if (bfile->bits_shifted >= 0) {
398 - /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */
399 - init_decompress(bfile);
400 - decompress_header(fin, bfile);
401 - setup_first_buffer_to_decompress(fin, bfile);
402 - return(0);
403 - }
404 - return(-1);
405 -}
406 -
407 -/* get the next buffer of uncompressed stuff */
408 -int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
409 - int ret;
410 -
411 - bfile->bufout = bufferout;
412 - bfile->bufout_size = bufout_size;
413 - bfile->bytes_written = 0;
414 -
415 - if (! bfile->initialized) {
416 - if (init_bz2_file(bfile, fin) == -1) {
417 - fprintf(stderr,"failed to initialize bz2file\n");
418 - return(-1);
419 - };
420 - bfile->strm.next_out = (char *)bfile->bufout;
421 - bfile->strm.avail_out = bfile->bufout_size;
422 - }
423 -
424 - ret = BZ_OK;
425 - while (BZ_OK == ret && bfile->bytes_written == 0) {
426 - ret = BZ2_bzDecompress ( &(bfile->strm) );
427 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
428 - bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
429 - }
430 - else {
431 - fprintf(stderr,"error from BZ decompress %d\n",ret);
432 - return(-1);
433 - }
434 - fill_buffer_to_decompress(fin, bfile, ret);
435 - /*
436 - if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
437 - fprintf(stderr,"eof reached\n");
438 - }
439 - */
440 - }
441 - return(0);
442 -}
443 -
444 -/*
445 - fill output buffer in b with uncompressed data from bfile
446 - if this is the first call to the function for this file,
447 - the file header will be read, and the first buffer of
448 - uncompressed data will be prepared. bfile->position
449 - should be set to the offset (from the beginning of file) from
450 - which to find the first bz2 block.
451 -
452 - returns:
453 - on success, number of bytes read (may be 0)
454 - -1 on error
455 -*/
456 -int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
457 - int res;
458 -
459 - if (buffer_is_full(b)) {
460 - fprintf(stdout,"DEBUG buffer full\n");
461 - return(0);
462 - }
463 -
464 - if (buffer_is_empty(b)) {
465 - b->next_to_fill = b->buffer;
466 - }
467 -
468 - res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
469 - if (res <0 ) {
470 - return(res);
471 - }
472 - if (bfile->bytes_written < 0) {
473 - fprintf(stderr,"read of file failed\n");
474 - return(-1);
475 - }
476 - else {
477 - /* really?? FIXME check this */
478 - if (buffer_is_empty(b)) {
479 - b->next_to_read = b->next_to_fill; /* where we just read */
480 - }
481 - b->bytes_avail += bfile->bytes_written;
482 - b->next_to_fill += bfile->bytes_written;
483 - b->next_to_fill[0] = '\0';
484 - return(0);
485 - }
486 -}
487 -
488 -void dumpbuf_info_t(buf_info_t *b) {
489 - fprintf(stdout, "\n");
490 - fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
491 - fprintf(stdout, "b->end: %ld\n", (long int) b->end);
492 - fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
493 - fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
494 - fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
495 -}
496 -
497 -/*
498 - copy text from end of buffer to the beginning, that we want to keep
499 - around for further processing (i.e. further regex matches)
500 - returns number of bytes copied
501 -*/
502 -int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes) {
503 - int i, tocopy;
504 -
505 - if (fromwhere >= b->end) {
506 - return(0);
507 - }
508 - else {
509 - tocopy = b->end - fromwhere;
510 - if (maxbytes && (tocopy > maxbytes)) {
511 - tocopy = maxbytes;
512 - }
513 - for (i = 0; i < tocopy; i++) {
514 - b->buffer[i] = fromwhere[i];
515 - }
516 - b->next_to_fill = b->buffer + tocopy;
517 - b->next_to_fill[0] = '\0';
518 - b->next_to_read = b->buffer;
519 - b->bytes_avail = tocopy;
520 - return(tocopy);
521 - }
522 -}
523 -
524 -/*
52515 dump the <meadiawiki> header (up through
52616 </siteinfo> close tag) found at the
52717 beginning of xml dump files.
@@ -550,7 +40,7 @@
55141 bfile.bytes_read = 0;
55242 bfile.position = 0;
55343
554 - while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof) && (!done)) {
 44+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof) && (!done)) {
55545 /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
55646 if (bfile.bytes_read) {
55747 if (firstpage) {
@@ -656,7 +146,7 @@
657147 bfile.bytes_read = 0;
658148 bfile.position = position;
659149
660 - while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof)) {
 150+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof)) {
661151 /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
662152 if (bfile.bytes_read) {
663153 if (firstpage) {
Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
@@ -8,436 +8,9 @@
99 #include <errno.h>
1010 #include <sys/types.h>
1111 #include <regex.h>
12 -#include "bzlib.h"
13 -#include "findpageidinbz2xml.h"
 12+#include "mwbzutils.h"
1413
15 -/* return n ones either at left or right end */
16 -int bitmask(int numbits, int end) {
17 - if (end == MASKRIGHT) {
18 - return((1<<numbits)-1);
19 - }
20 - else {
21 - return(((1<<numbits)-1) << (8-numbits));
22 - }
23 -}
2414
25 -void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
26 - int i;
27 -
28 - if (numbits == 0) {
29 - return;
30 - }
31 -
32 - for (i=0; i<buflen; i++) {
33 - /* left 1 */
34 - buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
35 -
36 - /* grab leftmost from next byte */
37 - if (i < buflen-1) {
38 - buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );
39 - }
40 - }
41 -}
42 -
43 -void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
44 - int i;
45 -
46 - for (i=buflen-1; i>=0; i--) {
47 - /* right 1 */
48 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
49 -
50 - /* grab rightmost from prev byte */
51 - if (i > 0) {
52 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));
53 - }
54 - }
55 -}
56 -
57 -unsigned char ** init_marker() {
58 - unsigned char **marker = malloc(8*sizeof(unsigned char *));
59 - int i;
60 -
61 - /* set up block marker plus its various right-shifted incarnations */
62 - for (i = 0; i< 8; i++) {
63 - marker[i] = malloc(sizeof(unsigned char)*7);
64 - }
65 - marker[0][0]= (unsigned char) 0x31;
66 - marker[0][1]= (unsigned char) 0x41;
67 - marker[0][2]= (unsigned char) 0x59;
68 - marker[0][3]= (unsigned char) 0x26;
69 - marker[0][4]= (unsigned char) 0x53;
70 - marker[0][5]= (unsigned char) 0x59;
71 - marker[0][6]= (unsigned char) 0x00;
72 - for (i = 1; i< 8; i++) {
73 - memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
74 - shift_bytes_right(marker[i],7,1);
75 - }
76 - return(marker);
77 -}
78 -
79 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
80 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
81 - matches and 0 otherwise. */
82 -int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
83 - int i;
84 -
85 - if (bitsrightshifted == 0) {
86 - for (i = 0; i< numbytes; i++) {
87 - if (buff1[i] != buff2[i]) {
88 - return(1);
89 - }
90 - }
91 - return(0);
92 - }
93 - else {
94 - for (i = 1; i< numbytes-2; i++) {
95 - if (buff1[i] != buff2[i]) {
96 - return(1);
97 - }
98 - }
99 - /* do leftmost byte */
100 - if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {
101 - return(1);
102 - }
103 - /* do rightmost byte */
104 - if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {
105 - return(1);
106 - }
107 - return(0);
108 - }
109 -}
110 -
111 -
112 -/* return -1 if no match
113 - return number of bits rightshifted otherwise */
114 -int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
115 - int result, i;
116 -
117 - result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
118 - if (!result) {
119 - return(0);
120 - }
121 - for (i=1; i<8; i++) {
122 - result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
123 - if (!result) {
124 - return(i);
125 - }
126 - }
127 - return(-1);
128 -}
129 -
130 -
131 -/* return: 1 if found, 0 if not, -1 on error */
132 -int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
133 - int result;
134 -
135 - bfile->bits_shifted = -1;
136 - result = read(fin, bfile->marker_buffer, 7);
137 - if (result == -1) {
138 - /* fprintf(stderr,"read of file failed\n"); */
139 - return(-1);
140 - }
141 - /* must be after 4 byte file header, and we add a leftmost byte to the buffer
142 - of data read in case some bits have been shifted into it */
143 - while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
144 - bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
145 - if (bfile->bits_shifted < 0) {
146 - bfile->position++;
147 - result = lseek(fin, (bfile->position), SEEK_SET);
148 - if (result == -1) {
149 - fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
150 - return(-1);
151 - }
152 - result = read(fin, bfile->marker_buffer, 7);
153 - if (result < 7) {
154 - /* fprintf(stderr,"read of file failed\n"); */
155 - return(-1);
156 - }
157 - }
158 - else {
159 - bfile->block_start = bfile->position;
160 - return(1);
161 - }
162 - }
163 - return(0);
164 -}
165 -
166 -/*
167 - initializes the bz2 strm structure,
168 - calls the BZ2 decompression library initializer
169 -
170 - returns:
171 - BZ_OK on success
172 - various BZ_ errors on failure (see bzlib.h)
173 -*/
174 -int init_decompress(bz_info_t *bfile) {
175 - int bz_verbosity = 0;
176 - int bz_small = 0;
177 - int ret;
178 -
179 - bfile->strm.bzalloc = NULL;
180 - bfile->strm.bzfree = NULL;
181 - bfile->strm.opaque = NULL;
182 -
183 - ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
184 - if (ret != BZ_OK) {
185 - fprintf(stderr,"uncompress failed, err %d\n", ret);
186 - exit(-1);
187 - }
188 - return(ret);
189 -}
190 -
191 -/*
192 - reads the first 4 bytes from a bz2 file (should be
193 - "BZh" followed by the block size indicator, typically "9")
194 - and passes them into the BZ2 decompression library.
195 - This must be done before decompression of any block of the
196 - file is attempted.
197 -
198 - returns:
199 - BZ_OK if successful,
200 - various BZ_ errors on failure (see bzlib.h)
201 -*/
202 -int decompress_header(int fin, bz_info_t *bfile) {
203 - int ret, res;
204 -
205 - res = lseek(fin,0,SEEK_SET);
206 - if (res == -1) {
207 - fprintf(stderr,"lseek of file to 0 failed (3)\n");
208 - exit(-1);
209 - }
210 - bfile->bytes_read = read(fin, bfile->header_buffer, 4);
211 - if (bfile->bytes_read < 4) {
212 - fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
213 - exit(-1);
214 - }
215 - bfile->strm.next_in = (char *)bfile->header_buffer;
216 - bfile->strm.avail_in = 4;
217 -
218 - ret = BZ2_bzDecompress ( &(bfile->strm) );
219 - if (BZ_OK != ret && BZ_STREAM_END != ret) {
220 - fprintf(stderr,"Corrupt bzip2 header, exiting\n");
221 - exit(-1);
222 - }
223 - return(ret);
224 -}
225 -
226 -/*
227 - seek to appropriate offset as specified in bfile,
228 - read compressed data into buffer indicated by bfile,
229 - update the bfile structure accordingly,
230 - save the overflow byte (bit-shifted data = suck)
231 - this is for the *first* buffer of data in a stream,
232 - for subsequent buffers use fill_buffer_to_decompress()
233 -
234 - this will set bfile->eof on eof. no other indicator
235 - will be provided.
236 -
237 - returns:
238 - 0 on success
239 - -1 on error
240 -*/
241 -int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
242 - int res;
243 -
244 - if (bfile->bits_shifted == 0) {
245 - res = lseek(fin,bfile->position+1,SEEK_SET);
246 - if (res == -1) {
247 - fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
248 - return(-1);
249 - }
250 - }
251 - else {
252 - res = lseek(fin,bfile->position,SEEK_SET);
253 - if (res == -1) {
254 - fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
255 - return(-1);
256 - }
257 - }
258 - bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
259 - if (bfile->bytes_read > 0) {
260 - bfile->overflow = bfile->bufin[bfile->bytes_read-1];
261 - shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
262 -
263 - bfile->strm.next_in = (char *)(bfile->bufin);
264 - bfile->strm.avail_in = bfile->bytes_read-1;
265 - }
266 - if (bfile->bytes_read <=0) {
267 - bfile->eof++;
268 - }
269 - return(0);
270 -}
271 -
272 -/*
273 - read compressed data into buffer indicated by bfile,
274 - from current position of file,
275 - stuffing the overflow byte in first.
276 - update the bfile structure accordingly
277 - save the new overflow byte (bit-shifted data = suck)
278 - this function is for decompression of buffers *after
279 - the first one*. for the first one use
280 - setup_first_buffer_to_decompress()
281 -
282 - this will set bfile->eof on eof. no other indicator
283 - will be provided.
284 -
285 - returns:
286 - 0 on success
287 - hmm, it really does not do anything about errors :-D
288 -*/
289 -int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
290 - if (bfile->strm.avail_in == 0) {
291 - bfile->strm.next_in = (char *)(bfile->bufin);
292 - bfile->bufin[0] = bfile->overflow;
293 - bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
294 - if (bfile->bytes_read > 0) {
295 - bfile->overflow = bfile->bufin[bfile->bytes_read];
296 - shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
297 - bfile->strm.avail_in = bfile->bytes_read;
298 - bfile->position+=bfile->bytes_read;
299 - }
300 - else {
301 - bfile->strm.avail_in = 1; /* the overflow byte */
302 - bfile->eof++;
303 - }
304 - }
305 - return(0);
306 -}
307 -
308 -/* size of buffer is bytes usable. there will be a null byte at the end
309 -
310 - what we do with the buffer:
311 - - read from front of buffer to end,
312 - - fill from point where prev read did not fill buffer, or from where
313 - move of data at end of buffer to beginning left room,
314 - - mark a string of bytes (starting from what's available to read) as "read"
315 -
316 -*/
317 -buf_info_t *init_buffer(int size) {
318 - buf_info_t *b;
319 -
320 - b = (buf_info_t *)malloc(sizeof(buf_info_t));
321 - b->buffer = malloc(sizeof(unsigned char)*(size+1));
322 - b->buffer[size]='\0';
323 - b->end = b->buffer + size;
324 - b->next_to_read = b->end; /* nothing available */
325 - b->bytes_avail = 0; /* bytes to read, nothing available */
326 - b->next_to_fill = b->buffer; /* empty */
327 - b->next_to_fill[0] = '\0';
328 - return(b);
329 -}
330 -
331 -/* check if buffer (used for decompressed data output) is empty,
332 - returns 1 if so and 0 if not */
333 -int buffer_is_empty(buf_info_t *b) {
334 - if (b->bytes_avail == 0) {
335 - return(1);
336 - }
337 - else {
338 - return(0);
339 - }
340 -}
341 -
342 -/* check if buffer (used for decompressed data output) is full,
343 -
344 - returns 1 if so and 0 if not
345 - I'm not liking this function so well, fixme */
346 -int buffer_is_full(buf_info_t *b) {
347 - if (b->next_to_fill == b->end) {
348 - return(1);
349 - }
350 - else {
351 - return(0);
352 - }
353 -}
354 -
355 -/* FIXME do this right. whatever. */
356 -int get_file_size(int fin) {
357 - int res;
358 -
359 - res = lseek(fin, 0, SEEK_END);
360 - if (res == -1) {
361 - fprintf(stderr,"lseek of file to 0 failed (6)\n");
362 - exit(-1);
363 - }
364 - return(res);
365 -}
366 -
367 -
368 -/*
369 - look for the first bz2 block in the file after specified offset
370 - it tests that the block is valid by doing partial decompression.
371 - this function will update the bfile structure:
372 - bfile->position will contain the current position of the file (? will it?)
373 - bfile->bits_shifted will contain the number of bits that the block is rightshifted
374 - bfile->block_start will contain the offset from start of file to the block
375 - returns:
376 - position of next byte in file to be read, on success
377 - -1 if no marker or other error
378 -*/
379 -int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {
380 - int res;
381 -
382 - bfile->bufin_size = BUFINSIZE;
383 - bfile->marker = init_marker();
384 - bfile->position = position;
385 - bfile->block_start = -1;
386 - bfile->bytes_read = 0;
387 - bfile->bytes_written = 0;
388 - bfile->eof = 0;
389 - bfile->bits_shifted = -1;
390 -
391 - bfile->file_size = get_file_size(fin);
392 -
393 - while (bfile->bits_shifted < 0) {
394 - if (bfile->position > bfile->file_size) {
395 - return(-1);
396 - }
397 - res = lseek(fin, bfile->position, SEEK_SET);
398 - if (res == -1) {
399 - fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
400 - exit(-1);
401 - }
402 - res = find_next_bz2_block_marker(fin, bfile);
403 - if (res == 1) {
404 - init_decompress(bfile);
405 - decompress_header(fin, bfile);
406 - res = setup_first_buffer_to_decompress(fin, bfile);
407 - if (res == -1) {
408 - fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
409 - exit(-1);
410 - }
411 - bfile->strm.next_out = (char *)bfile->bufout;
412 - bfile->strm.avail_out = bfile->bufout_size;
413 - res = BZ2_bzDecompress ( &(bfile->strm) );
414 - /* this means we (probably) have a genuine marker */
415 - if (BZ_OK == res || BZ_STREAM_END == res) {
416 - res = BZ2_bzDecompressEnd ( &(bfile->strm) );
417 - bfile->bytes_read = 0;
418 - bfile->bytes_written = 0;
419 - bfile->eof = 0;
420 - /* leave the file at the right position */
421 - res = lseek(fin, bfile->block_start, SEEK_SET);
422 - if (res == -1) {
423 - fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
424 - exit(-1);
425 - }
426 - return(0);
427 - }
428 - /* right bytes, but there by chance, skip and try again */
429 - else {
430 - bfile->position+=6;
431 - bfile->bits_shifted = -1;
432 - bfile->block_start = -1;
433 - }
434 - }
435 - else {
436 - return(-1);
437 - }
438 - }
439 - return(-1);
440 -}
441 -
44215 /*
44316 find the first bz2 block marker in the file,
44417 from its current position,
@@ -446,12 +19,12 @@
44720 0 on success
44821 -1 if no marker or other error
44922 */
450 -int init_bz2_file(bz_info_t *bfile, int fin) {
 23+int init_and_read_first_buffer_bz2_file(bz_info_t *bfile, int fin) {
45124 int res;
45225
45326 bfile->initialized++;
45427
455 - res = find_next_bz2_block_marker(fin, bfile);
 28+ res = find_next_bz2_block_marker(fin, bfile, FORWARD);
45629 if (res ==1) {
45730 init_decompress(bfile);
45831 decompress_header(fin, bfile);
@@ -461,125 +34,7 @@
46235 return(-1);
46336 }
46437
465 -/* return -1 if error */
466 -int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
467 - int ret;
468 -
469 - bfile->bufout = bufferout;
470 - bfile->bufout_size = bufout_size;
471 - bfile->bytes_written = 0;
472 -
473 - if (! bfile->initialized) {
474 - if (init_bz2_file(bfile, fin) == -1) {
475 - /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */
476 - return(-1);
477 - };
478 - bfile->strm.next_out = (char *)bfile->bufout;
479 - bfile->strm.avail_out = bfile->bufout_size;
480 - }
481 -
482 - ret = BZ_OK;
483 - while (BZ_OK == ret && bfile->bytes_written == 0) {
484 - ret = BZ2_bzDecompress ( &(bfile->strm) );
485 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
486 - bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
487 - }
488 - else {
489 - /* fprintf(stderr,"error from BZ decompress %d\n",ret); */
490 - return(-1);
491 - }
492 - fill_buffer_to_decompress(fin, bfile, ret);
493 - /*
494 - if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
495 - fprintf(stderr,"eof reached\n");
496 - }
497 - */
498 - }
499 - return(0);
500 -}
501 -
502 -
50338 /*
504 - fill output buffer in b with uncompressed data from bfile
505 - if this is the first call to the function for this file,
506 - the file header will be read, and the first buffer of
507 - uncompressed data will be prepared. bfile->position
508 - should be set to the offset (from the beginning of file) from
509 - which to find the first bz2 block.
510 -
511 - returns:
512 - on success, number of bytes read (may be 0)
513 - -1 on error
514 -*/
515 -int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
516 - int res;
517 -
518 - if (buffer_is_full(b)) {
519 - return(0);
520 - }
521 -
522 - if (buffer_is_empty(b)) {
523 - b->next_to_fill = b->buffer;
524 - }
525 -
526 - res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
527 - if (res == -1) {
528 - return(res);
529 - }
530 - if (bfile->bytes_written < 0) {
531 - /* fprintf(stderr,"read of file failed\n"); */
532 - return(-1);
533 - }
534 - else {
535 - /* really?? FIXME check this */
536 - if (buffer_is_empty(b)) {
537 - b->next_to_read = b->next_to_fill; /* where we just read */
538 - }
539 - b->bytes_avail += bfile->bytes_written;
540 - b->next_to_fill += bfile->bytes_written;
541 - b->next_to_fill[0] = '\0';
542 - return(0);
543 - }
544 -}
545 -
546 -void dumpbuf_info_t(buf_info_t *b) {
547 - fprintf(stdout, "\n");
548 - fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
549 - fprintf(stdout, "b->end: %ld\n", (long int) b->end);
550 - fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
551 - fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
552 - fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
553 -}
554 -
555 -
556 -/*
557 - copy text from end of buffer to the beginning, that we want to keep
558 - around for further processing (i.e. further regex matches)
559 - returns number of bytes copied
560 -*/
561 -int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *from_where, int maxbytes) {
562 - int i, tocopy;
563 -
564 - if (from_where >= b->end) {
565 - return(0);
566 - }
567 - else {
568 - tocopy = b->end - from_where;
569 - if (maxbytes && (tocopy > maxbytes)) {
570 - tocopy = maxbytes;
571 - }
572 - for (i = 0; i < tocopy; i++) {
573 - b->buffer[i] = from_where[i];
574 - }
575 - b->next_to_fill = b->buffer + tocopy;
576 - b->next_to_fill[0] = '\0';
577 - b->next_to_read = b->buffer;
578 - b->bytes_avail = tocopy;
579 - return(tocopy);
580 - }
581 -}
582 -
583 -/*
58439 get the first page id after position in file
58540 if a pageid is found, the structure pinfo will be updated accordingly
58641 returns:
@@ -614,12 +69,12 @@
61570
61671 bfile.bytes_read = 0;
61772
618 - if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {
 73+ if (find_first_bz2_block_from_offset(&bfile, fin, position, FORWARD) <= 0) {
61974 /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
62075 return(-1);
62176 }
62277
623 - while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {
 78+ while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) {
62479 if (bfile.bytes_read) {
62580 while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
62681 if (match_page_id[1].rm_so >=0) {
Index: branches/ariel/xmldumps-backup/mwbzutils/bzlib_private.h
@@ -0,0 +1,509 @@
 2+
 3+/*-------------------------------------------------------------*/
 4+/*--- Private header file for the library. ---*/
 5+/*--- bzlib_private.h ---*/
 6+/*-------------------------------------------------------------*/
 7+
 8+/* ------------------------------------------------------------------
 9+ This file is part of bzip2/libbzip2, a program and library for
 10+ lossless, block-sorting data compression.
 11+
 12+ bzip2/libbzip2 version 1.0.6 of 6 September 2010
 13+ Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
 14+
 15+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
 16+ README file.
 17+
 18+ This program is released under the terms of the license contained
 19+ in the file LICENSE.
 20+ ------------------------------------------------------------------ */
 21+
 22+
 23+#ifndef _BZLIB_PRIVATE_H
 24+#define _BZLIB_PRIVATE_H
 25+
 26+#include <stdlib.h>
 27+
 28+#ifndef BZ_NO_STDIO
 29+#include <stdio.h>
 30+#include <ctype.h>
 31+#include <string.h>
 32+#endif
 33+
 34+#include "bzlib.h"
 35+
 36+
 37+
 38+/*-- General stuff. --*/
 39+
 40+#define BZ_VERSION "1.0.6, 6-Sept-2010"
 41+
 42+typedef char Char;
 43+typedef unsigned char Bool;
 44+typedef unsigned char UChar;
 45+typedef int Int32;
 46+typedef unsigned int UInt32;
 47+typedef short Int16;
 48+typedef unsigned short UInt16;
 49+
 50+#define True ((Bool)1)
 51+#define False ((Bool)0)
 52+
 53+#ifndef __GNUC__
 54+#define __inline__ /* */
 55+#endif
 56+
 57+#ifndef BZ_NO_STDIO
 58+
 59+extern void BZ2_bz__AssertH__fail ( int errcode );
 60+#define AssertH(cond,errcode) \
 61+ { if (!(cond)) BZ2_bz__AssertH__fail ( errcode ); }
 62+
 63+#if BZ_DEBUG
 64+#define AssertD(cond,msg) \
 65+ { if (!(cond)) { \
 66+ fprintf ( stderr, \
 67+ "\n\nlibbzip2(debug build): internal error\n\t%s\n", msg );\
 68+ exit(1); \
 69+ }}
 70+#else
 71+#define AssertD(cond,msg) /* */
 72+#endif
 73+
 74+#define VPrintf0(zf) \
 75+ fprintf(stderr,zf)
 76+#define VPrintf1(zf,za1) \
 77+ fprintf(stderr,zf,za1)
 78+#define VPrintf2(zf,za1,za2) \
 79+ fprintf(stderr,zf,za1,za2)
 80+#define VPrintf3(zf,za1,za2,za3) \
 81+ fprintf(stderr,zf,za1,za2,za3)
 82+#define VPrintf4(zf,za1,za2,za3,za4) \
 83+ fprintf(stderr,zf,za1,za2,za3,za4)
 84+#define VPrintf5(zf,za1,za2,za3,za4,za5) \
 85+ fprintf(stderr,zf,za1,za2,za3,za4,za5)
 86+
 87+#else
 88+
 89+extern void bz_internal_error ( int errcode );
 90+#define AssertH(cond,errcode) \
 91+ { if (!(cond)) bz_internal_error ( errcode ); }
 92+#define AssertD(cond,msg) do { } while (0)
 93+#define VPrintf0(zf) do { } while (0)
 94+#define VPrintf1(zf,za1) do { } while (0)
 95+#define VPrintf2(zf,za1,za2) do { } while (0)
 96+#define VPrintf3(zf,za1,za2,za3) do { } while (0)
 97+#define VPrintf4(zf,za1,za2,za3,za4) do { } while (0)
 98+#define VPrintf5(zf,za1,za2,za3,za4,za5) do { } while (0)
 99+
 100+#endif
 101+
 102+
 103+#define BZALLOC(nnn) (strm->bzalloc)(strm->opaque,(nnn),1)
 104+#define BZFREE(ppp) (strm->bzfree)(strm->opaque,(ppp))
 105+
 106+
 107+/*-- Header bytes. --*/
 108+
 109+#define BZ_HDR_B 0x42 /* 'B' */
 110+#define BZ_HDR_Z 0x5a /* 'Z' */
 111+#define BZ_HDR_h 0x68 /* 'h' */
 112+#define BZ_HDR_0 0x30 /* '0' */
 113+
 114+/*-- Constants for the back end. --*/
 115+
 116+#define BZ_MAX_ALPHA_SIZE 258
 117+#define BZ_MAX_CODE_LEN 23
 118+
 119+#define BZ_RUNA 0
 120+#define BZ_RUNB 1
 121+
 122+#define BZ_N_GROUPS 6
 123+#define BZ_G_SIZE 50
 124+#define BZ_N_ITERS 4
 125+
 126+#define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
 127+
 128+
 129+
 130+/*-- Stuff for randomising repetitive blocks. --*/
 131+
 132+extern Int32 BZ2_rNums[512];
 133+
 134+#define BZ_RAND_DECLS \
 135+ Int32 rNToGo; \
 136+ Int32 rTPos \
 137+
 138+#define BZ_RAND_INIT_MASK \
 139+ s->rNToGo = 0; \
 140+ s->rTPos = 0 \
 141+
 142+#define BZ_RAND_MASK ((s->rNToGo == 1) ? 1 : 0)
 143+
 144+#define BZ_RAND_UPD_MASK \
 145+ if (s->rNToGo == 0) { \
 146+ s->rNToGo = BZ2_rNums[s->rTPos]; \
 147+ s->rTPos++; \
 148+ if (s->rTPos == 512) s->rTPos = 0; \
 149+ } \
 150+ s->rNToGo--;
 151+
 152+
 153+
 154+/*-- Stuff for doing CRCs. --*/
 155+
 156+extern UInt32 BZ2_crc32Table[256];
 157+
 158+#define BZ_INITIALISE_CRC(crcVar) \
 159+{ \
 160+ crcVar = 0xffffffffL; \
 161+}
 162+
 163+#define BZ_FINALISE_CRC(crcVar) \
 164+{ \
 165+ crcVar = ~(crcVar); \
 166+}
 167+
 168+#define BZ_UPDATE_CRC(crcVar,cha) \
 169+{ \
 170+ crcVar = (crcVar << 8) ^ \
 171+ BZ2_crc32Table[(crcVar >> 24) ^ \
 172+ ((UChar)cha)]; \
 173+}
 174+
 175+
 176+
 177+/*-- States and modes for compression. --*/
 178+
 179+#define BZ_M_IDLE 1
 180+#define BZ_M_RUNNING 2
 181+#define BZ_M_FLUSHING 3
 182+#define BZ_M_FINISHING 4
 183+
 184+#define BZ_S_OUTPUT 1
 185+#define BZ_S_INPUT 2
 186+
 187+#define BZ_N_RADIX 2
 188+#define BZ_N_QSORT 12
 189+#define BZ_N_SHELL 18
 190+#define BZ_N_OVERSHOOT (BZ_N_RADIX + BZ_N_QSORT + BZ_N_SHELL + 2)
 191+
 192+
 193+
 194+
 195+/*-- Structure holding all the compression-side stuff. --*/
 196+
 197+typedef
 198+ struct {
 199+ /* pointer back to the struct bz_stream */
 200+ bz_stream* strm;
 201+
 202+ /* mode this stream is in, and whether inputting */
 203+ /* or outputting data */
 204+ Int32 mode;
 205+ Int32 state;
 206+
 207+ /* remembers avail_in when flush/finish requested */
 208+ UInt32 avail_in_expect;
 209+
 210+ /* for doing the block sorting */
 211+ UInt32* arr1;
 212+ UInt32* arr2;
 213+ UInt32* ftab;
 214+ Int32 origPtr;
 215+
 216+ /* aliases for arr1 and arr2 */
 217+ UInt32* ptr;
 218+ UChar* block;
 219+ UInt16* mtfv;
 220+ UChar* zbits;
 221+
 222+ /* for deciding when to use the fallback sorting algorithm */
 223+ Int32 workFactor;
 224+
 225+ /* run-length-encoding of the input */
 226+ UInt32 state_in_ch;
 227+ Int32 state_in_len;
 228+ BZ_RAND_DECLS;
 229+
 230+ /* input and output limits and current posns */
 231+ Int32 nblock;
 232+ Int32 nblockMAX;
 233+ Int32 numZ;
 234+ Int32 state_out_pos;
 235+
 236+ /* map of bytes used in block */
 237+ Int32 nInUse;
 238+ Bool inUse[256];
 239+ UChar unseqToSeq[256];
 240+
 241+ /* the buffer for bit stream creation */
 242+ UInt32 bsBuff;
 243+ Int32 bsLive;
 244+
 245+ /* block and combined CRCs */
 246+ UInt32 blockCRC;
 247+ UInt32 combinedCRC;
 248+
 249+ /* misc administratium */
 250+ Int32 verbosity;
 251+ Int32 blockNo;
 252+ Int32 blockSize100k;
 253+
 254+ /* stuff for coding the MTF values */
 255+ Int32 nMTF;
 256+ Int32 mtfFreq [BZ_MAX_ALPHA_SIZE];
 257+ UChar selector [BZ_MAX_SELECTORS];
 258+ UChar selectorMtf[BZ_MAX_SELECTORS];
 259+
 260+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 261+ Int32 code [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 262+ Int32 rfreq [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 263+ /* second dimension: only 3 needed; 4 makes index calculations faster */
 264+ UInt32 len_pack[BZ_MAX_ALPHA_SIZE][4];
 265+
 266+ }
 267+ EState;
 268+
 269+
 270+
 271+/*-- externs for compression. --*/
 272+
 273+extern void
 274+BZ2_blockSort ( EState* );
 275+
 276+extern void
 277+BZ2_compressBlock ( EState*, Bool );
 278+
 279+extern void
 280+BZ2_bsInitWrite ( EState* );
 281+
 282+extern void
 283+BZ2_hbAssignCodes ( Int32*, UChar*, Int32, Int32, Int32 );
 284+
 285+extern void
 286+BZ2_hbMakeCodeLengths ( UChar*, Int32*, Int32, Int32 );
 287+
 288+
 289+
 290+/*-- states for decompression. --*/
 291+
 292+#define BZ_X_IDLE 1
 293+#define BZ_X_OUTPUT 2
 294+
 295+#define BZ_X_MAGIC_1 10
 296+#define BZ_X_MAGIC_2 11
 297+#define BZ_X_MAGIC_3 12
 298+#define BZ_X_MAGIC_4 13
 299+#define BZ_X_BLKHDR_1 14
 300+#define BZ_X_BLKHDR_2 15
 301+#define BZ_X_BLKHDR_3 16
 302+#define BZ_X_BLKHDR_4 17
 303+#define BZ_X_BLKHDR_5 18
 304+#define BZ_X_BLKHDR_6 19
 305+#define BZ_X_BCRC_1 20
 306+#define BZ_X_BCRC_2 21
 307+#define BZ_X_BCRC_3 22
 308+#define BZ_X_BCRC_4 23
 309+#define BZ_X_RANDBIT 24
 310+#define BZ_X_ORIGPTR_1 25
 311+#define BZ_X_ORIGPTR_2 26
 312+#define BZ_X_ORIGPTR_3 27
 313+#define BZ_X_MAPPING_1 28
 314+#define BZ_X_MAPPING_2 29
 315+#define BZ_X_SELECTOR_1 30
 316+#define BZ_X_SELECTOR_2 31
 317+#define BZ_X_SELECTOR_3 32
 318+#define BZ_X_CODING_1 33
 319+#define BZ_X_CODING_2 34
 320+#define BZ_X_CODING_3 35
 321+#define BZ_X_MTF_1 36
 322+#define BZ_X_MTF_2 37
 323+#define BZ_X_MTF_3 38
 324+#define BZ_X_MTF_4 39
 325+#define BZ_X_MTF_5 40
 326+#define BZ_X_MTF_6 41
 327+#define BZ_X_ENDHDR_2 42
 328+#define BZ_X_ENDHDR_3 43
 329+#define BZ_X_ENDHDR_4 44
 330+#define BZ_X_ENDHDR_5 45
 331+#define BZ_X_ENDHDR_6 46
 332+#define BZ_X_CCRC_1 47
 333+#define BZ_X_CCRC_2 48
 334+#define BZ_X_CCRC_3 49
 335+#define BZ_X_CCRC_4 50
 336+
 337+
 338+
 339+/*-- Constants for the fast MTF decoder. --*/
 340+
 341+#define MTFA_SIZE 4096
 342+#define MTFL_SIZE 16
 343+
 344+
 345+
 346+/*-- Structure holding all the decompression-side stuff. --*/
 347+
 348+typedef
 349+ struct {
 350+ /* pointer back to the struct bz_stream */
 351+ bz_stream* strm;
 352+
 353+ /* state indicator for this stream */
 354+ Int32 state;
 355+
 356+ /* for doing the final run-length decoding */
 357+ UChar state_out_ch;
 358+ Int32 state_out_len;
 359+ Bool blockRandomised;
 360+ BZ_RAND_DECLS;
 361+
 362+ /* the buffer for bit stream reading */
 363+ UInt32 bsBuff;
 364+ Int32 bsLive;
 365+
 366+ /* misc administratium */
 367+ Int32 blockSize100k;
 368+ Bool smallDecompress;
 369+ Int32 currBlockNo;
 370+ Int32 verbosity;
 371+
 372+ /* for undoing the Burrows-Wheeler transform */
 373+ Int32 origPtr;
 374+ UInt32 tPos;
 375+ Int32 k0;
 376+ Int32 unzftab[256];
 377+ Int32 nblock_used;
 378+ Int32 cftab[257];
 379+ Int32 cftabCopy[257];
 380+
 381+ /* for undoing the Burrows-Wheeler transform (FAST) */
 382+ UInt32 *tt;
 383+
 384+ /* for undoing the Burrows-Wheeler transform (SMALL) */
 385+ UInt16 *ll16;
 386+ UChar *ll4;
 387+
 388+ /* stored and calculated CRCs */
 389+ UInt32 storedBlockCRC;
 390+ UInt32 storedCombinedCRC;
 391+ UInt32 calculatedBlockCRC;
 392+ UInt32 calculatedCombinedCRC;
 393+
 394+ /* map of bytes used in block */
 395+ Int32 nInUse;
 396+ Bool inUse[256];
 397+ Bool inUse16[16];
 398+ UChar seqToUnseq[256];
 399+
 400+ /* for decoding the MTF values */
 401+ UChar mtfa [MTFA_SIZE];
 402+ Int32 mtfbase[256 / MTFL_SIZE];
 403+ UChar selector [BZ_MAX_SELECTORS];
 404+ UChar selectorMtf[BZ_MAX_SELECTORS];
 405+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 406+
 407+ Int32 limit [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 408+ Int32 base [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 409+ Int32 perm [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
 410+ Int32 minLens[BZ_N_GROUPS];
 411+
 412+ /* save area for scalars in the main decompress code */
 413+ Int32 save_i;
 414+ Int32 save_j;
 415+ Int32 save_t;
 416+ Int32 save_alphaSize;
 417+ Int32 save_nGroups;
 418+ Int32 save_nSelectors;
 419+ Int32 save_EOB;
 420+ Int32 save_groupNo;
 421+ Int32 save_groupPos;
 422+ Int32 save_nextSym;
 423+ Int32 save_nblockMAX;
 424+ Int32 save_nblock;
 425+ Int32 save_es;
 426+ Int32 save_N;
 427+ Int32 save_curr;
 428+ Int32 save_zt;
 429+ Int32 save_zn;
 430+ Int32 save_zvec;
 431+ Int32 save_zj;
 432+ Int32 save_gSel;
 433+ Int32 save_gMinlen;
 434+ Int32* save_gLimit;
 435+ Int32* save_gBase;
 436+ Int32* save_gPerm;
 437+
 438+ }
 439+ DState;
 440+
 441+
 442+
 443+/*-- Macros for decompression. --*/
 444+
 445+#define BZ_GET_FAST(cccc) \
 446+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
 447+ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) return True; \
 448+ s->tPos = s->tt[s->tPos]; \
 449+ cccc = (UChar)(s->tPos & 0xff); \
 450+ s->tPos >>= 8;
 451+
 452+#define BZ_GET_FAST_C(cccc) \
 453+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
 454+ if (c_tPos >= (UInt32)100000 * (UInt32)ro_blockSize100k) return True; \
 455+ c_tPos = c_tt[c_tPos]; \
 456+ cccc = (UChar)(c_tPos & 0xff); \
 457+ c_tPos >>= 8;
 458+
 459+#define SET_LL4(i,n) \
 460+ { if (((i) & 0x1) == 0) \
 461+ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0xf0) | (n); else \
 462+ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0x0f) | ((n) << 4); \
 463+ }
 464+
 465+#define GET_LL4(i) \
 466+ ((((UInt32)(s->ll4[(i) >> 1])) >> (((i) << 2) & 0x4)) & 0xF)
 467+
 468+#define SET_LL(i,n) \
 469+ { s->ll16[i] = (UInt16)(n & 0x0000ffff); \
 470+ SET_LL4(i, n >> 16); \
 471+ }
 472+
 473+#define GET_LL(i) \
 474+ (((UInt32)s->ll16[i]) | (GET_LL4(i) << 16))
 475+
 476+#define BZ_GET_SMALL(cccc) \
 477+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
 478+ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) return True; \
 479+ cccc = BZ2_indexIntoF ( s->tPos, s->cftab ); \
 480+ s->tPos = GET_LL(s->tPos);
 481+
 482+
 483+/*-- externs for decompression. --*/
 484+
 485+extern Int32
 486+BZ2_indexIntoF ( Int32, Int32* );
 487+
 488+extern Int32
 489+BZ2_decompress ( DState* );
 490+
 491+extern void
 492+BZ2_hbCreateDecodeTables ( Int32*, Int32*, Int32*, UChar*,
 493+ Int32, Int32, Int32 );
 494+
 495+
 496+#endif
 497+
 498+
 499+/*-- BZ_NO_STDIO seems to make NULL disappear on some platforms. --*/
 500+
 501+#ifdef BZ_NO_STDIO
 502+#ifndef NULL
 503+#define NULL 0
 504+#endif
 505+#endif
 506+
 507+
 508+/*-------------------------------------------------------------*/
 509+/*--- end bzlib_private.h ---*/
 510+/*-------------------------------------------------------------*/
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/bzlib_private.h
___________________________________________________________________
Added: svn:eol-style
1511 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/bzlibfuncs.c
@@ -0,0 +1,218 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include <sys/types.h>
 11+#include <regex.h>
 12+#include "bzlib_private.h"
 13+#include "bzlib.h"
 14+
 15+/*---------------------------------------------------*/
 16+/* Return True iff data corruption is discovered.
 17+ Returns False if there is no problem.
 18+*/
 19+Bool unRLE_obuf_to_output_FAST ( DState* s )
 20+{
 21+ UChar k1;
 22+
 23+ if (s->blockRandomised) {
 24+
 25+ while (True) {
 26+ /* try to finish existing run */
 27+ while (True) {
 28+ if (s->strm->avail_out == 0) return False;
 29+ if (s->state_out_len == 0) break;
 30+ *( (UChar*)(s->strm->next_out) ) = s->state_out_ch;
 31+ BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch );
 32+ s->state_out_len--;
 33+ s->strm->next_out++;
 34+ s->strm->avail_out--;
 35+ s->strm->total_out_lo32++;
 36+ if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++;
 37+ }
 38+
 39+ /* can a new run be started? */
 40+ if (s->nblock_used == s->save_nblock+1) return False;
 41+
 42+ /* Only caused by corrupt data stream? */
 43+ if (s->nblock_used > s->save_nblock+1)
 44+ return True;
 45+
 46+ s->state_out_len = 1;
 47+ s->state_out_ch = s->k0;
 48+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
 49+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
 50+ if (s->nblock_used == s->save_nblock+1) continue;
 51+ if (k1 != s->k0) { s->k0 = k1; continue; };
 52+
 53+ s->state_out_len = 2;
 54+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
 55+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
 56+ if (s->nblock_used == s->save_nblock+1) continue;
 57+ if (k1 != s->k0) { s->k0 = k1; continue; };
 58+
 59+ s->state_out_len = 3;
 60+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
 61+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
 62+ if (s->nblock_used == s->save_nblock+1) continue;
 63+ if (k1 != s->k0) { s->k0 = k1; continue; };
 64+
 65+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
 66+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
 67+ s->state_out_len = ((Int32)k1) + 4;
 68+ BZ_GET_FAST(s->k0); BZ_RAND_UPD_MASK;
 69+ s->k0 ^= BZ_RAND_MASK; s->nblock_used++;
 70+ }
 71+
 72+ } else {
 73+
 74+ /* restore */
 75+ UInt32 c_calculatedBlockCRC = s->calculatedBlockCRC;
 76+ UChar c_state_out_ch = s->state_out_ch;
 77+ Int32 c_state_out_len = s->state_out_len;
 78+ Int32 c_nblock_used = s->nblock_used;
 79+ Int32 c_k0 = s->k0;
 80+ UInt32* c_tt = s->tt;
 81+ UInt32 c_tPos = s->tPos;
 82+ char* cs_next_out = s->strm->next_out;
 83+ unsigned int cs_avail_out = s->strm->avail_out;
 84+ Int32 ro_blockSize100k = s->blockSize100k;
 85+ /* end restore */
 86+
 87+ UInt32 avail_out_INIT = cs_avail_out;
 88+ Int32 s_save_nblockPP = s->save_nblock+1;
 89+ unsigned int total_out_lo32_old;
 90+
 91+ while (True) {
 92+
 93+ /* try to finish existing run */
 94+ if (c_state_out_len > 0) {
 95+ while (True) {
 96+ if (cs_avail_out == 0) goto return_notr;
 97+ if (c_state_out_len == 1) break;
 98+ *( (UChar*)(cs_next_out) ) = c_state_out_ch;
 99+ BZ_UPDATE_CRC ( c_calculatedBlockCRC, c_state_out_ch );
 100+ c_state_out_len--;
 101+ cs_next_out++;
 102+ cs_avail_out--;
 103+ }
 104+ s_state_out_len_eq_one:
 105+ {
 106+ if (cs_avail_out == 0) {
 107+ c_state_out_len = 1; goto return_notr;
 108+ };
 109+ *( (UChar*)(cs_next_out) ) = c_state_out_ch;
 110+ BZ_UPDATE_CRC ( c_calculatedBlockCRC, c_state_out_ch );
 111+ cs_next_out++;
 112+ cs_avail_out--;
 113+ }
 114+ }
 115+ /* Only caused by corrupt data stream? */
 116+ if (c_nblock_used > s_save_nblockPP)
 117+ return True;
 118+
 119+ /* can a new run be started? */
 120+ if (c_nblock_used == s_save_nblockPP) {
 121+ c_state_out_len = 0; goto return_notr;
 122+ };
 123+ c_state_out_ch = c_k0;
 124+ BZ_GET_FAST_C(k1); c_nblock_used++;
 125+ if (k1 != c_k0) {
 126+ c_k0 = k1; goto s_state_out_len_eq_one;
 127+ };
 128+ if (c_nblock_used == s_save_nblockPP)
 129+ goto s_state_out_len_eq_one;
 130+
 131+ c_state_out_len = 2;
 132+ BZ_GET_FAST_C(k1); c_nblock_used++;
 133+ if (c_nblock_used == s_save_nblockPP) continue;
 134+ if (k1 != c_k0) { c_k0 = k1; continue; };
 135+
 136+ c_state_out_len = 3;
 137+ BZ_GET_FAST_C(k1); c_nblock_used++;
 138+ if (c_nblock_used == s_save_nblockPP) continue;
 139+ if (k1 != c_k0) { c_k0 = k1; continue; };
 140+
 141+ BZ_GET_FAST_C(k1); c_nblock_used++;
 142+ c_state_out_len = ((Int32)k1) + 4;
 143+ BZ_GET_FAST_C(c_k0); c_nblock_used++;
 144+ }
 145+
 146+ return_notr:
 147+ total_out_lo32_old = s->strm->total_out_lo32;
 148+ s->strm->total_out_lo32 += (avail_out_INIT - cs_avail_out);
 149+ if (s->strm->total_out_lo32 < total_out_lo32_old)
 150+ s->strm->total_out_hi32++;
 151+
 152+ /* save */
 153+ s->calculatedBlockCRC = c_calculatedBlockCRC;
 154+ s->state_out_ch = c_state_out_ch;
 155+ s->state_out_len = c_state_out_len;
 156+ s->nblock_used = c_nblock_used;
 157+ s->k0 = c_k0;
 158+ s->tt = c_tt;
 159+ s->tPos = c_tPos;
 160+ s->strm->next_out = cs_next_out;
 161+ s->strm->avail_out = cs_avail_out;
 162+ /* end save */
 163+ }
 164+ return False;
 165+}
 166+
 167+int BZ_API(BZ2_bzDecompress_mine) ( bz_stream *strm )
 168+{
 169+ Bool corrupt;
 170+ DState* s;
 171+ if (strm == NULL) return BZ_PARAM_ERROR;
 172+ s = strm->state;
 173+ if (s == NULL) return BZ_PARAM_ERROR;
 174+ if (s->strm != strm) return BZ_PARAM_ERROR;
 175+
 176+ while (True) {
 177+ if (s->state == BZ_X_IDLE) return BZ_SEQUENCE_ERROR;
 178+ if (s->state == BZ_X_OUTPUT) {
 179+ /* if (s->smallDecompress)
 180+ corrupt = unRLE_obuf_to_output_SMALL ( s ); else
 181+ corrupt = unRLE_obuf_to_output_FAST ( s ); */
 182+
 183+ corrupt = unRLE_obuf_to_output_FAST ( s );
 184+ if (corrupt) return BZ_DATA_ERROR;
 185+ if (s->nblock_used == s->save_nblock+1 && s->state_out_len == 0) {
 186+ BZ_FINALISE_CRC ( s->calculatedBlockCRC );
 187+ if (s->verbosity >= 3)
 188+ VPrintf2 ( " {0x%08x, 0x%08x}", s->storedBlockCRC,
 189+ s->calculatedBlockCRC );
 190+ if (s->verbosity >= 2) VPrintf0 ( "]" );
 191+ if (s->calculatedBlockCRC != s->storedBlockCRC)
 192+ return BZ_DATA_ERROR;
 193+ s->calculatedCombinedCRC
 194+ = (s->calculatedCombinedCRC << 1) |
 195+ (s->calculatedCombinedCRC >> 31);
 196+ s->calculatedCombinedCRC ^= s->calculatedBlockCRC;
 197+ s->state = BZ_X_BLKHDR_1;
 198+ } else {
 199+ return BZ_OK;
 200+ }
 201+ }
 202+ if (s->state >= BZ_X_MAGIC_1) {
 203+ Int32 r = BZ2_decompress ( s );
 204+ if (r == BZ_STREAM_END) {
 205+ if (s->verbosity >= 3)
 206+ VPrintf2 ( "\n combined CRCs: stored = 0x%08x, computed = 0x%08x",
 207+ s->storedCombinedCRC, s->calculatedCombinedCRC );
 208+ /* if (s->calculatedCombinedCRC != s->storedCombinedCRC)
 209+ return BZ_DATA_ERROR; */
 210+ return r;
 211+ }
 212+ if (s->state != BZ_X_OUTPUT) return r;
 213+ }
 214+ }
 215+
 216+ AssertH ( 0, 6001 );
 217+
 218+ return 0; /*NOTREACHED*/
 219+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/bzlibfuncs.c
___________________________________________________________________
Added: svn:eol-style
1220 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c
@@ -6,6 +6,7 @@
77 #include <fcntl.h>
88 #include <stdlib.h>
99 #include <errno.h>
 10+#include "mwbzutils.h"
1011
1112 /*
1213 Check to see whether a file ends with a bz2 footer or not
@@ -22,123 +23,11 @@
2324 */
2425
2526
26 -int read_footer(unsigned char *buffer, int fin) {
27 - int res;
28 -
29 - res = lseek(fin, -11, SEEK_END);
30 - if (res == -1) {
31 - fprintf(stderr,"lseek of file failed\n");
32 - exit(-1);
33 - }
34 - res = read(fin, buffer, 11);
35 - if (res == -1) {
36 - fprintf(stderr,"read of file failed\n");
37 - exit(-1);
38 - }
39 - return(0);
40 -}
41 -
42 -#define LEFT 0
43 -#define RIGHT 1
44 -
45 -/* return n ones either at left or right end */
46 -int bitmask(int numbits, int end) {
47 - if (end == RIGHT) {
48 - return((1<<numbits)-1);
49 - }
50 - else {
51 - return(((1<<numbits)-1) << (8-numbits));
52 - }
53 -}
54 -
55 -void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
56 - int i;
57 -
58 - for (i=buflen-1; i>=0; i--) {
59 - /* right 1 */
60 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
61 -
62 - /* grab rightmost from prev byte */
63 - if (i > 0) {
64 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(1,LEFT)));
65 - }
66 - }
67 -}
68 -
69 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
70 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
71 - matches and 0 otherwise. */
72 -int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
73 - int i;
74 -
75 - if (bitsrightshifted == 0) {
76 - for (i = 0; i< numbytes; i++) {
77 - if (buff1[i] != buff2[i]) {
78 - return(1);
79 - }
80 - }
81 - return(0);
82 - }
83 - else {
84 - for (i = 1; i< numbytes-2; i++) {
85 - if (buff1[i] != buff2[i]) {
86 - return(1);
87 - }
88 - }
89 - /* do leftmost byte */
90 - if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
91 - return(1);
92 - }
93 - /* do rightmost byte */
94 - if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
95 - return(1);
96 - }
97 - return(0);
98 - }
99 -}
100 -
101 -int checkfileforfooter(int fin) {
102 - unsigned char buffer[11];
103 - int result, i;
104 - unsigned char **footer = malloc(8*sizeof(unsigned char *));
105 -
106 - /* set up footer plus its various right-shifted incarnations */
107 - /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
108 - for (i = 0; i< 8; i++) {
109 - footer[i] = malloc(sizeof(unsigned char)*7);
110 - }
111 - footer[0][0]= (unsigned char) 0x17;
112 - footer[0][1]= (unsigned char) 0x72;
113 - footer[0][2]= (unsigned char) 0x45;
114 - footer[0][3]= (unsigned char) 0x38;
115 - footer[0][4]= (unsigned char) 0x50;
116 - footer[0][5]= (unsigned char) 0x90;
117 - footer[0][6]= (unsigned char) 0x00;
118 - for (i = 1; i< 8; i++) {
119 - memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
120 - shiftbytesright(footer[i],7,1);
121 - }
122 -
123 - read_footer(buffer,fin);
124 -
125 - result = bytescompare(footer[0],buffer+1,6,0);
126 - if (!result) {
127 - return(0);
128 - }
129 -
130 - for (i=1; i<8; i++) {
131 - result = bytescompare(footer[i],buffer,7,i);
132 - if (!result) {
133 - return(0);
134 - }
135 - }
136 - return(1);
137 -}
138 -
13927 int main(int argc, char **argv) {
14028
14129 int fin;
14230 int result;
 31+ bz_info_t bfile;
14332
14433 if (argc != 2) {
14534 fprintf(stderr,"usage: %s infile\n", argv[0]);
@@ -149,7 +38,9 @@
15039 fprintf(stderr,"failed to open file %s for read\n", argv[1]);
15140 exit(-1);
15241 }
153 - result = checkfileforfooter(fin);
 42+
 43+ bfile.footer = init_footer();
 44+ result = check_file_for_footer(fin, &bfile);
15445 close(fin);
15546 exit(result);
15647 }
Index: branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c
@@ -6,8 +6,9 @@
77 #include <fcntl.h>
88 #include <stdlib.h>
99 #include <errno.h>
10 -#include "bzlib.h"
 10+#include "mwbzutils.h"
1111
 12+
1213 /*
1314 Find the last bz2 block marker in a file
1415 and dump whatever can be decompressed after
@@ -24,439 +25,73 @@
2526 1 if decompression fails, and -1 on error.
2627 */
2728
28 -#define BUFSIZE 121072
29 -typedef struct {
30 - unsigned char bufin[BUFSIZE];
31 - unsigned char bufout[BUFSIZE];
32 - int bufsize;
33 - bz_stream strm;
34 - unsigned char overflow;
35 - int bitsshifted;
36 - int position;
37 -} bzinfo;
38 -
39 -int read_footer(unsigned char *buffer, int fin) {
40 - int res;
41 -
42 - res = lseek(fin, -11, SEEK_END);
43 - if (res == -1) {
44 - fprintf(stderr,"lseek of file failed\n");
45 - exit(-1);
46 - }
47 - res = read(fin, buffer, 11);
48 - if (res == -1) {
49 - fprintf(stderr,"read of file failed\n");
50 - exit(-1);
51 - }
52 - return(0);
53 -}
54 -
55 -#define LEFT 0
56 -#define RIGHT 1
57 -
58 -/* return n ones either at left or right end */
59 -int bitmask(int numbits, int end) {
60 - if (end == RIGHT) {
61 - return((1<<numbits)-1);
62 - }
63 - else {
64 - return(((1<<numbits)-1) << (8-numbits));
65 - }
66 -}
67 -
68 -void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {
69 - int i;
70 -
71 - if (numbits == 0) {
72 - return;
73 - }
74 -
75 - for (i=0; i<buflen; i++) {
76 - /* left 1 */
77 - buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
78 -
79 - /* grab leftmost from next byte */
80 - if (i < buflen-1) {
81 - buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );
82 - }
83 - }
84 -}
85 -
86 -
87 -void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
88 - int i;
89 -
90 - for (i=buflen-1; i>=0; i--) {
91 - /* right 1 */
92 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
93 -
94 - /* grab rightmost from prev byte */
95 - if (i > 0) {
96 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));
97 - }
98 - }
99 -}
100 -
101 -unsigned char ** init_marker() {
102 - unsigned char **marker = malloc(8*sizeof(unsigned char *));
103 - int i;
104 -
105 - /* set up block marker plus its various right-shifted incarnations */
106 - for (i = 0; i< 8; i++) {
107 - marker[i] = malloc(sizeof(unsigned char)*7);
108 - }
109 - marker[0][0]= (unsigned char) 0x31;
110 - marker[0][1]= (unsigned char) 0x41;
111 - marker[0][2]= (unsigned char) 0x59;
112 - marker[0][3]= (unsigned char) 0x26;
113 - marker[0][4]= (unsigned char) 0x53;
114 - marker[0][5]= (unsigned char) 0x59;
115 - marker[0][6]= (unsigned char) 0x00;
116 - for (i = 1; i< 8; i++) {
117 - memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
118 - shiftbytesright(marker[i],7,1);
119 - }
120 - return(marker);
121 -}
122 -
123 -unsigned char ** init_footer() {
124 - unsigned char **footer = malloc(8*sizeof(unsigned char *));
125 - int i;
126 -
127 - /* set up footer plus its various right-shifted incarnations */
128 - /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
129 - for (i = 0; i< 8; i++) {
130 - footer[i] = malloc(sizeof(unsigned char)*7);
131 - }
132 - footer[0][0]= (unsigned char) 0x17;
133 - footer[0][1]= (unsigned char) 0x72;
134 - footer[0][2]= (unsigned char) 0x45;
135 - footer[0][3]= (unsigned char) 0x38;
136 - footer[0][4]= (unsigned char) 0x50;
137 - footer[0][5]= (unsigned char) 0x90;
138 - footer[0][6]= (unsigned char) 0x00;
139 - for (i = 1; i< 8; i++) {
140 - memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
141 - shiftbytesright(footer[i],7,1);
142 - }
143 - return(footer);
144 -}
145 -
146 -
147 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
148 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
149 - matches and 0 otherwise. */
150 -int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
151 - int i;
152 -
153 - if (bitsrightshifted == 0) {
154 - for (i = 0; i< numbytes; i++) {
155 - if (buff1[i] != buff2[i]) {
156 - return(1);
157 - }
158 - }
159 - return(0);
160 - }
161 - else {
162 - for (i = 1; i< numbytes-2; i++) {
163 - if (buff1[i] != buff2[i]) {
164 - return(1);
165 - }
166 - }
167 - /* do leftmost byte */
168 - if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
169 - return(1);
170 - }
171 - /* do rightmost byte */
172 - if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
173 - return(1);
174 - }
175 - return(0);
176 - }
177 -}
178 -
179 -/* return -1 if no match
180 - return number of bits rightshifted otherwise */
181 -int checkfileforfooter(int fin, unsigned char **footer) {
182 - unsigned char buffer[11];
183 - int result, i;
184 -
185 - read_footer(buffer,fin);
186 -
187 - result = bytescompare(footer[0],buffer+1,6,0);
188 - if (!result) {
189 - return(0);
190 - }
191 -
192 - for (i=1; i<8; i++) {
193 - result = bytescompare(footer[i],buffer,7,i);
194 - if (!result) {
195 - return(i);
196 - }
197 - }
198 - return(-1);
199 -}
200 -
201 -/* return -1 if no match
202 - return number of bits rightshifted otherwise */
203 -int checkbufferforblockmarker(unsigned char *buffer, unsigned char **marker) {
204 - int result, i;
205 -
206 - result = bytescompare(marker[0],buffer+1,6,0);
207 - if (!result) {
208 - return(0);
209 - }
210 - for (i=1; i<8; i++) {
211 - result = bytescompare(marker[i],buffer,7,i);
212 - if (!result) {
213 - return(i);
214 - }
215 - }
216 - return(-1);
217 -}
218 -
219 -void clearbuffer(unsigned char *buf, int length) {
220 - int i;
221 -
222 - for (i=0; i<length; i++) {
223 - buf[i]=0;
224 - }
225 - return;
226 -}
227 -
228 -int findnextmarker(int fin, int *start_at, int *position, unsigned char **marker, unsigned char *buffer ) {
229 - int bitsshifted = -1;
230 - int result;
231 -
232 - /* must be after 4 byte file header, and we add a leftmost byte to the buffer
233 - of data read in case some bits have been shifted into it */
234 - while (*position >= 3 && bitsshifted < 0) {
235 - bitsshifted = checkbufferforblockmarker(buffer, marker);
236 - if (bitsshifted < 0) {
237 - (*start_at)++;
238 - /*
239 - if (*start_at % 10000 == 0) {
240 - fprintf(stderr, "starting at %d, position %d\n", *start_at, *position);
241 - }
242 - */
243 - *position = lseek(fin, -1*(*start_at), SEEK_END);
244 - if (*position == -1) {
245 - fprintf(stderr,"lseek of file failed\n");
246 - exit(-1);
247 - }
248 - result = read(fin, buffer, 7);
249 - if (result == -1) {
250 - fprintf(stderr,"read of file failed\n");
251 - exit(-1);
252 - }
253 - }
254 - else {
255 - return(bitsshifted);
256 - }
257 - }
258 - return(bitsshifted);
259 -}
260 -
261 -int init_decompress(bzinfo *bfile) {
262 - int bz_verbosity = 0;
263 - int bz_small = 0;
264 - int ret;
265 -
266 - bfile->strm.bzalloc = NULL;
267 - bfile->strm.bzfree = NULL;
268 - bfile->strm.opaque = NULL;
269 -
270 - ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
271 - if (ret != BZ_OK) {
272 - fprintf(stderr,"uncompress failed, err %d\n", ret);
273 - exit(-1);
274 - }
275 - return(ret);
276 -}
277 -
278 -int decompress_header(int fin, bzinfo *bfile) {
279 - int bytesread, ret;
280 - unsigned char header[4];
281 -
282 - lseek(fin,0,SEEK_SET);
283 - bytesread = read(fin, header, 4);
284 - if (bytesread < 4) {
285 - fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
286 - exit(-1);
287 - }
288 - bfile->strm.next_in = (char *)header;
289 - bfile->strm.avail_in = 4;
290 -
291 - bfile->strm.next_out = (char *)(bfile->bufout);
292 - bfile->strm.avail_out = bfile->bufsize;
293 - ret = BZ2_bzDecompress ( &(bfile->strm) );
294 - if (BZ_OK != ret && BZ_STREAM_END != ret) {
295 - fprintf(stderr,"Corrupt bzip2 header, exiting\n");
296 - exit(-1);
297 - }
298 - return(ret);
299 -}
300 -
301 -int setup_first_buffer(int fin, bzinfo *bfile) {
302 - int bytesread, eof=0;
303 -
304 - if (bfile->bitsshifted == 0) {
305 - lseek(fin,bfile->position+1,SEEK_SET);
306 - }
307 - else {
308 - lseek(fin,bfile->position,SEEK_SET);
309 - }
310 - bytesread = read(fin, bfile->bufin, bfile->bufsize);
311 - if (bytesread > 0) {
312 - bfile->overflow = bfile->bufin[bytesread-1];
313 - shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);
314 -
315 - bfile->strm.next_in = (char *)(bfile->bufin);
316 - bfile->strm.avail_in = bytesread-1;
317 -
318 - bfile->strm.next_out = (char *)(bfile->bufout);
319 - bfile->strm.avail_out = bfile->bufsize;
320 - }
321 - if (bytesread <=0) {
322 - eof++;
323 - }
324 - return(eof);
325 -}
326 -
327 -int do_last_byte(bzinfo *bfile) {
328 - int ret=BZ_OK;
329 - int written;
330 -
331 - if (bfile->strm.avail_in == 0) {
332 - bfile->strm.next_in = (char *)(bfile->bufin);
333 - bfile->bufin[0] = bfile->overflow;
334 - shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);
335 - bfile->strm.avail_in = 1;
336 - bfile->strm.next_out = (char *)(bfile->bufout);
337 - bfile->strm.avail_out = bfile->bufsize;
338 - ret = BZ2_bzDecompress ( &(bfile->strm) );
339 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
340 - written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);
341 - }
342 - }
343 - return(ret);
344 -}
345 -
346 -int read_next_buffer(int fin, bzinfo *bfile, int ret) {
347 - int bytesread, eof=0;
348 -
349 - /* fprintf(stderr," got return from decompress of %d\n", ret); */
350 -
351 - if (bfile->strm.avail_in == 0) {
352 - bfile->strm.next_in = (char *)(bfile->bufin);
353 - bfile->bufin[0] = bfile->overflow;
354 - bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);
355 - if (bytesread > 0) {
356 - bfile->overflow = bfile->bufin[bytesread];
357 - shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);
358 - bfile->strm.avail_in = bytesread;
359 - }
360 - else {
361 - eof++;
362 - bfile->strm.avail_in = 0;
363 - }
364 - }
365 - bfile->strm.next_out = (char *)(bfile->bufout);
366 - bfile->strm.avail_out = bfile->bufsize;
367 -
368 - return(eof);
369 -}
370 -
371 -
37229 int main(int argc, char **argv) {
37330
374 - bzinfo bfile;
 31+ bz_info_t bfile;
37532
37633 int fin;
377 - int result, ret;
378 - unsigned char buffer[8];
 34+ int result;
 35+ buf_info_t *b;
37936
380 - unsigned char **footer;
381 - unsigned char **marker;
 37+ int firstblock = 1;
 38+ int length = 5000; /* output buffer size */
38239
383 - int written=0;
384 - int start_at;
385 -
386 - int eof = 0;
387 -
38840 if (argc != 2) {
38941 fprintf(stderr,"usage: %s infile\n", argv[0]);
39042 exit(-1);
39143 }
39244
393 - marker = init_marker();
394 - footer = init_footer();
395 -
39645 fin = open (argv[1], O_RDONLY);
39746 if (fin < 0) {
39847 fprintf(stderr,"failed to open file %s for read\n", argv[1]);
39948 exit(-1);
40049 }
40150
402 - bfile.bufsize = BUFSIZE;
403 -
404 - result = checkfileforfooter(fin, footer);
 51+ bfile.file_size = get_file_size(fin);
 52+ bfile.footer = init_footer();
 53+ result = check_file_for_footer(fin, &bfile);
40554 if (result == -1) {
406 - start_at = 0;
 55+ bfile.position = bfile.file_size;
40756 }
40857 else {
409 - start_at = 11; /* size of footer, perhaps with 1 byte extra */
 58+ bfile.position = bfile.file_size - 11; /* size of footer, perhaps with 1 byte extra */
41059 }
411 - start_at +=6; /* size of marker */
412 - bfile.position = lseek(fin, -1*start_at, SEEK_END);
413 - if (bfile.position == -1) {
414 - fprintf(stderr,"lseek of file failed\n");
415 - exit(-1);
416 - }
417 - result = read(fin, buffer, 7);
418 - if (result == -1) {
419 - fprintf(stderr,"read of file failed\n");
420 - exit(-1);
421 - }
 60+ bfile.position -=6; /* size of marker */
 61+ bfile.initialized = 0;
 62+ b = init_buffer(length);
 63+ bfile.bytes_read = 0;
42264
423 - while (1) {
 65+ /* init_bz2_file(&bfile, fin, BACKWARD); */
 66+ firstblock = 1;
42467
425 - bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);
426 - if (bfile.bitsshifted >= 0) {
427 - /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */
428 - ret = init_decompress(&bfile);
429 -
430 - /* pass in the header */
431 - ret = decompress_header(fin,&bfile);
432 -
433 - eof = setup_first_buffer(fin, &bfile);
434 -
435 - while (BZ_OK == ret && !eof) {
436 - ret = BZ2_bzDecompress ( &(bfile.strm) );
437 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
438 - written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);
439 - }
440 - eof = read_next_buffer(fin, &bfile, ret);
441 - }
442 - if (BZ_OK == ret || BZ_STREAM_END == ret ) {
443 - /* so we read no bytes, process the last byte we held */
444 - do_last_byte(&bfile);
445 - }
446 - if (written == 0) {
447 - /* truncated block or other corruption, try going back one */
448 - start_at +=5;
449 - clearbuffer(buffer,sizeof(buffer));
450 - continue;
451 - }
452 - else {
453 - break;
454 - }
 68+ if (find_first_bz2_block_from_offset(&bfile, fin, bfile.position, BACKWARD) <= 0) {
 69+ fprintf(stderr,"failed to find block in bz2file\n");
 70+ exit(-1);
 71+ }
 72+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof) && (! bfile.position ==0)) {
 73+ if (bfile.bytes_read) {
 74+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 75+ b->next_to_read = b->end;
 76+ b->bytes_avail = 0;
 77+ b->next_to_fill = b->buffer; /* empty */
 78+ bfile.strm.next_out = (char *)b->next_to_fill;
 79+ bfile.strm.avail_out = b->end - b->next_to_fill;
 80+ firstblock = 0;
45581 }
45682 else {
457 - fprintf(stderr,"no block marker in this file.\n");
 83+ /* should never happen */
 84+ fprintf(stderr,"there was a block but now it's gone, giving up\n");
45885 exit(-1);
45986 }
46087 }
 88+ if (b->bytes_avail) {
 89+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 90+ b->next_to_read = b->end;
 91+ b->bytes_avail = 0;
 92+ b->next_to_fill = b->buffer; /* empty */
 93+ bfile.strm.next_out = (char *)b->next_to_fill;
 94+ bfile.strm.avail_out = b->end - b->next_to_fill;
 95+ }
46196 close(fin);
46297 exit(0);
46398 }
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h
@@ -0,0 +1,139 @@
 2+#ifndef _MWBZUTILS_H
 3+#define _MWBZUTILS_H
 4+
 5+#include "bzlib_private.h"
 6+int BZ_API(BZ2_bzDecompress_mine) ( bz_stream *strm );
 7+
 8+typedef struct {
 9+ int page_id; /* first id in the block */
 10+ int bits_shifted; /* block is right shifted this many bits */
 11+ int position; /* position in file of block */
 12+} page_info_t;
 13+
 14+#define BUFINSIZE 5000
 15+
 16+/*
 17+ keeps all information about a bzipped file
 18+ plus input/output buffers for decompression
 19+*/
 20+typedef struct {
 21+ unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
 22+ unsigned char *bufout; /* uncompressed data, must be allocated by caller */
 23+ unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
 24+ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
 25+
 26+ int bufin_size; /* size of input buffer for compressed data */
 27+ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
 28+
 29+ int initialized; /* whether bz2file has been initialized (header processed, seek to
 30+ some bz2 block in the file and input buffer filled) */
 31+ int block_start; /* position of bz2 block in file from which we started to read (we
 32+ read a sequence of bz2 blocks from a given position, this is
 33+ the offset to the first one) */
 34+
 35+ bz_stream strm; /* stream structure for libbz2 */
 36+ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
 37+ read around so we can grab the lower end bits off the end for
 38+ sticking in front of the next pile of compressed bytes we read */
 39+
 40+ int bits_shifted; /* number of bits that the compressed data has been right shifted
 41+ in the file (if the number is 0, the block marker and subsequent
 42+ data is byte-aligned) */
 43+ unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for
 44+ locating the marker in a stream of compressed data */
 45+ unsigned char **footer; /* bzip2 end of stream footer, plus bit-shifted versions of it for
 46+ locating the footer in a stream of compressed data */
 47+
 48+ int position; /* current offset into file from start of file */
 49+
 50+ int bytes_read; /* number of bytes of compressed data read from file (per read) */
 51+ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
 52+ int eof; /* nonzero if eof reached */
 53+ int file_size; /* length of file, so we don't search past it for blocks */
 54+} bz_info_t;
 55+
 56+#define MASKLEFT 0
 57+#define MASKRIGHT 1
 58+
 59+/*
 60+ this output buffer is used to collect decompressed output.
 61+ this is not a circular buffer; when it is full the user is
 62+ responsible for emptying it completely or partially and moving
 63+ to the beginning any unused bytes.
 64+
 65+*/
 66+typedef struct {
 67+ unsigned char *buffer; /* output storage, allocated by the caller */
 68+ unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */
 69+ unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */
 70+ int bytes_avail; /* number of bytes available for reading */
 71+ unsigned char *end; /* points to byte after end of buffer */
 72+} buf_info_t;
 73+
 74+/*
 75+ used for each iteration of narrowing down the location in a bzipped2 file of
 76+ a desired pageid, by finding first compressed block after a guessed
 77+ position and checking the first pageid (if any) contained in it.
 78+*/
 79+typedef struct {
 80+ int left_end; /* left end of interval to search (bytes from start of file) */
 81+ int right_end; /* right end of interval to search */
 82+ int value_wanted; /* pageid desired */
 83+ int last_value; /* pageid we found in last iteration */
 84+ int last_position; /* position in file for last iteration */
 85+} iter_info_t;
 86+
 87+int bit_mask(int numbits, int end);
 88+
 89+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits);
 90+
 91+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits);
 92+
 93+unsigned char ** init_marker();
 94+
 95+int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted);
 96+
 97+int check_buffer_for_bz2_block_marker(bz_info_t *bfile);
 98+
 99+#define FORWARD 1
 100+#define BACKWARD 2
 101+
 102+int find_next_bz2_block_marker(int fin, bz_info_t *bfile, int direction);
 103+
 104+int init_decompress(bz_info_t *bfile);
 105+
 106+int decompress_header(int fin, bz_info_t *bfile);
 107+
 108+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile);
 109+
 110+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret);
 111+
 112+buf_info_t *init_buffer(int size);
 113+
 114+int buffer_is_empty(buf_info_t *b);
 115+
 116+int buffer_is_full(buf_info_t *b);
 117+
 118+int get_file_size(int fin);
 119+
 120+int init_bz2_file(bz_info_t *bfile, int fin, int direction);
 121+
 122+int get_and_decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size, int direction);
 123+
 124+int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile, int direction);
 125+
 126+void dump_buf_info(buf_info_t *b);
 127+
 128+int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes);
 129+
 130+unsigned char ** init_footer();
 131+
 132+int read_footer(unsigned char *buffer, int fin);
 133+
 134+int check_file_for_footer(int fin, bz_info_t *bfile);
 135+
 136+void clear_buffer(unsigned char *buf, int length);
 137+
 138+int find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, int position, int direction);
 139+
 140+#endif
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h
___________________________________________________________________
Added: svn:eol-style
1141 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c
@@ -0,0 +1,687 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include <sys/types.h>
 11+#include <regex.h>
 12+#include "bzlib.h"
 13+#include "mwbzutils.h"
 14+
 15+
 16+
 17+/* return n ones either at left or right end */
 18+int bit_mask(int numbits, int end) {
 19+ if (end == MASKRIGHT) {
 20+ return((1<<numbits)-1);
 21+ }
 22+ else {
 23+ return(((1<<numbits)-1) << (8-numbits));
 24+ }
 25+}
 26+
 27+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
 28+ int i;
 29+
 30+ if (numbits == 0) {
 31+ return;
 32+ }
 33+
 34+ for (i=0; i<buflen; i++) {
 35+ /* left 1 */
 36+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
 37+
 38+ /* grab leftmost from next byte */
 39+ if (i < buflen-1) {
 40+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );
 41+ }
 42+ }
 43+}
 44+
 45+
 46+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
 47+ int i;
 48+
 49+ for (i=buflen-1; i>=0; i--) {
 50+ /* right 1 */
 51+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 52+
 53+ /* grab rightmost from prev byte */
 54+ if (i > 0) {
 55+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));
 56+ }
 57+ }
 58+}
 59+
 60+unsigned char ** init_marker() {
 61+ unsigned char **marker = malloc(8*sizeof(unsigned char *));
 62+ int i;
 63+
 64+ /* set up block marker plus its various right-shifted incarnations */
 65+ for (i = 0; i< 8; i++) {
 66+ marker[i] = malloc(sizeof(unsigned char)*7);
 67+ }
 68+ marker[0][0]= (unsigned char) 0x31;
 69+ marker[0][1]= (unsigned char) 0x41;
 70+ marker[0][2]= (unsigned char) 0x59;
 71+ marker[0][3]= (unsigned char) 0x26;
 72+ marker[0][4]= (unsigned char) 0x53;
 73+ marker[0][5]= (unsigned char) 0x59;
 74+ marker[0][6]= (unsigned char) 0x00;
 75+ for (i = 1; i< 8; i++) {
 76+ memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
 77+ shift_bytes_right(marker[i],7,1);
 78+ }
 79+ return(marker);
 80+}
 81+
 82+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 83+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 84+ matches and 0 otherwise. */
 85+int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 86+ int i;
 87+
 88+ if (bitsrightshifted == 0) {
 89+ for (i = 0; i< numbytes; i++) {
 90+ if (buff1[i] != buff2[i]) {
 91+ return(1);
 92+ }
 93+ }
 94+ return(0);
 95+ }
 96+ else {
 97+ for (i = 1; i< numbytes-2; i++) {
 98+ if (buff1[i] != buff2[i]) {
 99+ return(1);
 100+ }
 101+ }
 102+ /* do leftmost byte */
 103+ if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {
 104+ return(1);
 105+ }
 106+ /* do rightmost byte */
 107+ if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {
 108+ return(1);
 109+ }
 110+ return(0);
 111+ }
 112+}
 113+
 114+/* return -1 if no match
 115+ return number of bits rightshifted otherwise */
 116+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
 117+ int result, i;
 118+
 119+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
 120+ if (!result) {
 121+ return(0);
 122+ }
 123+ for (i=1; i<8; i++) {
 124+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
 125+ if (!result) {
 126+ return(i);
 127+ }
 128+ }
 129+ return(-1);
 130+}
 131+
 132+/* return: 1 if found, 0 if not, -1 on error */
 133+int find_next_bz2_block_marker(int fin, bz_info_t *bfile, int direction) {
 134+ int result;
 135+
 136+ bfile->bits_shifted = -1;
 137+ result = read(fin, bfile->marker_buffer, 7);
 138+ if (result == -1) {
 139+ fprintf(stderr,"read of file failed\n");
 140+ return(-1);
 141+ }
 142+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
 143+ of data read in case some bits have been shifted into it */
 144+ while (bfile->position <= bfile->file_size - 6 && bfile->position >= 0 && bfile->bits_shifted < 0) {
 145+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
 146+ if (bfile->bits_shifted < 0) {
 147+ if (direction == FORWARD) {
 148+ bfile->position++;
 149+ }
 150+ else {
 151+ bfile->position--;
 152+ }
 153+ result = lseek(fin, (bfile->position), SEEK_SET);
 154+ if (result == -1) {
 155+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
 156+ return(-1);
 157+ }
 158+ result = read(fin, bfile->marker_buffer, 7);
 159+ if (result < 7) {
 160+ /* fprintf(stderr,"read of file failed\n"); */
 161+ return(-1);
 162+ }
 163+ }
 164+ else {
 165+ bfile->block_start = bfile->position;
 166+ return(1);
 167+ }
 168+ }
 169+ return(0);
 170+}
 171+
 172+/*
 173+ initializes the bz2 strm structure,
 174+ calls the BZ2 decompression library initializer
 175+
 176+ returns:
 177+ BZ_OK on success
 178+ various BZ_ errors on failure (see bzlib.h)
 179+*/
 180+int init_decompress(bz_info_t *bfile) {
 181+ int bz_verbosity = 0;
 182+ int bz_small = 0;
 183+ int ret;
 184+
 185+ bfile->strm.bzalloc = NULL;
 186+ bfile->strm.bzfree = NULL;
 187+ bfile->strm.opaque = NULL;
 188+
 189+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
 190+ if (ret != BZ_OK) {
 191+ fprintf(stderr,"uncompress failed, err %d\n", ret);
 192+ return(ret);
 193+ }
 194+ return(ret);
 195+}
 196+
 197+/* FIXME do this right. whatever. */
 198+int get_file_size(int fin) {
 199+ int res;
 200+
 201+ res = lseek(fin, 0, SEEK_END);
 202+ if (res == -1) {
 203+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
 204+ return(-1);
 205+ }
 206+ return(res);
 207+}
 208+
 209+/*
 210+ reads the first 4 bytes from a bz2 file (should be
 211+ "BZh" followed by the block size indicator, typically "9")
 212+ and passes them into the BZ2 decompression library.
 213+ This must be done before decompression of any block of the
 214+ file is attempted.
 215+
 216+ returns:
 217+ BZ_OK if successful,
 218+ various BZ_ errors or -1 on failure (see bzlib.h)
 219+*/
 220+int decompress_header(int fin, bz_info_t *bfile) {
 221+ int ret, res;
 222+
 223+ res = lseek(fin,0,SEEK_SET);
 224+ if (res == -1) {
 225+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
 226+ return(-1);
 227+ }
 228+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
 229+ if (bfile->bytes_read < 4) {
 230+ fprintf(stderr,"failed to read 4 bytes of header\n");
 231+ return(-1);
 232+ }
 233+ bfile->strm.next_in = (char *)bfile->header_buffer;
 234+ bfile->strm.avail_in = 4;
 235+
 236+ ret = BZ2_bzDecompress_mine ( &(bfile->strm) );
 237+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
 238+ fprintf(stderr,"Corrupt bzip2 header\n");
 239+ return(-1);
 240+ }
 241+ return(ret);
 242+}
 243+
 244+/*
 245+ seek to appropriate offset as specified in bfile,
 246+ read compressed data into buffer indicated by bfile,
 247+ update the bfile structure accordingly,
 248+ save the overflow byte (bit-shifted data = suck)
 249+ this is for the *first* buffer of data in a stream,
 250+ for subsequent buffers use fill_buffer_to_decompress()
 251+
 252+ this will set bfile->eof on eof. no other indicator
 253+ will be provided.
 254+
 255+ returns:
 256+ 0 on success
 257+ -1 on error
 258+*/
 259+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
 260+ int res;
 261+
 262+ if (bfile->bits_shifted == 0) {
 263+ res = lseek(fin,bfile->position+1,SEEK_SET);
 264+ if (res == -1) {
 265+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
 266+ return(-1);
 267+ }
 268+ }
 269+ else {
 270+ res = lseek(fin,bfile->position,SEEK_SET);
 271+ if (res == -1) {
 272+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
 273+ return(-1);
 274+ }
 275+ }
 276+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
 277+ if (bfile->bytes_read > 0) {
 278+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
 279+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
 280+
 281+ bfile->strm.next_in = (char *)(bfile->bufin);
 282+ bfile->strm.avail_in = bfile->bytes_read-1;
 283+ }
 284+ if (bfile->bytes_read <=0) {
 285+ bfile->eof++;
 286+ }
 287+ return(0);
 288+}
 289+
 290+/*
 291+ set up the marker, seek to right place, get first
 292+ buffer of compressed data for processing
 293+ bfile->position must be set to desired offset first by caller.
 294+ returns:
 295+ -1 if no marker or other error, position of next read if ok
 296+*/
 297+int init_bz2_file(bz_info_t *bfile, int fin, int direction) {
 298+ int res;
 299+
 300+ bfile->bufin_size = BUFINSIZE;
 301+ bfile->marker = init_marker();
 302+ bfile->bytes_read = 0;
 303+ bfile->bytes_written = 0;
 304+ bfile->eof = 0;
 305+
 306+ bfile->initialized++;
 307+
 308+ bfile->file_size = get_file_size(fin);
 309+ if (bfile->position > bfile->file_size) {
 310+ fprintf(stderr,"asked for position past end of file\n");
 311+ return(-1);
 312+ }
 313+ res = lseek(fin, bfile->position, SEEK_SET);
 314+ if (res == -1) {
 315+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 316+ return(-1);
 317+ }
 318+
 319+ find_next_bz2_block_marker(fin, bfile, direction);
 320+ if (bfile->bits_shifted >= 0) {
 321+ /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */
 322+ init_decompress(bfile);
 323+ decompress_header(fin, bfile);
 324+ setup_first_buffer_to_decompress(fin, bfile);
 325+ return(0);
 326+ }
 327+ return(-1);
 328+}
 329+
 330+
 331+/*
 332+ read compressed data into buffer indicated by bfile,
 333+ from current position of file,
 334+ stuffing the overflow byte in first.
 335+ update the bfile structure accordingly
 336+ save the new overflow byte (bit-shifted data = suck)
 337+ this function is for decompression of buffers *after
 338+ the first one*. for the first one use
 339+ setup_first_buffer_to_decompress()
 340+
 341+ this will set bfile->eof on eof. no other indicator
 342+ will be provided.
 343+
 344+ returns:
 345+ 0 on success
 346+ hmm, it really does not do anything about errors :-D
 347+*/
 348+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
 349+ if (bfile->strm.avail_in == 0) {
 350+ bfile->strm.next_in = (char *)(bfile->bufin);
 351+ bfile->bufin[0] = bfile->overflow;
 352+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
 353+ if (bfile->bytes_read > 0) {
 354+ bfile->position+=bfile->bytes_read;
 355+ bfile->overflow = bfile->bufin[bfile->bytes_read];
 356+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
 357+ bfile->strm.avail_in = bfile->bytes_read;
 358+ }
 359+ else { /* bfile->bytes_read <= 0 */
 360+ bfile->strm.avail_in = 1; /* the overflow byte */
 361+ bfile->eof++;
 362+ }
 363+ }
 364+ return(0);
 365+}
 366+
 367+/* size of buffer is bytes usable. there will be a null byte at the end
 368+
 369+ what we do with the buffer:
 370+ - read from front of buffer to end,
 371+ - fill from point where prev read did not fill buffer, or from where
 372+ move of data at end of buffer to beginning left room,
 373+ - mark a string of bytes (starting from what's available to read) as "read"
 374+
 375+*/
 376+buf_info_t *init_buffer(int size) {
 377+ buf_info_t *b;
 378+
 379+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
 380+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
 381+ b->buffer[size]='\0';
 382+ b->end = b->buffer + size;
 383+ b->next_to_read = b->end; /* nothing available */
 384+ b->bytes_avail = 0; /* bytes to read, nothing available */
 385+ b->next_to_fill = b->buffer; /* empty */
 386+ b->next_to_fill[0] = '\0';
 387+ return(b);
 388+}
 389+
 390+/* check if buffer (used for decompressed data output) is empty,
 391+ returns 1 if so and 0 if not */
 392+int buffer_is_empty(buf_info_t *b) {
 393+ if (b->bytes_avail == 0) {
 394+ return(1);
 395+ }
 396+ else {
 397+ return(0);
 398+ }
 399+}
 400+
 401+/* check if buffer (used for decompressed data output) is full,
 402+
 403+ returns 1 if so and 0 if not
 404+ I'm not liking this function so well, fixme */
 405+int buffer_is_full(buf_info_t *b) {
 406+ if (b->next_to_fill == b->end) {
 407+ return(1);
 408+ }
 409+ else {
 410+ return(0);
 411+ }
 412+}
 413+
 414+
 415+/* get the next buffer of uncompressed stuff */
 416+int get_and_decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size, int direction) {
 417+ int ret;
 418+
 419+ bfile->bufout = bufferout;
 420+ bfile->bufout_size = bufout_size;
 421+ bfile->bytes_written = 0;
 422+
 423+ if (! bfile->initialized) {
 424+ if (init_bz2_file(bfile, fin, direction) == -1) {
 425+ fprintf(stderr,"failed to initialize bz2file\n");
 426+ return(-1);
 427+ };
 428+ bfile->strm.next_out = (char *)bfile->bufout;
 429+ bfile->strm.avail_out = bfile->bufout_size;
 430+ }
 431+
 432+ ret = BZ_OK;
 433+ while (BZ_OK == ret && bfile->bytes_written == 0) {
 434+ ret = BZ2_bzDecompress_mine ( &(bfile->strm) );
 435+ /* FIXME testing only, does stuff actually get written or not? */
 436+ /* if (BZ_OK == ret || BZ_STREAM_END == ret || BZ_DATA_ERROR == ret) { */
 437+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 438+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
 439+ }
 440+ else {
 441+ fprintf(stderr,"error from BZ decompress %d (1)\n",ret);
 442+ return(-1);
 443+ }
 444+ fill_buffer_to_decompress(fin, bfile, ret);
 445+ /*
 446+ if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
 447+ fprintf(stderr,"eof reached\n");
 448+ }
 449+ */
 450+ }
 451+ if (ret == BZ_STREAM_END) {
 452+ bfile->eof++;
 453+ /* should we actually change the file position?
 454+ bfile->position = bfile->filesize;
 455+ lseek(fin,0,SEEK_END);
 456+ */
 457+ }
 458+ return(0);
 459+}
 460+
 461+/*
 462+ fill output buffer in b with uncompressed data from bfile
 463+ if this is the first call to the function for this file,
 464+ the file header will be read, and the first buffer of
 465+ uncompressed data will be prepared. bfile->position
 466+ should be set to the offset (from the beginning of file) from
 467+ which to find the first bz2 block.
 468+
 469+ returns:
 470+ on success, number of bytes read (may be 0)
 471+ -1 on error
 472+*/
 473+int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile, int direction) {
 474+ int res;
 475+
 476+ if (buffer_is_full(b)) {
 477+ return(0);
 478+ }
 479+
 480+ if (buffer_is_empty(b)) {
 481+ b->next_to_fill = b->buffer;
 482+ }
 483+ res = get_and_decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill, direction);
 484+ if (res <0 ) {
 485+ return(res);
 486+ }
 487+ if (bfile->bytes_written < 0) {
 488+ fprintf(stderr,"read of file failed\n");
 489+ return(-1);
 490+ }
 491+ else {
 492+ /* really?? FIXME check this */
 493+ if (buffer_is_empty(b)) {
 494+ b->next_to_read = b->next_to_fill; /* where we just read */
 495+ }
 496+ b->bytes_avail += bfile->bytes_written;
 497+ b->next_to_fill += bfile->bytes_written;
 498+ b->next_to_fill[0] = '\0';
 499+ return(0);
 500+ }
 501+}
 502+
 503+void dumpbuf_info_t(buf_info_t *b) {
 504+ fprintf(stdout, "\n");
 505+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
 506+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
 507+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
 508+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
 509+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
 510+}
 511+
 512+/*
 513+ copy text from end of buffer to the beginning, that we want to keep
 514+ around for further processing (i.e. further regex matches)
 515+ returns number of bytes copied
 516+*/
 517+int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes) {
 518+ int i, tocopy;
 519+
 520+ if (fromwhere >= b->end) {
 521+ return(0);
 522+ }
 523+ else {
 524+ tocopy = b->end - fromwhere;
 525+ if (maxbytes && (tocopy > maxbytes)) {
 526+ tocopy = maxbytes;
 527+ }
 528+ for (i = 0; i < tocopy; i++) {
 529+ b->buffer[i] = fromwhere[i];
 530+ }
 531+ b->next_to_fill = b->buffer + tocopy;
 532+ b->next_to_fill[0] = '\0';
 533+ b->next_to_read = b->buffer;
 534+ b->bytes_avail = tocopy;
 535+ return(tocopy);
 536+ }
 537+}
 538+
 539+unsigned char ** init_footer() {
 540+ unsigned char **footer = malloc(8*sizeof(unsigned char *));
 541+ int i;
 542+
 543+ /* set up footer plus its various right-shifted incarnations */
 544+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
 545+ for (i = 0; i< 8; i++) {
 546+ footer[i] = malloc(sizeof(unsigned char)*7);
 547+ }
 548+ footer[0][0]= (unsigned char) 0x17;
 549+ footer[0][1]= (unsigned char) 0x72;
 550+ footer[0][2]= (unsigned char) 0x45;
 551+ footer[0][3]= (unsigned char) 0x38;
 552+ footer[0][4]= (unsigned char) 0x50;
 553+ footer[0][5]= (unsigned char) 0x90;
 554+ footer[0][6]= (unsigned char) 0x00;
 555+ for (i = 1; i< 8; i++) {
 556+ memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
 557+ shift_bytes_right(footer[i],7,1);
 558+ }
 559+ return(footer);
 560+}
 561+
 562+int read_footer(unsigned char *buffer, int fin) {
 563+ int res;
 564+
 565+ res = lseek(fin, -11, SEEK_END);
 566+ if (res == -1) {
 567+ fprintf(stderr,"lseek of file failed\n");
 568+ return(-1);
 569+ }
 570+ res = read(fin, buffer, 11);
 571+ if (res == -1) {
 572+ fprintf(stderr,"read of file failed\n");
 573+ return(-1);
 574+ }
 575+ return(0);
 576+}
 577+
 578+/*
 579+ return -1 if no match return number of bits rightshifted otherwise
 580+*/
 581+int check_file_for_footer(int fin, bz_info_t *bfile) {
 582+ unsigned char buffer[11];
 583+ int result, i;
 584+
 585+ read_footer(buffer,fin);
 586+
 587+ result = bytes_compare(bfile->footer[0],buffer+1,6,0);
 588+ if (!result) {
 589+ return(0);
 590+ }
 591+
 592+ for (i=1; i<8; i++) {
 593+ result = bytes_compare(bfile->footer[i],buffer,7,i);
 594+ if (!result) {
 595+ return(i);
 596+ }
 597+ }
 598+ return(-1);
 599+}
 600+
 601+void clear_buffer(unsigned char *buf, int length) {
 602+ int i;
 603+
 604+ for (i=0; i<length; i++) {
 605+ buf[i]=0;
 606+ }
 607+ return;
 608+}
 609+
 610+/*
 611+ look for the first bz2 block in the file before/after specified offset
 612+ it tests that the block is valid by doing partial decompression.
 613+ this function will update the bfile structure:
 614+ bfile->position will contain the current position of the file (? will it?)
 615+ bfile->bits_shifted will contain the number of bits that the block is rightshifted
 616+ bfile->block_start will contain the offset from start of file to the block
 617+ (this value will always be positive, the value given in the argument "direction"
 618+ determines whether the block starts before or after the initial file position).
 619+
 620+ returns:
 621+ position of next byte in file to be read, on success
 622+ 0 if no marker
 623+ -1 on error
 624+*/
 625+int find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, int position, int direction) {
 626+ int res;
 627+
 628+ bfile->bufin_size = BUFINSIZE;
 629+ bfile->marker = init_marker();
 630+ bfile->position = position;
 631+ bfile->block_start = -1;
 632+ bfile->bytes_read = 0;
 633+ bfile->bytes_written = 0;
 634+ bfile->eof = 0;
 635+ bfile->bits_shifted = -1;
 636+
 637+ bfile->file_size = get_file_size(fin);
 638+
 639+ while (bfile->bits_shifted < 0) {
 640+ if (bfile->position > bfile->file_size) {
 641+ return(0);
 642+ }
 643+ res = lseek(fin, bfile->position, SEEK_SET);
 644+ if (res < 0) {
 645+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 646+ return(-1);
 647+ }
 648+ res = find_next_bz2_block_marker(fin, bfile,direction);
 649+ if (res == 1) {
 650+ init_decompress(bfile);
 651+ decompress_header(fin, bfile);
 652+ res = setup_first_buffer_to_decompress(fin, bfile);
 653+ if (res == -1) {
 654+ fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
 655+ return(-1);
 656+ }
 657+ bfile->strm.next_out = (char *)bfile->bufout;
 658+ bfile->strm.avail_out = bfile->bufout_size;
 659+ res = BZ2_bzDecompress_mine ( &(bfile->strm) );
 660+ /* this means we (probably) have a genuine marker */
 661+ if (BZ_OK == res || BZ_STREAM_END == res) {
 662+ res = BZ2_bzDecompressEnd ( &(bfile->strm) );
 663+ bfile->bytes_read = 0;
 664+ bfile->bytes_written = 0;
 665+ bfile->eof = 0;
 666+ /* leave the file at the right position */
 667+ res = lseek(fin, bfile->block_start, SEEK_SET);
 668+ if (res < 0) {
 669+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 670+ return(-1);
 671+ }
 672+ bfile->position = res;
 673+ return(bfile->position);
 674+ }
 675+ /* right bytes, but there by chance, skip and try again */
 676+ else {
 677+ bfile->position+=6;
 678+ bfile->bits_shifted = -1;
 679+ bfile->block_start = -1;
 680+ }
 681+ }
 682+ else {
 683+ return(0);
 684+ }
 685+ }
 686+ return(-1);
 687+}
 688+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c
___________________________________________________________________
Added: svn:eol-style
1689 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/CHANGES
Index: branches/ariel/xmldumps-backup/mwbzutils/COPYING
@@ -0,0 +1,342 @@
 2+== GNU GENERAL PUBLIC LICENSE ==
 3+
 4+Version 2, June 1991
 5+
 6+Copyright (C) 1989, 1991 Free Software Foundation, Inc.
 7+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 8+Everyone is permitted to copy and distribute verbatim copies
 9+of this license document, but changing it is not allowed.
 10+
 11+=== Preamble ===
 12+
 13+The licenses for most software are designed to take away your
 14+freedom to share and change it. By contrast, the GNU General Public
 15+License is intended to guarantee your freedom to share and change free
 16+software--to make sure the software is free for all its users. This
 17+General Public License applies to most of the Free Software
 18+Foundation's software and to any other program whose authors commit to
 19+using it. (Some other Free Software Foundation software is covered by
 20+the GNU Library General Public License instead.) You can apply it to
 21+your programs, too.
 22+
 23+When we speak of free software, we are referring to freedom, not
 24+price. Our General Public Licenses are designed to make sure that you
 25+have the freedom to distribute copies of free software (and charge for
 26+this service if you wish), that you receive source code or can get it
 27+if you want it, that you can change the software or use pieces of it
 28+in new free programs; and that you know you can do these things.
 29+
 30+To protect your rights, we need to make restrictions that forbid
 31+anyone to deny you these rights or to ask you to surrender the rights.
 32+These restrictions translate to certain responsibilities for you if you
 33+distribute copies of the software, or if you modify it.
 34+
 35+For example, if you distribute copies of such a program, whether
 36+gratis or for a fee, you must give the recipients all the rights that
 37+you have. You must make sure that they, too, receive or can get the
 38+source code. And you must show them these terms so they know their
 39+rights.
 40+
 41+We protect your rights with two steps: (1) copyright the software, and
 42+(2) offer you this license which gives you legal permission to copy,
 43+distribute and/or modify the software.
 44+
 45+Also, for each author's protection and ours, we want to make certain
 46+that everyone understands that there is no warranty for this free
 47+software. If the software is modified by someone else and passed on, we
 48+want its recipients to know that what they have is not the original, so
 49+that any problems introduced by others will not reflect on the original
 50+authors' reputations.
 51+
 52+Finally, any free program is threatened constantly by software
 53+patents. We wish to avoid the danger that redistributors of a free
 54+program will individually obtain patent licenses, in effect making the
 55+program proprietary. To prevent this, we have made it clear that any
 56+patent must be licensed for everyone's free use or not licensed at all.
 57+
 58+The precise terms and conditions for copying, distribution and
 59+modification follow.
 60+
 61+== TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION ==
 62+
 63+'''0.''' This License applies to any program or other work which contains
 64+a notice placed by the copyright holder saying it may be distributed
 65+under the terms of this General Public License. The "Program", below,
 66+refers to any such program or work, and a "work based on the Program"
 67+means either the Program or any derivative work under copyright law:
 68+that is to say, a work containing the Program or a portion of it,
 69+either verbatim or with modifications and/or translated into another
 70+language. (Hereinafter, translation is included without limitation in
 71+the term "modification".) Each licensee is addressed as "you".
 72+
 73+Activities other than copying, distribution and modification are not
 74+covered by this License; they are outside its scope. The act of
 75+running the Program is not restricted, and the output from the Program
 76+is covered only if its contents constitute a work based on the
 77+Program (independent of having been made by running the Program).
 78+Whether that is true depends on what the Program does.
 79+
 80+'''1.''' You may copy and distribute verbatim copies of the Program's
 81+source code as you receive it, in any medium, provided that you
 82+conspicuously and appropriately publish on each copy an appropriate
 83+copyright notice and disclaimer of warranty; keep intact all the
 84+notices that refer to this License and to the absence of any warranty;
 85+and give any other recipients of the Program a copy of this License
 86+along with the Program.
 87+
 88+You may charge a fee for the physical act of transferring a copy, and
 89+you may at your option offer warranty protection in exchange for a fee.
 90+
 91+'''2.''' You may modify your copy or copies of the Program or any portion
 92+of it, thus forming a work based on the Program, and copy and
 93+distribute such modifications or work under the terms of Section 1
 94+above, provided that you also meet all of these conditions:
 95+
 96+ '''a)''' You must cause the modified files to carry prominent notices
 97+ stating that you changed the files and the date of any change.
 98+
 99+ '''b)''' You must cause any work that you distribute or publish, that in
 100+ whole or in part contains or is derived from the Program or any
 101+ part thereof, to be licensed as a whole at no charge to all third
 102+ parties under the terms of this License.
 103+
 104+ '''c)''' If the modified program normally reads commands interactively
 105+ when run, you must cause it, when started running for such
 106+ interactive use in the most ordinary way, to print or display an
 107+ announcement including an appropriate copyright notice and a
 108+ notice that there is no warranty (or else, saying that you provide
 109+ a warranty) and that users may redistribute the program under
 110+ these conditions, and telling the user how to view a copy of this
 111+ License. (Exception: if the Program itself is interactive but
 112+ does not normally print such an announcement, your work based on
 113+ the Program is not required to print an announcement.)
 114+
 115+These requirements apply to the modified work as a whole. If
 116+identifiable sections of that work are not derived from the Program,
 117+and can be reasonably considered independent and separate works in
 118+themselves, then this License, and its terms, do not apply to those
 119+sections when you distribute them as separate works. But when you
 120+distribute the same sections as part of a whole which is a work based
 121+on the Program, the distribution of the whole must be on the terms of
 122+this License, whose permissions for other licensees extend to the
 123+entire whole, and thus to each and every part regardless of who wrote it.
 124+
 125+Thus, it is not the intent of this section to claim rights or contest
 126+your rights to work written entirely by you; rather, the intent is to
 127+exercise the right to control the distribution of derivative or
 128+collective works based on the Program.
 129+
 130+In addition, mere aggregation of another work not based on the Program
 131+with the Program (or with a work based on the Program) on a volume of
 132+a storage or distribution medium does not bring the other work under
 133+the scope of this License.
 134+
 135+'''3.''' You may copy and distribute the Program (or a work based on it,
 136+under Section 2) in object code or executable form under the terms of
 137+Sections 1 and 2 above provided that you also do one of the following:
 138+
 139+ '''a)''' Accompany it with the complete corresponding machine-readable
 140+ source code, which must be distributed under the terms of Sections
 141+ 1 and 2 above on a medium customarily used for software interchange; or,
 142+
 143+ '''b)''' Accompany it with a written offer, valid for at least three
 144+ years, to give any third party, for a charge no more than your
 145+ cost of physically performing source distribution, a complete
 146+ machine-readable copy of the corresponding source code, to be
 147+ distributed under the terms of Sections 1 and 2 above on a medium
 148+ customarily used for software interchange; or,
 149+
 150+ '''c)''' Accompany it with the information you received as to the offer
 151+ to distribute corresponding source code. (This alternative is
 152+ allowed only for noncommercial distribution and only if you
 153+ received the program in object code or executable form with such
 154+ an offer, in accord with Subsection b above.)
 155+
 156+The source code for a work means the preferred form of the work for
 157+making modifications to it. For an executable work, complete source
 158+code means all the source code for all modules it contains, plus any
 159+associated interface definition files, plus the scripts used to
 160+control compilation and installation of the executable. However, as a
 161+special exception, the source code distributed need not include
 162+anything that is normally distributed (in either source or binary
 163+form) with the major components (compiler, kernel, and so on) of the
 164+operating system on which the executable runs, unless that component
 165+itself accompanies the executable.
 166+
 167+If distribution of executable or object code is made by offering
 168+access to copy from a designated place, then offering equivalent
 169+access to copy the source code from the same place counts as
 170+distribution of the source code, even though third parties are not
 171+compelled to copy the source along with the object code.
 172+
 173+'''4.''' You may not copy, modify, sublicense, or distribute the Program
 174+except as expressly provided under this License. Any attempt
 175+otherwise to copy, modify, sublicense or distribute the Program is
 176+void, and will automatically terminate your rights under this License.
 177+However, parties who have received copies, or rights, from you under
 178+this License will not have their licenses terminated so long as such
 179+parties remain in full compliance.
 180+
 181+'''5.''' You are not required to accept this License, since you have not
 182+signed it. However, nothing else grants you permission to modify or
 183+distribute the Program or its derivative works. These actions are
 184+prohibited by law if you do not accept this License. Therefore, by
 185+modifying or distributing the Program (or any work based on the
 186+Program), you indicate your acceptance of this License to do so, and
 187+all its terms and conditions for copying, distributing or modifying
 188+the Program or works based on it.
 189+
 190+'''6.''' Each time you redistribute the Program (or any work based on the
 191+Program), the recipient automatically receives a license from the
 192+original licensor to copy, distribute or modify the Program subject to
 193+these terms and conditions. You may not impose any further
 194+restrictions on the recipients' exercise of the rights granted herein.
 195+You are not responsible for enforcing compliance by third parties to
 196+this License.
 197+
 198+'''7.''' If, as a consequence of a court judgment or allegation of patent
 199+infringement or for any other reason (not limited to patent issues),
 200+conditions are imposed on you (whether by court order, agreement or
 201+otherwise) that contradict the conditions of this License, they do not
 202+excuse you from the conditions of this License. If you cannot
 203+distribute so as to satisfy simultaneously your obligations under this
 204+License and any other pertinent obligations, then as a consequence you
 205+may not distribute the Program at all. For example, if a patent
 206+license would not permit royalty-free redistribution of the Program by
 207+all those who receive copies directly or indirectly through you, then
 208+the only way you could satisfy both it and this License would be to
 209+refrain entirely from distribution of the Program.
 210+
 211+If any portion of this section is held invalid or unenforceable under
 212+any particular circumstance, the balance of the section is intended to
 213+apply and the section as a whole is intended to apply in other
 214+circumstances.
 215+
 216+It is not the purpose of this section to induce you to infringe any
 217+patents or other property right claims or to contest validity of any
 218+such claims; this section has the sole purpose of protecting the
 219+integrity of the free software distribution system, which is
 220+implemented by public license practices. Many people have made
 221+generous contributions to the wide range of software distributed
 222+through that system in reliance on consistent application of that
 223+system; it is up to the author/donor to decide if he or she is willing
 224+to distribute software through any other system and a licensee cannot
 225+impose that choice.
 226+
 227+This section is intended to make thoroughly clear what is believed to
 228+be a consequence of the rest of this License.
 229+
 230+'''8.''' If the distribution and/or use of the Program is restricted in
 231+certain countries either by patents or by copyrighted interfaces, the
 232+original copyright holder who places the Program under this License
 233+may add an explicit geographical distribution limitation excluding
 234+those countries, so that distribution is permitted only in or among
 235+countries not thus excluded. In such case, this License incorporates
 236+the limitation as if written in the body of this License.
 237+
 238+'''9.''' The Free Software Foundation may publish revised and/or new versions
 239+of the General Public License from time to time. Such new versions will
 240+be similar in spirit to the present version, but may differ in detail to
 241+address new problems or concerns.
 242+
 243+Each version is given a distinguishing version number. If the Program
 244+specifies a version number of this License which applies to it and "any
 245+later version", you have the option of following the terms and conditions
 246+either of that version or of any later version published by the Free
 247+Software Foundation. If the Program does not specify a version number of
 248+this License, you may choose any version ever published by the Free Software
 249+Foundation.
 250+
 251+'''10.''' If you wish to incorporate parts of the Program into other free
 252+programs whose distribution conditions are different, write to the author
 253+to ask for permission. For software which is copyrighted by the Free
 254+Software Foundation, write to the Free Software Foundation; we sometimes
 255+make exceptions for this. Our decision will be guided by the two goals
 256+of preserving the free status of all derivatives of our free software and
 257+of promoting the sharing and reuse of software generally.
 258+
 259+=== NO WARRANTY ===
 260+
 261+'''11.''' BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 262+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
 263+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
 264+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
 265+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 266+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
 267+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
 268+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
 269+REPAIR OR CORRECTION.
 270+
 271+'''12.''' IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 272+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
 273+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
 274+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
 275+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
 276+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
 277+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
 278+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 279+POSSIBILITY OF SUCH DAMAGES.
 280+
 281+ '''END OF TERMS AND CONDITIONS'''
 282+
 283+== How to Apply These Terms to Your New Programs ==
 284+
 285+If you develop a new program, and you want it to be of the greatest
 286+possible use to the public, the best way to achieve this is to make it
 287+free software which everyone can redistribute and change under these terms.
 288+
 289+To do so, attach the following notices to the program. It is safest
 290+to attach them to the start of each source file to most effectively
 291+convey the exclusion of warranty; and each file should have at least
 292+the "copyright" line and a pointer to where the full notice is found.
 293+
 294+ <one line to give the program's name and a brief idea of what it does.>
 295+
 296+ Copyright (C) <year> <name of author>
 297+
 298+ This program is free software; you can redistribute it and/or modify
 299+ it under the terms of the GNU General Public License as published by
 300+ the Free Software Foundation; either version 2 of the License, or
 301+ (at your option) any later version.
 302+
 303+ This program is distributed in the hope that it will be useful,
 304+ but WITHOUT ANY WARRANTY; without even the implied warranty of
 305+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 306+ GNU General Public License for more details.
 307+
 308+ You should have received a copy of the GNU General Public License
 309+ along with this program; if not, write to the Free Software
 310+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 311+
 312+
 313+Also add information on how to contact you by electronic and paper mail.
 314+
 315+If the program is interactive, make it output a short notice like this
 316+when it starts in an interactive mode:
 317+
 318+ Gnomovision version 69, Copyright (C) year name of author
 319+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
 320+ This is free software, and you are welcome to redistribute it
 321+ under certain conditions; type `show c' for details.
 322+
 323+The hypothetical commands `show w' and `show c' should show the appropriate
 324+parts of the General Public License. Of course, the commands you use may
 325+be called something other than `show w' and `show c'; they could even be
 326+mouse-clicks or menu items--whatever suits your program.
 327+
 328+You should also get your employer (if you work as a programmer) or your
 329+school, if any, to sign a "copyright disclaimer" for the program, if
 330+necessary. Here is a sample; alter the names:
 331+
 332+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
 333+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
 334+
 335+ <signature of Ty Coon>, 1 April 1989
 336+
 337+ Ty Coon, President of Vice
 338+
 339+This General Public License does not permit incorporating your program into
 340+proprietary programs. If your program is a subroutine library, you may
 341+consider it more useful to permit linking proprietary applications with the
 342+library. If this is what you want to do, use the GNU Library General
 343+Public License instead of this License.
Index: branches/ariel/xmldumps-backup/mwbzutils/LICENSE_BZ
@@ -0,0 +1,42 @@
 2+
 3+--------------------------------------------------------------------------
 4+
 5+This program, "bzip2", the associated library "libbzip2", and all
 6+documentation, are copyright (C) 1996-2010 Julian R Seward. All
 7+rights reserved.
 8+
 9+Redistribution and use in source and binary forms, with or without
 10+modification, are permitted provided that the following conditions
 11+are met:
 12+
 13+1. Redistributions of source code must retain the above copyright
 14+ notice, this list of conditions and the following disclaimer.
 15+
 16+2. The origin of this software must not be misrepresented; you must
 17+ not claim that you wrote the original software. If you use this
 18+ software in a product, an acknowledgment in the product
 19+ documentation would be appreciated but is not required.
 20+
 21+3. Altered source versions must be plainly marked as such, and must
 22+ not be misrepresented as being the original software.
 23+
 24+4. The name of the author may not be used to endorse or promote
 25+ products derived from this software without specific prior written
 26+ permission.
 27+
 28+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 29+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 30+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 31+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 32+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 33+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 34+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 35+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 36+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 37+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 38+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 39+
 40+Julian Seward, jseward@bzip.org
 41+bzip2/libbzip2 version 1.0.6 of 6 September 2010
 42+
 43+--------------------------------------------------------------------------
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile
@@ -0,0 +1,95 @@
 2+# ------------------------------------------------------------------
 3+# This Makefile builds binaries which rely on two source files
 4+# from libbzip2 version 1.0.6. (See bz2libfuncs.c and
 5+# bzlib_private.h; the first is slightly modified while the
 6+# second is unchanged from the library version.)
 7+#
 8+# The copyright for those two files is as follows:
 9+#
 10+# bzip2/libbzip2 version 1.0.6 of 6 September 2010
 11+# Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
 12+#
 13+# Those files are released under the terms of the license contained
 14+# in the file LICENSE_BZ.
 15+#
 16+# All other files are released under the GPL, copyright (C) Ariel T. Glenn
 17+# 2010-2010: see the file COPYING for details.
 18+# ------------------------------------------------------------------
 19+
 20+CC=gcc
 21+LDFLAGS=
 22+BIGFILES=-D_FILE_OFFSET_BITS=64
 23+CFLAGS=-Wall -Winline -O2 -g $(BIGFILES)
 24+PREFIX=/usr/local
 25+
 26+SHELL=/bin/sh
 27+
 28+OBJSBZ= bzlibfuncs.o
 29+
 30+all: checkforbz2footer \
 31+ dumpbz2filefromoffset \
 32+ dumplastbz2block \
 33+ findpageidinbz2xml
 34+
 35+dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
 36+ $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2
 37+
 38+findpageidinbz2xml: $(OBJSBZ) mwbzlib.o findpageidinbz2xml.o
 39+ $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o mwbzlib.o $(OBJSBZ) -lbz2
 40+
 41+checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o
 42+ $(CC) $(CFLAGS) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o mwbzlib.o $(OBJSBZ) -lbz2
 43+
 44+dumpbz2filefromoffset: $(OBJSBZ) mwbzlib.o dumpbz2filefromoffset.o
 45+ $(CC) $(CFLAGS) $(LDFLAGS) -o dumpbz2filefromoffset dumpbz2filefromoffset.o mwbzlib.o $(OBJSBZ) -lbz2
 46+
 47+install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset
 48+ if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
 49+ if ( test ! -d $(PREFIX)/include ) ; then mkdir -p $(PREFIX)/include ; fi
 50+ cp -f bzip2 $(PREFIX)/bin/dumplastbz2block
 51+ cp -f bzip2 $(PREFIX)/bin/findpageidinbz2xml
 52+ cp -f bzip2 $(PREFIX)/bin/checkforbz2footer
 53+ cp -f bzip2 $(PREFIX)/bin/dumpbz2filefromoffset
 54+ chmod a+x $(PREFIX)/bin/dumplastbz2block
 55+ chmod a+x $(PREFIX)/bin/findpageidinbz2xml
 56+ chmod a+x $(PREFIX)/bin/checkforbz2footer
 57+ chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset
 58+
 59+clean:
 60+ rm -f *.o *.a dumplastbz2block findpageidinbz2xml \
 61+ checkforbz2footer dumpbz2filefromoffset
 62+
 63+bzlibfuncs.o: bzlibfuncs.c
 64+ $(CC) $(CFLAGS) -c bzlibfuncs.c
 65+mwbzlib.o: mwbzlib.c
 66+ $(CC) $(CFLAGS) -c mwbzlib.c
 67+dumplastbz2block.o: dumplastbz2block.c
 68+ $(CC) $(CFLAGS) -c dumplastbz2block.c
 69+findpageidinbz2xml.o: findpageidinbz2xml.c
 70+ $(CC) $(CFLAGS) -c findpageidinbz2xml.c
 71+checkforbz2footer.o: checkforbz2footer.c
 72+ $(CC) $(CFLAGS) -c checkforbz2footer.c
 73+dumpbz2filefromoffset.o: dumpbz2filefromoffset.c
 74+ $(CC) $(CFLAGS) -c dumpbz2filefromoffset.c
 75+
 76+distclean: clean
 77+
 78+DISTNAME=mwbzutils-0.0.1
 79+dist: rm -f $(DISTNAME)
 80+ ln -s -f . $(DISTNAME)
 81+ tar cvf $(DISTNAME).tar \
 82+ $(DISTNAME)/dumplastbz2block.c \
 83+ $(DISTNAME)/findpageidinbz2xml.c \
 84+ $(DISTNAME)/checkforbz2footer.c \
 85+ $(DISTNAME)/dumpbz2filefromoffset.c \
 86+ $(DISTNAME)/mwbzlib.c \
 87+ $(DISTNAME)/mwbzutils.h \
 88+ $(DISTNAME)/bzlibfuncs.c \
 89+ $(DISTNAME)/bzlib_private.h \
 90+ $(DISTNAME)/Makefile \
 91+ $(DISTNAME)/LICENSE_BZ \
 92+ $(DISTNAME)/COPYING \
 93+ $(DISTNAME)/README \
 94+ $(DISTNAME)/CHANGES
 95+ gzip -v $(DISTNAME).tar
 96+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/Makefile
___________________________________________________________________
Added: svn:eol-style
197 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/README
@@ -0,0 +1,58 @@
 2+What is this?
 3+
 4+It is a tiny suite of utilities that hapless WMF employees use to massage the
 5+XML dump files so that we can produce them on a more regular basis.
 6+
 7+More specifically, they allow us to do various things with bz2 files
 8+quickly instead of requiring a serial read/decompress of the file. Some
 9+of these files range from 2 to 30 GB in size, so serial access is too slow.
 10+
 11+The files bz2libfuncs.c and bzlib_private.h are taken from bzip2/libbzip2
 12+version 1.0.6 of 6 September 2010 (Copyright (C) 1996-2010 Julian Seward
 13+<jseward@bzip.org>) and as such their copyright license is in the file
 14+LICENSE_BZ; all other files in the package are released under the GPL,
 15+see the file COPYING for details.
 16+
 17+Utilities:
 18+
 19+checkforbz2footer - Tests to see if the bz2 file specified on the command line
 20+ has a bz2 footer (if it does it is likely to be intact).
 21+ Exits with 0 if found, 1 otherwise.
 22+dumpbz2filefromoffset - Uncompresses the file from the first bz2 block found after
 23+ the specified offset, and dumps the results to stdout.
 24+ This will first look for and dump the <mediawiki> header,
 25+ up to and including the </siteinfo> tag; then it will
 26+ find the first <page> tag in the first bz2 block after
 27+ the specified output and dump the contents from that point
 28+ on.
 29+dumplastbz2block - Finds the last bz2 block marker in a file and dumps whatever
 30+ can be decompressed after that point; the header of the file
 31+ must be intact in order for any output to be produced. This
 32+ will produce output for truncated files as well, as long as
 33+ there is "enough" data after the bz2 block marker.
 34+ Exits with 0 if decompression of some data can be done,
 35+ 1 if decompression fails, and -1 on error.
 36+
 37+findpageidinbz2xml - Given a bzipped and possibly truncated file, and a page id,
 38+ hunt for the page id in the file; this assumes that the
 39+ bz2 header is intact and that page ids are steadily increasing
 40+ throughout the file. It writes the offset of the relevant block
 41+ (from beginning of file) and the first pageid found in that block,
 42+ to stdout. Format of output:
 43+ position:xxxxx pageid:nnn
 44+ It exits with 0 on success, -1 on error.
 45+
 46+Library routines:
 47+
 48+mwbz2lib.c - various utility functions (bitmasks, shifting and comparing bytes,
 49+ setting up bz2 files for decompression, etc)
 50+
 51+External library routines:
 52+
 53+bz2libfuncs.c - the BZ2_bzDecompress() routine, modified so that it does not do
 54+ a check of the cumulative CRC (since we read from an arbitrary
 55+ point in most of these files, we won't have a cumulative CRC
 56+ that makes any sense). It's a one line fix but it requires
 57+ unRLE_obuf_to_output_FAST() which is marked static in the original
 58+ library, so that's in here too.
 59+

Status & tagging log