r91638 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r91637‎ \| r91638 \| r91639 >
Date:	12:21, 7 July 2011
Author:	ariel
Status:	deferred
Tags:
Comment:	Move common functions into a separate file; use a slightly modified version of BZ2_bzDecompress from the bz2lib which skips the cumulaive crc check; add makefile, license information, readme for when I forget what these files do
Modified paths:	/branches/ariel/xmldumps-backup/mwbzutils/CHANGES (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/COPYING (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/LICENSE_BZ (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/Makefile (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/README (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/bzlib_private.h (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/bzlibfuncs.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h (deleted) (history) /branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h
—	—	@@ -1,81 +0,0 @@
2		~~-#ifndef _FINDPAGEID_H~~
3		~~-#define _FINDPAGEID_H~~
4		-
5		~~-typedef struct {~~
6		~~- int page_id; /* first id in the block */~~
7		~~- int bits_shifted; /* block is right shifted this many bits */~~
8		~~- int position; /* position in file of block */~~
9		~~-} page_info_t;~~
10		-
11		~~-#define BUFINSIZE 5000~~
12		-
13		-/*
14		~~- keeps all information about a bzipped file~~
15		~~- plus input/output buffers for decompression~~
16		~~-*/~~
17		~~-typedef struct {~~
18		~~- unsigned char bufin[BUFINSIZE]; /* compressed data read from file */~~
19		~~- unsigned char bufout; / uncompressed data, must be allocated by caller */~~
20		~~- unsigned char marker_buffer[7]; /* data to test for bz2 block marker */~~
21		~~- unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */~~
22		-
23		~~- int bufin_size; /* size of input buffer for compressed data */~~
24		~~- int bufout_size; /* size of output buffer for decompressed data, may vary at each call */~~
25		-
26		~~- int initialized; /* whether bz2file has been initialized (header processed, seek to~~
27		~~- some bz2 block in the file and input buffer filled) */~~
28		~~- int block_start; /* position of bz2 block in file from which we started to read (we~~
29		~~- read a sequence of bz2 blocks from a given position, this is~~
30		~~- the offset to the first one) */~~
31		-
32		~~- bz_stream strm; /* stream structure for libbz2 */~~
33		~~- unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte~~
34		~~- read around so we can grab the lower end bits off the end for~~
35		~~- sticking in front of the next pile of compressed bytes we read */~~
36		-
37		~~- int bits_shifted; /* number of bits that the compressed data has been right shifted~~
38		~~- in the file (if the number is 0, the block marker and subsequent~~
39		~~- data is byte-aligned) */~~
40		~~- unsigned char *marker; / bzip2 start of block marker, plus bit-shifted versions of it for~~
41		~~- locating the marker in a stream of compressed data */~~
42		-
43		~~- int position; /* current offset into file from start of file */~~
44		-
45		~~- int bytes_read; /* number of bytes of compressed data read from file (per read) */~~
46		~~- int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */~~
47		~~- int eof; /* nonzero if eof reached */~~
48		~~- int file_size; /* length of file, so we don't search past it for blocks */~~
49		~~-} bz_info_t;~~
50		-
51		~~-#define MASKLEFT 0~~
52		~~-#define MASKRIGHT 1~~
53		-
54		-/*
55		~~- this output buffer is used to collect decompressed output.~~
56		~~- this is not a circular buffer; when it is full the user is~~
57		~~- responsible for emptying it completely or partially and moving~~
58		~~- to the beginning any unused bytes.~~
59		-
60		~~-*/~~
61		~~-typedef struct {~~
62		~~- unsigned char buffer; / output storage, allocated by the caller */~~
63		~~- unsigned char next_to_read; / pointer to the next byte in the buffer with data to be read */~~
64		~~- unsigned char next_to_fill; / pointer to the next byte in the buffer which is empty and can receive data */~~
65		~~- int bytes_avail; /* number of bytes available for reading */~~
66		~~- unsigned char end; / points to byte after end of buffer */~~
67		~~-} buf_info_t;~~
68		-
69		-/*
70		~~- used for each iteration of narrowing down the location in a bzipped2 file of~~
71		~~- a desired pageid, by finding first compressed block after a guessed~~
72		~~- position and checking the first pageid (if any) contained in it.~~
73		~~-*/~~
74		~~-typedef struct {~~
75		~~- int left_end; /* left end of interval to search (bytes from start of file) */~~
76		~~- int right_end; /* right end of interval to search */~~
77		~~- int value_wanted; /* pageid desired */~~
78		~~- int last_value; /* pageid we found in last iteration */~~
79		~~- int last_position; /* position in file for last iteration */~~
80		~~-} iter_info_t;~~
81		-
82		~~-#endif~~
Index: branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
—	—	@@ -8,519 +8,9 @@
9	9	#include <errno.h>
10	10	#include <sys/types.h>
11	11	#include <regex.h>
12		~~-#include "bzlib.h"~~
13		~~-#include "findpageidinbz2xml.h"~~
	12	+#include "mwbzutils.h"
14	13
15		-
16		~~-/* return n ones either at left or right end */~~
17		~~-int bit_mask(int numbits, int end) {~~
18		~~- if (end == MASKRIGHT) {~~
19		~~- return((1<<numbits)-1);~~
20		~~- }~~
21		~~- else {~~
22		~~- return(((1<<numbits)-1) << (8-numbits));~~
23		~~- }~~
24		-}
25		-
26		~~-void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {~~
27		~~- int i;~~
28		-
29		~~- if (numbits == 0) {~~
30		~~- return;~~
31		~~- }~~
32		-
33		~~- for (i=0; i<buflen; i++) {~~
34		~~- /* left 1 */~~
35		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);~~
36		-
37		~~- /* grab leftmost from next byte */~~
38		~~- if (i < buflen-1) {~~
39		~~- buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );~~
40		~~- }~~
41		~~- }~~
42		-}
43		-
44		-
45		~~-void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {~~
46		~~- int i;~~
47		-
48		~~- for (i=buflen-1; i>=0; i--) {~~
49		~~- /* right 1 */~~
50		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
51		-
52		~~- /* grab rightmost from prev byte */~~
53		~~- if (i > 0) {~~
54		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));~~
55		~~- }~~
56		~~- }~~
57		-}
58		-
59		~~-unsigned char ** init_marker() {~~
60		~~- unsigned char *marker = malloc(8sizeof(unsigned char *));~~
61		~~- int i;~~
62		-
63		~~- /* set up block marker plus its various right-shifted incarnations */~~
64		~~- for (i = 0; i< 8; i++) {~~
65		~~- marker[i] = malloc(sizeof(unsigned char)*7);~~
66		~~- }~~
67		~~- marker[0][0]= (unsigned char) 0x31;~~
68		~~- marker[0][1]= (unsigned char) 0x41;~~
69		~~- marker[0][2]= (unsigned char) 0x59;~~
70		~~- marker[0][3]= (unsigned char) 0x26;~~
71		~~- marker[0][4]= (unsigned char) 0x53;~~
72		~~- marker[0][5]= (unsigned char) 0x59;~~
73		~~- marker[0][6]= (unsigned char) 0x00;~~
74		~~- for (i = 1; i< 8; i++) {~~
75		~~- memcpy((char )(marker[i]), (char )(marker[i-1]),7);~~
76		~~- shift_bytes_right(marker[i],7,1);~~
77		~~- }~~
78		~~- return(marker);~~
79		-}
80		-
81		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
82		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
83		~~- matches and 0 otherwise. */~~
84		~~-int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
85		~~- int i;~~
86		-
87		~~- if (bitsrightshifted == 0) {~~
88		~~- for (i = 0; i< numbytes; i++) {~~
89		~~- if (buff1[i] != buff2[i]) {~~
90		~~- return(1);~~
91		~~- }~~
92		~~- }~~
93		~~- return(0);~~
94		~~- }~~
95		~~- else {~~
96		~~- for (i = 1; i< numbytes-2; i++) {~~
97		~~- if (buff1[i] != buff2[i]) {~~
98		~~- return(1);~~
99		~~- }~~
100		~~- }~~
101		~~- /* do leftmost byte */~~
102		~~- if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {~~
103		~~- return(1);~~
104		~~- }~~
105		~~- /* do rightmost byte */~~
106		~~- if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {~~
107		~~- return(1);~~
108		~~- }~~
109		~~- return(0);~~
110		~~- }~~
111		-}
112		-
113		~~-/* return -1 if no match~~
114		~~- return number of bits rightshifted otherwise */~~
115		~~-int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {~~
116		~~- int result, i;~~
117		-
118		~~- result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);~~
119		~~- if (!result) {~~
120		~~- return(0);~~
121		~~- }~~
122		~~- for (i=1; i<8; i++) {~~
123		~~- result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);~~
124		~~- if (!result) {~~
125		~~- return(i);~~
126		~~- }~~
127		~~- }~~
128		~~- return(-1);~~
129		-}
130		-
131		~~-/* return: 1 if found, 0 if not, -1 on error */~~
132		~~-int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {~~
133		~~- int result;~~
134		-
135		~~- bfile->bits_shifted = -1;~~
136		~~- result = read(fin, bfile->marker_buffer, 7);~~
137		~~- if (result == -1) {~~
138		~~- fprintf(stderr,"read of file failed\n");~~
139		~~- exit(-1);~~
140		~~- }~~
141		~~- /* must be after 4 byte file header, and we add a leftmost byte to the buffer~~
142		~~- of data read in case some bits have been shifted into it */~~
143		~~- while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {~~
144		~~- bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);~~
145		~~- if (bfile->bits_shifted < 0) {~~
146		~~- bfile->position++;~~
147		~~- result = lseek(fin, (bfile->position), SEEK_SET);~~
148		~~- if (result == -1) {~~
149		~~- fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);~~
150		~~- exit(-1);~~
151		~~- }~~
152		~~- result = read(fin, bfile->marker_buffer, 7);~~
153		~~- if (result < 7) {~~
154		~~- /* fprintf(stderr,"read of file failed\n"); */~~
155		~~- exit(-1);~~
156		~~- }~~
157		~~- }~~
158		~~- else {~~
159		~~- bfile->block_start = bfile->position;~~
160		~~- return(1);~~
161		~~- }~~
162		~~- }~~
163		~~- return(0);~~
164		-}
165		-
166		-/*
167		~~- initializes the bz2 strm structure,~~
168		~~- calls the BZ2 decompression library initializer~~
169		-
170		~~- returns:~~
171		~~- BZ_OK on success~~
172		~~- various BZ_ errors on failure (see bzlib.h)~~
173		~~-*/~~
174		~~-int init_decompress(bz_info_t *bfile) {~~
175		~~- int bz_verbosity = 0;~~
176		~~- int bz_small = 0;~~
177		~~- int ret;~~
178		-
179		~~- bfile->strm.bzalloc = NULL;~~
180		~~- bfile->strm.bzfree = NULL;~~
181		~~- bfile->strm.opaque = NULL;~~
182		-
183		~~- ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );~~
184		~~- if (ret != BZ_OK) {~~
185		~~- fprintf(stderr,"uncompress failed, err %d\n", ret);~~
186		~~- exit(-1);~~
187		~~- }~~
188		~~- return(ret);~~
189		-}
190		-
191		-/*
192		~~- reads the first 4 bytes from a bz2 file (should be~~
193		~~- "BZh" followed by the block size indicator, typically "9")~~
194		~~- and passes them into the BZ2 decompression library.~~
195		~~- This must be done before decompression of any block of the~~
196		~~- file is attempted.~~
197		-
198		~~- returns:~~
199		~~- BZ_OK if successful,~~
200		~~- various BZ_ errors on failure (see bzlib.h)~~
201		~~-*/~~
202		~~-int decompress_header(int fin, bz_info_t *bfile) {~~
203		~~- int ret, res;~~
204		-
205		~~- res = lseek(fin,0,SEEK_SET);~~
206		~~- if (res == -1) {~~
207		~~- fprintf(stderr,"lseek of file to 0 failed (3)\n");~~
208		~~- }~~
209		~~- bfile->bytes_read = read(fin, bfile->header_buffer, 4);~~
210		~~- if (bfile->bytes_read < 4) {~~
211		~~- fprintf(stderr,"failed to read 4 bytes of header, exiting\n");~~
212		~~- exit(-1);~~
213		~~- }~~
214		~~- bfile->strm.next_in = (char *)bfile->header_buffer;~~
215		~~- bfile->strm.avail_in = 4;~~
216		-
217		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
218		~~- if (BZ_OK != ret && BZ_STREAM_END != ret) {~~
219		~~- fprintf(stderr,"Corrupt bzip2 header, exiting\n");~~
220		~~- exit(-1);~~
221		~~- }~~
222		~~- return(ret);~~
223		-}
224		-
225		-/*
226		~~- seek to appropriate offset as specified in bfile,~~
227		~~- read compressed data into buffer indicated by bfile,~~
228		~~- update the bfile structure accordingly,~~
229		~~- save the overflow byte (bit-shifted data = suck)~~
230		~~- this is for the first buffer of data in a stream,~~
231		~~- for subsequent buffers use fill_buffer_to_decompress()~~
232		-
233		~~- this will set bfile->eof on eof. no other indicator~~
234		~~- will be provided.~~
235		-
236		~~- returns:~~
237		~~- 0 on success~~
238		~~- -1 on error~~
239		~~-*/~~
240		~~-int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {~~
241		~~- int res;~~
242		-
243		~~- if (bfile->bits_shifted == 0) {~~
244		~~- res = lseek(fin,bfile->position+1,SEEK_SET);~~
245		~~- if (res == -1) {~~
246		~~- fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);~~
247		~~- return(-1);~~
248		~~- }~~
249		~~- }~~
250		~~- else {~~
251		~~- res = lseek(fin,bfile->position,SEEK_SET);~~
252		~~- if (res == -1) {~~
253		~~- fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);~~
254		~~- return(-1);~~
255		~~- }~~
256		~~- }~~
257		~~- bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);~~
258		~~- if (bfile->bytes_read > 0) {~~
259		~~- bfile->overflow = bfile->bufin[bfile->bytes_read-1];~~
260		~~- shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);~~
261		-
262		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
263		~~- bfile->strm.avail_in = bfile->bytes_read-1;~~
264		~~- }~~
265		~~- if (bfile->bytes_read <=0) {~~
266		~~- bfile->eof++;~~
267		~~- }~~
268		~~- return(0);~~
269		-}
270		-
271		-/*
272		~~- read compressed data into buffer indicated by bfile,~~
273		~~- from current position of file,~~
274		~~- stuffing the overflow byte in first.~~
275		~~- update the bfile structure accordingly~~
276		~~- save the new overflow byte (bit-shifted data = suck)~~
277		~~- this function is for decompression of buffers *after~~
278		~~- the first one*. for the first one use~~
279		~~- setup_first_buffer_to_decompress()~~
280		-
281		~~- this will set bfile->eof on eof. no other indicator~~
282		~~- will be provided.~~
283		-
284		~~- returns:~~
285		~~- 0 on success~~
286		~~- hmm, it really does not do anything about errors :-D~~
287		~~-*/~~
288		~~-int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {~~
289		~~- if (bfile->strm.avail_in == 0) {~~
290		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
291		~~- bfile->bufin[0] = bfile->overflow;~~
292		~~- bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);~~
293		~~- if (bfile->bytes_read > 0) {~~
294		~~- bfile->position+=bfile->bytes_read;~~
295		~~- bfile->overflow = bfile->bufin[bfile->bytes_read];~~
296		~~- shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);~~
297		~~- bfile->strm.avail_in = bfile->bytes_read;~~
298		~~- }~~
299		~~- else {~~
300		~~- bfile->strm.avail_in = 1; /* the overflow byte */~~
301		~~- bfile->eof++;~~
302		~~- }~~
303		~~- }~~
304		~~- return(0);~~
305		-}
306		-
307		~~-/* size of buffer is bytes usable. there will be a null byte at the end~~
308		-
309		~~- what we do with the buffer:~~
310		~~- - read from front of buffer to end,~~
311		~~- - fill from point where prev read did not fill buffer, or from where~~
312		~~- move of data at end of buffer to beginning left room,~~
313		~~- - mark a string of bytes (starting from what's available to read) as "read"~~
314		-
315		~~-*/~~
316		~~-buf_info_t *init_buffer(int size) {~~
317		~~- buf_info_t *b;~~
318		-
319		~~- b = (buf_info_t *)malloc(sizeof(buf_info_t));~~
320		~~- b->buffer = malloc(sizeof(unsigned char)*(size+1));~~
321		~~- b->buffer[size]='\0';~~
322		~~- b->end = b->buffer + size;~~
323		~~- b->next_to_read = b->end; /* nothing available */~~
324		~~- b->bytes_avail = 0; /* bytes to read, nothing available */~~
325		~~- b->next_to_fill = b->buffer; /* empty */~~
326		~~- b->next_to_fill[0] = '\0';~~
327		~~- return(b);~~
328		-}
329		-
330		~~-/* check if buffer (used for decompressed data output) is empty,~~
331		~~- returns 1 if so and 0 if not */~~
332		~~-int buffer_is_empty(buf_info_t *b) {~~
333		~~- if (b->bytes_avail == 0) {~~
334		~~- return(1);~~
335		~~- }~~
336		~~- else {~~
337		~~- return(0);~~
338		~~- }~~
339		-}
340		-
341		~~-/* check if buffer (used for decompressed data output) is full,~~
342		-
343		~~- returns 1 if so and 0 if not~~
344		~~- I'm not liking this function so well, fixme */~~
345		~~-int buffer_is_full(buf_info_t *b) {~~
346		~~- if (b->next_to_fill == b->end) {~~
347		~~- return(1);~~
348		~~- }~~
349		~~- else {~~
350		~~- return(0);~~
351		~~- }~~
352		-}
353		-
354		~~-/* FIXME do this right. whatever. */~~
355		~~-int get_file_size(int fin) {~~
356		~~- int res;~~
357		-
358		~~- res = lseek(fin, 0, SEEK_END);~~
359		~~- if (res == -1) {~~
360		~~- fprintf(stderr,"lseek of file to 0 failed (6)\n");~~
361		~~- exit(-1);~~
362		~~- }~~
363		~~- return(res);~~
364		-}
365		-
366		-
367	14	/*
368		~~- set up the marker, seek to right place, get first~~
369		~~- buffer of compressed data for processing~~
370		~~- bfile->position must be set to desired offset first by caller.~~
371		~~- returns:~~
372		~~- -1 if no marker or other error, position of next read if ok~~
373		~~-*/~~
374		~~-int init_bz2_file(bz_info_t *bfile, int fin) {~~
375		~~- int res;~~
376		-
377		~~- bfile->bufin_size = BUFINSIZE;~~
378		~~- bfile->marker = init_marker();~~
379		~~- bfile->bytes_read = 0;~~
380		~~- bfile->bytes_written = 0;~~
381		~~- bfile->eof = 0;~~
382		-
383		~~- bfile->initialized++;~~
384		-
385		~~- bfile->file_size = get_file_size(fin);~~
386		~~- if (bfile->position > bfile->file_size) {~~
387		~~- fprintf(stderr,"asked for position past end of file\n");~~
388		~~- exit(-1);~~
389		~~- }~~
390		~~- res = lseek(fin, bfile->position, SEEK_SET);~~
391		~~- if (res == -1) {~~
392		~~- fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);~~
393		~~- exit(-1);~~
394		~~- }~~
395		-
396		~~- find_next_bz2_block_marker(fin, bfile);~~
397		~~- if (bfile->bits_shifted >= 0) {~~
398		~~- /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */~~
399		~~- init_decompress(bfile);~~
400		~~- decompress_header(fin, bfile);~~
401		~~- setup_first_buffer_to_decompress(fin, bfile);~~
402		~~- return(0);~~
403		~~- }~~
404		~~- return(-1);~~
405		-}
406		-
407		~~-/* get the next buffer of uncompressed stuff */~~
408		~~-int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {~~
409		~~- int ret;~~
410		-
411		~~- bfile->bufout = bufferout;~~
412		~~- bfile->bufout_size = bufout_size;~~
413		~~- bfile->bytes_written = 0;~~
414		-
415		~~- if (! bfile->initialized) {~~
416		~~- if (init_bz2_file(bfile, fin) == -1) {~~
417		~~- fprintf(stderr,"failed to initialize bz2file\n");~~
418		~~- return(-1);~~
419		~~- };~~
420		~~- bfile->strm.next_out = (char *)bfile->bufout;~~
421		~~- bfile->strm.avail_out = bfile->bufout_size;~~
422		~~- }~~
423		-
424		~~- ret = BZ_OK;~~
425		~~- while (BZ_OK == ret && bfile->bytes_written == 0) {~~
426		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
427		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
428		~~- bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;~~
429		~~- }~~
430		~~- else {~~
431		~~- fprintf(stderr,"error from BZ decompress %d\n",ret);~~
432		~~- return(-1);~~
433		~~- }~~
434		~~- fill_buffer_to_decompress(fin, bfile, ret);~~
435		- /*
436		~~- if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {~~
437		~~- fprintf(stderr,"eof reached\n");~~
438		~~- }~~
439		~~- */~~
440		~~- }~~
441		~~- return(0);~~
442		-}
443		-
444		-/*
445		~~- fill output buffer in b with uncompressed data from bfile~~
446		~~- if this is the first call to the function for this file,~~
447		~~- the file header will be read, and the first buffer of~~
448		~~- uncompressed data will be prepared. bfile->position~~
449		~~- should be set to the offset (from the beginning of file) from~~
450		~~- which to find the first bz2 block.~~
451		-
452		~~- returns:~~
453		~~- on success, number of bytes read (may be 0)~~
454		~~- -1 on error~~
455		~~-*/~~
456		~~-int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {~~
457		~~- int res;~~
458		-
459		~~- if (buffer_is_full(b)) {~~
460		~~- fprintf(stdout,"DEBUG buffer full\n");~~
461		~~- return(0);~~
462		~~- }~~
463		-
464		~~- if (buffer_is_empty(b)) {~~
465		~~- b->next_to_fill = b->buffer;~~
466		~~- }~~
467		-
468		~~- res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);~~
469		~~- if (res <0 ) {~~
470		~~- return(res);~~
471		~~- }~~
472		~~- if (bfile->bytes_written < 0) {~~
473		~~- fprintf(stderr,"read of file failed\n");~~
474		~~- return(-1);~~
475		~~- }~~
476		~~- else {~~
477		~~- /* really?? FIXME check this */~~
478		~~- if (buffer_is_empty(b)) {~~
479		~~- b->next_to_read = b->next_to_fill; /* where we just read */~~
480		~~- }~~
481		~~- b->bytes_avail += bfile->bytes_written;~~
482		~~- b->next_to_fill += bfile->bytes_written;~~
483		~~- b->next_to_fill[0] = '\0';~~
484		~~- return(0);~~
485		~~- }~~
486		-}
487		-
488		~~-void dumpbuf_info_t(buf_info_t *b) {~~
489		~~- fprintf(stdout, "\n");~~
490		~~- fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);~~
491		~~- fprintf(stdout, "b->end: %ld\n", (long int) b->end);~~
492		~~- fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);~~
493		~~- fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);~~
494		~~- fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);~~
495		-}
496		-
497		-/*
498		~~- copy text from end of buffer to the beginning, that we want to keep~~
499		~~- around for further processing (i.e. further regex matches)~~
500		~~- returns number of bytes copied~~
501		~~-*/~~
502		~~-int move_bytes_to_buffer_start(buf_info_t b, unsigned char fromwhere, int maxbytes) {~~
503		~~- int i, tocopy;~~
504		-
505		~~- if (fromwhere >= b->end) {~~
506		~~- return(0);~~
507		~~- }~~
508		~~- else {~~
509		~~- tocopy = b->end - fromwhere;~~
510		~~- if (maxbytes && (tocopy > maxbytes)) {~~
511		~~- tocopy = maxbytes;~~
512		~~- }~~
513		~~- for (i = 0; i < tocopy; i++) {~~
514		~~- b->buffer[i] = fromwhere[i];~~
515		~~- }~~
516		~~- b->next_to_fill = b->buffer + tocopy;~~
517		~~- b->next_to_fill[0] = '\0';~~
518		~~- b->next_to_read = b->buffer;~~
519		~~- b->bytes_avail = tocopy;~~
520		~~- return(tocopy);~~
521		~~- }~~
522		-}
523		-
524		-/*
525	15	dump the <meadiawiki> header (up through
526	16	</siteinfo> close tag) found at the
527	17	beginning of xml dump files.
—	—	@@ -550,7 +40,7 @@
551	41	bfile.bytes_read = 0;
552	42	bfile.position = 0;
553	43
554		~~- while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof) && (!done)) {~~
	44	+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof) && (!done)) {
555	45	/* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
556	46	if (bfile.bytes_read) {
557	47	if (firstpage) {
—	—	@@ -656,7 +146,7 @@
657	147	bfile.bytes_read = 0;
658	148	bfile.position = position;
659	149
660		~~- while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof)) {~~
	150	+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof)) {
661	151	/* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
662	152	if (bfile.bytes_read) {
663	153	if (firstpage) {
Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
—	—	@@ -8,436 +8,9 @@
9	9	#include <errno.h>
10	10	#include <sys/types.h>
11	11	#include <regex.h>
12		~~-#include "bzlib.h"~~
13		~~-#include "findpageidinbz2xml.h"~~
	12	+#include "mwbzutils.h"
14	13
15		~~-/* return n ones either at left or right end */~~
16		~~-int bitmask(int numbits, int end) {~~
17		~~- if (end == MASKRIGHT) {~~
18		~~- return((1<<numbits)-1);~~
19		~~- }~~
20		~~- else {~~
21		~~- return(((1<<numbits)-1) << (8-numbits));~~
22		~~- }~~
23		-}
24	14
25		~~-void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {~~
26		~~- int i;~~
27		-
28		~~- if (numbits == 0) {~~
29		~~- return;~~
30		~~- }~~
31		-
32		~~- for (i=0; i<buflen; i++) {~~
33		~~- /* left 1 */~~
34		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);~~
35		-
36		~~- /* grab leftmost from next byte */~~
37		~~- if (i < buflen-1) {~~
38		~~- buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );~~
39		~~- }~~
40		~~- }~~
41		-}
42		-
43		~~-void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {~~
44		~~- int i;~~
45		-
46		~~- for (i=buflen-1; i>=0; i--) {~~
47		~~- /* right 1 */~~
48		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
49		-
50		~~- /* grab rightmost from prev byte */~~
51		~~- if (i > 0) {~~
52		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));~~
53		~~- }~~
54		~~- }~~
55		-}
56		-
57		~~-unsigned char ** init_marker() {~~
58		~~- unsigned char *marker = malloc(8sizeof(unsigned char *));~~
59		~~- int i;~~
60		-
61		~~- /* set up block marker plus its various right-shifted incarnations */~~
62		~~- for (i = 0; i< 8; i++) {~~
63		~~- marker[i] = malloc(sizeof(unsigned char)*7);~~
64		~~- }~~
65		~~- marker[0][0]= (unsigned char) 0x31;~~
66		~~- marker[0][1]= (unsigned char) 0x41;~~
67		~~- marker[0][2]= (unsigned char) 0x59;~~
68		~~- marker[0][3]= (unsigned char) 0x26;~~
69		~~- marker[0][4]= (unsigned char) 0x53;~~
70		~~- marker[0][5]= (unsigned char) 0x59;~~
71		~~- marker[0][6]= (unsigned char) 0x00;~~
72		~~- for (i = 1; i< 8; i++) {~~
73		~~- memcpy((char )(marker[i]), (char )(marker[i-1]),7);~~
74		~~- shift_bytes_right(marker[i],7,1);~~
75		~~- }~~
76		~~- return(marker);~~
77		-}
78		-
79		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
80		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
81		~~- matches and 0 otherwise. */~~
82		~~-int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
83		~~- int i;~~
84		-
85		~~- if (bitsrightshifted == 0) {~~
86		~~- for (i = 0; i< numbytes; i++) {~~
87		~~- if (buff1[i] != buff2[i]) {~~
88		~~- return(1);~~
89		~~- }~~
90		~~- }~~
91		~~- return(0);~~
92		~~- }~~
93		~~- else {~~
94		~~- for (i = 1; i< numbytes-2; i++) {~~
95		~~- if (buff1[i] != buff2[i]) {~~
96		~~- return(1);~~
97		~~- }~~
98		~~- }~~
99		~~- /* do leftmost byte */~~
100		~~- if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {~~
101		~~- return(1);~~
102		~~- }~~
103		~~- /* do rightmost byte */~~
104		~~- if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {~~
105		~~- return(1);~~
106		~~- }~~
107		~~- return(0);~~
108		~~- }~~
109		-}
110		-
111		-
112		~~-/* return -1 if no match~~
113		~~- return number of bits rightshifted otherwise */~~
114		~~-int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {~~
115		~~- int result, i;~~
116		-
117		~~- result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);~~
118		~~- if (!result) {~~
119		~~- return(0);~~
120		~~- }~~
121		~~- for (i=1; i<8; i++) {~~
122		~~- result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);~~
123		~~- if (!result) {~~
124		~~- return(i);~~
125		~~- }~~
126		~~- }~~
127		~~- return(-1);~~
128		-}
129		-
130		-
131		~~-/* return: 1 if found, 0 if not, -1 on error */~~
132		~~-int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {~~
133		~~- int result;~~
134		-
135		~~- bfile->bits_shifted = -1;~~
136		~~- result = read(fin, bfile->marker_buffer, 7);~~
137		~~- if (result == -1) {~~
138		~~- /* fprintf(stderr,"read of file failed\n"); */~~
139		~~- return(-1);~~
140		~~- }~~
141		~~- /* must be after 4 byte file header, and we add a leftmost byte to the buffer~~
142		~~- of data read in case some bits have been shifted into it */~~
143		~~- while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {~~
144		~~- bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);~~
145		~~- if (bfile->bits_shifted < 0) {~~
146		~~- bfile->position++;~~
147		~~- result = lseek(fin, (bfile->position), SEEK_SET);~~
148		~~- if (result == -1) {~~
149		~~- fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);~~
150		~~- return(-1);~~
151		~~- }~~
152		~~- result = read(fin, bfile->marker_buffer, 7);~~
153		~~- if (result < 7) {~~
154		~~- /* fprintf(stderr,"read of file failed\n"); */~~
155		~~- return(-1);~~
156		~~- }~~
157		~~- }~~
158		~~- else {~~
159		~~- bfile->block_start = bfile->position;~~
160		~~- return(1);~~
161		~~- }~~
162		~~- }~~
163		~~- return(0);~~
164		-}
165		-
166		-/*
167		~~- initializes the bz2 strm structure,~~
168		~~- calls the BZ2 decompression library initializer~~
169		-
170		~~- returns:~~
171		~~- BZ_OK on success~~
172		~~- various BZ_ errors on failure (see bzlib.h)~~
173		~~-*/~~
174		~~-int init_decompress(bz_info_t *bfile) {~~
175		~~- int bz_verbosity = 0;~~
176		~~- int bz_small = 0;~~
177		~~- int ret;~~
178		-
179		~~- bfile->strm.bzalloc = NULL;~~
180		~~- bfile->strm.bzfree = NULL;~~
181		~~- bfile->strm.opaque = NULL;~~
182		-
183		~~- ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );~~
184		~~- if (ret != BZ_OK) {~~
185		~~- fprintf(stderr,"uncompress failed, err %d\n", ret);~~
186		~~- exit(-1);~~
187		~~- }~~
188		~~- return(ret);~~
189		-}
190		-
191		-/*
192		~~- reads the first 4 bytes from a bz2 file (should be~~
193		~~- "BZh" followed by the block size indicator, typically "9")~~
194		~~- and passes them into the BZ2 decompression library.~~
195		~~- This must be done before decompression of any block of the~~
196		~~- file is attempted.~~
197		-
198		~~- returns:~~
199		~~- BZ_OK if successful,~~
200		~~- various BZ_ errors on failure (see bzlib.h)~~
201		~~-*/~~
202		~~-int decompress_header(int fin, bz_info_t *bfile) {~~
203		~~- int ret, res;~~
204		-
205		~~- res = lseek(fin,0,SEEK_SET);~~
206		~~- if (res == -1) {~~
207		~~- fprintf(stderr,"lseek of file to 0 failed (3)\n");~~
208		~~- exit(-1);~~
209		~~- }~~
210		~~- bfile->bytes_read = read(fin, bfile->header_buffer, 4);~~
211		~~- if (bfile->bytes_read < 4) {~~
212		~~- fprintf(stderr,"failed to read 4 bytes of header, exiting\n");~~
213		~~- exit(-1);~~
214		~~- }~~
215		~~- bfile->strm.next_in = (char *)bfile->header_buffer;~~
216		~~- bfile->strm.avail_in = 4;~~
217		-
218		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
219		~~- if (BZ_OK != ret && BZ_STREAM_END != ret) {~~
220		~~- fprintf(stderr,"Corrupt bzip2 header, exiting\n");~~
221		~~- exit(-1);~~
222		~~- }~~
223		~~- return(ret);~~
224		-}
225		-
226		-/*
227		~~- seek to appropriate offset as specified in bfile,~~
228		~~- read compressed data into buffer indicated by bfile,~~
229		~~- update the bfile structure accordingly,~~
230		~~- save the overflow byte (bit-shifted data = suck)~~
231		~~- this is for the first buffer of data in a stream,~~
232		~~- for subsequent buffers use fill_buffer_to_decompress()~~
233		-
234		~~- this will set bfile->eof on eof. no other indicator~~
235		~~- will be provided.~~
236		-
237		~~- returns:~~
238		~~- 0 on success~~
239		~~- -1 on error~~
240		~~-*/~~
241		~~-int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {~~
242		~~- int res;~~
243		-
244		~~- if (bfile->bits_shifted == 0) {~~
245		~~- res = lseek(fin,bfile->position+1,SEEK_SET);~~
246		~~- if (res == -1) {~~
247		~~- fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);~~
248		~~- return(-1);~~
249		~~- }~~
250		~~- }~~
251		~~- else {~~
252		~~- res = lseek(fin,bfile->position,SEEK_SET);~~
253		~~- if (res == -1) {~~
254		~~- fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);~~
255		~~- return(-1);~~
256		~~- }~~
257		~~- }~~
258		~~- bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);~~
259		~~- if (bfile->bytes_read > 0) {~~
260		~~- bfile->overflow = bfile->bufin[bfile->bytes_read-1];~~
261		~~- shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);~~
262		-
263		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
264		~~- bfile->strm.avail_in = bfile->bytes_read-1;~~
265		~~- }~~
266		~~- if (bfile->bytes_read <=0) {~~
267		~~- bfile->eof++;~~
268		~~- }~~
269		~~- return(0);~~
270		-}
271		-
272		-/*
273		~~- read compressed data into buffer indicated by bfile,~~
274		~~- from current position of file,~~
275		~~- stuffing the overflow byte in first.~~
276		~~- update the bfile structure accordingly~~
277		~~- save the new overflow byte (bit-shifted data = suck)~~
278		~~- this function is for decompression of buffers *after~~
279		~~- the first one*. for the first one use~~
280		~~- setup_first_buffer_to_decompress()~~
281		-
282		~~- this will set bfile->eof on eof. no other indicator~~
283		~~- will be provided.~~
284		-
285		~~- returns:~~
286		~~- 0 on success~~
287		~~- hmm, it really does not do anything about errors :-D~~
288		~~-*/~~
289		~~-int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {~~
290		~~- if (bfile->strm.avail_in == 0) {~~
291		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
292		~~- bfile->bufin[0] = bfile->overflow;~~
293		~~- bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);~~
294		~~- if (bfile->bytes_read > 0) {~~
295		~~- bfile->overflow = bfile->bufin[bfile->bytes_read];~~
296		~~- shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);~~
297		~~- bfile->strm.avail_in = bfile->bytes_read;~~
298		~~- bfile->position+=bfile->bytes_read;~~
299		~~- }~~
300		~~- else {~~
301		~~- bfile->strm.avail_in = 1; /* the overflow byte */~~
302		~~- bfile->eof++;~~
303		~~- }~~
304		~~- }~~
305		~~- return(0);~~
306		-}
307		-
308		~~-/* size of buffer is bytes usable. there will be a null byte at the end~~
309		-
310		~~- what we do with the buffer:~~
311		~~- - read from front of buffer to end,~~
312		~~- - fill from point where prev read did not fill buffer, or from where~~
313		~~- move of data at end of buffer to beginning left room,~~
314		~~- - mark a string of bytes (starting from what's available to read) as "read"~~
315		-
316		~~-*/~~
317		~~-buf_info_t *init_buffer(int size) {~~
318		~~- buf_info_t *b;~~
319		-
320		~~- b = (buf_info_t *)malloc(sizeof(buf_info_t));~~
321		~~- b->buffer = malloc(sizeof(unsigned char)*(size+1));~~
322		~~- b->buffer[size]='\0';~~
323		~~- b->end = b->buffer + size;~~
324		~~- b->next_to_read = b->end; /* nothing available */~~
325		~~- b->bytes_avail = 0; /* bytes to read, nothing available */~~
326		~~- b->next_to_fill = b->buffer; /* empty */~~
327		~~- b->next_to_fill[0] = '\0';~~
328		~~- return(b);~~
329		-}
330		-
331		~~-/* check if buffer (used for decompressed data output) is empty,~~
332		~~- returns 1 if so and 0 if not */~~
333		~~-int buffer_is_empty(buf_info_t *b) {~~
334		~~- if (b->bytes_avail == 0) {~~
335		~~- return(1);~~
336		~~- }~~
337		~~- else {~~
338		~~- return(0);~~
339		~~- }~~
340		-}
341		-
342		~~-/* check if buffer (used for decompressed data output) is full,~~
343		-
344		~~- returns 1 if so and 0 if not~~
345		~~- I'm not liking this function so well, fixme */~~
346		~~-int buffer_is_full(buf_info_t *b) {~~
347		~~- if (b->next_to_fill == b->end) {~~
348		~~- return(1);~~
349		~~- }~~
350		~~- else {~~
351		~~- return(0);~~
352		~~- }~~
353		-}
354		-
355		~~-/* FIXME do this right. whatever. */~~
356		~~-int get_file_size(int fin) {~~
357		~~- int res;~~
358		-
359		~~- res = lseek(fin, 0, SEEK_END);~~
360		~~- if (res == -1) {~~
361		~~- fprintf(stderr,"lseek of file to 0 failed (6)\n");~~
362		~~- exit(-1);~~
363		~~- }~~
364		~~- return(res);~~
365		-}
366		-
367		-
368		-/*
369		~~- look for the first bz2 block in the file after specified offset~~
370		~~- it tests that the block is valid by doing partial decompression.~~
371		~~- this function will update the bfile structure:~~
372		~~- bfile->position will contain the current position of the file (? will it?)~~
373		~~- bfile->bits_shifted will contain the number of bits that the block is rightshifted~~
374		~~- bfile->block_start will contain the offset from start of file to the block~~
375		~~- returns:~~
376		~~- position of next byte in file to be read, on success~~
377		~~- -1 if no marker or other error~~
378		~~-*/~~
379		~~-int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {~~
380		~~- int res;~~
381		-
382		~~- bfile->bufin_size = BUFINSIZE;~~
383		~~- bfile->marker = init_marker();~~
384		~~- bfile->position = position;~~
385		~~- bfile->block_start = -1;~~
386		~~- bfile->bytes_read = 0;~~
387		~~- bfile->bytes_written = 0;~~
388		~~- bfile->eof = 0;~~
389		~~- bfile->bits_shifted = -1;~~
390		-
391		~~- bfile->file_size = get_file_size(fin);~~
392		-
393		~~- while (bfile->bits_shifted < 0) {~~
394		~~- if (bfile->position > bfile->file_size) {~~
395		~~- return(-1);~~
396		~~- }~~
397		~~- res = lseek(fin, bfile->position, SEEK_SET);~~
398		~~- if (res == -1) {~~
399		~~- fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);~~
400		~~- exit(-1);~~
401		~~- }~~
402		~~- res = find_next_bz2_block_marker(fin, bfile);~~
403		~~- if (res == 1) {~~
404		~~- init_decompress(bfile);~~
405		~~- decompress_header(fin, bfile);~~
406		~~- res = setup_first_buffer_to_decompress(fin, bfile);~~
407		~~- if (res == -1) {~~
408		~~- fprintf(stderr,"couldn't get first buffer of data to uncompress\n");~~
409		~~- exit(-1);~~
410		~~- }~~
411		~~- bfile->strm.next_out = (char *)bfile->bufout;~~
412		~~- bfile->strm.avail_out = bfile->bufout_size;~~
413		~~- res = BZ2_bzDecompress ( &(bfile->strm) );~~
414		~~- /* this means we (probably) have a genuine marker */~~
415		~~- if (BZ_OK == res \|\| BZ_STREAM_END == res) {~~
416		~~- res = BZ2_bzDecompressEnd ( &(bfile->strm) );~~
417		~~- bfile->bytes_read = 0;~~
418		~~- bfile->bytes_written = 0;~~
419		~~- bfile->eof = 0;~~
420		~~- /* leave the file at the right position */~~
421		~~- res = lseek(fin, bfile->block_start, SEEK_SET);~~
422		~~- if (res == -1) {~~
423		~~- fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);~~
424		~~- exit(-1);~~
425		~~- }~~
426		~~- return(0);~~
427		~~- }~~
428		~~- /* right bytes, but there by chance, skip and try again */~~
429		~~- else {~~
430		~~- bfile->position+=6;~~
431		~~- bfile->bits_shifted = -1;~~
432		~~- bfile->block_start = -1;~~
433		~~- }~~
434		~~- }~~
435		~~- else {~~
436		~~- return(-1);~~
437		~~- }~~
438		~~- }~~
439		~~- return(-1);~~
440		-}
441		-
442	15	/*
443	16	find the first bz2 block marker in the file,
444	17	from its current position,
—	—	@@ -446,12 +19,12 @@
447	20	0 on success
448	21	-1 if no marker or other error
449	22	*/
450		~~-int init_bz2_file(bz_info_t *bfile, int fin) {~~
	23	+int init_and_read_first_buffer_bz2_file(bz_info_t *bfile, int fin) {
451	24	int res;
452	25
453	26	bfile->initialized++;
454	27
455		~~- res = find_next_bz2_block_marker(fin, bfile);~~
	28	+ res = find_next_bz2_block_marker(fin, bfile, FORWARD);
456	29	if (res ==1) {
457	30	init_decompress(bfile);
458	31	decompress_header(fin, bfile);
—	—	@@ -461,125 +34,7 @@
462	35	return(-1);
463	36	}
464	37
465		~~-/* return -1 if error */~~
466		~~-int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {~~
467		~~- int ret;~~
468		-
469		~~- bfile->bufout = bufferout;~~
470		~~- bfile->bufout_size = bufout_size;~~
471		~~- bfile->bytes_written = 0;~~
472		-
473		~~- if (! bfile->initialized) {~~
474		~~- if (init_bz2_file(bfile, fin) == -1) {~~
475		~~- /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */~~
476		~~- return(-1);~~
477		~~- };~~
478		~~- bfile->strm.next_out = (char *)bfile->bufout;~~
479		~~- bfile->strm.avail_out = bfile->bufout_size;~~
480		~~- }~~
481		-
482		~~- ret = BZ_OK;~~
483		~~- while (BZ_OK == ret && bfile->bytes_written == 0) {~~
484		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
485		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
486		~~- bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;~~
487		~~- }~~
488		~~- else {~~
489		~~- /* fprintf(stderr,"error from BZ decompress %d\n",ret); */~~
490		~~- return(-1);~~
491		~~- }~~
492		~~- fill_buffer_to_decompress(fin, bfile, ret);~~
493		- /*
494		~~- if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {~~
495		~~- fprintf(stderr,"eof reached\n");~~
496		~~- }~~
497		~~- */~~
498		~~- }~~
499		~~- return(0);~~
500		-}
501		-
502		-
503	38	/*
504		~~- fill output buffer in b with uncompressed data from bfile~~
505		~~- if this is the first call to the function for this file,~~
506		~~- the file header will be read, and the first buffer of~~
507		~~- uncompressed data will be prepared. bfile->position~~
508		~~- should be set to the offset (from the beginning of file) from~~
509		~~- which to find the first bz2 block.~~
510		-
511		~~- returns:~~
512		~~- on success, number of bytes read (may be 0)~~
513		~~- -1 on error~~
514		~~-*/~~
515		~~-int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {~~
516		~~- int res;~~
517		-
518		~~- if (buffer_is_full(b)) {~~
519		~~- return(0);~~
520		~~- }~~
521		-
522		~~- if (buffer_is_empty(b)) {~~
523		~~- b->next_to_fill = b->buffer;~~
524		~~- }~~
525		-
526		~~- res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);~~
527		~~- if (res == -1) {~~
528		~~- return(res);~~
529		~~- }~~
530		~~- if (bfile->bytes_written < 0) {~~
531		~~- /* fprintf(stderr,"read of file failed\n"); */~~
532		~~- return(-1);~~
533		~~- }~~
534		~~- else {~~
535		~~- /* really?? FIXME check this */~~
536		~~- if (buffer_is_empty(b)) {~~
537		~~- b->next_to_read = b->next_to_fill; /* where we just read */~~
538		~~- }~~
539		~~- b->bytes_avail += bfile->bytes_written;~~
540		~~- b->next_to_fill += bfile->bytes_written;~~
541		~~- b->next_to_fill[0] = '\0';~~
542		~~- return(0);~~
543		~~- }~~
544		-}
545		-
546		~~-void dumpbuf_info_t(buf_info_t *b) {~~
547		~~- fprintf(stdout, "\n");~~
548		~~- fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);~~
549		~~- fprintf(stdout, "b->end: %ld\n", (long int) b->end);~~
550		~~- fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);~~
551		~~- fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);~~
552		~~- fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);~~
553		-}
554		-
555		-
556		-/*
557		~~- copy text from end of buffer to the beginning, that we want to keep~~
558		~~- around for further processing (i.e. further regex matches)~~
559		~~- returns number of bytes copied~~
560		~~-*/~~
561		~~-int move_bytes_to_buffer_start(buf_info_t b, unsigned char from_where, int maxbytes) {~~
562		~~- int i, tocopy;~~
563		-
564		~~- if (from_where >= b->end) {~~
565		~~- return(0);~~
566		~~- }~~
567		~~- else {~~
568		~~- tocopy = b->end - from_where;~~
569		~~- if (maxbytes && (tocopy > maxbytes)) {~~
570		~~- tocopy = maxbytes;~~
571		~~- }~~
572		~~- for (i = 0; i < tocopy; i++) {~~
573		~~- b->buffer[i] = from_where[i];~~
574		~~- }~~
575		~~- b->next_to_fill = b->buffer + tocopy;~~
576		~~- b->next_to_fill[0] = '\0';~~
577		~~- b->next_to_read = b->buffer;~~
578		~~- b->bytes_avail = tocopy;~~
579		~~- return(tocopy);~~
580		~~- }~~
581		-}
582		-
583		-/*
584	39	get the first page id after position in file
585	40	if a pageid is found, the structure pinfo will be updated accordingly
586	41	returns:
—	—	@@ -614,12 +69,12 @@
615	70
616	71	bfile.bytes_read = 0;
617	72
618		~~- if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {~~
	73	+ if (find_first_bz2_block_from_offset(&bfile, fin, position, FORWARD) <= 0) {
619	74	/* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
620	75	return(-1);
621	76	}
622	77
623		~~- while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {~~
	78	+ while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) {
624	79	if (bfile.bytes_read) {
625	80	while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
626	81	if (match_page_id[1].rm_so >=0) {
Index: branches/ariel/xmldumps-backup/mwbzutils/bzlib_private.h
—	—	@@ -0,0 +1,509 @@
	2	+
	3	+/-------------------------------------------------------------/
	4	+/--- Private header file for the library. ---/
	5	+/--- bzlib_private.h ---/
	6	+/-------------------------------------------------------------/
	7	+
	8	+/* ------------------------------------------------------------------
	9	+ This file is part of bzip2/libbzip2, a program and library for
	10	+ lossless, block-sorting data compression.
	11	+
	12	+ bzip2/libbzip2 version 1.0.6 of 6 September 2010
	13	+ Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
	14	+
	15	+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
	16	+ README file.
	17	+
	18	+ This program is released under the terms of the license contained
	19	+ in the file LICENSE.
	20	+ ------------------------------------------------------------------ */
	21	+
	22	+
	23	+#ifndef _BZLIB_PRIVATE_H
	24	+#define _BZLIB_PRIVATE_H
	25	+
	26	+#include <stdlib.h>
	27	+
	28	+#ifndef BZ_NO_STDIO
	29	+#include <stdio.h>
	30	+#include <ctype.h>
	31	+#include <string.h>
	32	+#endif
	33	+
	34	+#include "bzlib.h"
	35	+
	36	+
	37	+
	38	+/-- General stuff. --/
	39	+
	40	+#define BZ_VERSION "1.0.6, 6-Sept-2010"
	41	+
	42	+typedef char Char;
	43	+typedef unsigned char Bool;
	44	+typedef unsigned char UChar;
	45	+typedef int Int32;
	46	+typedef unsigned int UInt32;
	47	+typedef short Int16;
	48	+typedef unsigned short UInt16;
	49	+
	50	+#define True ((Bool)1)
	51	+#define False ((Bool)0)
	52	+
	53	+#ifndef __GNUC__
	54	+#define __inline__ /* */
	55	+#endif
	56	+
	57	+#ifndef BZ_NO_STDIO
	58	+
	59	+extern void BZ2_bz__AssertH__fail ( int errcode );
	60	+#define AssertH(cond,errcode) \
	61	+ { if (!(cond)) BZ2_bz__AssertH__fail ( errcode ); }
	62	+
	63	+#if BZ_DEBUG
	64	+#define AssertD(cond,msg) \
	65	+ { if (!(cond)) { \
	66	+ fprintf ( stderr, \
	67	+ "\n\nlibbzip2(debug build): internal error\n\t%s\n", msg );\
	68	+ exit(1); \
	69	+ }}
	70	+#else
	71	+#define AssertD(cond,msg) /* */
	72	+#endif
	73	+
	74	+#define VPrintf0(zf) \
	75	+ fprintf(stderr,zf)
	76	+#define VPrintf1(zf,za1) \
	77	+ fprintf(stderr,zf,za1)
	78	+#define VPrintf2(zf,za1,za2) \
	79	+ fprintf(stderr,zf,za1,za2)
	80	+#define VPrintf3(zf,za1,za2,za3) \
	81	+ fprintf(stderr,zf,za1,za2,za3)
	82	+#define VPrintf4(zf,za1,za2,za3,za4) \
	83	+ fprintf(stderr,zf,za1,za2,za3,za4)
	84	+#define VPrintf5(zf,za1,za2,za3,za4,za5) \
	85	+ fprintf(stderr,zf,za1,za2,za3,za4,za5)
	86	+
	87	+#else
	88	+
	89	+extern void bz_internal_error ( int errcode );
	90	+#define AssertH(cond,errcode) \
	91	+ { if (!(cond)) bz_internal_error ( errcode ); }
	92	+#define AssertD(cond,msg) do { } while (0)
	93	+#define VPrintf0(zf) do { } while (0)
	94	+#define VPrintf1(zf,za1) do { } while (0)
	95	+#define VPrintf2(zf,za1,za2) do { } while (0)
	96	+#define VPrintf3(zf,za1,za2,za3) do { } while (0)
	97	+#define VPrintf4(zf,za1,za2,za3,za4) do { } while (0)
	98	+#define VPrintf5(zf,za1,za2,za3,za4,za5) do { } while (0)
	99	+
	100	+#endif
	101	+
	102	+
	103	+#define BZALLOC(nnn) (strm->bzalloc)(strm->opaque,(nnn),1)
	104	+#define BZFREE(ppp) (strm->bzfree)(strm->opaque,(ppp))
	105	+
	106	+
	107	+/-- Header bytes. --/
	108	+
	109	+#define BZ_HDR_B 0x42 /* 'B' */
	110	+#define BZ_HDR_Z 0x5a /* 'Z' */
	111	+#define BZ_HDR_h 0x68 /* 'h' */
	112	+#define BZ_HDR_0 0x30 /* '0' */
	113	+
	114	+/-- Constants for the back end. --/
	115	+
	116	+#define BZ_MAX_ALPHA_SIZE 258
	117	+#define BZ_MAX_CODE_LEN 23
	118	+
	119	+#define BZ_RUNA 0
	120	+#define BZ_RUNB 1
	121	+
	122	+#define BZ_N_GROUPS 6
	123	+#define BZ_G_SIZE 50
	124	+#define BZ_N_ITERS 4
	125	+
	126	+#define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
	127	+
	128	+
	129	+
	130	+/-- Stuff for randomising repetitive blocks. --/
	131	+
	132	+extern Int32 BZ2_rNums[512];
	133	+
	134	+#define BZ_RAND_DECLS \
	135	+ Int32 rNToGo; \
	136	+ Int32 rTPos \
	137	+
	138	+#define BZ_RAND_INIT_MASK \
	139	+ s->rNToGo = 0; \
	140	+ s->rTPos = 0 \
	141	+
	142	+#define BZ_RAND_MASK ((s->rNToGo == 1) ? 1 : 0)
	143	+
	144	+#define BZ_RAND_UPD_MASK \
	145	+ if (s->rNToGo == 0) { \
	146	+ s->rNToGo = BZ2_rNums[s->rTPos]; \
	147	+ s->rTPos++; \
	148	+ if (s->rTPos == 512) s->rTPos = 0; \
	149	+ } \
	150	+ s->rNToGo--;
	151	+
	152	+
	153	+
	154	+/-- Stuff for doing CRCs. --/
	155	+
	156	+extern UInt32 BZ2_crc32Table[256];
	157	+
	158	+#define BZ_INITIALISE_CRC(crcVar) \
	159	+{ \
	160	+ crcVar = 0xffffffffL; \
	161	+}
	162	+
	163	+#define BZ_FINALISE_CRC(crcVar) \
	164	+{ \
	165	+ crcVar = ~(crcVar); \
	166	+}
	167	+
	168	+#define BZ_UPDATE_CRC(crcVar,cha) \
	169	+{ \
	170	+ crcVar = (crcVar << 8) ^ \
	171	+ BZ2_crc32Table[(crcVar >> 24) ^ \
	172	+ ((UChar)cha)]; \
	173	+}
	174	+
	175	+
	176	+
	177	+/-- States and modes for compression. --/
	178	+
	179	+#define BZ_M_IDLE 1
	180	+#define BZ_M_RUNNING 2
	181	+#define BZ_M_FLUSHING 3
	182	+#define BZ_M_FINISHING 4
	183	+
	184	+#define BZ_S_OUTPUT 1
	185	+#define BZ_S_INPUT 2
	186	+
	187	+#define BZ_N_RADIX 2
	188	+#define BZ_N_QSORT 12
	189	+#define BZ_N_SHELL 18
	190	+#define BZ_N_OVERSHOOT (BZ_N_RADIX + BZ_N_QSORT + BZ_N_SHELL + 2)
	191	+
	192	+
	193	+
	194	+
	195	+/-- Structure holding all the compression-side stuff. --/
	196	+
	197	+typedef
	198	+ struct {
	199	+ /* pointer back to the struct bz_stream */
	200	+ bz_stream* strm;
	201	+
	202	+ /* mode this stream is in, and whether inputting */
	203	+ /* or outputting data */
	204	+ Int32 mode;
	205	+ Int32 state;
	206	+
	207	+ /* remembers avail_in when flush/finish requested */
	208	+ UInt32 avail_in_expect;
	209	+
	210	+ /* for doing the block sorting */
	211	+ UInt32* arr1;
	212	+ UInt32* arr2;
	213	+ UInt32* ftab;
	214	+ Int32 origPtr;
	215	+
	216	+ /* aliases for arr1 and arr2 */
	217	+ UInt32* ptr;
	218	+ UChar* block;
	219	+ UInt16* mtfv;
	220	+ UChar* zbits;
	221	+
	222	+ /* for deciding when to use the fallback sorting algorithm */
	223	+ Int32 workFactor;
	224	+
	225	+ /* run-length-encoding of the input */
	226	+ UInt32 state_in_ch;
	227	+ Int32 state_in_len;
	228	+ BZ_RAND_DECLS;
	229	+
	230	+ /* input and output limits and current posns */
	231	+ Int32 nblock;
	232	+ Int32 nblockMAX;
	233	+ Int32 numZ;
	234	+ Int32 state_out_pos;
	235	+
	236	+ /* map of bytes used in block */
	237	+ Int32 nInUse;
	238	+ Bool inUse[256];
	239	+ UChar unseqToSeq[256];
	240	+
	241	+ /* the buffer for bit stream creation */
	242	+ UInt32 bsBuff;
	243	+ Int32 bsLive;
	244	+
	245	+ /* block and combined CRCs */
	246	+ UInt32 blockCRC;
	247	+ UInt32 combinedCRC;
	248	+
	249	+ /* misc administratium */
	250	+ Int32 verbosity;
	251	+ Int32 blockNo;
	252	+ Int32 blockSize100k;
	253	+
	254	+ /* stuff for coding the MTF values */
	255	+ Int32 nMTF;
	256	+ Int32 mtfFreq [BZ_MAX_ALPHA_SIZE];
	257	+ UChar selector [BZ_MAX_SELECTORS];
	258	+ UChar selectorMtf[BZ_MAX_SELECTORS];
	259	+
	260	+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	261	+ Int32 code [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	262	+ Int32 rfreq [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	263	+ /* second dimension: only 3 needed; 4 makes index calculations faster */
	264	+ UInt32 len_pack[BZ_MAX_ALPHA_SIZE][4];
	265	+
	266	+ }
	267	+ EState;
	268	+
	269	+
	270	+
	271	+/-- externs for compression. --/
	272	+
	273	+extern void
	274	+BZ2_blockSort ( EState* );
	275	+
	276	+extern void
	277	+BZ2_compressBlock ( EState*, Bool );
	278	+
	279	+extern void
	280	+BZ2_bsInitWrite ( EState* );
	281	+
	282	+extern void
	283	+BZ2_hbAssignCodes ( Int32, UChar, Int32, Int32, Int32 );
	284	+
	285	+extern void
	286	+BZ2_hbMakeCodeLengths ( UChar, Int32, Int32, Int32 );
	287	+
	288	+
	289	+
	290	+/-- states for decompression. --/
	291	+
	292	+#define BZ_X_IDLE 1
	293	+#define BZ_X_OUTPUT 2
	294	+
	295	+#define BZ_X_MAGIC_1 10
	296	+#define BZ_X_MAGIC_2 11
	297	+#define BZ_X_MAGIC_3 12
	298	+#define BZ_X_MAGIC_4 13
	299	+#define BZ_X_BLKHDR_1 14
	300	+#define BZ_X_BLKHDR_2 15
	301	+#define BZ_X_BLKHDR_3 16
	302	+#define BZ_X_BLKHDR_4 17
	303	+#define BZ_X_BLKHDR_5 18
	304	+#define BZ_X_BLKHDR_6 19
	305	+#define BZ_X_BCRC_1 20
	306	+#define BZ_X_BCRC_2 21
	307	+#define BZ_X_BCRC_3 22
	308	+#define BZ_X_BCRC_4 23
	309	+#define BZ_X_RANDBIT 24
	310	+#define BZ_X_ORIGPTR_1 25
	311	+#define BZ_X_ORIGPTR_2 26
	312	+#define BZ_X_ORIGPTR_3 27
	313	+#define BZ_X_MAPPING_1 28
	314	+#define BZ_X_MAPPING_2 29
	315	+#define BZ_X_SELECTOR_1 30
	316	+#define BZ_X_SELECTOR_2 31
	317	+#define BZ_X_SELECTOR_3 32
	318	+#define BZ_X_CODING_1 33
	319	+#define BZ_X_CODING_2 34
	320	+#define BZ_X_CODING_3 35
	321	+#define BZ_X_MTF_1 36
	322	+#define BZ_X_MTF_2 37
	323	+#define BZ_X_MTF_3 38
	324	+#define BZ_X_MTF_4 39
	325	+#define BZ_X_MTF_5 40
	326	+#define BZ_X_MTF_6 41
	327	+#define BZ_X_ENDHDR_2 42
	328	+#define BZ_X_ENDHDR_3 43
	329	+#define BZ_X_ENDHDR_4 44
	330	+#define BZ_X_ENDHDR_5 45
	331	+#define BZ_X_ENDHDR_6 46
	332	+#define BZ_X_CCRC_1 47
	333	+#define BZ_X_CCRC_2 48
	334	+#define BZ_X_CCRC_3 49
	335	+#define BZ_X_CCRC_4 50
	336	+
	337	+
	338	+
	339	+/-- Constants for the fast MTF decoder. --/
	340	+
	341	+#define MTFA_SIZE 4096
	342	+#define MTFL_SIZE 16
	343	+
	344	+
	345	+
	346	+/-- Structure holding all the decompression-side stuff. --/
	347	+
	348	+typedef
	349	+ struct {
	350	+ /* pointer back to the struct bz_stream */
	351	+ bz_stream* strm;
	352	+
	353	+ /* state indicator for this stream */
	354	+ Int32 state;
	355	+
	356	+ /* for doing the final run-length decoding */
	357	+ UChar state_out_ch;
	358	+ Int32 state_out_len;
	359	+ Bool blockRandomised;
	360	+ BZ_RAND_DECLS;
	361	+
	362	+ /* the buffer for bit stream reading */
	363	+ UInt32 bsBuff;
	364	+ Int32 bsLive;
	365	+
	366	+ /* misc administratium */
	367	+ Int32 blockSize100k;
	368	+ Bool smallDecompress;
	369	+ Int32 currBlockNo;
	370	+ Int32 verbosity;
	371	+
	372	+ /* for undoing the Burrows-Wheeler transform */
	373	+ Int32 origPtr;
	374	+ UInt32 tPos;
	375	+ Int32 k0;
	376	+ Int32 unzftab[256];
	377	+ Int32 nblock_used;
	378	+ Int32 cftab[257];
	379	+ Int32 cftabCopy[257];
	380	+
	381	+ /* for undoing the Burrows-Wheeler transform (FAST) */
	382	+ UInt32 *tt;
	383	+
	384	+ /* for undoing the Burrows-Wheeler transform (SMALL) */
	385	+ UInt16 *ll16;
	386	+ UChar *ll4;
	387	+
	388	+ /* stored and calculated CRCs */
	389	+ UInt32 storedBlockCRC;
	390	+ UInt32 storedCombinedCRC;
	391	+ UInt32 calculatedBlockCRC;
	392	+ UInt32 calculatedCombinedCRC;
	393	+
	394	+ /* map of bytes used in block */
	395	+ Int32 nInUse;
	396	+ Bool inUse[256];
	397	+ Bool inUse16[16];
	398	+ UChar seqToUnseq[256];
	399	+
	400	+ /* for decoding the MTF values */
	401	+ UChar mtfa [MTFA_SIZE];
	402	+ Int32 mtfbase[256 / MTFL_SIZE];
	403	+ UChar selector [BZ_MAX_SELECTORS];
	404	+ UChar selectorMtf[BZ_MAX_SELECTORS];
	405	+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	406	+
	407	+ Int32 limit [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	408	+ Int32 base [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	409	+ Int32 perm [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
	410	+ Int32 minLens[BZ_N_GROUPS];
	411	+
	412	+ /* save area for scalars in the main decompress code */
	413	+ Int32 save_i;
	414	+ Int32 save_j;
	415	+ Int32 save_t;
	416	+ Int32 save_alphaSize;
	417	+ Int32 save_nGroups;
	418	+ Int32 save_nSelectors;
	419	+ Int32 save_EOB;
	420	+ Int32 save_groupNo;
	421	+ Int32 save_groupPos;
	422	+ Int32 save_nextSym;
	423	+ Int32 save_nblockMAX;
	424	+ Int32 save_nblock;
	425	+ Int32 save_es;
	426	+ Int32 save_N;
	427	+ Int32 save_curr;
	428	+ Int32 save_zt;
	429	+ Int32 save_zn;
	430	+ Int32 save_zvec;
	431	+ Int32 save_zj;
	432	+ Int32 save_gSel;
	433	+ Int32 save_gMinlen;
	434	+ Int32* save_gLimit;
	435	+ Int32* save_gBase;
	436	+ Int32* save_gPerm;
	437	+
	438	+ }
	439	+ DState;
	440	+
	441	+
	442	+
	443	+/-- Macros for decompression. --/
	444	+
	445	+#define BZ_GET_FAST(cccc) \
	446	+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
	447	+ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) return True; \
	448	+ s->tPos = s->tt[s->tPos]; \
	449	+ cccc = (UChar)(s->tPos & 0xff); \
	450	+ s->tPos >>= 8;
	451	+
	452	+#define BZ_GET_FAST_C(cccc) \
	453	+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
	454	+ if (c_tPos >= (UInt32)100000 * (UInt32)ro_blockSize100k) return True; \
	455	+ c_tPos = c_tt[c_tPos]; \
	456	+ cccc = (UChar)(c_tPos & 0xff); \
	457	+ c_tPos >>= 8;
	458	+
	459	+#define SET_LL4(i,n) \
	460	+ { if (((i) & 0x1) == 0) \
	461	+ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0xf0) \| (n); else \
	462	+ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0x0f) \| ((n) << 4); \
	463	+ }
	464	+
	465	+#define GET_LL4(i) \
	466	+ ((((UInt32)(s->ll4[(i) >> 1])) >> (((i) << 2) & 0x4)) & 0xF)
	467	+
	468	+#define SET_LL(i,n) \
	469	+ { s->ll16[i] = (UInt16)(n & 0x0000ffff); \
	470	+ SET_LL4(i, n >> 16); \
	471	+ }
	472	+
	473	+#define GET_LL(i) \
	474	+ (((UInt32)s->ll16[i]) \| (GET_LL4(i) << 16))
	475	+
	476	+#define BZ_GET_SMALL(cccc) \
	477	+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
	478	+ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) return True; \
	479	+ cccc = BZ2_indexIntoF ( s->tPos, s->cftab ); \
	480	+ s->tPos = GET_LL(s->tPos);
	481	+
	482	+
	483	+/-- externs for decompression. --/
	484	+
	485	+extern Int32
	486	+BZ2_indexIntoF ( Int32, Int32* );
	487	+
	488	+extern Int32
	489	+BZ2_decompress ( DState* );
	490	+
	491	+extern void
	492	+BZ2_hbCreateDecodeTables ( Int32, Int32, Int32, UChar,
	493	+ Int32, Int32, Int32 );
	494	+
	495	+
	496	+#endif
	497	+
	498	+
	499	+/-- BZ_NO_STDIO seems to make NULL disappear on some platforms. --/
	500	+
	501	+#ifdef BZ_NO_STDIO
	502	+#ifndef NULL
	503	+#define NULL 0
	504	+#endif
	505	+#endif
	506	+
	507	+
	508	+/-------------------------------------------------------------/
	509	+/--- end bzlib_private.h ---/
	510	+/-------------------------------------------------------------/
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/bzlib_private.h
___________________________________________________________________
Added: svn:eol-style
1	511	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/bzlibfuncs.c
—	—	@@ -0,0 +1,218 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+#include <sys/types.h>
	11	+#include <regex.h>
	12	+#include "bzlib_private.h"
	13	+#include "bzlib.h"
	14	+
	15	+/---------------------------------------------------/
	16	+/* Return True iff data corruption is discovered.
	17	+ Returns False if there is no problem.
	18	+*/
	19	+Bool unRLE_obuf_to_output_FAST ( DState* s )
	20	+{
	21	+ UChar k1;
	22	+
	23	+ if (s->blockRandomised) {
	24	+
	25	+ while (True) {
	26	+ /* try to finish existing run */
	27	+ while (True) {
	28	+ if (s->strm->avail_out == 0) return False;
	29	+ if (s->state_out_len == 0) break;
	30	+ ( (UChar)(s->strm->next_out) ) = s->state_out_ch;
	31	+ BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch );
	32	+ s->state_out_len--;
	33	+ s->strm->next_out++;
	34	+ s->strm->avail_out--;
	35	+ s->strm->total_out_lo32++;
	36	+ if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++;
	37	+ }
	38	+
	39	+ /* can a new run be started? */
	40	+ if (s->nblock_used == s->save_nblock+1) return False;
	41	+
	42	+ /* Only caused by corrupt data stream? */
	43	+ if (s->nblock_used > s->save_nblock+1)
	44	+ return True;
	45	+
	46	+ s->state_out_len = 1;
	47	+ s->state_out_ch = s->k0;
	48	+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
	49	+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
	50	+ if (s->nblock_used == s->save_nblock+1) continue;
	51	+ if (k1 != s->k0) { s->k0 = k1; continue; };
	52	+
	53	+ s->state_out_len = 2;
	54	+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
	55	+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
	56	+ if (s->nblock_used == s->save_nblock+1) continue;
	57	+ if (k1 != s->k0) { s->k0 = k1; continue; };
	58	+
	59	+ s->state_out_len = 3;
	60	+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
	61	+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
	62	+ if (s->nblock_used == s->save_nblock+1) continue;
	63	+ if (k1 != s->k0) { s->k0 = k1; continue; };
	64	+
	65	+ BZ_GET_FAST(k1); BZ_RAND_UPD_MASK;
	66	+ k1 ^= BZ_RAND_MASK; s->nblock_used++;
	67	+ s->state_out_len = ((Int32)k1) + 4;
	68	+ BZ_GET_FAST(s->k0); BZ_RAND_UPD_MASK;
	69	+ s->k0 ^= BZ_RAND_MASK; s->nblock_used++;
	70	+ }
	71	+
	72	+ } else {
	73	+
	74	+ /* restore */
	75	+ UInt32 c_calculatedBlockCRC = s->calculatedBlockCRC;
	76	+ UChar c_state_out_ch = s->state_out_ch;
	77	+ Int32 c_state_out_len = s->state_out_len;
	78	+ Int32 c_nblock_used = s->nblock_used;
	79	+ Int32 c_k0 = s->k0;
	80	+ UInt32* c_tt = s->tt;
	81	+ UInt32 c_tPos = s->tPos;
	82	+ char* cs_next_out = s->strm->next_out;
	83	+ unsigned int cs_avail_out = s->strm->avail_out;
	84	+ Int32 ro_blockSize100k = s->blockSize100k;
	85	+ /* end restore */
	86	+
	87	+ UInt32 avail_out_INIT = cs_avail_out;
	88	+ Int32 s_save_nblockPP = s->save_nblock+1;
	89	+ unsigned int total_out_lo32_old;
	90	+
	91	+ while (True) {
	92	+
	93	+ /* try to finish existing run */
	94	+ if (c_state_out_len > 0) {
	95	+ while (True) {
	96	+ if (cs_avail_out == 0) goto return_notr;
	97	+ if (c_state_out_len == 1) break;
	98	+ ( (UChar)(cs_next_out) ) = c_state_out_ch;
	99	+ BZ_UPDATE_CRC ( c_calculatedBlockCRC, c_state_out_ch );
	100	+ c_state_out_len--;
	101	+ cs_next_out++;
	102	+ cs_avail_out--;
	103	+ }
	104	+ s_state_out_len_eq_one:
	105	+ {
	106	+ if (cs_avail_out == 0) {
	107	+ c_state_out_len = 1; goto return_notr;
	108	+ };
	109	+ ( (UChar)(cs_next_out) ) = c_state_out_ch;
	110	+ BZ_UPDATE_CRC ( c_calculatedBlockCRC, c_state_out_ch );
	111	+ cs_next_out++;
	112	+ cs_avail_out--;
	113	+ }
	114	+ }
	115	+ /* Only caused by corrupt data stream? */
	116	+ if (c_nblock_used > s_save_nblockPP)
	117	+ return True;
	118	+
	119	+ /* can a new run be started? */
	120	+ if (c_nblock_used == s_save_nblockPP) {
	121	+ c_state_out_len = 0; goto return_notr;
	122	+ };
	123	+ c_state_out_ch = c_k0;
	124	+ BZ_GET_FAST_C(k1); c_nblock_used++;
	125	+ if (k1 != c_k0) {
	126	+ c_k0 = k1; goto s_state_out_len_eq_one;
	127	+ };
	128	+ if (c_nblock_used == s_save_nblockPP)
	129	+ goto s_state_out_len_eq_one;
	130	+
	131	+ c_state_out_len = 2;
	132	+ BZ_GET_FAST_C(k1); c_nblock_used++;
	133	+ if (c_nblock_used == s_save_nblockPP) continue;
	134	+ if (k1 != c_k0) { c_k0 = k1; continue; };
	135	+
	136	+ c_state_out_len = 3;
	137	+ BZ_GET_FAST_C(k1); c_nblock_used++;
	138	+ if (c_nblock_used == s_save_nblockPP) continue;
	139	+ if (k1 != c_k0) { c_k0 = k1; continue; };
	140	+
	141	+ BZ_GET_FAST_C(k1); c_nblock_used++;
	142	+ c_state_out_len = ((Int32)k1) + 4;
	143	+ BZ_GET_FAST_C(c_k0); c_nblock_used++;
	144	+ }
	145	+
	146	+ return_notr:
	147	+ total_out_lo32_old = s->strm->total_out_lo32;
	148	+ s->strm->total_out_lo32 += (avail_out_INIT - cs_avail_out);
	149	+ if (s->strm->total_out_lo32 < total_out_lo32_old)
	150	+ s->strm->total_out_hi32++;
	151	+
	152	+ /* save */
	153	+ s->calculatedBlockCRC = c_calculatedBlockCRC;
	154	+ s->state_out_ch = c_state_out_ch;
	155	+ s->state_out_len = c_state_out_len;
	156	+ s->nblock_used = c_nblock_used;
	157	+ s->k0 = c_k0;
	158	+ s->tt = c_tt;
	159	+ s->tPos = c_tPos;
	160	+ s->strm->next_out = cs_next_out;
	161	+ s->strm->avail_out = cs_avail_out;
	162	+ /* end save */
	163	+ }
	164	+ return False;
	165	+}
	166	+
	167	+int BZ_API(BZ2_bzDecompress_mine) ( bz_stream *strm )
	168	+{
	169	+ Bool corrupt;
	170	+ DState* s;
	171	+ if (strm == NULL) return BZ_PARAM_ERROR;
	172	+ s = strm->state;
	173	+ if (s == NULL) return BZ_PARAM_ERROR;
	174	+ if (s->strm != strm) return BZ_PARAM_ERROR;
	175	+
	176	+ while (True) {
	177	+ if (s->state == BZ_X_IDLE) return BZ_SEQUENCE_ERROR;
	178	+ if (s->state == BZ_X_OUTPUT) {
	179	+ /* if (s->smallDecompress)
	180	+ corrupt = unRLE_obuf_to_output_SMALL ( s ); else
	181	+ corrupt = unRLE_obuf_to_output_FAST ( s ); */
	182	+
	183	+ corrupt = unRLE_obuf_to_output_FAST ( s );
	184	+ if (corrupt) return BZ_DATA_ERROR;
	185	+ if (s->nblock_used == s->save_nblock+1 && s->state_out_len == 0) {
	186	+ BZ_FINALISE_CRC ( s->calculatedBlockCRC );
	187	+ if (s->verbosity >= 3)
	188	+ VPrintf2 ( " {0x%08x, 0x%08x}", s->storedBlockCRC,
	189	+ s->calculatedBlockCRC );
	190	+ if (s->verbosity >= 2) VPrintf0 ( "]" );
	191	+ if (s->calculatedBlockCRC != s->storedBlockCRC)
	192	+ return BZ_DATA_ERROR;
	193	+ s->calculatedCombinedCRC
	194	+ = (s->calculatedCombinedCRC << 1) \|
	195	+ (s->calculatedCombinedCRC >> 31);
	196	+ s->calculatedCombinedCRC ^= s->calculatedBlockCRC;
	197	+ s->state = BZ_X_BLKHDR_1;
	198	+ } else {
	199	+ return BZ_OK;
	200	+ }
	201	+ }
	202	+ if (s->state >= BZ_X_MAGIC_1) {
	203	+ Int32 r = BZ2_decompress ( s );
	204	+ if (r == BZ_STREAM_END) {
	205	+ if (s->verbosity >= 3)
	206	+ VPrintf2 ( "\n combined CRCs: stored = 0x%08x, computed = 0x%08x",
	207	+ s->storedCombinedCRC, s->calculatedCombinedCRC );
	208	+ /* if (s->calculatedCombinedCRC != s->storedCombinedCRC)
	209	+ return BZ_DATA_ERROR; */
	210	+ return r;
	211	+ }
	212	+ if (s->state != BZ_X_OUTPUT) return r;
	213	+ }
	214	+ }
	215	+
	216	+ AssertH ( 0, 6001 );
	217	+
	218	+ return 0; /NOTREACHED/
	219	+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/bzlibfuncs.c
___________________________________________________________________
Added: svn:eol-style
1	220	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c
—	—	@@ -6,6 +6,7 @@
7	7	#include <fcntl.h>
8	8	#include <stdlib.h>
9	9	#include <errno.h>
	10	+#include "mwbzutils.h"
10	11
11	12	/*
12	13	Check to see whether a file ends with a bz2 footer or not
—	—	@@ -22,123 +23,11 @@
23	24	*/
24	25
25	26
26		~~-int read_footer(unsigned char *buffer, int fin) {~~
27		~~- int res;~~
28		-
29		~~- res = lseek(fin, -11, SEEK_END);~~
30		~~- if (res == -1) {~~
31		~~- fprintf(stderr,"lseek of file failed\n");~~
32		~~- exit(-1);~~
33		~~- }~~
34		~~- res = read(fin, buffer, 11);~~
35		~~- if (res == -1) {~~
36		~~- fprintf(stderr,"read of file failed\n");~~
37		~~- exit(-1);~~
38		~~- }~~
39		~~- return(0);~~
40		-}
41		-
42		~~-#define LEFT 0~~
43		~~-#define RIGHT 1~~
44		-
45		~~-/* return n ones either at left or right end */~~
46		~~-int bitmask(int numbits, int end) {~~
47		~~- if (end == RIGHT) {~~
48		~~- return((1<<numbits)-1);~~
49		~~- }~~
50		~~- else {~~
51		~~- return(((1<<numbits)-1) << (8-numbits));~~
52		~~- }~~
53		-}
54		-
55		~~-void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {~~
56		~~- int i;~~
57		-
58		~~- for (i=buflen-1; i>=0; i--) {~~
59		~~- /* right 1 */~~
60		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
61		-
62		~~- /* grab rightmost from prev byte */~~
63		~~- if (i > 0) {~~
64		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(1,LEFT)));~~
65		~~- }~~
66		~~- }~~
67		-}
68		-
69		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
70		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
71		~~- matches and 0 otherwise. */~~
72		~~-int bytescompare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
73		~~- int i;~~
74		-
75		~~- if (bitsrightshifted == 0) {~~
76		~~- for (i = 0; i< numbytes; i++) {~~
77		~~- if (buff1[i] != buff2[i]) {~~
78		~~- return(1);~~
79		~~- }~~
80		~~- }~~
81		~~- return(0);~~
82		~~- }~~
83		~~- else {~~
84		~~- for (i = 1; i< numbytes-2; i++) {~~
85		~~- if (buff1[i] != buff2[i]) {~~
86		~~- return(1);~~
87		~~- }~~
88		~~- }~~
89		~~- /* do leftmost byte */~~
90		~~- if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {~~
91		~~- return(1);~~
92		~~- }~~
93		~~- /* do rightmost byte */~~
94		~~- if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {~~
95		~~- return(1);~~
96		~~- }~~
97		~~- return(0);~~
98		~~- }~~
99		-}
100		-
101		~~-int checkfileforfooter(int fin) {~~
102		~~- unsigned char buffer[11];~~
103		~~- int result, i;~~
104		~~- unsigned char *footer = malloc(8sizeof(unsigned char *));~~
105		-
106		~~- /* set up footer plus its various right-shifted incarnations */~~
107		~~- /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */~~
108		~~- for (i = 0; i< 8; i++) {~~
109		~~- footer[i] = malloc(sizeof(unsigned char)*7);~~
110		~~- }~~
111		~~- footer[0][0]= (unsigned char) 0x17;~~
112		~~- footer[0][1]= (unsigned char) 0x72;~~
113		~~- footer[0][2]= (unsigned char) 0x45;~~
114		~~- footer[0][3]= (unsigned char) 0x38;~~
115		~~- footer[0][4]= (unsigned char) 0x50;~~
116		~~- footer[0][5]= (unsigned char) 0x90;~~
117		~~- footer[0][6]= (unsigned char) 0x00;~~
118		~~- for (i = 1; i< 8; i++) {~~
119		~~- memcpy((char )(footer[i]), (char )(footer[i-1]),7);~~
120		~~- shiftbytesright(footer[i],7,1);~~
121		~~- }~~
122		-
123		~~- read_footer(buffer,fin);~~
124		-
125		~~- result = bytescompare(footer[0],buffer+1,6,0);~~
126		~~- if (!result) {~~
127		~~- return(0);~~
128		~~- }~~
129		-
130		~~- for (i=1; i<8; i++) {~~
131		~~- result = bytescompare(footer[i],buffer,7,i);~~
132		~~- if (!result) {~~
133		~~- return(0);~~
134		~~- }~~
135		~~- }~~
136		~~- return(1);~~
137		-}
138		-
139	27	int main(int argc, char **argv) {
140	28
141	29	int fin;
142	30	int result;
	31	+ bz_info_t bfile;
143	32
144	33	if (argc != 2) {
145	34	fprintf(stderr,"usage: %s infile\n", argv[0]);
—	—	@@ -149,7 +38,9 @@
150	39	fprintf(stderr,"failed to open file %s for read\n", argv[1]);
151	40	exit(-1);
152	41	}
153		~~- result = checkfileforfooter(fin);~~
	42	+
	43	+ bfile.footer = init_footer();
	44	+ result = check_file_for_footer(fin, &bfile);
154	45	close(fin);
155	46	exit(result);
156	47	}
Index: branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c
—	—	@@ -6,8 +6,9 @@
7	7	#include <fcntl.h>
8	8	#include <stdlib.h>
9	9	#include <errno.h>
10		~~-#include "bzlib.h"~~
	10	+#include "mwbzutils.h"
11	11
	12	+
12	13	/*
13	14	Find the last bz2 block marker in a file
14	15	and dump whatever can be decompressed after
—	—	@@ -24,439 +25,73 @@
25	26	1 if decompression fails, and -1 on error.
26	27	*/
27	28
28		~~-#define BUFSIZE 121072~~
29		~~-typedef struct {~~
30		~~- unsigned char bufin[BUFSIZE];~~
31		~~- unsigned char bufout[BUFSIZE];~~
32		~~- int bufsize;~~
33		~~- bz_stream strm;~~
34		~~- unsigned char overflow;~~
35		~~- int bitsshifted;~~
36		~~- int position;~~
37		~~-} bzinfo;~~
38		-
39		~~-int read_footer(unsigned char *buffer, int fin) {~~
40		~~- int res;~~
41		-
42		~~- res = lseek(fin, -11, SEEK_END);~~
43		~~- if (res == -1) {~~
44		~~- fprintf(stderr,"lseek of file failed\n");~~
45		~~- exit(-1);~~
46		~~- }~~
47		~~- res = read(fin, buffer, 11);~~
48		~~- if (res == -1) {~~
49		~~- fprintf(stderr,"read of file failed\n");~~
50		~~- exit(-1);~~
51		~~- }~~
52		~~- return(0);~~
53		-}
54		-
55		~~-#define LEFT 0~~
56		~~-#define RIGHT 1~~
57		-
58		~~-/* return n ones either at left or right end */~~
59		~~-int bitmask(int numbits, int end) {~~
60		~~- if (end == RIGHT) {~~
61		~~- return((1<<numbits)-1);~~
62		~~- }~~
63		~~- else {~~
64		~~- return(((1<<numbits)-1) << (8-numbits));~~
65		~~- }~~
66		-}
67		-
68		~~-void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {~~
69		~~- int i;~~
70		-
71		~~- if (numbits == 0) {~~
72		~~- return;~~
73		~~- }~~
74		-
75		~~- for (i=0; i<buflen; i++) {~~
76		~~- /* left 1 */~~
77		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);~~
78		-
79		~~- /* grab leftmost from next byte */~~
80		~~- if (i < buflen-1) {~~
81		~~- buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );~~
82		~~- }~~
83		~~- }~~
84		-}
85		-
86		-
87		~~-void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {~~
88		~~- int i;~~
89		-
90		~~- for (i=buflen-1; i>=0; i--) {~~
91		~~- /* right 1 */~~
92		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
93		-
94		~~- /* grab rightmost from prev byte */~~
95		~~- if (i > 0) {~~
96		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));~~
97		~~- }~~
98		~~- }~~
99		-}
100		-
101		~~-unsigned char ** init_marker() {~~
102		~~- unsigned char *marker = malloc(8sizeof(unsigned char *));~~
103		~~- int i;~~
104		-
105		~~- /* set up block marker plus its various right-shifted incarnations */~~
106		~~- for (i = 0; i< 8; i++) {~~
107		~~- marker[i] = malloc(sizeof(unsigned char)*7);~~
108		~~- }~~
109		~~- marker[0][0]= (unsigned char) 0x31;~~
110		~~- marker[0][1]= (unsigned char) 0x41;~~
111		~~- marker[0][2]= (unsigned char) 0x59;~~
112		~~- marker[0][3]= (unsigned char) 0x26;~~
113		~~- marker[0][4]= (unsigned char) 0x53;~~
114		~~- marker[0][5]= (unsigned char) 0x59;~~
115		~~- marker[0][6]= (unsigned char) 0x00;~~
116		~~- for (i = 1; i< 8; i++) {~~
117		~~- memcpy((char )(marker[i]), (char )(marker[i-1]),7);~~
118		~~- shiftbytesright(marker[i],7,1);~~
119		~~- }~~
120		~~- return(marker);~~
121		-}
122		-
123		~~-unsigned char ** init_footer() {~~
124		~~- unsigned char *footer = malloc(8sizeof(unsigned char *));~~
125		~~- int i;~~
126		-
127		~~- /* set up footer plus its various right-shifted incarnations */~~
128		~~- /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */~~
129		~~- for (i = 0; i< 8; i++) {~~
130		~~- footer[i] = malloc(sizeof(unsigned char)*7);~~
131		~~- }~~
132		~~- footer[0][0]= (unsigned char) 0x17;~~
133		~~- footer[0][1]= (unsigned char) 0x72;~~
134		~~- footer[0][2]= (unsigned char) 0x45;~~
135		~~- footer[0][3]= (unsigned char) 0x38;~~
136		~~- footer[0][4]= (unsigned char) 0x50;~~
137		~~- footer[0][5]= (unsigned char) 0x90;~~
138		~~- footer[0][6]= (unsigned char) 0x00;~~
139		~~- for (i = 1; i< 8; i++) {~~
140		~~- memcpy((char )(footer[i]), (char )(footer[i-1]),7);~~
141		~~- shiftbytesright(footer[i],7,1);~~
142		~~- }~~
143		~~- return(footer);~~
144		-}
145		-
146		-
147		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
148		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
149		~~- matches and 0 otherwise. */~~
150		~~-int bytescompare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
151		~~- int i;~~
152		-
153		~~- if (bitsrightshifted == 0) {~~
154		~~- for (i = 0; i< numbytes; i++) {~~
155		~~- if (buff1[i] != buff2[i]) {~~
156		~~- return(1);~~
157		~~- }~~
158		~~- }~~
159		~~- return(0);~~
160		~~- }~~
161		~~- else {~~
162		~~- for (i = 1; i< numbytes-2; i++) {~~
163		~~- if (buff1[i] != buff2[i]) {~~
164		~~- return(1);~~
165		~~- }~~
166		~~- }~~
167		~~- /* do leftmost byte */~~
168		~~- if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {~~
169		~~- return(1);~~
170		~~- }~~
171		~~- /* do rightmost byte */~~
172		~~- if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {~~
173		~~- return(1);~~
174		~~- }~~
175		~~- return(0);~~
176		~~- }~~
177		-}
178		-
179		~~-/* return -1 if no match~~
180		~~- return number of bits rightshifted otherwise */~~
181		~~-int checkfileforfooter(int fin, unsigned char **footer) {~~
182		~~- unsigned char buffer[11];~~
183		~~- int result, i;~~
184		-
185		~~- read_footer(buffer,fin);~~
186		-
187		~~- result = bytescompare(footer[0],buffer+1,6,0);~~
188		~~- if (!result) {~~
189		~~- return(0);~~
190		~~- }~~
191		-
192		~~- for (i=1; i<8; i++) {~~
193		~~- result = bytescompare(footer[i],buffer,7,i);~~
194		~~- if (!result) {~~
195		~~- return(i);~~
196		~~- }~~
197		~~- }~~
198		~~- return(-1);~~
199		-}
200		-
201		~~-/* return -1 if no match~~
202		~~- return number of bits rightshifted otherwise */~~
203		~~-int checkbufferforblockmarker(unsigned char buffer, unsigned char *marker) {~~
204		~~- int result, i;~~
205		-
206		~~- result = bytescompare(marker[0],buffer+1,6,0);~~
207		~~- if (!result) {~~
208		~~- return(0);~~
209		~~- }~~
210		~~- for (i=1; i<8; i++) {~~
211		~~- result = bytescompare(marker[i],buffer,7,i);~~
212		~~- if (!result) {~~
213		~~- return(i);~~
214		~~- }~~
215		~~- }~~
216		~~- return(-1);~~
217		-}
218		-
219		~~-void clearbuffer(unsigned char *buf, int length) {~~
220		~~- int i;~~
221		-
222		~~- for (i=0; i<length; i++) {~~
223		~~- buf[i]=0;~~
224		~~- }~~
225		~~- return;~~
226		-}
227		-
228		~~-int findnextmarker(int fin, int start_at, int position, unsigned char *marker, unsigned char buffer ) {~~
229		~~- int bitsshifted = -1;~~
230		~~- int result;~~
231		-
232		~~- /* must be after 4 byte file header, and we add a leftmost byte to the buffer~~
233		~~- of data read in case some bits have been shifted into it */~~
234		~~- while (*position >= 3 && bitsshifted < 0) {~~
235		~~- bitsshifted = checkbufferforblockmarker(buffer, marker);~~
236		~~- if (bitsshifted < 0) {~~
237		~~- (*start_at)++;~~
238		- /*
239		~~- if (*start_at % 10000 == 0) {~~
240		~~- fprintf(stderr, "starting at %d, position %d\n", start_at, position);~~
241		~~- }~~
242		~~- */~~
243		~~- position = lseek(fin, -1(*start_at), SEEK_END);~~
244		~~- if (*position == -1) {~~
245		~~- fprintf(stderr,"lseek of file failed\n");~~
246		~~- exit(-1);~~
247		~~- }~~
248		~~- result = read(fin, buffer, 7);~~
249		~~- if (result == -1) {~~
250		~~- fprintf(stderr,"read of file failed\n");~~
251		~~- exit(-1);~~
252		~~- }~~
253		~~- }~~
254		~~- else {~~
255		~~- return(bitsshifted);~~
256		~~- }~~
257		~~- }~~
258		~~- return(bitsshifted);~~
259		-}
260		-
261		~~-int init_decompress(bzinfo *bfile) {~~
262		~~- int bz_verbosity = 0;~~
263		~~- int bz_small = 0;~~
264		~~- int ret;~~
265		-
266		~~- bfile->strm.bzalloc = NULL;~~
267		~~- bfile->strm.bzfree = NULL;~~
268		~~- bfile->strm.opaque = NULL;~~
269		-
270		~~- ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );~~
271		~~- if (ret != BZ_OK) {~~
272		~~- fprintf(stderr,"uncompress failed, err %d\n", ret);~~
273		~~- exit(-1);~~
274		~~- }~~
275		~~- return(ret);~~
276		-}
277		-
278		~~-int decompress_header(int fin, bzinfo *bfile) {~~
279		~~- int bytesread, ret;~~
280		~~- unsigned char header[4];~~
281		-
282		~~- lseek(fin,0,SEEK_SET);~~
283		~~- bytesread = read(fin, header, 4);~~
284		~~- if (bytesread < 4) {~~
285		~~- fprintf(stderr,"failed to read 4 bytes of header, exiting\n");~~
286		~~- exit(-1);~~
287		~~- }~~
288		~~- bfile->strm.next_in = (char *)header;~~
289		~~- bfile->strm.avail_in = 4;~~
290		-
291		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
292		~~- bfile->strm.avail_out = bfile->bufsize;~~
293		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
294		~~- if (BZ_OK != ret && BZ_STREAM_END != ret) {~~
295		~~- fprintf(stderr,"Corrupt bzip2 header, exiting\n");~~
296		~~- exit(-1);~~
297		~~- }~~
298		~~- return(ret);~~
299		-}
300		-
301		~~-int setup_first_buffer(int fin, bzinfo *bfile) {~~
302		~~- int bytesread, eof=0;~~
303		-
304		~~- if (bfile->bitsshifted == 0) {~~
305		~~- lseek(fin,bfile->position+1,SEEK_SET);~~
306		~~- }~~
307		~~- else {~~
308		~~- lseek(fin,bfile->position,SEEK_SET);~~
309		~~- }~~
310		~~- bytesread = read(fin, bfile->bufin, bfile->bufsize);~~
311		~~- if (bytesread > 0) {~~
312		~~- bfile->overflow = bfile->bufin[bytesread-1];~~
313		~~- shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);~~
314		-
315		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
316		~~- bfile->strm.avail_in = bytesread-1;~~
317		-
318		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
319		~~- bfile->strm.avail_out = bfile->bufsize;~~
320		~~- }~~
321		~~- if (bytesread <=0) {~~
322		~~- eof++;~~
323		~~- }~~
324		~~- return(eof);~~
325		-}
326		-
327		~~-int do_last_byte(bzinfo *bfile) {~~
328		~~- int ret=BZ_OK;~~
329		~~- int written;~~
330		-
331		~~- if (bfile->strm.avail_in == 0) {~~
332		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
333		~~- bfile->bufin[0] = bfile->overflow;~~
334		~~- shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);~~
335		~~- bfile->strm.avail_in = 1;~~
336		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
337		~~- bfile->strm.avail_out = bfile->bufsize;~~
338		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
339		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
340		~~- written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);~~
341		~~- }~~
342		~~- }~~
343		~~- return(ret);~~
344		-}
345		-
346		~~-int read_next_buffer(int fin, bzinfo *bfile, int ret) {~~
347		~~- int bytesread, eof=0;~~
348		-
349		~~- /* fprintf(stderr," got return from decompress of %d\n", ret); */~~
350		-
351		~~- if (bfile->strm.avail_in == 0) {~~
352		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
353		~~- bfile->bufin[0] = bfile->overflow;~~
354		~~- bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);~~
355		~~- if (bytesread > 0) {~~
356		~~- bfile->overflow = bfile->bufin[bytesread];~~
357		~~- shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);~~
358		~~- bfile->strm.avail_in = bytesread;~~
359		~~- }~~
360		~~- else {~~
361		~~- eof++;~~
362		~~- bfile->strm.avail_in = 0;~~
363		~~- }~~
364		~~- }~~
365		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
366		~~- bfile->strm.avail_out = bfile->bufsize;~~
367		-
368		~~- return(eof);~~
369		-}
370		-
371		-
372	29	int main(int argc, char **argv) {
373	30
374		~~- bzinfo bfile;~~
	31	+ bz_info_t bfile;
375	32
376	33	int fin;
377		~~- int result, ret;~~
378		~~- unsigned char buffer[8];~~
	34	+ int result;
	35	+ buf_info_t *b;
379	36
380		~~- unsigned char **footer;~~
381		~~- unsigned char **marker;~~
	37	+ int firstblock = 1;
	38	+ int length = 5000; /* output buffer size */
382	39
383		~~- int written=0;~~
384		~~- int start_at;~~
385		-
386		~~- int eof = 0;~~
387		-
388	40	if (argc != 2) {
389	41	fprintf(stderr,"usage: %s infile\n", argv[0]);
390	42	exit(-1);
391	43	}
392	44
393		~~- marker = init_marker();~~
394		~~- footer = init_footer();~~
395		-
396	45	fin = open (argv[1], O_RDONLY);
397	46	if (fin < 0) {
398	47	fprintf(stderr,"failed to open file %s for read\n", argv[1]);
399	48	exit(-1);
400	49	}
401	50
402		~~- bfile.bufsize = BUFSIZE;~~
403		-
404		~~- result = checkfileforfooter(fin, footer);~~
	51	+ bfile.file_size = get_file_size(fin);
	52	+ bfile.footer = init_footer();
	53	+ result = check_file_for_footer(fin, &bfile);
405	54	if (result == -1) {
406		~~- start_at = 0;~~
	55	+ bfile.position = bfile.file_size;
407	56	}
408	57	else {
409		~~- start_at = 11; /* size of footer, perhaps with 1 byte extra */~~
	58	+ bfile.position = bfile.file_size - 11; /* size of footer, perhaps with 1 byte extra */
410	59	}
411		~~- start_at +=6; /* size of marker */~~
412		~~- bfile.position = lseek(fin, -1*start_at, SEEK_END);~~
413		~~- if (bfile.position == -1) {~~
414		~~- fprintf(stderr,"lseek of file failed\n");~~
415		~~- exit(-1);~~
416		~~- }~~
417		~~- result = read(fin, buffer, 7);~~
418		~~- if (result == -1) {~~
419		~~- fprintf(stderr,"read of file failed\n");~~
420		~~- exit(-1);~~
421		~~- }~~
	60	+ bfile.position -=6; /* size of marker */
	61	+ bfile.initialized = 0;
	62	+ b = init_buffer(length);
	63	+ bfile.bytes_read = 0;
422	64
423		~~- while (1) {~~
	65	+ /* init_bz2_file(&bfile, fin, BACKWARD); */
	66	+ firstblock = 1;
424	67
425		~~- bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);~~
426		~~- if (bfile.bitsshifted >= 0) {~~
427		~~- /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */~~
428		~~- ret = init_decompress(&bfile);~~
429		-
430		~~- /* pass in the header */~~
431		~~- ret = decompress_header(fin,&bfile);~~
432		-
433		~~- eof = setup_first_buffer(fin, &bfile);~~
434		-
435		~~- while (BZ_OK == ret && !eof) {~~
436		~~- ret = BZ2_bzDecompress ( &(bfile.strm) );~~
437		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
438		~~- written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);~~
439		~~- }~~
440		~~- eof = read_next_buffer(fin, &bfile, ret);~~
441		~~- }~~
442		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret ) {~~
443		~~- /* so we read no bytes, process the last byte we held */~~
444		~~- do_last_byte(&bfile);~~
445		~~- }~~
446		~~- if (written == 0) {~~
447		~~- /* truncated block or other corruption, try going back one */~~
448		~~- start_at +=5;~~
449		~~- clearbuffer(buffer,sizeof(buffer));~~
450		~~- continue;~~
451		~~- }~~
452		~~- else {~~
453		~~- break;~~
454		~~- }~~
	68	+ if (find_first_bz2_block_from_offset(&bfile, fin, bfile.position, BACKWARD) <= 0) {
	69	+ fprintf(stderr,"failed to find block in bz2file\n");
	70	+ exit(-1);
	71	+ }
	72	+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof) && (! bfile.position ==0)) {
	73	+ if (bfile.bytes_read) {
	74	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	75	+ b->next_to_read = b->end;
	76	+ b->bytes_avail = 0;
	77	+ b->next_to_fill = b->buffer; /* empty */
	78	+ bfile.strm.next_out = (char *)b->next_to_fill;
	79	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	80	+ firstblock = 0;
455	81	}
456	82	else {
457		~~- fprintf(stderr,"no block marker in this file.\n");~~
	83	+ /* should never happen */
	84	+ fprintf(stderr,"there was a block but now it's gone, giving up\n");
458	85	exit(-1);
459	86	}
460	87	}
	88	+ if (b->bytes_avail) {
	89	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	90	+ b->next_to_read = b->end;
	91	+ b->bytes_avail = 0;
	92	+ b->next_to_fill = b->buffer; /* empty */
	93	+ bfile.strm.next_out = (char *)b->next_to_fill;
	94	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	95	+ }
461	96	close(fin);
462	97	exit(0);
463	98	}
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h
—	—	@@ -0,0 +1,139 @@
	2	+#ifndef _MWBZUTILS_H
	3	+#define _MWBZUTILS_H
	4	+
	5	+#include "bzlib_private.h"
	6	+int BZ_API(BZ2_bzDecompress_mine) ( bz_stream *strm );
	7	+
	8	+typedef struct {
	9	+ int page_id; /* first id in the block */
	10	+ int bits_shifted; /* block is right shifted this many bits */
	11	+ int position; /* position in file of block */
	12	+} page_info_t;
	13	+
	14	+#define BUFINSIZE 5000
	15	+
	16	+/*
	17	+ keeps all information about a bzipped file
	18	+ plus input/output buffers for decompression
	19	+*/
	20	+typedef struct {
	21	+ unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
	22	+ unsigned char bufout; / uncompressed data, must be allocated by caller */
	23	+ unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
	24	+ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
	25	+
	26	+ int bufin_size; /* size of input buffer for compressed data */
	27	+ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
	28	+
	29	+ int initialized; /* whether bz2file has been initialized (header processed, seek to
	30	+ some bz2 block in the file and input buffer filled) */
	31	+ int block_start; /* position of bz2 block in file from which we started to read (we
	32	+ read a sequence of bz2 blocks from a given position, this is
	33	+ the offset to the first one) */
	34	+
	35	+ bz_stream strm; /* stream structure for libbz2 */
	36	+ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
	37	+ read around so we can grab the lower end bits off the end for
	38	+ sticking in front of the next pile of compressed bytes we read */
	39	+
	40	+ int bits_shifted; /* number of bits that the compressed data has been right shifted
	41	+ in the file (if the number is 0, the block marker and subsequent
	42	+ data is byte-aligned) */
	43	+ unsigned char *marker; / bzip2 start of block marker, plus bit-shifted versions of it for
	44	+ locating the marker in a stream of compressed data */
	45	+ unsigned char *footer; / bzip2 end of stream footer, plus bit-shifted versions of it for
	46	+ locating the footer in a stream of compressed data */
	47	+
	48	+ int position; /* current offset into file from start of file */
	49	+
	50	+ int bytes_read; /* number of bytes of compressed data read from file (per read) */
	51	+ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
	52	+ int eof; /* nonzero if eof reached */
	53	+ int file_size; /* length of file, so we don't search past it for blocks */
	54	+} bz_info_t;
	55	+
	56	+#define MASKLEFT 0
	57	+#define MASKRIGHT 1
	58	+
	59	+/*
	60	+ this output buffer is used to collect decompressed output.
	61	+ this is not a circular buffer; when it is full the user is
	62	+ responsible for emptying it completely or partially and moving
	63	+ to the beginning any unused bytes.
	64	+
	65	+*/
	66	+typedef struct {
	67	+ unsigned char buffer; / output storage, allocated by the caller */
	68	+ unsigned char next_to_read; / pointer to the next byte in the buffer with data to be read */
	69	+ unsigned char next_to_fill; / pointer to the next byte in the buffer which is empty and can receive data */
	70	+ int bytes_avail; /* number of bytes available for reading */
	71	+ unsigned char end; / points to byte after end of buffer */
	72	+} buf_info_t;
	73	+
	74	+/*
	75	+ used for each iteration of narrowing down the location in a bzipped2 file of
	76	+ a desired pageid, by finding first compressed block after a guessed
	77	+ position and checking the first pageid (if any) contained in it.
	78	+*/
	79	+typedef struct {
	80	+ int left_end; /* left end of interval to search (bytes from start of file) */
	81	+ int right_end; /* right end of interval to search */
	82	+ int value_wanted; /* pageid desired */
	83	+ int last_value; /* pageid we found in last iteration */
	84	+ int last_position; /* position in file for last iteration */
	85	+} iter_info_t;
	86	+
	87	+int bit_mask(int numbits, int end);
	88	+
	89	+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits);
	90	+
	91	+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits);
	92	+
	93	+unsigned char ** init_marker();
	94	+
	95	+int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted);
	96	+
	97	+int check_buffer_for_bz2_block_marker(bz_info_t *bfile);
	98	+
	99	+#define FORWARD 1
	100	+#define BACKWARD 2
	101	+
	102	+int find_next_bz2_block_marker(int fin, bz_info_t *bfile, int direction);
	103	+
	104	+int init_decompress(bz_info_t *bfile);
	105	+
	106	+int decompress_header(int fin, bz_info_t *bfile);
	107	+
	108	+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile);
	109	+
	110	+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret);
	111	+
	112	+buf_info_t *init_buffer(int size);
	113	+
	114	+int buffer_is_empty(buf_info_t *b);
	115	+
	116	+int buffer_is_full(buf_info_t *b);
	117	+
	118	+int get_file_size(int fin);
	119	+
	120	+int init_bz2_file(bz_info_t *bfile, int fin, int direction);
	121	+
	122	+int get_and_decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size, int direction);
	123	+
	124	+int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile, int direction);
	125	+
	126	+void dump_buf_info(buf_info_t *b);
	127	+
	128	+int move_bytes_to_buffer_start(buf_info_t b, unsigned char fromwhere, int maxbytes);
	129	+
	130	+unsigned char ** init_footer();
	131	+
	132	+int read_footer(unsigned char *buffer, int fin);
	133	+
	134	+int check_file_for_footer(int fin, bz_info_t *bfile);
	135	+
	136	+void clear_buffer(unsigned char *buf, int length);
	137	+
	138	+int find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, int position, int direction);
	139	+
	140	+#endif
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/mwbzutils.h
___________________________________________________________________
Added: svn:eol-style
1	141	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c
—	—	@@ -0,0 +1,687 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+#include <sys/types.h>
	11	+#include <regex.h>
	12	+#include "bzlib.h"
	13	+#include "mwbzutils.h"
	14	+
	15	+
	16	+
	17	+/* return n ones either at left or right end */
	18	+int bit_mask(int numbits, int end) {
	19	+ if (end == MASKRIGHT) {
	20	+ return((1<<numbits)-1);
	21	+ }
	22	+ else {
	23	+ return(((1<<numbits)-1) << (8-numbits));
	24	+ }
	25	+}
	26	+
	27	+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
	28	+ int i;
	29	+
	30	+ if (numbits == 0) {
	31	+ return;
	32	+ }
	33	+
	34	+ for (i=0; i<buflen; i++) {
	35	+ /* left 1 */
	36	+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
	37	+
	38	+ /* grab leftmost from next byte */
	39	+ if (i < buflen-1) {
	40	+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );
	41	+ }
	42	+ }
	43	+}
	44	+
	45	+
	46	+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
	47	+ int i;
	48	+
	49	+ for (i=buflen-1; i>=0; i--) {
	50	+ /* right 1 */
	51	+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
	52	+
	53	+ /* grab rightmost from prev byte */
	54	+ if (i > 0) {
	55	+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));
	56	+ }
	57	+ }
	58	+}
	59	+
	60	+unsigned char ** init_marker() {
	61	+ unsigned char *marker = malloc(8sizeof(unsigned char *));
	62	+ int i;
	63	+
	64	+ /* set up block marker plus its various right-shifted incarnations */
	65	+ for (i = 0; i< 8; i++) {
	66	+ marker[i] = malloc(sizeof(unsigned char)*7);
	67	+ }
	68	+ marker[0][0]= (unsigned char) 0x31;
	69	+ marker[0][1]= (unsigned char) 0x41;
	70	+ marker[0][2]= (unsigned char) 0x59;
	71	+ marker[0][3]= (unsigned char) 0x26;
	72	+ marker[0][4]= (unsigned char) 0x53;
	73	+ marker[0][5]= (unsigned char) 0x59;
	74	+ marker[0][6]= (unsigned char) 0x00;
	75	+ for (i = 1; i< 8; i++) {
	76	+ memcpy((char )(marker[i]), (char )(marker[i-1]),7);
	77	+ shift_bytes_right(marker[i],7,1);
	78	+ }
	79	+ return(marker);
	80	+}
	81	+
	82	+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
	83	+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
	84	+ matches and 0 otherwise. */
	85	+int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {
	86	+ int i;
	87	+
	88	+ if (bitsrightshifted == 0) {
	89	+ for (i = 0; i< numbytes; i++) {
	90	+ if (buff1[i] != buff2[i]) {
	91	+ return(1);
	92	+ }
	93	+ }
	94	+ return(0);
	95	+ }
	96	+ else {
	97	+ for (i = 1; i< numbytes-2; i++) {
	98	+ if (buff1[i] != buff2[i]) {
	99	+ return(1);
	100	+ }
	101	+ }
	102	+ /* do leftmost byte */
	103	+ if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {
	104	+ return(1);
	105	+ }
	106	+ /* do rightmost byte */
	107	+ if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {
	108	+ return(1);
	109	+ }
	110	+ return(0);
	111	+ }
	112	+}
	113	+
	114	+/* return -1 if no match
	115	+ return number of bits rightshifted otherwise */
	116	+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
	117	+ int result, i;
	118	+
	119	+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
	120	+ if (!result) {
	121	+ return(0);
	122	+ }
	123	+ for (i=1; i<8; i++) {
	124	+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
	125	+ if (!result) {
	126	+ return(i);
	127	+ }
	128	+ }
	129	+ return(-1);
	130	+}
	131	+
	132	+/* return: 1 if found, 0 if not, -1 on error */
	133	+int find_next_bz2_block_marker(int fin, bz_info_t *bfile, int direction) {
	134	+ int result;
	135	+
	136	+ bfile->bits_shifted = -1;
	137	+ result = read(fin, bfile->marker_buffer, 7);
	138	+ if (result == -1) {
	139	+ fprintf(stderr,"read of file failed\n");
	140	+ return(-1);
	141	+ }
	142	+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
	143	+ of data read in case some bits have been shifted into it */
	144	+ while (bfile->position <= bfile->file_size - 6 && bfile->position >= 0 && bfile->bits_shifted < 0) {
	145	+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
	146	+ if (bfile->bits_shifted < 0) {
	147	+ if (direction == FORWARD) {
	148	+ bfile->position++;
	149	+ }
	150	+ else {
	151	+ bfile->position--;
	152	+ }
	153	+ result = lseek(fin, (bfile->position), SEEK_SET);
	154	+ if (result == -1) {
	155	+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
	156	+ return(-1);
	157	+ }
	158	+ result = read(fin, bfile->marker_buffer, 7);
	159	+ if (result < 7) {
	160	+ /* fprintf(stderr,"read of file failed\n"); */
	161	+ return(-1);
	162	+ }
	163	+ }
	164	+ else {
	165	+ bfile->block_start = bfile->position;
	166	+ return(1);
	167	+ }
	168	+ }
	169	+ return(0);
	170	+}
	171	+
	172	+/*
	173	+ initializes the bz2 strm structure,
	174	+ calls the BZ2 decompression library initializer
	175	+
	176	+ returns:
	177	+ BZ_OK on success
	178	+ various BZ_ errors on failure (see bzlib.h)
	179	+*/
	180	+int init_decompress(bz_info_t *bfile) {
	181	+ int bz_verbosity = 0;
	182	+ int bz_small = 0;
	183	+ int ret;
	184	+
	185	+ bfile->strm.bzalloc = NULL;
	186	+ bfile->strm.bzfree = NULL;
	187	+ bfile->strm.opaque = NULL;
	188	+
	189	+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
	190	+ if (ret != BZ_OK) {
	191	+ fprintf(stderr,"uncompress failed, err %d\n", ret);
	192	+ return(ret);
	193	+ }
	194	+ return(ret);
	195	+}
	196	+
	197	+/* FIXME do this right. whatever. */
	198	+int get_file_size(int fin) {
	199	+ int res;
	200	+
	201	+ res = lseek(fin, 0, SEEK_END);
	202	+ if (res == -1) {
	203	+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
	204	+ return(-1);
	205	+ }
	206	+ return(res);
	207	+}
	208	+
	209	+/*
	210	+ reads the first 4 bytes from a bz2 file (should be
	211	+ "BZh" followed by the block size indicator, typically "9")
	212	+ and passes them into the BZ2 decompression library.
	213	+ This must be done before decompression of any block of the
	214	+ file is attempted.
	215	+
	216	+ returns:
	217	+ BZ_OK if successful,
	218	+ various BZ_ errors or -1 on failure (see bzlib.h)
	219	+*/
	220	+int decompress_header(int fin, bz_info_t *bfile) {
	221	+ int ret, res;
	222	+
	223	+ res = lseek(fin,0,SEEK_SET);
	224	+ if (res == -1) {
	225	+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
	226	+ return(-1);
	227	+ }
	228	+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
	229	+ if (bfile->bytes_read < 4) {
	230	+ fprintf(stderr,"failed to read 4 bytes of header\n");
	231	+ return(-1);
	232	+ }
	233	+ bfile->strm.next_in = (char *)bfile->header_buffer;
	234	+ bfile->strm.avail_in = 4;
	235	+
	236	+ ret = BZ2_bzDecompress_mine ( &(bfile->strm) );
	237	+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
	238	+ fprintf(stderr,"Corrupt bzip2 header\n");
	239	+ return(-1);
	240	+ }
	241	+ return(ret);
	242	+}
	243	+
	244	+/*
	245	+ seek to appropriate offset as specified in bfile,
	246	+ read compressed data into buffer indicated by bfile,
	247	+ update the bfile structure accordingly,
	248	+ save the overflow byte (bit-shifted data = suck)
	249	+ this is for the first buffer of data in a stream,
	250	+ for subsequent buffers use fill_buffer_to_decompress()
	251	+
	252	+ this will set bfile->eof on eof. no other indicator
	253	+ will be provided.
	254	+
	255	+ returns:
	256	+ 0 on success
	257	+ -1 on error
	258	+*/
	259	+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
	260	+ int res;
	261	+
	262	+ if (bfile->bits_shifted == 0) {
	263	+ res = lseek(fin,bfile->position+1,SEEK_SET);
	264	+ if (res == -1) {
	265	+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
	266	+ return(-1);
	267	+ }
	268	+ }
	269	+ else {
	270	+ res = lseek(fin,bfile->position,SEEK_SET);
	271	+ if (res == -1) {
	272	+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
	273	+ return(-1);
	274	+ }
	275	+ }
	276	+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
	277	+ if (bfile->bytes_read > 0) {
	278	+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
	279	+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
	280	+
	281	+ bfile->strm.next_in = (char *)(bfile->bufin);
	282	+ bfile->strm.avail_in = bfile->bytes_read-1;
	283	+ }
	284	+ if (bfile->bytes_read <=0) {
	285	+ bfile->eof++;
	286	+ }
	287	+ return(0);
	288	+}
	289	+
	290	+/*
	291	+ set up the marker, seek to right place, get first
	292	+ buffer of compressed data for processing
	293	+ bfile->position must be set to desired offset first by caller.
	294	+ returns:
	295	+ -1 if no marker or other error, position of next read if ok
	296	+*/
	297	+int init_bz2_file(bz_info_t *bfile, int fin, int direction) {
	298	+ int res;
	299	+
	300	+ bfile->bufin_size = BUFINSIZE;
	301	+ bfile->marker = init_marker();
	302	+ bfile->bytes_read = 0;
	303	+ bfile->bytes_written = 0;
	304	+ bfile->eof = 0;
	305	+
	306	+ bfile->initialized++;
	307	+
	308	+ bfile->file_size = get_file_size(fin);
	309	+ if (bfile->position > bfile->file_size) {
	310	+ fprintf(stderr,"asked for position past end of file\n");
	311	+ return(-1);
	312	+ }
	313	+ res = lseek(fin, bfile->position, SEEK_SET);
	314	+ if (res == -1) {
	315	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	316	+ return(-1);
	317	+ }
	318	+
	319	+ find_next_bz2_block_marker(fin, bfile, direction);
	320	+ if (bfile->bits_shifted >= 0) {
	321	+ /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */
	322	+ init_decompress(bfile);
	323	+ decompress_header(fin, bfile);
	324	+ setup_first_buffer_to_decompress(fin, bfile);
	325	+ return(0);
	326	+ }
	327	+ return(-1);
	328	+}
	329	+
	330	+
	331	+/*
	332	+ read compressed data into buffer indicated by bfile,
	333	+ from current position of file,
	334	+ stuffing the overflow byte in first.
	335	+ update the bfile structure accordingly
	336	+ save the new overflow byte (bit-shifted data = suck)
	337	+ this function is for decompression of buffers *after
	338	+ the first one*. for the first one use
	339	+ setup_first_buffer_to_decompress()
	340	+
	341	+ this will set bfile->eof on eof. no other indicator
	342	+ will be provided.
	343	+
	344	+ returns:
	345	+ 0 on success
	346	+ hmm, it really does not do anything about errors :-D
	347	+*/
	348	+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
	349	+ if (bfile->strm.avail_in == 0) {
	350	+ bfile->strm.next_in = (char *)(bfile->bufin);
	351	+ bfile->bufin[0] = bfile->overflow;
	352	+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
	353	+ if (bfile->bytes_read > 0) {
	354	+ bfile->position+=bfile->bytes_read;
	355	+ bfile->overflow = bfile->bufin[bfile->bytes_read];
	356	+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
	357	+ bfile->strm.avail_in = bfile->bytes_read;
	358	+ }
	359	+ else { /* bfile->bytes_read <= 0 */
	360	+ bfile->strm.avail_in = 1; /* the overflow byte */
	361	+ bfile->eof++;
	362	+ }
	363	+ }
	364	+ return(0);
	365	+}
	366	+
	367	+/* size of buffer is bytes usable. there will be a null byte at the end
	368	+
	369	+ what we do with the buffer:
	370	+ - read from front of buffer to end,
	371	+ - fill from point where prev read did not fill buffer, or from where
	372	+ move of data at end of buffer to beginning left room,
	373	+ - mark a string of bytes (starting from what's available to read) as "read"
	374	+
	375	+*/
	376	+buf_info_t *init_buffer(int size) {
	377	+ buf_info_t *b;
	378	+
	379	+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
	380	+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
	381	+ b->buffer[size]='\0';
	382	+ b->end = b->buffer + size;
	383	+ b->next_to_read = b->end; /* nothing available */
	384	+ b->bytes_avail = 0; /* bytes to read, nothing available */
	385	+ b->next_to_fill = b->buffer; /* empty */
	386	+ b->next_to_fill[0] = '\0';
	387	+ return(b);
	388	+}
	389	+
	390	+/* check if buffer (used for decompressed data output) is empty,
	391	+ returns 1 if so and 0 if not */
	392	+int buffer_is_empty(buf_info_t *b) {
	393	+ if (b->bytes_avail == 0) {
	394	+ return(1);
	395	+ }
	396	+ else {
	397	+ return(0);
	398	+ }
	399	+}
	400	+
	401	+/* check if buffer (used for decompressed data output) is full,
	402	+
	403	+ returns 1 if so and 0 if not
	404	+ I'm not liking this function so well, fixme */
	405	+int buffer_is_full(buf_info_t *b) {
	406	+ if (b->next_to_fill == b->end) {
	407	+ return(1);
	408	+ }
	409	+ else {
	410	+ return(0);
	411	+ }
	412	+}
	413	+
	414	+
	415	+/* get the next buffer of uncompressed stuff */
	416	+int get_and_decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size, int direction) {
	417	+ int ret;
	418	+
	419	+ bfile->bufout = bufferout;
	420	+ bfile->bufout_size = bufout_size;
	421	+ bfile->bytes_written = 0;
	422	+
	423	+ if (! bfile->initialized) {
	424	+ if (init_bz2_file(bfile, fin, direction) == -1) {
	425	+ fprintf(stderr,"failed to initialize bz2file\n");
	426	+ return(-1);
	427	+ };
	428	+ bfile->strm.next_out = (char *)bfile->bufout;
	429	+ bfile->strm.avail_out = bfile->bufout_size;
	430	+ }
	431	+
	432	+ ret = BZ_OK;
	433	+ while (BZ_OK == ret && bfile->bytes_written == 0) {
	434	+ ret = BZ2_bzDecompress_mine ( &(bfile->strm) );
	435	+ /* FIXME testing only, does stuff actually get written or not? */
	436	+ /* if (BZ_OK == ret \|\| BZ_STREAM_END == ret \|\| BZ_DATA_ERROR == ret) { */
	437	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {
	438	+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
	439	+ }
	440	+ else {
	441	+ fprintf(stderr,"error from BZ decompress %d (1)\n",ret);
	442	+ return(-1);
	443	+ }
	444	+ fill_buffer_to_decompress(fin, bfile, ret);
	445	+ /*
	446	+ if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {
	447	+ fprintf(stderr,"eof reached\n");
	448	+ }
	449	+ */
	450	+ }
	451	+ if (ret == BZ_STREAM_END) {
	452	+ bfile->eof++;
	453	+ /* should we actually change the file position?
	454	+ bfile->position = bfile->filesize;
	455	+ lseek(fin,0,SEEK_END);
	456	+ */
	457	+ }
	458	+ return(0);
	459	+}
	460	+
	461	+/*
	462	+ fill output buffer in b with uncompressed data from bfile
	463	+ if this is the first call to the function for this file,
	464	+ the file header will be read, and the first buffer of
	465	+ uncompressed data will be prepared. bfile->position
	466	+ should be set to the offset (from the beginning of file) from
	467	+ which to find the first bz2 block.
	468	+
	469	+ returns:
	470	+ on success, number of bytes read (may be 0)
	471	+ -1 on error
	472	+*/
	473	+int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile, int direction) {
	474	+ int res;
	475	+
	476	+ if (buffer_is_full(b)) {
	477	+ return(0);
	478	+ }
	479	+
	480	+ if (buffer_is_empty(b)) {
	481	+ b->next_to_fill = b->buffer;
	482	+ }
	483	+ res = get_and_decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill, direction);
	484	+ if (res <0 ) {
	485	+ return(res);
	486	+ }
	487	+ if (bfile->bytes_written < 0) {
	488	+ fprintf(stderr,"read of file failed\n");
	489	+ return(-1);
	490	+ }
	491	+ else {
	492	+ /* really?? FIXME check this */
	493	+ if (buffer_is_empty(b)) {
	494	+ b->next_to_read = b->next_to_fill; /* where we just read */
	495	+ }
	496	+ b->bytes_avail += bfile->bytes_written;
	497	+ b->next_to_fill += bfile->bytes_written;
	498	+ b->next_to_fill[0] = '\0';
	499	+ return(0);
	500	+ }
	501	+}
	502	+
	503	+void dumpbuf_info_t(buf_info_t *b) {
	504	+ fprintf(stdout, "\n");
	505	+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
	506	+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
	507	+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
	508	+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
	509	+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
	510	+}
	511	+
	512	+/*
	513	+ copy text from end of buffer to the beginning, that we want to keep
	514	+ around for further processing (i.e. further regex matches)
	515	+ returns number of bytes copied
	516	+*/
	517	+int move_bytes_to_buffer_start(buf_info_t b, unsigned char fromwhere, int maxbytes) {
	518	+ int i, tocopy;
	519	+
	520	+ if (fromwhere >= b->end) {
	521	+ return(0);
	522	+ }
	523	+ else {
	524	+ tocopy = b->end - fromwhere;
	525	+ if (maxbytes && (tocopy > maxbytes)) {
	526	+ tocopy = maxbytes;
	527	+ }
	528	+ for (i = 0; i < tocopy; i++) {
	529	+ b->buffer[i] = fromwhere[i];
	530	+ }
	531	+ b->next_to_fill = b->buffer + tocopy;
	532	+ b->next_to_fill[0] = '\0';
	533	+ b->next_to_read = b->buffer;
	534	+ b->bytes_avail = tocopy;
	535	+ return(tocopy);
	536	+ }
	537	+}
	538	+
	539	+unsigned char ** init_footer() {
	540	+ unsigned char *footer = malloc(8sizeof(unsigned char *));
	541	+ int i;
	542	+
	543	+ /* set up footer plus its various right-shifted incarnations */
	544	+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
	545	+ for (i = 0; i< 8; i++) {
	546	+ footer[i] = malloc(sizeof(unsigned char)*7);
	547	+ }
	548	+ footer[0][0]= (unsigned char) 0x17;
	549	+ footer[0][1]= (unsigned char) 0x72;
	550	+ footer[0][2]= (unsigned char) 0x45;
	551	+ footer[0][3]= (unsigned char) 0x38;
	552	+ footer[0][4]= (unsigned char) 0x50;
	553	+ footer[0][5]= (unsigned char) 0x90;
	554	+ footer[0][6]= (unsigned char) 0x00;
	555	+ for (i = 1; i< 8; i++) {
	556	+ memcpy((char )(footer[i]), (char )(footer[i-1]),7);
	557	+ shift_bytes_right(footer[i],7,1);
	558	+ }
	559	+ return(footer);
	560	+}
	561	+
	562	+int read_footer(unsigned char *buffer, int fin) {
	563	+ int res;
	564	+
	565	+ res = lseek(fin, -11, SEEK_END);
	566	+ if (res == -1) {
	567	+ fprintf(stderr,"lseek of file failed\n");
	568	+ return(-1);
	569	+ }
	570	+ res = read(fin, buffer, 11);
	571	+ if (res == -1) {
	572	+ fprintf(stderr,"read of file failed\n");
	573	+ return(-1);
	574	+ }
	575	+ return(0);
	576	+}
	577	+
	578	+/*
	579	+ return -1 if no match return number of bits rightshifted otherwise
	580	+*/
	581	+int check_file_for_footer(int fin, bz_info_t *bfile) {
	582	+ unsigned char buffer[11];
	583	+ int result, i;
	584	+
	585	+ read_footer(buffer,fin);
	586	+
	587	+ result = bytes_compare(bfile->footer[0],buffer+1,6,0);
	588	+ if (!result) {
	589	+ return(0);
	590	+ }
	591	+
	592	+ for (i=1; i<8; i++) {
	593	+ result = bytes_compare(bfile->footer[i],buffer,7,i);
	594	+ if (!result) {
	595	+ return(i);
	596	+ }
	597	+ }
	598	+ return(-1);
	599	+}
	600	+
	601	+void clear_buffer(unsigned char *buf, int length) {
	602	+ int i;
	603	+
	604	+ for (i=0; i<length; i++) {
	605	+ buf[i]=0;
	606	+ }
	607	+ return;
	608	+}
	609	+
	610	+/*
	611	+ look for the first bz2 block in the file before/after specified offset
	612	+ it tests that the block is valid by doing partial decompression.
	613	+ this function will update the bfile structure:
	614	+ bfile->position will contain the current position of the file (? will it?)
	615	+ bfile->bits_shifted will contain the number of bits that the block is rightshifted
	616	+ bfile->block_start will contain the offset from start of file to the block
	617	+ (this value will always be positive, the value given in the argument "direction"
	618	+ determines whether the block starts before or after the initial file position).
	619	+
	620	+ returns:
	621	+ position of next byte in file to be read, on success
	622	+ 0 if no marker
	623	+ -1 on error
	624	+*/
	625	+int find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, int position, int direction) {
	626	+ int res;
	627	+
	628	+ bfile->bufin_size = BUFINSIZE;
	629	+ bfile->marker = init_marker();
	630	+ bfile->position = position;
	631	+ bfile->block_start = -1;
	632	+ bfile->bytes_read = 0;
	633	+ bfile->bytes_written = 0;
	634	+ bfile->eof = 0;
	635	+ bfile->bits_shifted = -1;
	636	+
	637	+ bfile->file_size = get_file_size(fin);
	638	+
	639	+ while (bfile->bits_shifted < 0) {
	640	+ if (bfile->position > bfile->file_size) {
	641	+ return(0);
	642	+ }
	643	+ res = lseek(fin, bfile->position, SEEK_SET);
	644	+ if (res < 0) {
	645	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	646	+ return(-1);
	647	+ }
	648	+ res = find_next_bz2_block_marker(fin, bfile,direction);
	649	+ if (res == 1) {
	650	+ init_decompress(bfile);
	651	+ decompress_header(fin, bfile);
	652	+ res = setup_first_buffer_to_decompress(fin, bfile);
	653	+ if (res == -1) {
	654	+ fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
	655	+ return(-1);
	656	+ }
	657	+ bfile->strm.next_out = (char *)bfile->bufout;
	658	+ bfile->strm.avail_out = bfile->bufout_size;
	659	+ res = BZ2_bzDecompress_mine ( &(bfile->strm) );
	660	+ /* this means we (probably) have a genuine marker */
	661	+ if (BZ_OK == res \|\| BZ_STREAM_END == res) {
	662	+ res = BZ2_bzDecompressEnd ( &(bfile->strm) );
	663	+ bfile->bytes_read = 0;
	664	+ bfile->bytes_written = 0;
	665	+ bfile->eof = 0;
	666	+ /* leave the file at the right position */
	667	+ res = lseek(fin, bfile->block_start, SEEK_SET);
	668	+ if (res < 0) {
	669	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	670	+ return(-1);
	671	+ }
	672	+ bfile->position = res;
	673	+ return(bfile->position);
	674	+ }
	675	+ /* right bytes, but there by chance, skip and try again */
	676	+ else {
	677	+ bfile->position+=6;
	678	+ bfile->bits_shifted = -1;
	679	+ bfile->block_start = -1;
	680	+ }
	681	+ }
	682	+ else {
	683	+ return(0);
	684	+ }
	685	+ }
	686	+ return(-1);
	687	+}
	688	+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/mwbzlib.c
___________________________________________________________________
Added: svn:eol-style
1	689	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/CHANGES
Index: branches/ariel/xmldumps-backup/mwbzutils/COPYING
—	—	@@ -0,0 +1,342 @@
	2	+== GNU GENERAL PUBLIC LICENSE ==
	3	+
	4	+Version 2, June 1991
	5	+
	6	+Copyright (C) 1989, 1991 Free Software Foundation, Inc.
	7	+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
	8	+Everyone is permitted to copy and distribute verbatim copies
	9	+of this license document, but changing it is not allowed.
	10	+
	11	+=== Preamble ===
	12	+
	13	+The licenses for most software are designed to take away your
	14	+freedom to share and change it. By contrast, the GNU General Public
	15	+License is intended to guarantee your freedom to share and change free
	16	+software--to make sure the software is free for all its users. This
	17	+General Public License applies to most of the Free Software
	18	+Foundation's software and to any other program whose authors commit to
	19	+using it. (Some other Free Software Foundation software is covered by
	20	+the GNU Library General Public License instead.) You can apply it to
	21	+your programs, too.
	22	+
	23	+When we speak of free software, we are referring to freedom, not
	24	+price. Our General Public Licenses are designed to make sure that you
	25	+have the freedom to distribute copies of free software (and charge for
	26	+this service if you wish), that you receive source code or can get it
	27	+if you want it, that you can change the software or use pieces of it
	28	+in new free programs; and that you know you can do these things.
	29	+
	30	+To protect your rights, we need to make restrictions that forbid
	31	+anyone to deny you these rights or to ask you to surrender the rights.
	32	+These restrictions translate to certain responsibilities for you if you
	33	+distribute copies of the software, or if you modify it.
	34	+
	35	+For example, if you distribute copies of such a program, whether
	36	+gratis or for a fee, you must give the recipients all the rights that
	37	+you have. You must make sure that they, too, receive or can get the
	38	+source code. And you must show them these terms so they know their
	39	+rights.
	40	+
	41	+We protect your rights with two steps: (1) copyright the software, and
	42	+(2) offer you this license which gives you legal permission to copy,
	43	+distribute and/or modify the software.
	44	+
	45	+Also, for each author's protection and ours, we want to make certain
	46	+that everyone understands that there is no warranty for this free
	47	+software. If the software is modified by someone else and passed on, we
	48	+want its recipients to know that what they have is not the original, so
	49	+that any problems introduced by others will not reflect on the original
	50	+authors' reputations.
	51	+
	52	+Finally, any free program is threatened constantly by software
	53	+patents. We wish to avoid the danger that redistributors of a free
	54	+program will individually obtain patent licenses, in effect making the
	55	+program proprietary. To prevent this, we have made it clear that any
	56	+patent must be licensed for everyone's free use or not licensed at all.
	57	+
	58	+The precise terms and conditions for copying, distribution and
	59	+modification follow.
	60	+
	61	+== TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION ==
	62	+
	63	+'''0.''' This License applies to any program or other work which contains
	64	+a notice placed by the copyright holder saying it may be distributed
	65	+under the terms of this General Public License. The "Program", below,
	66	+refers to any such program or work, and a "work based on the Program"
	67	+means either the Program or any derivative work under copyright law:
	68	+that is to say, a work containing the Program or a portion of it,
	69	+either verbatim or with modifications and/or translated into another
	70	+language. (Hereinafter, translation is included without limitation in
	71	+the term "modification".) Each licensee is addressed as "you".
	72	+
	73	+Activities other than copying, distribution and modification are not
	74	+covered by this License; they are outside its scope. The act of
	75	+running the Program is not restricted, and the output from the Program
	76	+is covered only if its contents constitute a work based on the
	77	+Program (independent of having been made by running the Program).
	78	+Whether that is true depends on what the Program does.
	79	+
	80	+'''1.''' You may copy and distribute verbatim copies of the Program's
	81	+source code as you receive it, in any medium, provided that you
	82	+conspicuously and appropriately publish on each copy an appropriate
	83	+copyright notice and disclaimer of warranty; keep intact all the
	84	+notices that refer to this License and to the absence of any warranty;
	85	+and give any other recipients of the Program a copy of this License
	86	+along with the Program.
	87	+
	88	+You may charge a fee for the physical act of transferring a copy, and
	89	+you may at your option offer warranty protection in exchange for a fee.
	90	+
	91	+'''2.''' You may modify your copy or copies of the Program or any portion
	92	+of it, thus forming a work based on the Program, and copy and
	93	+distribute such modifications or work under the terms of Section 1
	94	+above, provided that you also meet all of these conditions:
	95	+
	96	+ '''a)''' You must cause the modified files to carry prominent notices
	97	+ stating that you changed the files and the date of any change.
	98	+
	99	+ '''b)''' You must cause any work that you distribute or publish, that in
	100	+ whole or in part contains or is derived from the Program or any
	101	+ part thereof, to be licensed as a whole at no charge to all third
	102	+ parties under the terms of this License.
	103	+
	104	+ '''c)''' If the modified program normally reads commands interactively
	105	+ when run, you must cause it, when started running for such
	106	+ interactive use in the most ordinary way, to print or display an
	107	+ announcement including an appropriate copyright notice and a
	108	+ notice that there is no warranty (or else, saying that you provide
	109	+ a warranty) and that users may redistribute the program under
	110	+ these conditions, and telling the user how to view a copy of this
	111	+ License. (Exception: if the Program itself is interactive but
	112	+ does not normally print such an announcement, your work based on
	113	+ the Program is not required to print an announcement.)
	114	+
	115	+These requirements apply to the modified work as a whole. If
	116	+identifiable sections of that work are not derived from the Program,
	117	+and can be reasonably considered independent and separate works in
	118	+themselves, then this License, and its terms, do not apply to those
	119	+sections when you distribute them as separate works. But when you
	120	+distribute the same sections as part of a whole which is a work based
	121	+on the Program, the distribution of the whole must be on the terms of
	122	+this License, whose permissions for other licensees extend to the
	123	+entire whole, and thus to each and every part regardless of who wrote it.
	124	+
	125	+Thus, it is not the intent of this section to claim rights or contest
	126	+your rights to work written entirely by you; rather, the intent is to
	127	+exercise the right to control the distribution of derivative or
	128	+collective works based on the Program.
	129	+
	130	+In addition, mere aggregation of another work not based on the Program
	131	+with the Program (or with a work based on the Program) on a volume of
	132	+a storage or distribution medium does not bring the other work under
	133	+the scope of this License.
	134	+
	135	+'''3.''' You may copy and distribute the Program (or a work based on it,
	136	+under Section 2) in object code or executable form under the terms of
	137	+Sections 1 and 2 above provided that you also do one of the following:
	138	+
	139	+ '''a)''' Accompany it with the complete corresponding machine-readable
	140	+ source code, which must be distributed under the terms of Sections
	141	+ 1 and 2 above on a medium customarily used for software interchange; or,
	142	+
	143	+ '''b)''' Accompany it with a written offer, valid for at least three
	144	+ years, to give any third party, for a charge no more than your
	145	+ cost of physically performing source distribution, a complete
	146	+ machine-readable copy of the corresponding source code, to be
	147	+ distributed under the terms of Sections 1 and 2 above on a medium
	148	+ customarily used for software interchange; or,
	149	+
	150	+ '''c)''' Accompany it with the information you received as to the offer
	151	+ to distribute corresponding source code. (This alternative is
	152	+ allowed only for noncommercial distribution and only if you
	153	+ received the program in object code or executable form with such
	154	+ an offer, in accord with Subsection b above.)
	155	+
	156	+The source code for a work means the preferred form of the work for
	157	+making modifications to it. For an executable work, complete source
	158	+code means all the source code for all modules it contains, plus any
	159	+associated interface definition files, plus the scripts used to
	160	+control compilation and installation of the executable. However, as a
	161	+special exception, the source code distributed need not include
	162	+anything that is normally distributed (in either source or binary
	163	+form) with the major components (compiler, kernel, and so on) of the
	164	+operating system on which the executable runs, unless that component
	165	+itself accompanies the executable.
	166	+
	167	+If distribution of executable or object code is made by offering
	168	+access to copy from a designated place, then offering equivalent
	169	+access to copy the source code from the same place counts as
	170	+distribution of the source code, even though third parties are not
	171	+compelled to copy the source along with the object code.
	172	+
	173	+'''4.''' You may not copy, modify, sublicense, or distribute the Program
	174	+except as expressly provided under this License. Any attempt
	175	+otherwise to copy, modify, sublicense or distribute the Program is
	176	+void, and will automatically terminate your rights under this License.
	177	+However, parties who have received copies, or rights, from you under
	178	+this License will not have their licenses terminated so long as such
	179	+parties remain in full compliance.
	180	+
	181	+'''5.''' You are not required to accept this License, since you have not
	182	+signed it. However, nothing else grants you permission to modify or
	183	+distribute the Program or its derivative works. These actions are
	184	+prohibited by law if you do not accept this License. Therefore, by
	185	+modifying or distributing the Program (or any work based on the
	186	+Program), you indicate your acceptance of this License to do so, and
	187	+all its terms and conditions for copying, distributing or modifying
	188	+the Program or works based on it.
	189	+
	190	+'''6.''' Each time you redistribute the Program (or any work based on the
	191	+Program), the recipient automatically receives a license from the
	192	+original licensor to copy, distribute or modify the Program subject to
	193	+these terms and conditions. You may not impose any further
	194	+restrictions on the recipients' exercise of the rights granted herein.
	195	+You are not responsible for enforcing compliance by third parties to
	196	+this License.
	197	+
	198	+'''7.''' If, as a consequence of a court judgment or allegation of patent
	199	+infringement or for any other reason (not limited to patent issues),
	200	+conditions are imposed on you (whether by court order, agreement or
	201	+otherwise) that contradict the conditions of this License, they do not
	202	+excuse you from the conditions of this License. If you cannot
	203	+distribute so as to satisfy simultaneously your obligations under this
	204	+License and any other pertinent obligations, then as a consequence you
	205	+may not distribute the Program at all. For example, if a patent
	206	+license would not permit royalty-free redistribution of the Program by
	207	+all those who receive copies directly or indirectly through you, then
	208	+the only way you could satisfy both it and this License would be to
	209	+refrain entirely from distribution of the Program.
	210	+
	211	+If any portion of this section is held invalid or unenforceable under
	212	+any particular circumstance, the balance of the section is intended to
	213	+apply and the section as a whole is intended to apply in other
	214	+circumstances.
	215	+
	216	+It is not the purpose of this section to induce you to infringe any
	217	+patents or other property right claims or to contest validity of any
	218	+such claims; this section has the sole purpose of protecting the
	219	+integrity of the free software distribution system, which is
	220	+implemented by public license practices. Many people have made
	221	+generous contributions to the wide range of software distributed
	222	+through that system in reliance on consistent application of that
	223	+system; it is up to the author/donor to decide if he or she is willing
	224	+to distribute software through any other system and a licensee cannot
	225	+impose that choice.
	226	+
	227	+This section is intended to make thoroughly clear what is believed to
	228	+be a consequence of the rest of this License.
	229	+
	230	+'''8.''' If the distribution and/or use of the Program is restricted in
	231	+certain countries either by patents or by copyrighted interfaces, the
	232	+original copyright holder who places the Program under this License
	233	+may add an explicit geographical distribution limitation excluding
	234	+those countries, so that distribution is permitted only in or among
	235	+countries not thus excluded. In such case, this License incorporates
	236	+the limitation as if written in the body of this License.
	237	+
	238	+'''9.''' The Free Software Foundation may publish revised and/or new versions
	239	+of the General Public License from time to time. Such new versions will
	240	+be similar in spirit to the present version, but may differ in detail to
	241	+address new problems or concerns.
	242	+
	243	+Each version is given a distinguishing version number. If the Program
	244	+specifies a version number of this License which applies to it and "any
	245	+later version", you have the option of following the terms and conditions
	246	+either of that version or of any later version published by the Free
	247	+Software Foundation. If the Program does not specify a version number of
	248	+this License, you may choose any version ever published by the Free Software
	249	+Foundation.
	250	+
	251	+'''10.''' If you wish to incorporate parts of the Program into other free
	252	+programs whose distribution conditions are different, write to the author
	253	+to ask for permission. For software which is copyrighted by the Free
	254	+Software Foundation, write to the Free Software Foundation; we sometimes
	255	+make exceptions for this. Our decision will be guided by the two goals
	256	+of preserving the free status of all derivatives of our free software and
	257	+of promoting the sharing and reuse of software generally.
	258	+
	259	+=== NO WARRANTY ===
	260	+
	261	+'''11.''' BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
	262	+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
	263	+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
	264	+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
	265	+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	266	+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
	267	+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
	268	+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
	269	+REPAIR OR CORRECTION.
	270	+
	271	+'''12.''' IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
	272	+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
	273	+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
	274	+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
	275	+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
	276	+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
	277	+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
	278	+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
	279	+POSSIBILITY OF SUCH DAMAGES.
	280	+
	281	+ '''END OF TERMS AND CONDITIONS'''
	282	+
	283	+== How to Apply These Terms to Your New Programs ==
	284	+
	285	+If you develop a new program, and you want it to be of the greatest
	286	+possible use to the public, the best way to achieve this is to make it
	287	+free software which everyone can redistribute and change under these terms.
	288	+
	289	+To do so, attach the following notices to the program. It is safest
	290	+to attach them to the start of each source file to most effectively
	291	+convey the exclusion of warranty; and each file should have at least
	292	+the "copyright" line and a pointer to where the full notice is found.
	293	+
	294	+ <one line to give the program's name and a brief idea of what it does.>
	295	+
	296	+ Copyright (C) <year> <name of author>
	297	+
	298	+ This program is free software; you can redistribute it and/or modify
	299	+ it under the terms of the GNU General Public License as published by
	300	+ the Free Software Foundation; either version 2 of the License, or
	301	+ (at your option) any later version.
	302	+
	303	+ This program is distributed in the hope that it will be useful,
	304	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	305	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	306	+ GNU General Public License for more details.
	307	+
	308	+ You should have received a copy of the GNU General Public License
	309	+ along with this program; if not, write to the Free Software
	310	+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
	311	+
	312	+
	313	+Also add information on how to contact you by electronic and paper mail.
	314	+
	315	+If the program is interactive, make it output a short notice like this
	316	+when it starts in an interactive mode:
	317	+
	318	+ Gnomovision version 69, Copyright (C) year name of author
	319	+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
	320	+ This is free software, and you are welcome to redistribute it
	321	+ under certain conditions; type `show c' for details.
	322	+
	323	+The hypothetical commands `show w' and `show c' should show the appropriate
	324	+parts of the General Public License. Of course, the commands you use may
	325	+be called something other than `show w' and `show c'; they could even be
	326	+mouse-clicks or menu items--whatever suits your program.
	327	+
	328	+You should also get your employer (if you work as a programmer) or your
	329	+school, if any, to sign a "copyright disclaimer" for the program, if
	330	+necessary. Here is a sample; alter the names:
	331	+
	332	+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
	333	+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
	334	+
	335	+ <signature of Ty Coon>, 1 April 1989
	336	+
	337	+ Ty Coon, President of Vice
	338	+
	339	+This General Public License does not permit incorporating your program into
	340	+proprietary programs. If your program is a subroutine library, you may
	341	+consider it more useful to permit linking proprietary applications with the
	342	+library. If this is what you want to do, use the GNU Library General
	343	+Public License instead of this License.
Index: branches/ariel/xmldumps-backup/mwbzutils/LICENSE_BZ
—	—	@@ -0,0 +1,42 @@
	2	+
	3	+--------------------------------------------------------------------------
	4	+
	5	+This program, "bzip2", the associated library "libbzip2", and all
	6	+documentation, are copyright (C) 1996-2010 Julian R Seward. All
	7	+rights reserved.
	8	+
	9	+Redistribution and use in source and binary forms, with or without
	10	+modification, are permitted provided that the following conditions
	11	+are met:
	12	+
	13	+1. Redistributions of source code must retain the above copyright
	14	+ notice, this list of conditions and the following disclaimer.
	15	+
	16	+2. The origin of this software must not be misrepresented; you must
	17	+ not claim that you wrote the original software. If you use this
	18	+ software in a product, an acknowledgment in the product
	19	+ documentation would be appreciated but is not required.
	20	+
	21	+3. Altered source versions must be plainly marked as such, and must
	22	+ not be misrepresented as being the original software.
	23	+
	24	+4. The name of the author may not be used to endorse or promote
	25	+ products derived from this software without specific prior written
	26	+ permission.
	27	+
	28	+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
	29	+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	30	+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	31	+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
	32	+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	33	+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
	34	+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	35	+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	36	+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	37	+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	38	+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	39	+
	40	+Julian Seward, jseward@bzip.org
	41	+bzip2/libbzip2 version 1.0.6 of 6 September 2010
	42	+
	43	+--------------------------------------------------------------------------
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile
—	—	@@ -0,0 +1,95 @@
	2	+# ------------------------------------------------------------------
	3	+# This Makefile builds binaries which rely on two source files
	4	+# from libbzip2 version 1.0.6. (See bz2libfuncs.c and
	5	+# bzlib_private.h; the first is slightly modified while the
	6	+# second is unchanged from the library version.)
	7	+#
	8	+# The copyright for those two files is as follows:
	9	+#
	10	+# bzip2/libbzip2 version 1.0.6 of 6 September 2010
	11	+# Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
	12	+#
	13	+# Those files are released under the terms of the license contained
	14	+# in the file LICENSE_BZ.
	15	+#
	16	+# All other files are released under the GPL, copyright (C) Ariel T. Glenn
	17	+# 2010-2010: see the file COPYING for details.
	18	+# ------------------------------------------------------------------
	19	+
	20	+CC=gcc
	21	+LDFLAGS=
	22	+BIGFILES=-D_FILE_OFFSET_BITS=64
	23	+CFLAGS=-Wall -Winline -O2 -g $(BIGFILES)
	24	+PREFIX=/usr/local
	25	+
	26	+SHELL=/bin/sh
	27	+
	28	+OBJSBZ= bzlibfuncs.o
	29	+
	30	+all: checkforbz2footer \
	31	+ dumpbz2filefromoffset \
	32	+ dumplastbz2block \
	33	+ findpageidinbz2xml
	34	+
	35	+dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
	36	+ $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2
	37	+
	38	+findpageidinbz2xml: $(OBJSBZ) mwbzlib.o findpageidinbz2xml.o
	39	+ $(CC) $(CFLAGS) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o mwbzlib.o $(OBJSBZ) -lbz2
	40	+
	41	+checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o
	42	+ $(CC) $(CFLAGS) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o mwbzlib.o $(OBJSBZ) -lbz2
	43	+
	44	+dumpbz2filefromoffset: $(OBJSBZ) mwbzlib.o dumpbz2filefromoffset.o
	45	+ $(CC) $(CFLAGS) $(LDFLAGS) -o dumpbz2filefromoffset dumpbz2filefromoffset.o mwbzlib.o $(OBJSBZ) -lbz2
	46	+
	47	+install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset
	48	+ if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
	49	+ if ( test ! -d $(PREFIX)/include ) ; then mkdir -p $(PREFIX)/include ; fi
	50	+ cp -f bzip2 $(PREFIX)/bin/dumplastbz2block
	51	+ cp -f bzip2 $(PREFIX)/bin/findpageidinbz2xml
	52	+ cp -f bzip2 $(PREFIX)/bin/checkforbz2footer
	53	+ cp -f bzip2 $(PREFIX)/bin/dumpbz2filefromoffset
	54	+ chmod a+x $(PREFIX)/bin/dumplastbz2block
	55	+ chmod a+x $(PREFIX)/bin/findpageidinbz2xml
	56	+ chmod a+x $(PREFIX)/bin/checkforbz2footer
	57	+ chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset
	58	+
	59	+clean:
	60	+ rm -f .o .a dumplastbz2block findpageidinbz2xml \
	61	+ checkforbz2footer dumpbz2filefromoffset
	62	+
	63	+bzlibfuncs.o: bzlibfuncs.c
	64	+ $(CC) $(CFLAGS) -c bzlibfuncs.c
	65	+mwbzlib.o: mwbzlib.c
	66	+ $(CC) $(CFLAGS) -c mwbzlib.c
	67	+dumplastbz2block.o: dumplastbz2block.c
	68	+ $(CC) $(CFLAGS) -c dumplastbz2block.c
	69	+findpageidinbz2xml.o: findpageidinbz2xml.c
	70	+ $(CC) $(CFLAGS) -c findpageidinbz2xml.c
	71	+checkforbz2footer.o: checkforbz2footer.c
	72	+ $(CC) $(CFLAGS) -c checkforbz2footer.c
	73	+dumpbz2filefromoffset.o: dumpbz2filefromoffset.c
	74	+ $(CC) $(CFLAGS) -c dumpbz2filefromoffset.c
	75	+
	76	+distclean: clean
	77	+
	78	+DISTNAME=mwbzutils-0.0.1
	79	+dist: rm -f $(DISTNAME)
	80	+ ln -s -f . $(DISTNAME)
	81	+ tar cvf $(DISTNAME).tar \
	82	+ $(DISTNAME)/dumplastbz2block.c \
	83	+ $(DISTNAME)/findpageidinbz2xml.c \
	84	+ $(DISTNAME)/checkforbz2footer.c \
	85	+ $(DISTNAME)/dumpbz2filefromoffset.c \
	86	+ $(DISTNAME)/mwbzlib.c \
	87	+ $(DISTNAME)/mwbzutils.h \
	88	+ $(DISTNAME)/bzlibfuncs.c \
	89	+ $(DISTNAME)/bzlib_private.h \
	90	+ $(DISTNAME)/Makefile \
	91	+ $(DISTNAME)/LICENSE_BZ \
	92	+ $(DISTNAME)/COPYING \
	93	+ $(DISTNAME)/README \
	94	+ $(DISTNAME)/CHANGES
	95	+ gzip -v $(DISTNAME).tar
	96	+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/Makefile
___________________________________________________________________
Added: svn:eol-style
1	97	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/README
—	—	@@ -0,0 +1,58 @@
	2	+What is this?
	3	+
	4	+It is a tiny suite of utilities that hapless WMF employees use to massage the
	5	+XML dump files so that we can produce them on a more regular basis.
	6	+
	7	+More specifically, they allow us to do various things with bz2 files
	8	+quickly instead of requiring a serial read/decompress of the file. Some
	9	+of these files range from 2 to 30 GB in size, so serial access is too slow.
	10	+
	11	+The files bz2libfuncs.c and bzlib_private.h are taken from bzip2/libbzip2
	12	+version 1.0.6 of 6 September 2010 (Copyright (C) 1996-2010 Julian Seward
	13	+<jseward@bzip.org>) and as such their copyright license is in the file
	14	+LICENSE_BZ; all other files in the package are released under the GPL,
	15	+see the file COPYING for details.
	16	+
	17	+Utilities:
	18	+
	19	+checkforbz2footer - Tests to see if the bz2 file specified on the command line
	20	+ has a bz2 footer (if it does it is likely to be intact).
	21	+ Exits with 0 if found, 1 otherwise.
	22	+dumpbz2filefromoffset - Uncompresses the file from the first bz2 block found after
	23	+ the specified offset, and dumps the results to stdout.
	24	+ This will first look for and dump the <mediawiki> header,
	25	+ up to and including the </siteinfo> tag; then it will
	26	+ find the first <page> tag in the first bz2 block after
	27	+ the specified output and dump the contents from that point
	28	+ on.
	29	+dumplastbz2block - Finds the last bz2 block marker in a file and dumps whatever
	30	+ can be decompressed after that point; the header of the file
	31	+ must be intact in order for any output to be produced. This
	32	+ will produce output for truncated files as well, as long as
	33	+ there is "enough" data after the bz2 block marker.
	34	+ Exits with 0 if decompression of some data can be done,
	35	+ 1 if decompression fails, and -1 on error.
	36	+
	37	+findpageidinbz2xml - Given a bzipped and possibly truncated file, and a page id,
	38	+ hunt for the page id in the file; this assumes that the
	39	+ bz2 header is intact and that page ids are steadily increasing
	40	+ throughout the file. It writes the offset of the relevant block
	41	+ (from beginning of file) and the first pageid found in that block,
	42	+ to stdout. Format of output:
	43	+ position:xxxxx pageid:nnn
	44	+ It exits with 0 on success, -1 on error.
	45	+
	46	+Library routines:
	47	+
	48	+mwbz2lib.c - various utility functions (bitmasks, shifting and comparing bytes,
	49	+ setting up bz2 files for decompression, etc)
	50	+
	51	+External library routines:
	52	+
	53	+bz2libfuncs.c - the BZ2_bzDecompress() routine, modified so that it does not do
	54	+ a check of the cumulative CRC (since we read from an arbitrary
	55	+ point in most of these files, we won't have a cumulative CRC
	56	+ that makes any sense). It's a one line fix but it requires
	57	+ unRLE_obuf_to_output_FAST() which is marked static in the original
	58	+ library, so that's in here too.
	59	+

Status & tagging log

15:58, 7 July 2011 Reedy (talk | contribs) changed the status of r91638 [removed: new added: deferred]