r91637 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r91636‎ \| r91637 \| r91638 >
Date:	12:07, 7 July 2011
Author:	ariel
Status:	deferred
Tags:
Comment:	move bz2 related utils into subdirectory
Modified paths:	/branches/ariel/xmldumps-backup/checkforbz2footer.c (deleted) (history) /branches/ariel/xmldumps-backup/dumpbz2filefromoffset.c (deleted) (history) /branches/ariel/xmldumps-backup/dumplastbz2block.c (deleted) (history) /branches/ariel/xmldumps-backup/findpageidinbz2xml.c (deleted) (history) /branches/ariel/xmldumps-backup/findpageidinbz2xml.h (deleted) (history) /branches/ariel/xmldumps-backup/mwbzutils (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (added) (history) /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/dumpbz2filefromoffset.c
—	—	@@ -1,766 +0,0 @@
2		~~-#include <unistd.h>~~
3		~~-#include <stdio.h>~~
4		~~-#include <string.h>~~
5		~~-#include <sys/types.h>~~
6		~~-#include <sys/stat.h>~~
7		~~-#include <fcntl.h>~~
8		~~-#include <stdlib.h>~~
9		~~-#include <errno.h>~~
10		~~-#include <sys/types.h>~~
11		~~-#include <regex.h>~~
12		~~-#include "bzlib.h"~~
13		~~-#include "findpageidinbz2xml.h"~~
14		-
15		-
16		~~-/* return n ones either at left or right end */~~
17		~~-int bit_mask(int numbits, int end) {~~
18		~~- if (end == MASKRIGHT) {~~
19		~~- return((1<<numbits)-1);~~
20		~~- }~~
21		~~- else {~~
22		~~- return(((1<<numbits)-1) << (8-numbits));~~
23		~~- }~~
24		-}
25		-
26		~~-void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {~~
27		~~- int i;~~
28		-
29		~~- if (numbits == 0) {~~
30		~~- return;~~
31		~~- }~~
32		-
33		~~- for (i=0; i<buflen; i++) {~~
34		~~- /* left 1 */~~
35		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);~~
36		-
37		~~- /* grab leftmost from next byte */~~
38		~~- if (i < buflen-1) {~~
39		~~- buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );~~
40		~~- }~~
41		~~- }~~
42		-}
43		-
44		-
45		~~-void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {~~
46		~~- int i;~~
47		-
48		~~- for (i=buflen-1; i>=0; i--) {~~
49		~~- /* right 1 */~~
50		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
51		-
52		~~- /* grab rightmost from prev byte */~~
53		~~- if (i > 0) {~~
54		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));~~
55		~~- }~~
56		~~- }~~
57		-}
58		-
59		~~-unsigned char ** init_marker() {~~
60		~~- unsigned char *marker = malloc(8sizeof(unsigned char *));~~
61		~~- int i;~~
62		-
63		~~- /* set up block marker plus its various right-shifted incarnations */~~
64		~~- for (i = 0; i< 8; i++) {~~
65		~~- marker[i] = malloc(sizeof(unsigned char)*7);~~
66		~~- }~~
67		~~- marker[0][0]= (unsigned char) 0x31;~~
68		~~- marker[0][1]= (unsigned char) 0x41;~~
69		~~- marker[0][2]= (unsigned char) 0x59;~~
70		~~- marker[0][3]= (unsigned char) 0x26;~~
71		~~- marker[0][4]= (unsigned char) 0x53;~~
72		~~- marker[0][5]= (unsigned char) 0x59;~~
73		~~- marker[0][6]= (unsigned char) 0x00;~~
74		~~- for (i = 1; i< 8; i++) {~~
75		~~- memcpy((char )(marker[i]), (char )(marker[i-1]),7);~~
76		~~- shift_bytes_right(marker[i],7,1);~~
77		~~- }~~
78		~~- return(marker);~~
79		-}
80		-
81		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
82		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
83		~~- matches and 0 otherwise. */~~
84		~~-int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
85		~~- int i;~~
86		-
87		~~- if (bitsrightshifted == 0) {~~
88		~~- for (i = 0; i< numbytes; i++) {~~
89		~~- if (buff1[i] != buff2[i]) {~~
90		~~- return(1);~~
91		~~- }~~
92		~~- }~~
93		~~- return(0);~~
94		~~- }~~
95		~~- else {~~
96		~~- for (i = 1; i< numbytes-2; i++) {~~
97		~~- if (buff1[i] != buff2[i]) {~~
98		~~- return(1);~~
99		~~- }~~
100		~~- }~~
101		~~- /* do leftmost byte */~~
102		~~- if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {~~
103		~~- return(1);~~
104		~~- }~~
105		~~- /* do rightmost byte */~~
106		~~- if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {~~
107		~~- return(1);~~
108		~~- }~~
109		~~- return(0);~~
110		~~- }~~
111		-}
112		-
113		~~-/* return -1 if no match~~
114		~~- return number of bits rightshifted otherwise */~~
115		~~-int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {~~
116		~~- int result, i;~~
117		-
118		~~- result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);~~
119		~~- if (!result) {~~
120		~~- return(0);~~
121		~~- }~~
122		~~- for (i=1; i<8; i++) {~~
123		~~- result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);~~
124		~~- if (!result) {~~
125		~~- return(i);~~
126		~~- }~~
127		~~- }~~
128		~~- return(-1);~~
129		-}
130		-
131		~~-/* return: 1 if found, 0 if not, -1 on error */~~
132		~~-int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {~~
133		~~- int result;~~
134		-
135		~~- bfile->bits_shifted = -1;~~
136		~~- result = read(fin, bfile->marker_buffer, 7);~~
137		~~- if (result == -1) {~~
138		~~- fprintf(stderr,"read of file failed\n");~~
139		~~- exit(-1);~~
140		~~- }~~
141		~~- /* must be after 4 byte file header, and we add a leftmost byte to the buffer~~
142		~~- of data read in case some bits have been shifted into it */~~
143		~~- while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {~~
144		~~- bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);~~
145		~~- if (bfile->bits_shifted < 0) {~~
146		~~- bfile->position++;~~
147		~~- result = lseek(fin, (bfile->position), SEEK_SET);~~
148		~~- if (result == -1) {~~
149		~~- fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);~~
150		~~- exit(-1);~~
151		~~- }~~
152		~~- result = read(fin, bfile->marker_buffer, 7);~~
153		~~- if (result < 7) {~~
154		~~- /* fprintf(stderr,"read of file failed\n"); */~~
155		~~- exit(-1);~~
156		~~- }~~
157		~~- }~~
158		~~- else {~~
159		~~- bfile->block_start = bfile->position;~~
160		~~- return(1);~~
161		~~- }~~
162		~~- }~~
163		~~- return(0);~~
164		-}
165		-
166		-/*
167		~~- initializes the bz2 strm structure,~~
168		~~- calls the BZ2 decompression library initializer~~
169		-
170		~~- returns:~~
171		~~- BZ_OK on success~~
172		~~- various BZ_ errors on failure (see bzlib.h)~~
173		~~-*/~~
174		~~-int init_decompress(bz_info_t *bfile) {~~
175		~~- int bz_verbosity = 0;~~
176		~~- int bz_small = 0;~~
177		~~- int ret;~~
178		-
179		~~- bfile->strm.bzalloc = NULL;~~
180		~~- bfile->strm.bzfree = NULL;~~
181		~~- bfile->strm.opaque = NULL;~~
182		-
183		~~- ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );~~
184		~~- if (ret != BZ_OK) {~~
185		~~- fprintf(stderr,"uncompress failed, err %d\n", ret);~~
186		~~- exit(-1);~~
187		~~- }~~
188		~~- return(ret);~~
189		-}
190		-
191		-/*
192		~~- reads the first 4 bytes from a bz2 file (should be~~
193		~~- "BZh" followed by the block size indicator, typically "9")~~
194		~~- and passes them into the BZ2 decompression library.~~
195		~~- This must be done before decompression of any block of the~~
196		~~- file is attempted.~~
197		-
198		~~- returns:~~
199		~~- BZ_OK if successful,~~
200		~~- various BZ_ errors on failure (see bzlib.h)~~
201		~~-*/~~
202		~~-int decompress_header(int fin, bz_info_t *bfile) {~~
203		~~- int ret, res;~~
204		-
205		~~- res = lseek(fin,0,SEEK_SET);~~
206		~~- if (res == -1) {~~
207		~~- fprintf(stderr,"lseek of file to 0 failed (3)\n");~~
208		~~- }~~
209		~~- bfile->bytes_read = read(fin, bfile->header_buffer, 4);~~
210		~~- if (bfile->bytes_read < 4) {~~
211		~~- fprintf(stderr,"failed to read 4 bytes of header, exiting\n");~~
212		~~- exit(-1);~~
213		~~- }~~
214		~~- bfile->strm.next_in = (char *)bfile->header_buffer;~~
215		~~- bfile->strm.avail_in = 4;~~
216		-
217		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
218		~~- if (BZ_OK != ret && BZ_STREAM_END != ret) {~~
219		~~- fprintf(stderr,"Corrupt bzip2 header, exiting\n");~~
220		~~- exit(-1);~~
221		~~- }~~
222		~~- return(ret);~~
223		-}
224		-
225		-/*
226		~~- seek to appropriate offset as specified in bfile,~~
227		~~- read compressed data into buffer indicated by bfile,~~
228		~~- update the bfile structure accordingly,~~
229		~~- save the overflow byte (bit-shifted data = suck)~~
230		~~- this is for the first buffer of data in a stream,~~
231		~~- for subsequent buffers use fill_buffer_to_decompress()~~
232		-
233		~~- this will set bfile->eof on eof. no other indicator~~
234		~~- will be provided.~~
235		-
236		~~- returns:~~
237		~~- 0 on success~~
238		~~- -1 on error~~
239		~~-*/~~
240		~~-int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {~~
241		~~- int res;~~
242		-
243		~~- if (bfile->bits_shifted == 0) {~~
244		~~- res = lseek(fin,bfile->position+1,SEEK_SET);~~
245		~~- if (res == -1) {~~
246		~~- fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);~~
247		~~- return(-1);~~
248		~~- }~~
249		~~- }~~
250		~~- else {~~
251		~~- res = lseek(fin,bfile->position,SEEK_SET);~~
252		~~- if (res == -1) {~~
253		~~- fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);~~
254		~~- return(-1);~~
255		~~- }~~
256		~~- }~~
257		~~- bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);~~
258		~~- if (bfile->bytes_read > 0) {~~
259		~~- bfile->overflow = bfile->bufin[bfile->bytes_read-1];~~
260		~~- shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);~~
261		-
262		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
263		~~- bfile->strm.avail_in = bfile->bytes_read-1;~~
264		~~- }~~
265		~~- if (bfile->bytes_read <=0) {~~
266		~~- bfile->eof++;~~
267		~~- }~~
268		~~- return(0);~~
269		-}
270		-
271		-/*
272		~~- read compressed data into buffer indicated by bfile,~~
273		~~- from current position of file,~~
274		~~- stuffing the overflow byte in first.~~
275		~~- update the bfile structure accordingly~~
276		~~- save the new overflow byte (bit-shifted data = suck)~~
277		~~- this function is for decompression of buffers *after~~
278		~~- the first one*. for the first one use~~
279		~~- setup_first_buffer_to_decompress()~~
280		-
281		~~- this will set bfile->eof on eof. no other indicator~~
282		~~- will be provided.~~
283		-
284		~~- returns:~~
285		~~- 0 on success~~
286		~~- hmm, it really does not do anything about errors :-D~~
287		~~-*/~~
288		~~-int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {~~
289		~~- if (bfile->strm.avail_in == 0) {~~
290		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
291		~~- bfile->bufin[0] = bfile->overflow;~~
292		~~- bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);~~
293		~~- if (bfile->bytes_read > 0) {~~
294		~~- bfile->position+=bfile->bytes_read;~~
295		~~- bfile->overflow = bfile->bufin[bfile->bytes_read];~~
296		~~- shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);~~
297		~~- bfile->strm.avail_in = bfile->bytes_read;~~
298		~~- }~~
299		~~- else {~~
300		~~- bfile->strm.avail_in = 1; /* the overflow byte */~~
301		~~- bfile->eof++;~~
302		~~- }~~
303		~~- }~~
304		~~- return(0);~~
305		-}
306		-
307		~~-/* size of buffer is bytes usable. there will be a null byte at the end~~
308		-
309		~~- what we do with the buffer:~~
310		~~- - read from front of buffer to end,~~
311		~~- - fill from point where prev read did not fill buffer, or from where~~
312		~~- move of data at end of buffer to beginning left room,~~
313		~~- - mark a string of bytes (starting from what's available to read) as "read"~~
314		-
315		~~-*/~~
316		~~-buf_info_t *init_buffer(int size) {~~
317		~~- buf_info_t *b;~~
318		-
319		~~- b = (buf_info_t *)malloc(sizeof(buf_info_t));~~
320		~~- b->buffer = malloc(sizeof(unsigned char)*(size+1));~~
321		~~- b->buffer[size]='\0';~~
322		~~- b->end = b->buffer + size;~~
323		~~- b->next_to_read = b->end; /* nothing available */~~
324		~~- b->bytes_avail = 0; /* bytes to read, nothing available */~~
325		~~- b->next_to_fill = b->buffer; /* empty */~~
326		~~- b->next_to_fill[0] = '\0';~~
327		~~- return(b);~~
328		-}
329		-
330		~~-/* check if buffer (used for decompressed data output) is empty,~~
331		~~- returns 1 if so and 0 if not */~~
332		~~-int buffer_is_empty(buf_info_t *b) {~~
333		~~- if (b->bytes_avail == 0) {~~
334		~~- return(1);~~
335		~~- }~~
336		~~- else {~~
337		~~- return(0);~~
338		~~- }~~
339		-}
340		-
341		~~-/* check if buffer (used for decompressed data output) is full,~~
342		-
343		~~- returns 1 if so and 0 if not~~
344		~~- I'm not liking this function so well, fixme */~~
345		~~-int buffer_is_full(buf_info_t *b) {~~
346		~~- if (b->next_to_fill == b->end) {~~
347		~~- return(1);~~
348		~~- }~~
349		~~- else {~~
350		~~- return(0);~~
351		~~- }~~
352		-}
353		-
354		~~-/* FIXME do this right. whatever. */~~
355		~~-int get_file_size(int fin) {~~
356		~~- int res;~~
357		-
358		~~- res = lseek(fin, 0, SEEK_END);~~
359		~~- if (res == -1) {~~
360		~~- fprintf(stderr,"lseek of file to 0 failed (6)\n");~~
361		~~- exit(-1);~~
362		~~- }~~
363		~~- return(res);~~
364		-}
365		-
366		-
367		-/*
368		~~- set up the marker, seek to right place, get first~~
369		~~- buffer of compressed data for processing~~
370		~~- bfile->position must be set to desired offset first by caller.~~
371		~~- returns:~~
372		~~- -1 if no marker or other error, position of next read if ok~~
373		~~-*/~~
374		~~-int init_bz2_file(bz_info_t *bfile, int fin) {~~
375		~~- int res;~~
376		-
377		~~- bfile->bufin_size = BUFINSIZE;~~
378		~~- bfile->marker = init_marker();~~
379		~~- bfile->bytes_read = 0;~~
380		~~- bfile->bytes_written = 0;~~
381		~~- bfile->eof = 0;~~
382		-
383		~~- bfile->initialized++;~~
384		-
385		~~- bfile->file_size = get_file_size(fin);~~
386		~~- if (bfile->position > bfile->file_size) {~~
387		~~- fprintf(stderr,"asked for position past end of file\n");~~
388		~~- exit(-1);~~
389		~~- }~~
390		~~- res = lseek(fin, bfile->position, SEEK_SET);~~
391		~~- if (res == -1) {~~
392		~~- fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);~~
393		~~- exit(-1);~~
394		~~- }~~
395		-
396		~~- find_next_bz2_block_marker(fin, bfile);~~
397		~~- if (bfile->bits_shifted >= 0) {~~
398		~~- /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */~~
399		~~- init_decompress(bfile);~~
400		~~- decompress_header(fin, bfile);~~
401		~~- setup_first_buffer_to_decompress(fin, bfile);~~
402		~~- return(0);~~
403		~~- }~~
404		~~- return(-1);~~
405		-}
406		-
407		~~-/* get the next buffer of uncompressed stuff */~~
408		~~-int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {~~
409		~~- int ret;~~
410		-
411		~~- bfile->bufout = bufferout;~~
412		~~- bfile->bufout_size = bufout_size;~~
413		~~- bfile->bytes_written = 0;~~
414		-
415		~~- if (! bfile->initialized) {~~
416		~~- if (init_bz2_file(bfile, fin) == -1) {~~
417		~~- fprintf(stderr,"failed to initialize bz2file\n");~~
418		~~- return(-1);~~
419		~~- };~~
420		~~- bfile->strm.next_out = (char *)bfile->bufout;~~
421		~~- bfile->strm.avail_out = bfile->bufout_size;~~
422		~~- }~~
423		-
424		~~- ret = BZ_OK;~~
425		~~- while (BZ_OK == ret && bfile->bytes_written == 0) {~~
426		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
427		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
428		~~- bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;~~
429		~~- }~~
430		~~- else {~~
431		~~- fprintf(stderr,"error from BZ decompress %d\n",ret);~~
432		~~- return(-1);~~
433		~~- }~~
434		~~- fill_buffer_to_decompress(fin, bfile, ret);~~
435		- /*
436		~~- if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {~~
437		~~- fprintf(stderr,"eof reached\n");~~
438		~~- }~~
439		~~- */~~
440		~~- }~~
441		~~- return(0);~~
442		-}
443		-
444		-/*
445		~~- fill output buffer in b with uncompressed data from bfile~~
446		~~- if this is the first call to the function for this file,~~
447		~~- the file header will be read, and the first buffer of~~
448		~~- uncompressed data will be prepared. bfile->position~~
449		~~- should be set to the offset (from the beginning of file) from~~
450		~~- which to find the first bz2 block.~~
451		-
452		~~- returns:~~
453		~~- on success, number of bytes read (may be 0)~~
454		~~- -1 on error~~
455		~~-*/~~
456		~~-int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {~~
457		~~- int res;~~
458		-
459		~~- if (buffer_is_full(b)) {~~
460		~~- fprintf(stdout,"DEBUG buffer full\n");~~
461		~~- return(0);~~
462		~~- }~~
463		-
464		~~- if (buffer_is_empty(b)) {~~
465		~~- b->next_to_fill = b->buffer;~~
466		~~- }~~
467		-
468		~~- res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);~~
469		~~- if (res <0 ) {~~
470		~~- return(res);~~
471		~~- }~~
472		~~- if (bfile->bytes_written < 0) {~~
473		~~- fprintf(stderr,"read of file failed\n");~~
474		~~- return(-1);~~
475		~~- }~~
476		~~- else {~~
477		~~- /* really?? FIXME check this */~~
478		~~- if (buffer_is_empty(b)) {~~
479		~~- b->next_to_read = b->next_to_fill; /* where we just read */~~
480		~~- }~~
481		~~- b->bytes_avail += bfile->bytes_written;~~
482		~~- b->next_to_fill += bfile->bytes_written;~~
483		~~- b->next_to_fill[0] = '\0';~~
484		~~- return(0);~~
485		~~- }~~
486		-}
487		-
488		~~-void dumpbuf_info_t(buf_info_t *b) {~~
489		~~- fprintf(stdout, "\n");~~
490		~~- fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);~~
491		~~- fprintf(stdout, "b->end: %ld\n", (long int) b->end);~~
492		~~- fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);~~
493		~~- fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);~~
494		~~- fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);~~
495		-}
496		-
497		-/*
498		~~- copy text from end of buffer to the beginning, that we want to keep~~
499		~~- around for further processing (i.e. further regex matches)~~
500		~~- returns number of bytes copied~~
501		~~-*/~~
502		~~-int move_bytes_to_buffer_start(buf_info_t b, unsigned char fromwhere, int maxbytes) {~~
503		~~- int i, tocopy;~~
504		-
505		~~- if (fromwhere >= b->end) {~~
506		~~- return(0);~~
507		~~- }~~
508		~~- else {~~
509		~~- tocopy = b->end - fromwhere;~~
510		~~- if (maxbytes && (tocopy > maxbytes)) {~~
511		~~- tocopy = maxbytes;~~
512		~~- }~~
513		~~- for (i = 0; i < tocopy; i++) {~~
514		~~- b->buffer[i] = fromwhere[i];~~
515		~~- }~~
516		~~- b->next_to_fill = b->buffer + tocopy;~~
517		~~- b->next_to_fill[0] = '\0';~~
518		~~- b->next_to_read = b->buffer;~~
519		~~- b->bytes_avail = tocopy;~~
520		~~- return(tocopy);~~
521		~~- }~~
522		-}
523		-
524		-/*
525		~~- dump the <meadiawiki> header (up through~~
526		~~- </siteinfo> close tag) found at the~~
527		~~- beginning of xml dump files.~~
528		~~- returns:~~
529		~~- 0 on success,~~
530		~~- -1 on error~~
531		~~-*/~~
532		~~-int dump_mw_header(int fin) {~~
533		~~- int res;~~
534		~~- regmatch_t *match_siteinfo;~~
535		~~- regex_t compiled_siteinfo;~~
536		~~- int length=5000; /* output buffer size */~~
537		~~- char *siteinfo = " </siteinfo>\n";~~
538		-
539		~~- buf_info_t *b;~~
540		~~- bz_info_t bfile;~~
541		-
542		~~- int firstpage = 1;~~
543		~~- int done = 0;~~
544		~~- bfile.initialized = 0;~~
545		-
546		~~- res = regcomp(&compiled_siteinfo, siteinfo, REG_EXTENDED);~~
547		-
548		~~- match_siteinfo = (regmatch_t )malloc(sizeof(regmatch_t)1);~~
549		-
550		~~- b = init_buffer(length);~~
551		~~- bfile.bytes_read = 0;~~
552		~~- bfile.position = 0;~~
553		-
554		~~- while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof) && (!done)) {~~
555		~~- /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */~~
556		~~- if (bfile.bytes_read) {~~
557		~~- if (firstpage) {~~
558		~~- if (bfile.bytes_read >= 11 && !memcmp((char *)b->next_to_read,"<mediawiki ",11)) {~~
559		~~- /* good, write it and loop and not firstpage any more */~~
560		~~- if (b->bytes_avail) {~~
561		~~- if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {~~
562		~~- fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);~~
563		~~- b->next_to_read = b->end;~~
564		~~- b->bytes_avail = 0;~~
565		~~- b->next_to_fill = b->buffer; /* empty */~~
566		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
567		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
568		~~- done++;~~
569		~~- }~~
570		~~- else {~~
571		~~- fwrite(b->next_to_read,b->bytes_avail,1,stdout);~~
572		~~- b->next_to_read = b->end;~~
573		~~- b->bytes_avail = 0;~~
574		~~- b->next_to_fill = b->buffer; /* empty */~~
575		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
576		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
577		~~- }~~
578		~~- }~~
579		~~- }~~
580		~~- else {~~
581		~~- fprintf(stderr,"missing mediawiki header from bz2 xml file\n");~~
582		~~- return(-1);~~
583		~~- }~~
584		~~- firstpage = 0;~~
585		~~- }~~
586		~~- else { /* not firstpage */~~
587		~~- if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {~~
588		~~- fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);~~
589		~~- b->next_to_read = b->end;~~
590		~~- b->bytes_avail = 0;~~
591		~~- b->next_to_fill = b->buffer; /* empty */~~
592		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
593		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
594		~~- done++;~~
595		~~- }~~
596		~~- else {~~
597		~~- /* could have the first part of the siteinfo tag... so copy up enough bytes to cover that case */~~
598		~~- if (b->bytes_avail> 12) {~~
599		~~- /* write everything that didn't match, but leave 12 bytes, to stdout */~~
600		~~- fwrite(b->next_to_read,b->bytes_avail - 12,1,stdout);~~
601		~~- move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 12, 12);~~
602		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
603		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
604		~~- }~~
605		~~- else {~~
606		~~- if (buffer_is_empty(b)) {~~
607		~~- bfile.strm.next_out = (char *)b->buffer;~~
608		~~- bfile.strm.avail_out = bfile.bufout_size;~~
609		~~- b->next_to_fill = b->buffer; /* empty */~~
610		~~- }~~
611		~~- else {~~
612		~~- /* there were only 12 or less bytes so just save em don't write em to stdout */~~
613		~~- move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);~~
614		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
615		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
616		~~- }~~
617		~~- }~~
618		~~- }~~
619		~~- } /* end notfirstpage */~~
620		~~- }~~
621		~~- }~~
622		~~- if (!done) {~~
623		~~- fprintf(stderr,"incomplete or no mediawiki header found\n");~~
624		~~- return(-1);~~
625		~~- }~~
626		~~- else {~~
627		~~- return(0);~~
628		~~- }~~
629		-}
630		-
631		-/*
632		~~- find the first page id after position in file~~
633		~~- decompress and dump to stdout from that point on~~
634		~~- returns:~~
635		~~- 0 on success,~~
636		~~- -1 on error~~
637		~~-*/~~
638		~~-int dump_from_first_page_id_after_offset(int fin, int position) {~~
639		~~- int res;~~
640		~~- regmatch_t *match_page;~~
641		~~- regex_t compiled_page;~~
642		~~- int length=5000; /* output buffer size */~~
643		~~- char *page = " <page>";~~
644		-
645		~~- buf_info_t *b;~~
646		~~- bz_info_t bfile;~~
647		-
648		~~- int firstpage = 1;~~
649		-
650		~~- bfile.initialized = 0;~~
651		-
652		~~- res = regcomp(&compiled_page, page, REG_EXTENDED);~~
653		-
654		~~- match_page = (regmatch_t )malloc(sizeof(regmatch_t)1);~~
655		-
656		~~- b = init_buffer(length);~~
657		~~- bfile.bytes_read = 0;~~
658		~~- bfile.position = position;~~
659		-
660		~~- while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof)) {~~
661		~~- /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */~~
662		~~- if (bfile.bytes_read) {~~
663		~~- if (firstpage) {~~
664		~~- if (regexec(&compiled_page, (char *)b->next_to_read, 2, match_page, 0 ) == 0) {~~
665		~~- fwrite(b->next_to_read+match_page[0].rm_so,b->next_to_fill - (b->next_to_read+match_page[0].rm_so), 1, stdout);~~
666		~~- b->next_to_read = b->end;~~
667		~~- b->bytes_avail = 0;~~
668		~~- b->next_to_fill = b->buffer; /* empty */~~
669		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
670		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
671		~~- firstpage = 0;~~
672		~~- }~~
673		~~- else {~~
674		~~- /* could have the first part of the page tag... so copy up enough bytes to cover that case */~~
675		~~- if (b->bytes_avail> 7) {~~
676		~~- /* write everything that didn't match, but leave 7 bytes, to stdout */~~
677		~~- fwrite(b->next_to_read,b->bytes_avail - 7,1,stdout);~~
678		~~- move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 7, 7);~~
679		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
680		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
681		~~- }~~
682		~~- else {~~
683		~~- if (buffer_is_empty(b)) {~~
684		~~- bfile.strm.next_out = (char *)b->buffer;~~
685		~~- bfile.strm.avail_out = bfile.bufout_size;~~
686		~~- b->next_to_fill = b->buffer; /* empty */~~
687		~~- }~~
688		~~- else {~~
689		~~- /* there were only 7 or less bytes so just save em don't write em to stdout */~~
690		~~- move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);~~
691		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
692		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
693		~~- }~~
694		~~- }~~
695		~~- }~~
696		~~- }~~
697		~~- else {~~
698		~~- if (b->bytes_avail) {~~
699		~~- fwrite(b->next_to_read,b->bytes_avail,1,stdout);~~
700		~~- b->next_to_read = b->end;~~
701		~~- b->bytes_avail = 0;~~
702		~~- b->next_to_fill = b->buffer; /* empty */~~
703		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
704		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
705		~~- }~~
706		~~- }~~
707		~~- }~~
708		~~- }~~
709		~~- if (b->bytes_avail) {~~
710		~~- fwrite(b->next_to_read,b->bytes_avail,1,stdout);~~
711		~~- b->next_to_read = b->end;~~
712		~~- b->bytes_avail = 0;~~
713		~~- b->next_to_fill = b->buffer; /* empty */~~
714		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
715		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
716		~~- }~~
717		~~- return(0);~~
718		-}
719		-
720		-/*
721		~~- find the first bz2 block after the specified offset,~~
722		~~- uncompress from that point on, write out the~~
723		~~- contents starting with the first <page> tag,~~
724		~~- prefacing first with the <mediawiki> header from~~
725		~~- the beginning of the file, up through </siteinfo>.~~
726		-
727		~~- note that we may lose some bytes from the very last~~
728		~~- block if the blocks are bit shifted, because the~~
729		~~- bzip crc at end of file will be wrong. (needs testing to~~
730		~~- find a workaround, simply not feeding in the crc doesn't~~
731		~~- suffice)~~
732		-
733		~~- for purposes of the XML dumps this is fine, since we use~~
734		~~- this tool to generate prefetch data starting from~~
735		~~- a given pageid, rather than needing to uncompress~~
736		~~- gigabytes of data to get to the point in the file~~
737		~~- we want.~~
738		-
739		~~- returns:~~
740		~~- BZ_OK on success, various BZ_ errors otherwise.~~
741		~~-*/~~
742		~~-int main(int argc, char **argv) {~~
743		~~- int fin, position, res;~~
744		-
745		~~- if (argc != 3) {~~
746		~~- fprintf(stderr,"usage: %s infile position\n", argv[0]);~~
747		~~- exit(-1);~~
748		~~- }~~
749		-
750		~~- fin = open (argv[1], O_RDONLY);~~
751		~~- if (fin < 0) {~~
752		~~- fprintf(stderr,"failed to open file %s for read\n", argv[1]);~~
753		~~- exit(-1);~~
754		~~- }~~
755		-
756		~~- position = atoi(argv[2]);~~
757		~~- if (position <0) {~~
758		~~- fprintf(stderr,"please specify a position >= 0.\n");~~
759		~~- fprintf(stderr,"usage: %s infile position\n", argv[0]);~~
760		~~- exit(-1);~~
761		~~- }~~
762		~~- /* input file, starting position in file, length of buffer for reading */~~
763		~~- res = dump_mw_header(fin);~~
764		-
765		~~- res = dump_from_first_page_id_after_offset(fin, position);~~
766		~~- exit(res);~~
767		-}
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.c
—	—	@@ -1,842 +0,0 @@
2		~~-#include <unistd.h>~~
3		~~-#include <stdio.h>~~
4		~~-#include <string.h>~~
5		~~-#include <sys/types.h>~~
6		~~-#include <sys/stat.h>~~
7		~~-#include <fcntl.h>~~
8		~~-#include <stdlib.h>~~
9		~~-#include <errno.h>~~
10		~~-#include <sys/types.h>~~
11		~~-#include <regex.h>~~
12		~~-#include "bzlib.h"~~
13		~~-#include "findpageidinbz2xml.h"~~
14		-
15		~~-/* return n ones either at left or right end */~~
16		~~-int bitmask(int numbits, int end) {~~
17		~~- if (end == MASKRIGHT) {~~
18		~~- return((1<<numbits)-1);~~
19		~~- }~~
20		~~- else {~~
21		~~- return(((1<<numbits)-1) << (8-numbits));~~
22		~~- }~~
23		-}
24		-
25		~~-void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {~~
26		~~- int i;~~
27		-
28		~~- if (numbits == 0) {~~
29		~~- return;~~
30		~~- }~~
31		-
32		~~- for (i=0; i<buflen; i++) {~~
33		~~- /* left 1 */~~
34		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);~~
35		-
36		~~- /* grab leftmost from next byte */~~
37		~~- if (i < buflen-1) {~~
38		~~- buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );~~
39		~~- }~~
40		~~- }~~
41		-}
42		-
43		~~-void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {~~
44		~~- int i;~~
45		-
46		~~- for (i=buflen-1; i>=0; i--) {~~
47		~~- /* right 1 */~~
48		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
49		-
50		~~- /* grab rightmost from prev byte */~~
51		~~- if (i > 0) {~~
52		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));~~
53		~~- }~~
54		~~- }~~
55		-}
56		-
57		~~-unsigned char ** init_marker() {~~
58		~~- unsigned char *marker = malloc(8sizeof(unsigned char *));~~
59		~~- int i;~~
60		-
61		~~- /* set up block marker plus its various right-shifted incarnations */~~
62		~~- for (i = 0; i< 8; i++) {~~
63		~~- marker[i] = malloc(sizeof(unsigned char)*7);~~
64		~~- }~~
65		~~- marker[0][0]= (unsigned char) 0x31;~~
66		~~- marker[0][1]= (unsigned char) 0x41;~~
67		~~- marker[0][2]= (unsigned char) 0x59;~~
68		~~- marker[0][3]= (unsigned char) 0x26;~~
69		~~- marker[0][4]= (unsigned char) 0x53;~~
70		~~- marker[0][5]= (unsigned char) 0x59;~~
71		~~- marker[0][6]= (unsigned char) 0x00;~~
72		~~- for (i = 1; i< 8; i++) {~~
73		~~- memcpy((char )(marker[i]), (char )(marker[i-1]),7);~~
74		~~- shift_bytes_right(marker[i],7,1);~~
75		~~- }~~
76		~~- return(marker);~~
77		-}
78		-
79		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
80		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
81		~~- matches and 0 otherwise. */~~
82		~~-int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
83		~~- int i;~~
84		-
85		~~- if (bitsrightshifted == 0) {~~
86		~~- for (i = 0; i< numbytes; i++) {~~
87		~~- if (buff1[i] != buff2[i]) {~~
88		~~- return(1);~~
89		~~- }~~
90		~~- }~~
91		~~- return(0);~~
92		~~- }~~
93		~~- else {~~
94		~~- for (i = 1; i< numbytes-2; i++) {~~
95		~~- if (buff1[i] != buff2[i]) {~~
96		~~- return(1);~~
97		~~- }~~
98		~~- }~~
99		~~- /* do leftmost byte */~~
100		~~- if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {~~
101		~~- return(1);~~
102		~~- }~~
103		~~- /* do rightmost byte */~~
104		~~- if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {~~
105		~~- return(1);~~
106		~~- }~~
107		~~- return(0);~~
108		~~- }~~
109		-}
110		-
111		-
112		~~-/* return -1 if no match~~
113		~~- return number of bits rightshifted otherwise */~~
114		~~-int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {~~
115		~~- int result, i;~~
116		-
117		~~- result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);~~
118		~~- if (!result) {~~
119		~~- return(0);~~
120		~~- }~~
121		~~- for (i=1; i<8; i++) {~~
122		~~- result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);~~
123		~~- if (!result) {~~
124		~~- return(i);~~
125		~~- }~~
126		~~- }~~
127		~~- return(-1);~~
128		-}
129		-
130		-
131		~~-/* return: 1 if found, 0 if not, -1 on error */~~
132		~~-int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {~~
133		~~- int result;~~
134		-
135		~~- bfile->bits_shifted = -1;~~
136		~~- result = read(fin, bfile->marker_buffer, 7);~~
137		~~- if (result == -1) {~~
138		~~- /* fprintf(stderr,"read of file failed\n"); */~~
139		~~- return(-1);~~
140		~~- }~~
141		~~- /* must be after 4 byte file header, and we add a leftmost byte to the buffer~~
142		~~- of data read in case some bits have been shifted into it */~~
143		~~- while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {~~
144		~~- bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);~~
145		~~- if (bfile->bits_shifted < 0) {~~
146		~~- bfile->position++;~~
147		~~- result = lseek(fin, (bfile->position), SEEK_SET);~~
148		~~- if (result == -1) {~~
149		~~- fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);~~
150		~~- return(-1);~~
151		~~- }~~
152		~~- result = read(fin, bfile->marker_buffer, 7);~~
153		~~- if (result < 7) {~~
154		~~- /* fprintf(stderr,"read of file failed\n"); */~~
155		~~- return(-1);~~
156		~~- }~~
157		~~- }~~
158		~~- else {~~
159		~~- bfile->block_start = bfile->position;~~
160		~~- return(1);~~
161		~~- }~~
162		~~- }~~
163		~~- return(0);~~
164		-}
165		-
166		-/*
167		~~- initializes the bz2 strm structure,~~
168		~~- calls the BZ2 decompression library initializer~~
169		-
170		~~- returns:~~
171		~~- BZ_OK on success~~
172		~~- various BZ_ errors on failure (see bzlib.h)~~
173		~~-*/~~
174		~~-int init_decompress(bz_info_t *bfile) {~~
175		~~- int bz_verbosity = 0;~~
176		~~- int bz_small = 0;~~
177		~~- int ret;~~
178		-
179		~~- bfile->strm.bzalloc = NULL;~~
180		~~- bfile->strm.bzfree = NULL;~~
181		~~- bfile->strm.opaque = NULL;~~
182		-
183		~~- ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );~~
184		~~- if (ret != BZ_OK) {~~
185		~~- fprintf(stderr,"uncompress failed, err %d\n", ret);~~
186		~~- exit(-1);~~
187		~~- }~~
188		~~- return(ret);~~
189		-}
190		-
191		-/*
192		~~- reads the first 4 bytes from a bz2 file (should be~~
193		~~- "BZh" followed by the block size indicator, typically "9")~~
194		~~- and passes them into the BZ2 decompression library.~~
195		~~- This must be done before decompression of any block of the~~
196		~~- file is attempted.~~
197		-
198		~~- returns:~~
199		~~- BZ_OK if successful,~~
200		~~- various BZ_ errors on failure (see bzlib.h)~~
201		~~-*/~~
202		~~-int decompress_header(int fin, bz_info_t *bfile) {~~
203		~~- int ret, res;~~
204		-
205		~~- res = lseek(fin,0,SEEK_SET);~~
206		~~- if (res == -1) {~~
207		~~- fprintf(stderr,"lseek of file to 0 failed (3)\n");~~
208		~~- exit(-1);~~
209		~~- }~~
210		~~- bfile->bytes_read = read(fin, bfile->header_buffer, 4);~~
211		~~- if (bfile->bytes_read < 4) {~~
212		~~- fprintf(stderr,"failed to read 4 bytes of header, exiting\n");~~
213		~~- exit(-1);~~
214		~~- }~~
215		~~- bfile->strm.next_in = (char *)bfile->header_buffer;~~
216		~~- bfile->strm.avail_in = 4;~~
217		-
218		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
219		~~- if (BZ_OK != ret && BZ_STREAM_END != ret) {~~
220		~~- fprintf(stderr,"Corrupt bzip2 header, exiting\n");~~
221		~~- exit(-1);~~
222		~~- }~~
223		~~- return(ret);~~
224		-}
225		-
226		-/*
227		~~- seek to appropriate offset as specified in bfile,~~
228		~~- read compressed data into buffer indicated by bfile,~~
229		~~- update the bfile structure accordingly,~~
230		~~- save the overflow byte (bit-shifted data = suck)~~
231		~~- this is for the first buffer of data in a stream,~~
232		~~- for subsequent buffers use fill_buffer_to_decompress()~~
233		-
234		~~- this will set bfile->eof on eof. no other indicator~~
235		~~- will be provided.~~
236		-
237		~~- returns:~~
238		~~- 0 on success~~
239		~~- -1 on error~~
240		~~-*/~~
241		~~-int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {~~
242		~~- int res;~~
243		-
244		~~- if (bfile->bits_shifted == 0) {~~
245		~~- res = lseek(fin,bfile->position+1,SEEK_SET);~~
246		~~- if (res == -1) {~~
247		~~- fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);~~
248		~~- return(-1);~~
249		~~- }~~
250		~~- }~~
251		~~- else {~~
252		~~- res = lseek(fin,bfile->position,SEEK_SET);~~
253		~~- if (res == -1) {~~
254		~~- fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);~~
255		~~- return(-1);~~
256		~~- }~~
257		~~- }~~
258		~~- bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);~~
259		~~- if (bfile->bytes_read > 0) {~~
260		~~- bfile->overflow = bfile->bufin[bfile->bytes_read-1];~~
261		~~- shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);~~
262		-
263		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
264		~~- bfile->strm.avail_in = bfile->bytes_read-1;~~
265		~~- }~~
266		~~- if (bfile->bytes_read <=0) {~~
267		~~- bfile->eof++;~~
268		~~- }~~
269		~~- return(0);~~
270		-}
271		-
272		-/*
273		~~- read compressed data into buffer indicated by bfile,~~
274		~~- from current position of file,~~
275		~~- stuffing the overflow byte in first.~~
276		~~- update the bfile structure accordingly~~
277		~~- save the new overflow byte (bit-shifted data = suck)~~
278		~~- this function is for decompression of buffers *after~~
279		~~- the first one*. for the first one use~~
280		~~- setup_first_buffer_to_decompress()~~
281		-
282		~~- this will set bfile->eof on eof. no other indicator~~
283		~~- will be provided.~~
284		-
285		~~- returns:~~
286		~~- 0 on success~~
287		~~- hmm, it really does not do anything about errors :-D~~
288		~~-*/~~
289		~~-int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {~~
290		~~- if (bfile->strm.avail_in == 0) {~~
291		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
292		~~- bfile->bufin[0] = bfile->overflow;~~
293		~~- bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);~~
294		~~- if (bfile->bytes_read > 0) {~~
295		~~- bfile->overflow = bfile->bufin[bfile->bytes_read];~~
296		~~- shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);~~
297		~~- bfile->strm.avail_in = bfile->bytes_read;~~
298		~~- bfile->position+=bfile->bytes_read;~~
299		~~- }~~
300		~~- else {~~
301		~~- bfile->strm.avail_in = 1; /* the overflow byte */~~
302		~~- bfile->eof++;~~
303		~~- }~~
304		~~- }~~
305		~~- return(0);~~
306		-}
307		-
308		~~-/* size of buffer is bytes usable. there will be a null byte at the end~~
309		-
310		~~- what we do with the buffer:~~
311		~~- - read from front of buffer to end,~~
312		~~- - fill from point where prev read did not fill buffer, or from where~~
313		~~- move of data at end of buffer to beginning left room,~~
314		~~- - mark a string of bytes (starting from what's available to read) as "read"~~
315		-
316		~~-*/~~
317		~~-buf_info_t *init_buffer(int size) {~~
318		~~- buf_info_t *b;~~
319		-
320		~~- b = (buf_info_t *)malloc(sizeof(buf_info_t));~~
321		~~- b->buffer = malloc(sizeof(unsigned char)*(size+1));~~
322		~~- b->buffer[size]='\0';~~
323		~~- b->end = b->buffer + size;~~
324		~~- b->next_to_read = b->end; /* nothing available */~~
325		~~- b->bytes_avail = 0; /* bytes to read, nothing available */~~
326		~~- b->next_to_fill = b->buffer; /* empty */~~
327		~~- b->next_to_fill[0] = '\0';~~
328		~~- return(b);~~
329		-}
330		-
331		~~-/* check if buffer (used for decompressed data output) is empty,~~
332		~~- returns 1 if so and 0 if not */~~
333		~~-int buffer_is_empty(buf_info_t *b) {~~
334		~~- if (b->bytes_avail == 0) {~~
335		~~- return(1);~~
336		~~- }~~
337		~~- else {~~
338		~~- return(0);~~
339		~~- }~~
340		-}
341		-
342		~~-/* check if buffer (used for decompressed data output) is full,~~
343		-
344		~~- returns 1 if so and 0 if not~~
345		~~- I'm not liking this function so well, fixme */~~
346		~~-int buffer_is_full(buf_info_t *b) {~~
347		~~- if (b->next_to_fill == b->end) {~~
348		~~- return(1);~~
349		~~- }~~
350		~~- else {~~
351		~~- return(0);~~
352		~~- }~~
353		-}
354		-
355		~~-/* FIXME do this right. whatever. */~~
356		~~-int get_file_size(int fin) {~~
357		~~- int res;~~
358		-
359		~~- res = lseek(fin, 0, SEEK_END);~~
360		~~- if (res == -1) {~~
361		~~- fprintf(stderr,"lseek of file to 0 failed (6)\n");~~
362		~~- exit(-1);~~
363		~~- }~~
364		~~- return(res);~~
365		-}
366		-
367		-
368		-/*
369		~~- look for the first bz2 block in the file after specified offset~~
370		~~- it tests that the block is valid by doing partial decompression.~~
371		~~- this function will update the bfile structure:~~
372		~~- bfile->position will contain the current position of the file (? will it?)~~
373		~~- bfile->bits_shifted will contain the number of bits that the block is rightshifted~~
374		~~- bfile->block_start will contain the offset from start of file to the block~~
375		~~- returns:~~
376		~~- position of next byte in file to be read, on success~~
377		~~- -1 if no marker or other error~~
378		~~-*/~~
379		~~-int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {~~
380		~~- int res;~~
381		-
382		~~- bfile->bufin_size = BUFINSIZE;~~
383		~~- bfile->marker = init_marker();~~
384		~~- bfile->position = position;~~
385		~~- bfile->block_start = -1;~~
386		~~- bfile->bytes_read = 0;~~
387		~~- bfile->bytes_written = 0;~~
388		~~- bfile->eof = 0;~~
389		~~- bfile->bits_shifted = -1;~~
390		-
391		~~- bfile->file_size = get_file_size(fin);~~
392		-
393		~~- while (bfile->bits_shifted < 0) {~~
394		~~- if (bfile->position > bfile->file_size) {~~
395		~~- return(-1);~~
396		~~- }~~
397		~~- res = lseek(fin, bfile->position, SEEK_SET);~~
398		~~- if (res == -1) {~~
399		~~- fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);~~
400		~~- exit(-1);~~
401		~~- }~~
402		~~- res = find_next_bz2_block_marker(fin, bfile);~~
403		~~- if (res == 1) {~~
404		~~- init_decompress(bfile);~~
405		~~- decompress_header(fin, bfile);~~
406		~~- res = setup_first_buffer_to_decompress(fin, bfile);~~
407		~~- if (res == -1) {~~
408		~~- fprintf(stderr,"couldn't get first buffer of data to uncompress\n");~~
409		~~- exit(-1);~~
410		~~- }~~
411		~~- bfile->strm.next_out = (char *)bfile->bufout;~~
412		~~- bfile->strm.avail_out = bfile->bufout_size;~~
413		~~- res = BZ2_bzDecompress ( &(bfile->strm) );~~
414		~~- /* this means we (probably) have a genuine marker */~~
415		~~- if (BZ_OK == res \|\| BZ_STREAM_END == res) {~~
416		~~- res = BZ2_bzDecompressEnd ( &(bfile->strm) );~~
417		~~- bfile->bytes_read = 0;~~
418		~~- bfile->bytes_written = 0;~~
419		~~- bfile->eof = 0;~~
420		~~- /* leave the file at the right position */~~
421		~~- res = lseek(fin, bfile->block_start, SEEK_SET);~~
422		~~- if (res == -1) {~~
423		~~- fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);~~
424		~~- exit(-1);~~
425		~~- }~~
426		~~- return(0);~~
427		~~- }~~
428		~~- /* right bytes, but there by chance, skip and try again */~~
429		~~- else {~~
430		~~- bfile->position+=6;~~
431		~~- bfile->bits_shifted = -1;~~
432		~~- bfile->block_start = -1;~~
433		~~- }~~
434		~~- }~~
435		~~- else {~~
436		~~- return(-1);~~
437		~~- }~~
438		~~- }~~
439		~~- return(-1);~~
440		-}
441		-
442		-/*
443		~~- find the first bz2 block marker in the file,~~
444		~~- from its current position,~~
445		~~- then set up for decompression from that point~~
446		~~- returns:~~
447		~~- 0 on success~~
448		~~- -1 if no marker or other error~~
449		~~-*/~~
450		~~-int init_bz2_file(bz_info_t *bfile, int fin) {~~
451		~~- int res;~~
452		-
453		~~- bfile->initialized++;~~
454		-
455		~~- res = find_next_bz2_block_marker(fin, bfile);~~
456		~~- if (res ==1) {~~
457		~~- init_decompress(bfile);~~
458		~~- decompress_header(fin, bfile);~~
459		~~- setup_first_buffer_to_decompress(fin, bfile);~~
460		~~- return(0);~~
461		~~- }~~
462		~~- return(-1);~~
463		-}
464		-
465		~~-/* return -1 if error */~~
466		~~-int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {~~
467		~~- int ret;~~
468		-
469		~~- bfile->bufout = bufferout;~~
470		~~- bfile->bufout_size = bufout_size;~~
471		~~- bfile->bytes_written = 0;~~
472		-
473		~~- if (! bfile->initialized) {~~
474		~~- if (init_bz2_file(bfile, fin) == -1) {~~
475		~~- /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */~~
476		~~- return(-1);~~
477		~~- };~~
478		~~- bfile->strm.next_out = (char *)bfile->bufout;~~
479		~~- bfile->strm.avail_out = bfile->bufout_size;~~
480		~~- }~~
481		-
482		~~- ret = BZ_OK;~~
483		~~- while (BZ_OK == ret && bfile->bytes_written == 0) {~~
484		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
485		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
486		~~- bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;~~
487		~~- }~~
488		~~- else {~~
489		~~- /* fprintf(stderr,"error from BZ decompress %d\n",ret); */~~
490		~~- return(-1);~~
491		~~- }~~
492		~~- fill_buffer_to_decompress(fin, bfile, ret);~~
493		- /*
494		~~- if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {~~
495		~~- fprintf(stderr,"eof reached\n");~~
496		~~- }~~
497		~~- */~~
498		~~- }~~
499		~~- return(0);~~
500		-}
501		-
502		-
503		-/*
504		~~- fill output buffer in b with uncompressed data from bfile~~
505		~~- if this is the first call to the function for this file,~~
506		~~- the file header will be read, and the first buffer of~~
507		~~- uncompressed data will be prepared. bfile->position~~
508		~~- should be set to the offset (from the beginning of file) from~~
509		~~- which to find the first bz2 block.~~
510		-
511		~~- returns:~~
512		~~- on success, number of bytes read (may be 0)~~
513		~~- -1 on error~~
514		~~-*/~~
515		~~-int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {~~
516		~~- int res;~~
517		-
518		~~- if (buffer_is_full(b)) {~~
519		~~- return(0);~~
520		~~- }~~
521		-
522		~~- if (buffer_is_empty(b)) {~~
523		~~- b->next_to_fill = b->buffer;~~
524		~~- }~~
525		-
526		~~- res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);~~
527		~~- if (res == -1) {~~
528		~~- return(res);~~
529		~~- }~~
530		~~- if (bfile->bytes_written < 0) {~~
531		~~- /* fprintf(stderr,"read of file failed\n"); */~~
532		~~- return(-1);~~
533		~~- }~~
534		~~- else {~~
535		~~- /* really?? FIXME check this */~~
536		~~- if (buffer_is_empty(b)) {~~
537		~~- b->next_to_read = b->next_to_fill; /* where we just read */~~
538		~~- }~~
539		~~- b->bytes_avail += bfile->bytes_written;~~
540		~~- b->next_to_fill += bfile->bytes_written;~~
541		~~- b->next_to_fill[0] = '\0';~~
542		~~- return(0);~~
543		~~- }~~
544		-}
545		-
546		~~-void dumpbuf_info_t(buf_info_t *b) {~~
547		~~- fprintf(stdout, "\n");~~
548		~~- fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);~~
549		~~- fprintf(stdout, "b->end: %ld\n", (long int) b->end);~~
550		~~- fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);~~
551		~~- fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);~~
552		~~- fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);~~
553		-}
554		-
555		-
556		-/*
557		~~- copy text from end of buffer to the beginning, that we want to keep~~
558		~~- around for further processing (i.e. further regex matches)~~
559		~~- returns number of bytes copied~~
560		~~-*/~~
561		~~-int move_bytes_to_buffer_start(buf_info_t b, unsigned char from_where, int maxbytes) {~~
562		~~- int i, tocopy;~~
563		-
564		~~- if (from_where >= b->end) {~~
565		~~- return(0);~~
566		~~- }~~
567		~~- else {~~
568		~~- tocopy = b->end - from_where;~~
569		~~- if (maxbytes && (tocopy > maxbytes)) {~~
570		~~- tocopy = maxbytes;~~
571		~~- }~~
572		~~- for (i = 0; i < tocopy; i++) {~~
573		~~- b->buffer[i] = from_where[i];~~
574		~~- }~~
575		~~- b->next_to_fill = b->buffer + tocopy;~~
576		~~- b->next_to_fill[0] = '\0';~~
577		~~- b->next_to_read = b->buffer;~~
578		~~- b->bytes_avail = tocopy;~~
579		~~- return(tocopy);~~
580		~~- }~~
581		-}
582		-
583		-/*
584		~~- get the first page id after position in file~~
585		~~- if a pageid is found, the structure pinfo will be updated accordingly~~
586		~~- returns:~~
587		~~- 1 if a pageid found,~~
588		~~- 0 if no pageid found,~~
589		~~- -1 on error~~
590		~~-*/~~
591		~~-int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) {~~
592		~~- int res;~~
593		~~- regmatch_t match_page, match_page_id;~~
594		~~- regex_t compiled_page, compiled_page_id;~~
595		~~- int length=5000; /* output buffer size */~~
596		~~- char *page = "<page>";~~
597		~~- char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";~~
598		-
599		~~- buf_info_t *b;~~
600		~~- bz_info_t bfile;~~
601		-
602		~~- bfile.initialized = 0;~~
603		-
604		~~- res = regcomp(&compiled_page, page, REG_EXTENDED);~~
605		~~- res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);~~
606		-
607		~~- match_page = (regmatch_t )malloc(sizeof(regmatch_t)1);~~
608		~~- match_page_id = (regmatch_t )malloc(sizeof(regmatch_t)2);~~
609		-
610		~~- b = init_buffer(length);~~
611		-
612		~~- pinfo->bits_shifted = -1;~~
613		~~- pinfo->position = -1;~~
614		~~- pinfo->page_id = -1;~~
615		-
616		~~- bfile.bytes_read = 0;~~
617		-
618		~~- if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {~~
619		~~- /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */~~
620		~~- return(-1);~~
621		~~- }~~
622		-
623		~~- while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {~~
624		~~- if (bfile.bytes_read) {~~
625		~~- while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {~~
626		~~- if (match_page_id[1].rm_so >=0) {~~
627		~~- /* write page_id to stderr */~~
628		- /*
629		~~- fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);~~
630		~~- fwrite("\n",1,1,stderr);~~
631		~~- */~~
632		~~- pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));~~
633		~~- pinfo->position = bfile.block_start;~~
634		~~- pinfo->bits_shifted = bfile.bits_shifted;~~
635		~~- return(1);~~
636		~~- /* write up to and including page id tag to stdout */~~
637		- /*
638		~~- fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout);~~
639		~~- b->next_to_read = b->next_to_read+match_page_id[0].rm_eo;~~
640		~~- b->bytes_avail -= match_page_id[0].rm_eo;~~
641		~~- */~~
642		~~- }~~
643		~~- else {~~
644		~~- /* should never happen */~~
645		~~- fprintf(stderr,"regex gone bad...\n");~~
646		~~- exit(-1);~~
647		~~- }~~
648		~~- }~~
649		~~- if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {~~
650		~~- /* write everything up to but not including the page tag to stdout */~~
651		- /*
652		~~- fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);~~
653		~~- */~~
654		~~- move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so);~~
655		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
656		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
657		~~- }~~
658		~~- else {~~
659		~~- /* could have the first part of the page tag... so copy up enough bytes to cover that case */~~
660		~~- if (b->bytes_avail> 5) {~~
661		~~- /* write everything that didn't match, but leave 5 bytes, to stdout */~~
662		- /*
663		~~- fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);~~
664		~~- */~~
665		~~- move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);~~
666		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
667		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
668		~~- }~~
669		~~- else {~~
670		~~- if (buffer_is_empty(b)) {~~
671		~~- bfile.strm.next_out = (char *)b->buffer;~~
672		~~- bfile.strm.avail_out = bfile.bufout_size;~~
673		~~- b->next_to_fill = b->buffer; /* empty */~~
674		~~- }~~
675		~~- else {~~
676		~~- /* there were only 5 or less bytes so just save em don't write em to stdout */~~
677		~~- move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);~~
678		~~- bfile.strm.next_out = (char *)b->next_to_fill;~~
679		~~- bfile.strm.avail_out = b->end - b->next_to_fill;~~
680		~~- }~~
681		~~- }~~
682		~~- }~~
683		~~- }~~
684		~~- }~~
685		- /*
686		~~- if (b->bytes_avail) {~~
687		~~- fwrite(b->next_to_read,b->bytes_avail,1,stdout);~~
688		~~- }~~
689		~~- */~~
690		~~- return(0);~~
691		-}
692		-
693		~~-/* search for pageid in a bz2 file, given start and end offsets~~
694		~~- to search for~~
695		~~- we guess by the most boring method possible (shrink the~~
696		~~- interval according to the value found on the last guess,~~
697		~~- try midpoint of the new interval)~~
698		~~- multiple calls of this will get the job done.~~
699		~~- interval has left end = right end if search is complete.~~
700		~~- this function may return the previous guess and simply~~
701		~~- shrink the interval.~~
702		~~- note that a "match" means either that the pageid we find~~
703		~~- is smaller than the one the caller wants, or is equal.~~
704		~~- why? because then we can use the output for prefetch~~
705		~~- for xml dumps and be sure a specific page range is covered :-P~~
706		-
707		~~- return value from guess, or -1 on error.~~
708		~~- */~~
709		~~-int do_iteration(iter_info_t iinfo, int fin, page_info_t pinfo) {~~
710		~~- int res;~~
711		~~- int new_position;~~
712		~~- int interval;~~
713		-
714		- /*
715		~~- last_position is somewhere in the interval, perhaps at an end~~
716		~~- last_value is the value we had at that position~~
717		~~- */~~
718		-
719		~~- interval = (iinfo->right_end - iinfo->left_end)/2;~~
720		~~- if (interval == 0) {~~
721		~~- interval = 1;~~
722		~~- }~~
723		~~- /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */~~
724		~~- /* if we're this close, we'll check this value and be done with it */~~
725		~~- if (iinfo->right_end -iinfo->left_end < 2) {~~
726		~~- new_position = iinfo->left_end;~~
727		~~- iinfo->right_end = iinfo->left_end;~~
728		~~- }~~
729		~~- else {~~
730		~~- if (iinfo->last_value < iinfo->value_wanted) {~~
731		~~- /* fprintf(stderr,"resetting left end\n"); */~~
732		~~- iinfo->left_end = iinfo->last_position;~~
733		~~- new_position = iinfo->last_position + interval;~~
734		~~- }~~
735		~~- /* iinfo->last_value > iinfo->value_wanted */~~
736		~~- else {~~
737		~~- /* fprintf(stderr,"resetting right end\n"); */~~
738		~~- iinfo->right_end = iinfo->last_position;~~
739		~~- new_position = iinfo->last_position - interval;~~
740		~~- }~~
741		~~- }~~
742		~~- res = get_first_page_id_after_offset(fin, new_position, pinfo);~~
743		~~- if (res >0) {~~
744		~~- /* caller wants the new value */~~
745		~~- iinfo->last_value = pinfo->page_id;~~
746		~~- iinfo->last_position = new_position;~~
747		~~- return(pinfo->page_id);~~
748		~~- }~~
749		~~- else {~~
750		~~- /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or~~
751		~~- there's just no block here.~~
752		~~- set the right end, keep the last value and position and let the caller retry with the new interval */~~
753		~~- if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */~~
754		~~- iinfo->right_end = new_position;~~
755		~~- return(iinfo->last_value);~~
756		~~- }~~
757		~~- /* in theory we were moving towards beginning of file, should not have issues, so bail here */~~
758		~~- else {~~
759		~~- /* fprintf(stderr,"something very broken, giving up\n"); */~~
760		~~- return(-1);~~
761		~~- }~~
762		~~- }~~
763		-}
764		-
765		-/*
766		~~- given a bzipped and possibly truncated file, and a page id,~~
767		~~- hunt for the page id in the file; this assume that the~~
768		~~- bz2 header is intact and that page ids are steadily increasing~~
769		~~- throughout the file.~~
770		-
771		~~- writes the offset of the relevant block (from beginning of file)~~
772		~~- and the first pageid found in that block, to stdout~~
773		-
774		~~- format of output:~~
775		~~- position:xxxxx pageid:nnn~~
776		-
777		~~- returns: 0 on success, -1 on error~~
778		~~-*/~~
779		~~-int main(int argc, char **argv) {~~
780		~~- int fin, position, res, interval, page_id, oldmarker, file_size;~~
781		~~- page_info_t pinfo;~~
782		~~- iter_info_t iinfo;~~
783		-
784		~~- if (argc != 3) {~~
785		~~- fprintf(stderr,"usage: %s infile id\n", argv[0]);~~
786		~~- exit(-1);~~
787		~~- }~~
788		-
789		~~- fin = open (argv[1], O_RDONLY);~~
790		~~- if (fin < 0) {~~
791		~~- fprintf(stderr,"failed to open file %s for read\n", argv[1]);~~
792		~~- exit(-1);~~
793		~~- }~~
794		-
795		~~- page_id = atoi(argv[2]);~~
796		~~- if (page_id <1) {~~
797		~~- fprintf(stderr,"please specify a page_id >= 1.\n");~~
798		~~- fprintf(stderr,"usage: %s infile page_id\n", argv[0]);~~
799		~~- exit(-1);~~
800		~~- }~~
801		-
802		~~- file_size = get_file_size(fin);~~
803		-
804		~~- interval = file_size;~~
805		~~- position = 0;~~
806		~~- oldmarker = -1;~~
807		~~- pinfo.bits_shifted = -1;~~
808		~~- pinfo.position = -1;~~
809		~~- pinfo.page_id = -1;~~
810		-
811		~~- iinfo.left_end = 0;~~
812		~~- file_size = get_file_size(fin);~~
813		~~- iinfo.right_end = file_size;~~
814		~~- iinfo.value_wanted = page_id;~~
815		-
816		~~- res = get_first_page_id_after_offset(fin, 0, &pinfo);~~
817		~~- if (res > 0) {~~
818		~~- iinfo.last_value = pinfo.page_id;~~
819		~~- iinfo.last_position = 0;~~
820		~~- }~~
821		~~- else {~~
822		~~- fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n");~~
823		~~- exit(1);~~
824		~~- }~~
825		~~- if (pinfo.page_id == page_id) {~~
826		~~- fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);~~
827		~~- exit(0);~~
828		~~- }~~
829		-
830		~~- while (1) {~~
831		~~- res = do_iteration(&iinfo, fin, &pinfo);~~
832		~~- /* things to check: bad return? interval is 0 bytes long? */~~
833		~~- if (iinfo.left_end == iinfo.right_end) {~~
834		~~- fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);~~
835		~~- exit(0);~~
836		~~- }~~
837		~~- else if (res < 0) {~~
838		~~- fprintf(stderr,"broken and quitting\n");~~
839		~~- exit(-1);~~
840		~~- }~~
841		~~- }~~
842		~~- exit(0);~~
843		-}
Index: branches/ariel/xmldumps-backup/checkforbz2footer.c
—	—	@@ -1,156 +0,0 @@
2		~~-#include <unistd.h>~~
3		~~-#include <stdio.h>~~
4		~~-#include <string.h>~~
5		~~-#include <sys/types.h>~~
6		~~-#include <sys/stat.h>~~
7		~~-#include <fcntl.h>~~
8		~~-#include <stdlib.h>~~
9		~~-#include <errno.h>~~
10		-
11		-/*
12		~~- Check to see whether a file ends with a bz2 footer or not~~
13		~~- (i.e. if it is truncated or corrupted).~~
14		~~- This is a crude but fast test for integrity; we don't~~
15		~~- check the CRC at the end of fthe stream, nor do we check the~~
16		~~- bit padding in the last byte of the file.~~
17		-
18		~~- Arguments: the name of the file to check, presumably~~
19		~~- a bzipped file.~~
20		~~- Outputs: none.~~
21		~~- Exits with 0 if the file contains the footer at the end,~~
22		~~- 1 if the file does not contain the footer, and -1 on error.~~
23		~~-*/~~
24		-
25		-
26		~~-int read_footer(unsigned char *buffer, int fin) {~~
27		~~- int res;~~
28		-
29		~~- res = lseek(fin, -11, SEEK_END);~~
30		~~- if (res == -1) {~~
31		~~- fprintf(stderr,"lseek of file failed\n");~~
32		~~- exit(-1);~~
33		~~- }~~
34		~~- res = read(fin, buffer, 11);~~
35		~~- if (res == -1) {~~
36		~~- fprintf(stderr,"read of file failed\n");~~
37		~~- exit(-1);~~
38		~~- }~~
39		~~- return(0);~~
40		-}
41		-
42		~~-#define LEFT 0~~
43		~~-#define RIGHT 1~~
44		-
45		~~-/* return n ones either at left or right end */~~
46		~~-int bitmask(int numbits, int end) {~~
47		~~- if (end == RIGHT) {~~
48		~~- return((1<<numbits)-1);~~
49		~~- }~~
50		~~- else {~~
51		~~- return(((1<<numbits)-1) << (8-numbits));~~
52		~~- }~~
53		-}
54		-
55		~~-void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {~~
56		~~- int i;~~
57		-
58		~~- for (i=buflen-1; i>=0; i--) {~~
59		~~- /* right 1 */~~
60		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
61		-
62		~~- /* grab rightmost from prev byte */~~
63		~~- if (i > 0) {~~
64		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(1,LEFT)));~~
65		~~- }~~
66		~~- }~~
67		-}
68		-
69		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
70		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
71		~~- matches and 0 otherwise. */~~
72		~~-int bytescompare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
73		~~- int i;~~
74		-
75		~~- if (bitsrightshifted == 0) {~~
76		~~- for (i = 0; i< numbytes; i++) {~~
77		~~- if (buff1[i] != buff2[i]) {~~
78		~~- return(1);~~
79		~~- }~~
80		~~- }~~
81		~~- return(0);~~
82		~~- }~~
83		~~- else {~~
84		~~- for (i = 1; i< numbytes-2; i++) {~~
85		~~- if (buff1[i] != buff2[i]) {~~
86		~~- return(1);~~
87		~~- }~~
88		~~- }~~
89		~~- /* do leftmost byte */~~
90		~~- if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {~~
91		~~- return(1);~~
92		~~- }~~
93		~~- /* do rightmost byte */~~
94		~~- if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {~~
95		~~- return(1);~~
96		~~- }~~
97		~~- return(0);~~
98		~~- }~~
99		-}
100		-
101		~~-int checkfileforfooter(int fin) {~~
102		~~- unsigned char buffer[11];~~
103		~~- int result, i;~~
104		~~- unsigned char *footer = malloc(8sizeof(unsigned char *));~~
105		-
106		~~- /* set up footer plus its various right-shifted incarnations */~~
107		~~- /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */~~
108		~~- for (i = 0; i< 8; i++) {~~
109		~~- footer[i] = malloc(sizeof(unsigned char)*7);~~
110		~~- }~~
111		~~- footer[0][0]= (unsigned char) 0x17;~~
112		~~- footer[0][1]= (unsigned char) 0x72;~~
113		~~- footer[0][2]= (unsigned char) 0x45;~~
114		~~- footer[0][3]= (unsigned char) 0x38;~~
115		~~- footer[0][4]= (unsigned char) 0x50;~~
116		~~- footer[0][5]= (unsigned char) 0x90;~~
117		~~- footer[0][6]= (unsigned char) 0x00;~~
118		~~- for (i = 1; i< 8; i++) {~~
119		~~- memcpy((char )(footer[i]), (char )(footer[i-1]),7);~~
120		~~- shiftbytesright(footer[i],7,1);~~
121		~~- }~~
122		-
123		~~- read_footer(buffer,fin);~~
124		-
125		~~- result = bytescompare(footer[0],buffer+1,6,0);~~
126		~~- if (!result) {~~
127		~~- return(0);~~
128		~~- }~~
129		-
130		~~- for (i=1; i<8; i++) {~~
131		~~- result = bytescompare(footer[i],buffer,7,i);~~
132		~~- if (!result) {~~
133		~~- return(0);~~
134		~~- }~~
135		~~- }~~
136		~~- return(1);~~
137		-}
138		-
139		~~-int main(int argc, char **argv) {~~
140		-
141		~~- int fin;~~
142		~~- int result;~~
143		-
144		~~- if (argc != 2) {~~
145		~~- fprintf(stderr,"usage: %s infile\n", argv[0]);~~
146		~~- exit(-1);~~
147		~~- }~~
148		~~- fin = open (argv[1], O_RDONLY);~~
149		~~- if (fin < 0) {~~
150		~~- fprintf(stderr,"failed to open file %s for read\n", argv[1]);~~
151		~~- exit(-1);~~
152		~~- }~~
153		~~- result = checkfileforfooter(fin);~~
154		~~- close(fin);~~
155		~~- exit(result);~~
156		-}
157		-
Index: branches/ariel/xmldumps-backup/dumplastbz2block.c
—	—	@@ -1,463 +0,0 @@
2		~~-#include <unistd.h>~~
3		~~-#include <stdio.h>~~
4		~~-#include <string.h>~~
5		~~-#include <sys/types.h>~~
6		~~-#include <sys/stat.h>~~
7		~~-#include <fcntl.h>~~
8		~~-#include <stdlib.h>~~
9		~~-#include <errno.h>~~
10		~~-#include "bzlib.h"~~
11		-
12		-/*
13		~~- Find the last bz2 block marker in a file~~
14		~~- and dump whatever can be decompressed after~~
15		~~- that point. The header of the file must~~
16		~~- be intact in order for any output to be produced.~~
17		~~- This will produce output for truncated files as well,~~
18		~~- as long as there is "enough" data after the block~~
19		~~- marker.~~
20		-
21		~~- Arguments: the name of the file to check, presumably~~
22		~~- a bzipped file.~~
23		~~- Outputs: the decompressed data at the end of the file.~~
24		~~- Exits with 0 if decompression of some data can be done,~~
25		~~- 1 if decompression fails, and -1 on error.~~
26		~~-*/~~
27		-
28		~~-#define BUFSIZE 121072~~
29		~~-typedef struct {~~
30		~~- unsigned char bufin[BUFSIZE];~~
31		~~- unsigned char bufout[BUFSIZE];~~
32		~~- int bufsize;~~
33		~~- bz_stream strm;~~
34		~~- unsigned char overflow;~~
35		~~- int bitsshifted;~~
36		~~- int position;~~
37		~~-} bzinfo;~~
38		-
39		~~-int read_footer(unsigned char *buffer, int fin) {~~
40		~~- int res;~~
41		-
42		~~- res = lseek(fin, -11, SEEK_END);~~
43		~~- if (res == -1) {~~
44		~~- fprintf(stderr,"lseek of file failed\n");~~
45		~~- exit(-1);~~
46		~~- }~~
47		~~- res = read(fin, buffer, 11);~~
48		~~- if (res == -1) {~~
49		~~- fprintf(stderr,"read of file failed\n");~~
50		~~- exit(-1);~~
51		~~- }~~
52		~~- return(0);~~
53		-}
54		-
55		~~-#define LEFT 0~~
56		~~-#define RIGHT 1~~
57		-
58		~~-/* return n ones either at left or right end */~~
59		~~-int bitmask(int numbits, int end) {~~
60		~~- if (end == RIGHT) {~~
61		~~- return((1<<numbits)-1);~~
62		~~- }~~
63		~~- else {~~
64		~~- return(((1<<numbits)-1) << (8-numbits));~~
65		~~- }~~
66		-}
67		-
68		~~-void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {~~
69		~~- int i;~~
70		-
71		~~- if (numbits == 0) {~~
72		~~- return;~~
73		~~- }~~
74		-
75		~~- for (i=0; i<buflen; i++) {~~
76		~~- /* left 1 */~~
77		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);~~
78		-
79		~~- /* grab leftmost from next byte */~~
80		~~- if (i < buflen-1) {~~
81		~~- buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );~~
82		~~- }~~
83		~~- }~~
84		-}
85		-
86		-
87		~~-void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {~~
88		~~- int i;~~
89		-
90		~~- for (i=buflen-1; i>=0; i--) {~~
91		~~- /* right 1 */~~
92		~~- buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);~~
93		-
94		~~- /* grab rightmost from prev byte */~~
95		~~- if (i > 0) {~~
96		~~- buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));~~
97		~~- }~~
98		~~- }~~
99		-}
100		-
101		~~-unsigned char ** init_marker() {~~
102		~~- unsigned char *marker = malloc(8sizeof(unsigned char *));~~
103		~~- int i;~~
104		-
105		~~- /* set up block marker plus its various right-shifted incarnations */~~
106		~~- for (i = 0; i< 8; i++) {~~
107		~~- marker[i] = malloc(sizeof(unsigned char)*7);~~
108		~~- }~~
109		~~- marker[0][0]= (unsigned char) 0x31;~~
110		~~- marker[0][1]= (unsigned char) 0x41;~~
111		~~- marker[0][2]= (unsigned char) 0x59;~~
112		~~- marker[0][3]= (unsigned char) 0x26;~~
113		~~- marker[0][4]= (unsigned char) 0x53;~~
114		~~- marker[0][5]= (unsigned char) 0x59;~~
115		~~- marker[0][6]= (unsigned char) 0x00;~~
116		~~- for (i = 1; i< 8; i++) {~~
117		~~- memcpy((char )(marker[i]), (char )(marker[i-1]),7);~~
118		~~- shiftbytesright(marker[i],7,1);~~
119		~~- }~~
120		~~- return(marker);~~
121		-}
122		-
123		~~-unsigned char ** init_footer() {~~
124		~~- unsigned char *footer = malloc(8sizeof(unsigned char *));~~
125		~~- int i;~~
126		-
127		~~- /* set up footer plus its various right-shifted incarnations */~~
128		~~- /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */~~
129		~~- for (i = 0; i< 8; i++) {~~
130		~~- footer[i] = malloc(sizeof(unsigned char)*7);~~
131		~~- }~~
132		~~- footer[0][0]= (unsigned char) 0x17;~~
133		~~- footer[0][1]= (unsigned char) 0x72;~~
134		~~- footer[0][2]= (unsigned char) 0x45;~~
135		~~- footer[0][3]= (unsigned char) 0x38;~~
136		~~- footer[0][4]= (unsigned char) 0x50;~~
137		~~- footer[0][5]= (unsigned char) 0x90;~~
138		~~- footer[0][6]= (unsigned char) 0x00;~~
139		~~- for (i = 1; i< 8; i++) {~~
140		~~- memcpy((char )(footer[i]), (char )(footer[i-1]),7);~~
141		~~- shiftbytesright(footer[i],7,1);~~
142		~~- }~~
143		~~- return(footer);~~
144		-}
145		-
146		-
147		~~-/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,~~
148		~~- both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2~~
149		~~- matches and 0 otherwise. */~~
150		~~-int bytescompare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {~~
151		~~- int i;~~
152		-
153		~~- if (bitsrightshifted == 0) {~~
154		~~- for (i = 0; i< numbytes; i++) {~~
155		~~- if (buff1[i] != buff2[i]) {~~
156		~~- return(1);~~
157		~~- }~~
158		~~- }~~
159		~~- return(0);~~
160		~~- }~~
161		~~- else {~~
162		~~- for (i = 1; i< numbytes-2; i++) {~~
163		~~- if (buff1[i] != buff2[i]) {~~
164		~~- return(1);~~
165		~~- }~~
166		~~- }~~
167		~~- /* do leftmost byte */~~
168		~~- if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {~~
169		~~- return(1);~~
170		~~- }~~
171		~~- /* do rightmost byte */~~
172		~~- if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {~~
173		~~- return(1);~~
174		~~- }~~
175		~~- return(0);~~
176		~~- }~~
177		-}
178		-
179		~~-/* return -1 if no match~~
180		~~- return number of bits rightshifted otherwise */~~
181		~~-int checkfileforfooter(int fin, unsigned char **footer) {~~
182		~~- unsigned char buffer[11];~~
183		~~- int result, i;~~
184		-
185		~~- read_footer(buffer,fin);~~
186		-
187		~~- result = bytescompare(footer[0],buffer+1,6,0);~~
188		~~- if (!result) {~~
189		~~- return(0);~~
190		~~- }~~
191		-
192		~~- for (i=1; i<8; i++) {~~
193		~~- result = bytescompare(footer[i],buffer,7,i);~~
194		~~- if (!result) {~~
195		~~- return(i);~~
196		~~- }~~
197		~~- }~~
198		~~- return(-1);~~
199		-}
200		-
201		~~-/* return -1 if no match~~
202		~~- return number of bits rightshifted otherwise */~~
203		~~-int checkbufferforblockmarker(unsigned char buffer, unsigned char *marker) {~~
204		~~- int result, i;~~
205		-
206		~~- result = bytescompare(marker[0],buffer+1,6,0);~~
207		~~- if (!result) {~~
208		~~- return(0);~~
209		~~- }~~
210		~~- for (i=1; i<8; i++) {~~
211		~~- result = bytescompare(marker[i],buffer,7,i);~~
212		~~- if (!result) {~~
213		~~- return(i);~~
214		~~- }~~
215		~~- }~~
216		~~- return(-1);~~
217		-}
218		-
219		~~-void clearbuffer(unsigned char *buf, int length) {~~
220		~~- int i;~~
221		-
222		~~- for (i=0; i<length; i++) {~~
223		~~- buf[i]=0;~~
224		~~- }~~
225		~~- return;~~
226		-}
227		-
228		~~-int findnextmarker(int fin, int start_at, int position, unsigned char *marker, unsigned char buffer ) {~~
229		~~- int bitsshifted = -1;~~
230		~~- int result;~~
231		-
232		~~- /* must be after 4 byte file header, and we add a leftmost byte to the buffer~~
233		~~- of data read in case some bits have been shifted into it */~~
234		~~- while (*position >= 3 && bitsshifted < 0) {~~
235		~~- bitsshifted = checkbufferforblockmarker(buffer, marker);~~
236		~~- if (bitsshifted < 0) {~~
237		~~- (*start_at)++;~~
238		- /*
239		~~- if (*start_at % 10000 == 0) {~~
240		~~- fprintf(stderr, "starting at %d, position %d\n", start_at, position);~~
241		~~- }~~
242		~~- */~~
243		~~- position = lseek(fin, -1(*start_at), SEEK_END);~~
244		~~- if (*position == -1) {~~
245		~~- fprintf(stderr,"lseek of file failed\n");~~
246		~~- exit(-1);~~
247		~~- }~~
248		~~- result = read(fin, buffer, 7);~~
249		~~- if (result == -1) {~~
250		~~- fprintf(stderr,"read of file failed\n");~~
251		~~- exit(-1);~~
252		~~- }~~
253		~~- }~~
254		~~- else {~~
255		~~- return(bitsshifted);~~
256		~~- }~~
257		~~- }~~
258		~~- return(bitsshifted);~~
259		-}
260		-
261		~~-int init_decompress(bzinfo *bfile) {~~
262		~~- int bz_verbosity = 0;~~
263		~~- int bz_small = 0;~~
264		~~- int ret;~~
265		-
266		~~- bfile->strm.bzalloc = NULL;~~
267		~~- bfile->strm.bzfree = NULL;~~
268		~~- bfile->strm.opaque = NULL;~~
269		-
270		~~- ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );~~
271		~~- if (ret != BZ_OK) {~~
272		~~- fprintf(stderr,"uncompress failed, err %d\n", ret);~~
273		~~- exit(-1);~~
274		~~- }~~
275		~~- return(ret);~~
276		-}
277		-
278		~~-int decompress_header(int fin, bzinfo *bfile) {~~
279		~~- int bytesread, ret;~~
280		~~- unsigned char header[4];~~
281		-
282		~~- lseek(fin,0,SEEK_SET);~~
283		~~- bytesread = read(fin, header, 4);~~
284		~~- if (bytesread < 4) {~~
285		~~- fprintf(stderr,"failed to read 4 bytes of header, exiting\n");~~
286		~~- exit(-1);~~
287		~~- }~~
288		~~- bfile->strm.next_in = (char *)header;~~
289		~~- bfile->strm.avail_in = 4;~~
290		-
291		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
292		~~- bfile->strm.avail_out = bfile->bufsize;~~
293		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
294		~~- if (BZ_OK != ret && BZ_STREAM_END != ret) {~~
295		~~- fprintf(stderr,"Corrupt bzip2 header, exiting\n");~~
296		~~- exit(-1);~~
297		~~- }~~
298		~~- return(ret);~~
299		-}
300		-
301		~~-int setup_first_buffer(int fin, bzinfo *bfile) {~~
302		~~- int bytesread, eof=0;~~
303		-
304		~~- if (bfile->bitsshifted == 0) {~~
305		~~- lseek(fin,bfile->position+1,SEEK_SET);~~
306		~~- }~~
307		~~- else {~~
308		~~- lseek(fin,bfile->position,SEEK_SET);~~
309		~~- }~~
310		~~- bytesread = read(fin, bfile->bufin, bfile->bufsize);~~
311		~~- if (bytesread > 0) {~~
312		~~- bfile->overflow = bfile->bufin[bytesread-1];~~
313		~~- shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);~~
314		-
315		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
316		~~- bfile->strm.avail_in = bytesread-1;~~
317		-
318		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
319		~~- bfile->strm.avail_out = bfile->bufsize;~~
320		~~- }~~
321		~~- if (bytesread <=0) {~~
322		~~- eof++;~~
323		~~- }~~
324		~~- return(eof);~~
325		-}
326		-
327		~~-int do_last_byte(bzinfo *bfile) {~~
328		~~- int ret=BZ_OK;~~
329		~~- int written;~~
330		-
331		~~- if (bfile->strm.avail_in == 0) {~~
332		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
333		~~- bfile->bufin[0] = bfile->overflow;~~
334		~~- shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);~~
335		~~- bfile->strm.avail_in = 1;~~
336		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
337		~~- bfile->strm.avail_out = bfile->bufsize;~~
338		~~- ret = BZ2_bzDecompress ( &(bfile->strm) );~~
339		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
340		~~- written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);~~
341		~~- }~~
342		~~- }~~
343		~~- return(ret);~~
344		-}
345		-
346		~~-int read_next_buffer(int fin, bzinfo *bfile, int ret) {~~
347		~~- int bytesread, eof=0;~~
348		-
349		~~- /* fprintf(stderr," got return from decompress of %d\n", ret); */~~
350		-
351		~~- if (bfile->strm.avail_in == 0) {~~
352		~~- bfile->strm.next_in = (char *)(bfile->bufin);~~
353		~~- bfile->bufin[0] = bfile->overflow;~~
354		~~- bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);~~
355		~~- if (bytesread > 0) {~~
356		~~- bfile->overflow = bfile->bufin[bytesread];~~
357		~~- shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);~~
358		~~- bfile->strm.avail_in = bytesread;~~
359		~~- }~~
360		~~- else {~~
361		~~- eof++;~~
362		~~- bfile->strm.avail_in = 0;~~
363		~~- }~~
364		~~- }~~
365		~~- bfile->strm.next_out = (char *)(bfile->bufout);~~
366		~~- bfile->strm.avail_out = bfile->bufsize;~~
367		-
368		~~- return(eof);~~
369		-}
370		-
371		-
372		~~-int main(int argc, char **argv) {~~
373		-
374		~~- bzinfo bfile;~~
375		-
376		~~- int fin;~~
377		~~- int result, ret;~~
378		~~- unsigned char buffer[8];~~
379		-
380		~~- unsigned char **footer;~~
381		~~- unsigned char **marker;~~
382		-
383		~~- int written=0;~~
384		~~- int start_at;~~
385		-
386		~~- int eof = 0;~~
387		-
388		~~- if (argc != 2) {~~
389		~~- fprintf(stderr,"usage: %s infile\n", argv[0]);~~
390		~~- exit(-1);~~
391		~~- }~~
392		-
393		~~- marker = init_marker();~~
394		~~- footer = init_footer();~~
395		-
396		~~- fin = open (argv[1], O_RDONLY);~~
397		~~- if (fin < 0) {~~
398		~~- fprintf(stderr,"failed to open file %s for read\n", argv[1]);~~
399		~~- exit(-1);~~
400		~~- }~~
401		-
402		~~- bfile.bufsize = BUFSIZE;~~
403		-
404		~~- result = checkfileforfooter(fin, footer);~~
405		~~- if (result == -1) {~~
406		~~- start_at = 0;~~
407		~~- }~~
408		~~- else {~~
409		~~- start_at = 11; /* size of footer, perhaps with 1 byte extra */~~
410		~~- }~~
411		~~- start_at +=6; /* size of marker */~~
412		~~- bfile.position = lseek(fin, -1*start_at, SEEK_END);~~
413		~~- if (bfile.position == -1) {~~
414		~~- fprintf(stderr,"lseek of file failed\n");~~
415		~~- exit(-1);~~
416		~~- }~~
417		~~- result = read(fin, buffer, 7);~~
418		~~- if (result == -1) {~~
419		~~- fprintf(stderr,"read of file failed\n");~~
420		~~- exit(-1);~~
421		~~- }~~
422		-
423		~~- while (1) {~~
424		-
425		~~- bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);~~
426		~~- if (bfile.bitsshifted >= 0) {~~
427		~~- /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */~~
428		~~- ret = init_decompress(&bfile);~~
429		-
430		~~- /* pass in the header */~~
431		~~- ret = decompress_header(fin,&bfile);~~
432		-
433		~~- eof = setup_first_buffer(fin, &bfile);~~
434		-
435		~~- while (BZ_OK == ret && !eof) {~~
436		~~- ret = BZ2_bzDecompress ( &(bfile.strm) );~~
437		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {~~
438		~~- written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);~~
439		~~- }~~
440		~~- eof = read_next_buffer(fin, &bfile, ret);~~
441		~~- }~~
442		~~- if (BZ_OK == ret \|\| BZ_STREAM_END == ret ) {~~
443		~~- /* so we read no bytes, process the last byte we held */~~
444		~~- do_last_byte(&bfile);~~
445		~~- }~~
446		~~- if (written == 0) {~~
447		~~- /* truncated block or other corruption, try going back one */~~
448		~~- start_at +=5;~~
449		~~- clearbuffer(buffer,sizeof(buffer));~~
450		~~- continue;~~
451		~~- }~~
452		~~- else {~~
453		~~- break;~~
454		~~- }~~
455		~~- }~~
456		~~- else {~~
457		~~- fprintf(stderr,"no block marker in this file.\n");~~
458		~~- exit(-1);~~
459		~~- }~~
460		~~- }~~
461		~~- close(fin);~~
462		~~- exit(0);~~
463		-}
464		-
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.h
—	—	@@ -1,81 +0,0 @@
2		~~-#ifndef _FINDPAGEID_H~~
3		~~-#define _FINDPAGEID_H~~
4		-
5		~~-typedef struct {~~
6		~~- int page_id; /* first id in the block */~~
7		~~- int bits_shifted; /* block is right shifted this many bits */~~
8		~~- int position; /* position in file of block */~~
9		~~-} page_info_t;~~
10		-
11		~~-#define BUFINSIZE 5000~~
12		-
13		-/*
14		~~- keeps all information about a bzipped file~~
15		~~- plus input/output buffers for decompression~~
16		~~-*/~~
17		~~-typedef struct {~~
18		~~- unsigned char bufin[BUFINSIZE]; /* compressed data read from file */~~
19		~~- unsigned char bufout; / uncompressed data, must be allocated by caller */~~
20		~~- unsigned char marker_buffer[7]; /* data to test for bz2 block marker */~~
21		~~- unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */~~
22		-
23		~~- int bufin_size; /* size of input buffer for compressed data */~~
24		~~- int bufout_size; /* size of output buffer for decompressed data, may vary at each call */~~
25		-
26		~~- int initialized; /* whether bz2file has been initialized (header processed, seek to~~
27		~~- some bz2 block in the file and input buffer filled) */~~
28		~~- int block_start; /* position of bz2 block in file from which we started to read (we~~
29		~~- read a sequence of bz2 blocks from a given position, this is~~
30		~~- the offset to the first one) */~~
31		-
32		~~- bz_stream strm; /* stream structure for libbz2 */~~
33		~~- unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte~~
34		~~- read around so we can grab the lower end bits off the end for~~
35		~~- sticking in front of the next pile of compressed bytes we read */~~
36		-
37		~~- int bits_shifted; /* number of bits that the compressed data has been right shifted~~
38		~~- in the file (if the number is 0, the block marker and subsequent~~
39		~~- data is byte-aligned) */~~
40		~~- unsigned char *marker; / bzip2 start of block marker, plus bit-shifted versions of it for~~
41		~~- locating the marker in a stream of compressed data */~~
42		-
43		~~- int position; /* current offset into file from start of file */~~
44		-
45		~~- int bytes_read; /* number of bytes of compressed data read from file (per read) */~~
46		~~- int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */~~
47		~~- int eof; /* nonzero if eof reached */~~
48		~~- int file_size; /* length of file, so we don't search past it for blocks */~~
49		~~-} bz_info_t;~~
50		-
51		~~-#define MASKLEFT 0~~
52		~~-#define MASKRIGHT 1~~
53		-
54		-/*
55		~~- this output buffer is used to collect decompressed output.~~
56		~~- this is not a circular buffer; when it is full the user is~~
57		~~- responsible for emptying it completely or partially and moving~~
58		~~- to the beginning any unused bytes.~~
59		-
60		~~-*/~~
61		~~-typedef struct {~~
62		~~- unsigned char buffer; / output storage, allocated by the caller */~~
63		~~- unsigned char next_to_read; / pointer to the next byte in the buffer with data to be read */~~
64		~~- unsigned char next_to_fill; / pointer to the next byte in the buffer which is empty and can receive data */~~
65		~~- int bytes_avail; /* number of bytes available for reading */~~
66		~~- unsigned char end; / points to byte after end of buffer */~~
67		~~-} buf_info_t;~~
68		-
69		-/*
70		~~- used for each iteration of narrowing down the location in a bzipped2 file of~~
71		~~- a desired pageid, by finding first compressed block after a guessed~~
72		~~- position and checking the first pageid (if any) contained in it.~~
73		~~-*/~~
74		~~-typedef struct {~~
75		~~- int left_end; /* left end of interval to search (bytes from start of file) */~~
76		~~- int right_end; /* right end of interval to search */~~
77		~~- int value_wanted; /* pageid desired */~~
78		~~- int last_value; /* pageid we found in last iteration */~~
79		~~- int last_position; /* position in file for last iteration */~~
80		~~-} iter_info_t;~~
81		-
82		~~-#endif~~
Index: branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
—	—	@@ -0,0 +1,766 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+#include <sys/types.h>
	11	+#include <regex.h>
	12	+#include "bzlib.h"
	13	+#include "findpageidinbz2xml.h"
	14	+
	15	+
	16	+/* return n ones either at left or right end */
	17	+int bit_mask(int numbits, int end) {
	18	+ if (end == MASKRIGHT) {
	19	+ return((1<<numbits)-1);
	20	+ }
	21	+ else {
	22	+ return(((1<<numbits)-1) << (8-numbits));
	23	+ }
	24	+}
	25	+
	26	+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
	27	+ int i;
	28	+
	29	+ if (numbits == 0) {
	30	+ return;
	31	+ }
	32	+
	33	+ for (i=0; i<buflen; i++) {
	34	+ /* left 1 */
	35	+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
	36	+
	37	+ /* grab leftmost from next byte */
	38	+ if (i < buflen-1) {
	39	+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );
	40	+ }
	41	+ }
	42	+}
	43	+
	44	+
	45	+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
	46	+ int i;
	47	+
	48	+ for (i=buflen-1; i>=0; i--) {
	49	+ /* right 1 */
	50	+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
	51	+
	52	+ /* grab rightmost from prev byte */
	53	+ if (i > 0) {
	54	+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));
	55	+ }
	56	+ }
	57	+}
	58	+
	59	+unsigned char ** init_marker() {
	60	+ unsigned char *marker = malloc(8sizeof(unsigned char *));
	61	+ int i;
	62	+
	63	+ /* set up block marker plus its various right-shifted incarnations */
	64	+ for (i = 0; i< 8; i++) {
	65	+ marker[i] = malloc(sizeof(unsigned char)*7);
	66	+ }
	67	+ marker[0][0]= (unsigned char) 0x31;
	68	+ marker[0][1]= (unsigned char) 0x41;
	69	+ marker[0][2]= (unsigned char) 0x59;
	70	+ marker[0][3]= (unsigned char) 0x26;
	71	+ marker[0][4]= (unsigned char) 0x53;
	72	+ marker[0][5]= (unsigned char) 0x59;
	73	+ marker[0][6]= (unsigned char) 0x00;
	74	+ for (i = 1; i< 8; i++) {
	75	+ memcpy((char )(marker[i]), (char )(marker[i-1]),7);
	76	+ shift_bytes_right(marker[i],7,1);
	77	+ }
	78	+ return(marker);
	79	+}
	80	+
	81	+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
	82	+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
	83	+ matches and 0 otherwise. */
	84	+int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {
	85	+ int i;
	86	+
	87	+ if (bitsrightshifted == 0) {
	88	+ for (i = 0; i< numbytes; i++) {
	89	+ if (buff1[i] != buff2[i]) {
	90	+ return(1);
	91	+ }
	92	+ }
	93	+ return(0);
	94	+ }
	95	+ else {
	96	+ for (i = 1; i< numbytes-2; i++) {
	97	+ if (buff1[i] != buff2[i]) {
	98	+ return(1);
	99	+ }
	100	+ }
	101	+ /* do leftmost byte */
	102	+ if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {
	103	+ return(1);
	104	+ }
	105	+ /* do rightmost byte */
	106	+ if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {
	107	+ return(1);
	108	+ }
	109	+ return(0);
	110	+ }
	111	+}
	112	+
	113	+/* return -1 if no match
	114	+ return number of bits rightshifted otherwise */
	115	+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
	116	+ int result, i;
	117	+
	118	+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
	119	+ if (!result) {
	120	+ return(0);
	121	+ }
	122	+ for (i=1; i<8; i++) {
	123	+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
	124	+ if (!result) {
	125	+ return(i);
	126	+ }
	127	+ }
	128	+ return(-1);
	129	+}
	130	+
	131	+/* return: 1 if found, 0 if not, -1 on error */
	132	+int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
	133	+ int result;
	134	+
	135	+ bfile->bits_shifted = -1;
	136	+ result = read(fin, bfile->marker_buffer, 7);
	137	+ if (result == -1) {
	138	+ fprintf(stderr,"read of file failed\n");
	139	+ exit(-1);
	140	+ }
	141	+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
	142	+ of data read in case some bits have been shifted into it */
	143	+ while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
	144	+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
	145	+ if (bfile->bits_shifted < 0) {
	146	+ bfile->position++;
	147	+ result = lseek(fin, (bfile->position), SEEK_SET);
	148	+ if (result == -1) {
	149	+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
	150	+ exit(-1);
	151	+ }
	152	+ result = read(fin, bfile->marker_buffer, 7);
	153	+ if (result < 7) {
	154	+ /* fprintf(stderr,"read of file failed\n"); */
	155	+ exit(-1);
	156	+ }
	157	+ }
	158	+ else {
	159	+ bfile->block_start = bfile->position;
	160	+ return(1);
	161	+ }
	162	+ }
	163	+ return(0);
	164	+}
	165	+
	166	+/*
	167	+ initializes the bz2 strm structure,
	168	+ calls the BZ2 decompression library initializer
	169	+
	170	+ returns:
	171	+ BZ_OK on success
	172	+ various BZ_ errors on failure (see bzlib.h)
	173	+*/
	174	+int init_decompress(bz_info_t *bfile) {
	175	+ int bz_verbosity = 0;
	176	+ int bz_small = 0;
	177	+ int ret;
	178	+
	179	+ bfile->strm.bzalloc = NULL;
	180	+ bfile->strm.bzfree = NULL;
	181	+ bfile->strm.opaque = NULL;
	182	+
	183	+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
	184	+ if (ret != BZ_OK) {
	185	+ fprintf(stderr,"uncompress failed, err %d\n", ret);
	186	+ exit(-1);
	187	+ }
	188	+ return(ret);
	189	+}
	190	+
	191	+/*
	192	+ reads the first 4 bytes from a bz2 file (should be
	193	+ "BZh" followed by the block size indicator, typically "9")
	194	+ and passes them into the BZ2 decompression library.
	195	+ This must be done before decompression of any block of the
	196	+ file is attempted.
	197	+
	198	+ returns:
	199	+ BZ_OK if successful,
	200	+ various BZ_ errors on failure (see bzlib.h)
	201	+*/
	202	+int decompress_header(int fin, bz_info_t *bfile) {
	203	+ int ret, res;
	204	+
	205	+ res = lseek(fin,0,SEEK_SET);
	206	+ if (res == -1) {
	207	+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
	208	+ }
	209	+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
	210	+ if (bfile->bytes_read < 4) {
	211	+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
	212	+ exit(-1);
	213	+ }
	214	+ bfile->strm.next_in = (char *)bfile->header_buffer;
	215	+ bfile->strm.avail_in = 4;
	216	+
	217	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	218	+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
	219	+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
	220	+ exit(-1);
	221	+ }
	222	+ return(ret);
	223	+}
	224	+
	225	+/*
	226	+ seek to appropriate offset as specified in bfile,
	227	+ read compressed data into buffer indicated by bfile,
	228	+ update the bfile structure accordingly,
	229	+ save the overflow byte (bit-shifted data = suck)
	230	+ this is for the first buffer of data in a stream,
	231	+ for subsequent buffers use fill_buffer_to_decompress()
	232	+
	233	+ this will set bfile->eof on eof. no other indicator
	234	+ will be provided.
	235	+
	236	+ returns:
	237	+ 0 on success
	238	+ -1 on error
	239	+*/
	240	+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
	241	+ int res;
	242	+
	243	+ if (bfile->bits_shifted == 0) {
	244	+ res = lseek(fin,bfile->position+1,SEEK_SET);
	245	+ if (res == -1) {
	246	+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
	247	+ return(-1);
	248	+ }
	249	+ }
	250	+ else {
	251	+ res = lseek(fin,bfile->position,SEEK_SET);
	252	+ if (res == -1) {
	253	+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
	254	+ return(-1);
	255	+ }
	256	+ }
	257	+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
	258	+ if (bfile->bytes_read > 0) {
	259	+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
	260	+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
	261	+
	262	+ bfile->strm.next_in = (char *)(bfile->bufin);
	263	+ bfile->strm.avail_in = bfile->bytes_read-1;
	264	+ }
	265	+ if (bfile->bytes_read <=0) {
	266	+ bfile->eof++;
	267	+ }
	268	+ return(0);
	269	+}
	270	+
	271	+/*
	272	+ read compressed data into buffer indicated by bfile,
	273	+ from current position of file,
	274	+ stuffing the overflow byte in first.
	275	+ update the bfile structure accordingly
	276	+ save the new overflow byte (bit-shifted data = suck)
	277	+ this function is for decompression of buffers *after
	278	+ the first one*. for the first one use
	279	+ setup_first_buffer_to_decompress()
	280	+
	281	+ this will set bfile->eof on eof. no other indicator
	282	+ will be provided.
	283	+
	284	+ returns:
	285	+ 0 on success
	286	+ hmm, it really does not do anything about errors :-D
	287	+*/
	288	+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
	289	+ if (bfile->strm.avail_in == 0) {
	290	+ bfile->strm.next_in = (char *)(bfile->bufin);
	291	+ bfile->bufin[0] = bfile->overflow;
	292	+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
	293	+ if (bfile->bytes_read > 0) {
	294	+ bfile->position+=bfile->bytes_read;
	295	+ bfile->overflow = bfile->bufin[bfile->bytes_read];
	296	+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
	297	+ bfile->strm.avail_in = bfile->bytes_read;
	298	+ }
	299	+ else {
	300	+ bfile->strm.avail_in = 1; /* the overflow byte */
	301	+ bfile->eof++;
	302	+ }
	303	+ }
	304	+ return(0);
	305	+}
	306	+
	307	+/* size of buffer is bytes usable. there will be a null byte at the end
	308	+
	309	+ what we do with the buffer:
	310	+ - read from front of buffer to end,
	311	+ - fill from point where prev read did not fill buffer, or from where
	312	+ move of data at end of buffer to beginning left room,
	313	+ - mark a string of bytes (starting from what's available to read) as "read"
	314	+
	315	+*/
	316	+buf_info_t *init_buffer(int size) {
	317	+ buf_info_t *b;
	318	+
	319	+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
	320	+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
	321	+ b->buffer[size]='\0';
	322	+ b->end = b->buffer + size;
	323	+ b->next_to_read = b->end; /* nothing available */
	324	+ b->bytes_avail = 0; /* bytes to read, nothing available */
	325	+ b->next_to_fill = b->buffer; /* empty */
	326	+ b->next_to_fill[0] = '\0';
	327	+ return(b);
	328	+}
	329	+
	330	+/* check if buffer (used for decompressed data output) is empty,
	331	+ returns 1 if so and 0 if not */
	332	+int buffer_is_empty(buf_info_t *b) {
	333	+ if (b->bytes_avail == 0) {
	334	+ return(1);
	335	+ }
	336	+ else {
	337	+ return(0);
	338	+ }
	339	+}
	340	+
	341	+/* check if buffer (used for decompressed data output) is full,
	342	+
	343	+ returns 1 if so and 0 if not
	344	+ I'm not liking this function so well, fixme */
	345	+int buffer_is_full(buf_info_t *b) {
	346	+ if (b->next_to_fill == b->end) {
	347	+ return(1);
	348	+ }
	349	+ else {
	350	+ return(0);
	351	+ }
	352	+}
	353	+
	354	+/* FIXME do this right. whatever. */
	355	+int get_file_size(int fin) {
	356	+ int res;
	357	+
	358	+ res = lseek(fin, 0, SEEK_END);
	359	+ if (res == -1) {
	360	+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
	361	+ exit(-1);
	362	+ }
	363	+ return(res);
	364	+}
	365	+
	366	+
	367	+/*
	368	+ set up the marker, seek to right place, get first
	369	+ buffer of compressed data for processing
	370	+ bfile->position must be set to desired offset first by caller.
	371	+ returns:
	372	+ -1 if no marker or other error, position of next read if ok
	373	+*/
	374	+int init_bz2_file(bz_info_t *bfile, int fin) {
	375	+ int res;
	376	+
	377	+ bfile->bufin_size = BUFINSIZE;
	378	+ bfile->marker = init_marker();
	379	+ bfile->bytes_read = 0;
	380	+ bfile->bytes_written = 0;
	381	+ bfile->eof = 0;
	382	+
	383	+ bfile->initialized++;
	384	+
	385	+ bfile->file_size = get_file_size(fin);
	386	+ if (bfile->position > bfile->file_size) {
	387	+ fprintf(stderr,"asked for position past end of file\n");
	388	+ exit(-1);
	389	+ }
	390	+ res = lseek(fin, bfile->position, SEEK_SET);
	391	+ if (res == -1) {
	392	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	393	+ exit(-1);
	394	+ }
	395	+
	396	+ find_next_bz2_block_marker(fin, bfile);
	397	+ if (bfile->bits_shifted >= 0) {
	398	+ /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */
	399	+ init_decompress(bfile);
	400	+ decompress_header(fin, bfile);
	401	+ setup_first_buffer_to_decompress(fin, bfile);
	402	+ return(0);
	403	+ }
	404	+ return(-1);
	405	+}
	406	+
	407	+/* get the next buffer of uncompressed stuff */
	408	+int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {
	409	+ int ret;
	410	+
	411	+ bfile->bufout = bufferout;
	412	+ bfile->bufout_size = bufout_size;
	413	+ bfile->bytes_written = 0;
	414	+
	415	+ if (! bfile->initialized) {
	416	+ if (init_bz2_file(bfile, fin) == -1) {
	417	+ fprintf(stderr,"failed to initialize bz2file\n");
	418	+ return(-1);
	419	+ };
	420	+ bfile->strm.next_out = (char *)bfile->bufout;
	421	+ bfile->strm.avail_out = bfile->bufout_size;
	422	+ }
	423	+
	424	+ ret = BZ_OK;
	425	+ while (BZ_OK == ret && bfile->bytes_written == 0) {
	426	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	427	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {
	428	+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
	429	+ }
	430	+ else {
	431	+ fprintf(stderr,"error from BZ decompress %d\n",ret);
	432	+ return(-1);
	433	+ }
	434	+ fill_buffer_to_decompress(fin, bfile, ret);
	435	+ /*
	436	+ if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {
	437	+ fprintf(stderr,"eof reached\n");
	438	+ }
	439	+ */
	440	+ }
	441	+ return(0);
	442	+}
	443	+
	444	+/*
	445	+ fill output buffer in b with uncompressed data from bfile
	446	+ if this is the first call to the function for this file,
	447	+ the file header will be read, and the first buffer of
	448	+ uncompressed data will be prepared. bfile->position
	449	+ should be set to the offset (from the beginning of file) from
	450	+ which to find the first bz2 block.
	451	+
	452	+ returns:
	453	+ on success, number of bytes read (may be 0)
	454	+ -1 on error
	455	+*/
	456	+int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {
	457	+ int res;
	458	+
	459	+ if (buffer_is_full(b)) {
	460	+ fprintf(stdout,"DEBUG buffer full\n");
	461	+ return(0);
	462	+ }
	463	+
	464	+ if (buffer_is_empty(b)) {
	465	+ b->next_to_fill = b->buffer;
	466	+ }
	467	+
	468	+ res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
	469	+ if (res <0 ) {
	470	+ return(res);
	471	+ }
	472	+ if (bfile->bytes_written < 0) {
	473	+ fprintf(stderr,"read of file failed\n");
	474	+ return(-1);
	475	+ }
	476	+ else {
	477	+ /* really?? FIXME check this */
	478	+ if (buffer_is_empty(b)) {
	479	+ b->next_to_read = b->next_to_fill; /* where we just read */
	480	+ }
	481	+ b->bytes_avail += bfile->bytes_written;
	482	+ b->next_to_fill += bfile->bytes_written;
	483	+ b->next_to_fill[0] = '\0';
	484	+ return(0);
	485	+ }
	486	+}
	487	+
	488	+void dumpbuf_info_t(buf_info_t *b) {
	489	+ fprintf(stdout, "\n");
	490	+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
	491	+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
	492	+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
	493	+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
	494	+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
	495	+}
	496	+
	497	+/*
	498	+ copy text from end of buffer to the beginning, that we want to keep
	499	+ around for further processing (i.e. further regex matches)
	500	+ returns number of bytes copied
	501	+*/
	502	+int move_bytes_to_buffer_start(buf_info_t b, unsigned char fromwhere, int maxbytes) {
	503	+ int i, tocopy;
	504	+
	505	+ if (fromwhere >= b->end) {
	506	+ return(0);
	507	+ }
	508	+ else {
	509	+ tocopy = b->end - fromwhere;
	510	+ if (maxbytes && (tocopy > maxbytes)) {
	511	+ tocopy = maxbytes;
	512	+ }
	513	+ for (i = 0; i < tocopy; i++) {
	514	+ b->buffer[i] = fromwhere[i];
	515	+ }
	516	+ b->next_to_fill = b->buffer + tocopy;
	517	+ b->next_to_fill[0] = '\0';
	518	+ b->next_to_read = b->buffer;
	519	+ b->bytes_avail = tocopy;
	520	+ return(tocopy);
	521	+ }
	522	+}
	523	+
	524	+/*
	525	+ dump the <meadiawiki> header (up through
	526	+ </siteinfo> close tag) found at the
	527	+ beginning of xml dump files.
	528	+ returns:
	529	+ 0 on success,
	530	+ -1 on error
	531	+*/
	532	+int dump_mw_header(int fin) {
	533	+ int res;
	534	+ regmatch_t *match_siteinfo;
	535	+ regex_t compiled_siteinfo;
	536	+ int length=5000; /* output buffer size */
	537	+ char *siteinfo = " </siteinfo>\n";
	538	+
	539	+ buf_info_t *b;
	540	+ bz_info_t bfile;
	541	+
	542	+ int firstpage = 1;
	543	+ int done = 0;
	544	+ bfile.initialized = 0;
	545	+
	546	+ res = regcomp(&compiled_siteinfo, siteinfo, REG_EXTENDED);
	547	+
	548	+ match_siteinfo = (regmatch_t )malloc(sizeof(regmatch_t)1);
	549	+
	550	+ b = init_buffer(length);
	551	+ bfile.bytes_read = 0;
	552	+ bfile.position = 0;
	553	+
	554	+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof) && (!done)) {
	555	+ /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
	556	+ if (bfile.bytes_read) {
	557	+ if (firstpage) {
	558	+ if (bfile.bytes_read >= 11 && !memcmp((char *)b->next_to_read,"<mediawiki ",11)) {
	559	+ /* good, write it and loop and not firstpage any more */
	560	+ if (b->bytes_avail) {
	561	+ if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {
	562	+ fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);
	563	+ b->next_to_read = b->end;
	564	+ b->bytes_avail = 0;
	565	+ b->next_to_fill = b->buffer; /* empty */
	566	+ bfile.strm.next_out = (char *)b->next_to_fill;
	567	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	568	+ done++;
	569	+ }
	570	+ else {
	571	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	572	+ b->next_to_read = b->end;
	573	+ b->bytes_avail = 0;
	574	+ b->next_to_fill = b->buffer; /* empty */
	575	+ bfile.strm.next_out = (char *)b->next_to_fill;
	576	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	577	+ }
	578	+ }
	579	+ }
	580	+ else {
	581	+ fprintf(stderr,"missing mediawiki header from bz2 xml file\n");
	582	+ return(-1);
	583	+ }
	584	+ firstpage = 0;
	585	+ }
	586	+ else { /* not firstpage */
	587	+ if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {
	588	+ fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);
	589	+ b->next_to_read = b->end;
	590	+ b->bytes_avail = 0;
	591	+ b->next_to_fill = b->buffer; /* empty */
	592	+ bfile.strm.next_out = (char *)b->next_to_fill;
	593	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	594	+ done++;
	595	+ }
	596	+ else {
	597	+ /* could have the first part of the siteinfo tag... so copy up enough bytes to cover that case */
	598	+ if (b->bytes_avail> 12) {
	599	+ /* write everything that didn't match, but leave 12 bytes, to stdout */
	600	+ fwrite(b->next_to_read,b->bytes_avail - 12,1,stdout);
	601	+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 12, 12);
	602	+ bfile.strm.next_out = (char *)b->next_to_fill;
	603	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	604	+ }
	605	+ else {
	606	+ if (buffer_is_empty(b)) {
	607	+ bfile.strm.next_out = (char *)b->buffer;
	608	+ bfile.strm.avail_out = bfile.bufout_size;
	609	+ b->next_to_fill = b->buffer; /* empty */
	610	+ }
	611	+ else {
	612	+ /* there were only 12 or less bytes so just save em don't write em to stdout */
	613	+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
	614	+ bfile.strm.next_out = (char *)b->next_to_fill;
	615	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	616	+ }
	617	+ }
	618	+ }
	619	+ } /* end notfirstpage */
	620	+ }
	621	+ }
	622	+ if (!done) {
	623	+ fprintf(stderr,"incomplete or no mediawiki header found\n");
	624	+ return(-1);
	625	+ }
	626	+ else {
	627	+ return(0);
	628	+ }
	629	+}
	630	+
	631	+/*
	632	+ find the first page id after position in file
	633	+ decompress and dump to stdout from that point on
	634	+ returns:
	635	+ 0 on success,
	636	+ -1 on error
	637	+*/
	638	+int dump_from_first_page_id_after_offset(int fin, int position) {
	639	+ int res;
	640	+ regmatch_t *match_page;
	641	+ regex_t compiled_page;
	642	+ int length=5000; /* output buffer size */
	643	+ char *page = " <page>";
	644	+
	645	+ buf_info_t *b;
	646	+ bz_info_t bfile;
	647	+
	648	+ int firstpage = 1;
	649	+
	650	+ bfile.initialized = 0;
	651	+
	652	+ res = regcomp(&compiled_page, page, REG_EXTENDED);
	653	+
	654	+ match_page = (regmatch_t )malloc(sizeof(regmatch_t)1);
	655	+
	656	+ b = init_buffer(length);
	657	+ bfile.bytes_read = 0;
	658	+ bfile.position = position;
	659	+
	660	+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof)) {
	661	+ /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
	662	+ if (bfile.bytes_read) {
	663	+ if (firstpage) {
	664	+ if (regexec(&compiled_page, (char *)b->next_to_read, 2, match_page, 0 ) == 0) {
	665	+ fwrite(b->next_to_read+match_page[0].rm_so,b->next_to_fill - (b->next_to_read+match_page[0].rm_so), 1, stdout);
	666	+ b->next_to_read = b->end;
	667	+ b->bytes_avail = 0;
	668	+ b->next_to_fill = b->buffer; /* empty */
	669	+ bfile.strm.next_out = (char *)b->next_to_fill;
	670	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	671	+ firstpage = 0;
	672	+ }
	673	+ else {
	674	+ /* could have the first part of the page tag... so copy up enough bytes to cover that case */
	675	+ if (b->bytes_avail> 7) {
	676	+ /* write everything that didn't match, but leave 7 bytes, to stdout */
	677	+ fwrite(b->next_to_read,b->bytes_avail - 7,1,stdout);
	678	+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 7, 7);
	679	+ bfile.strm.next_out = (char *)b->next_to_fill;
	680	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	681	+ }
	682	+ else {
	683	+ if (buffer_is_empty(b)) {
	684	+ bfile.strm.next_out = (char *)b->buffer;
	685	+ bfile.strm.avail_out = bfile.bufout_size;
	686	+ b->next_to_fill = b->buffer; /* empty */
	687	+ }
	688	+ else {
	689	+ /* there were only 7 or less bytes so just save em don't write em to stdout */
	690	+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
	691	+ bfile.strm.next_out = (char *)b->next_to_fill;
	692	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	693	+ }
	694	+ }
	695	+ }
	696	+ }
	697	+ else {
	698	+ if (b->bytes_avail) {
	699	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	700	+ b->next_to_read = b->end;
	701	+ b->bytes_avail = 0;
	702	+ b->next_to_fill = b->buffer; /* empty */
	703	+ bfile.strm.next_out = (char *)b->next_to_fill;
	704	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	705	+ }
	706	+ }
	707	+ }
	708	+ }
	709	+ if (b->bytes_avail) {
	710	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	711	+ b->next_to_read = b->end;
	712	+ b->bytes_avail = 0;
	713	+ b->next_to_fill = b->buffer; /* empty */
	714	+ bfile.strm.next_out = (char *)b->next_to_fill;
	715	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	716	+ }
	717	+ return(0);
	718	+}
	719	+
	720	+/*
	721	+ find the first bz2 block after the specified offset,
	722	+ uncompress from that point on, write out the
	723	+ contents starting with the first <page> tag,
	724	+ prefacing first with the <mediawiki> header from
	725	+ the beginning of the file, up through </siteinfo>.
	726	+
	727	+ note that we may lose some bytes from the very last
	728	+ block if the blocks are bit shifted, because the
	729	+ bzip crc at end of file will be wrong. (needs testing to
	730	+ find a workaround, simply not feeding in the crc doesn't
	731	+ suffice)
	732	+
	733	+ for purposes of the XML dumps this is fine, since we use
	734	+ this tool to generate prefetch data starting from
	735	+ a given pageid, rather than needing to uncompress
	736	+ gigabytes of data to get to the point in the file
	737	+ we want.
	738	+
	739	+ returns:
	740	+ BZ_OK on success, various BZ_ errors otherwise.
	741	+*/
	742	+int main(int argc, char **argv) {
	743	+ int fin, position, res;
	744	+
	745	+ if (argc != 3) {
	746	+ fprintf(stderr,"usage: %s infile position\n", argv[0]);
	747	+ exit(-1);
	748	+ }
	749	+
	750	+ fin = open (argv[1], O_RDONLY);
	751	+ if (fin < 0) {
	752	+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
	753	+ exit(-1);
	754	+ }
	755	+
	756	+ position = atoi(argv[2]);
	757	+ if (position <0) {
	758	+ fprintf(stderr,"please specify a position >= 0.\n");
	759	+ fprintf(stderr,"usage: %s infile position\n", argv[0]);
	760	+ exit(-1);
	761	+ }
	762	+ /* input file, starting position in file, length of buffer for reading */
	763	+ res = dump_mw_header(fin);
	764	+
	765	+ res = dump_from_first_page_id_after_offset(fin, position);
	766	+ exit(res);
	767	+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
___________________________________________________________________
Added: svn:eol-style
1	768	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
—	—	@@ -0,0 +1,842 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+#include <sys/types.h>
	11	+#include <regex.h>
	12	+#include "bzlib.h"
	13	+#include "findpageidinbz2xml.h"
	14	+
	15	+/* return n ones either at left or right end */
	16	+int bitmask(int numbits, int end) {
	17	+ if (end == MASKRIGHT) {
	18	+ return((1<<numbits)-1);
	19	+ }
	20	+ else {
	21	+ return(((1<<numbits)-1) << (8-numbits));
	22	+ }
	23	+}
	24	+
	25	+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
	26	+ int i;
	27	+
	28	+ if (numbits == 0) {
	29	+ return;
	30	+ }
	31	+
	32	+ for (i=0; i<buflen; i++) {
	33	+ /* left 1 */
	34	+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
	35	+
	36	+ /* grab leftmost from next byte */
	37	+ if (i < buflen-1) {
	38	+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );
	39	+ }
	40	+ }
	41	+}
	42	+
	43	+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
	44	+ int i;
	45	+
	46	+ for (i=buflen-1; i>=0; i--) {
	47	+ /* right 1 */
	48	+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
	49	+
	50	+ /* grab rightmost from prev byte */
	51	+ if (i > 0) {
	52	+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));
	53	+ }
	54	+ }
	55	+}
	56	+
	57	+unsigned char ** init_marker() {
	58	+ unsigned char *marker = malloc(8sizeof(unsigned char *));
	59	+ int i;
	60	+
	61	+ /* set up block marker plus its various right-shifted incarnations */
	62	+ for (i = 0; i< 8; i++) {
	63	+ marker[i] = malloc(sizeof(unsigned char)*7);
	64	+ }
	65	+ marker[0][0]= (unsigned char) 0x31;
	66	+ marker[0][1]= (unsigned char) 0x41;
	67	+ marker[0][2]= (unsigned char) 0x59;
	68	+ marker[0][3]= (unsigned char) 0x26;
	69	+ marker[0][4]= (unsigned char) 0x53;
	70	+ marker[0][5]= (unsigned char) 0x59;
	71	+ marker[0][6]= (unsigned char) 0x00;
	72	+ for (i = 1; i< 8; i++) {
	73	+ memcpy((char )(marker[i]), (char )(marker[i-1]),7);
	74	+ shift_bytes_right(marker[i],7,1);
	75	+ }
	76	+ return(marker);
	77	+}
	78	+
	79	+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
	80	+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
	81	+ matches and 0 otherwise. */
	82	+int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {
	83	+ int i;
	84	+
	85	+ if (bitsrightshifted == 0) {
	86	+ for (i = 0; i< numbytes; i++) {
	87	+ if (buff1[i] != buff2[i]) {
	88	+ return(1);
	89	+ }
	90	+ }
	91	+ return(0);
	92	+ }
	93	+ else {
	94	+ for (i = 1; i< numbytes-2; i++) {
	95	+ if (buff1[i] != buff2[i]) {
	96	+ return(1);
	97	+ }
	98	+ }
	99	+ /* do leftmost byte */
	100	+ if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {
	101	+ return(1);
	102	+ }
	103	+ /* do rightmost byte */
	104	+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {
	105	+ return(1);
	106	+ }
	107	+ return(0);
	108	+ }
	109	+}
	110	+
	111	+
	112	+/* return -1 if no match
	113	+ return number of bits rightshifted otherwise */
	114	+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
	115	+ int result, i;
	116	+
	117	+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
	118	+ if (!result) {
	119	+ return(0);
	120	+ }
	121	+ for (i=1; i<8; i++) {
	122	+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
	123	+ if (!result) {
	124	+ return(i);
	125	+ }
	126	+ }
	127	+ return(-1);
	128	+}
	129	+
	130	+
	131	+/* return: 1 if found, 0 if not, -1 on error */
	132	+int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
	133	+ int result;
	134	+
	135	+ bfile->bits_shifted = -1;
	136	+ result = read(fin, bfile->marker_buffer, 7);
	137	+ if (result == -1) {
	138	+ /* fprintf(stderr,"read of file failed\n"); */
	139	+ return(-1);
	140	+ }
	141	+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
	142	+ of data read in case some bits have been shifted into it */
	143	+ while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
	144	+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
	145	+ if (bfile->bits_shifted < 0) {
	146	+ bfile->position++;
	147	+ result = lseek(fin, (bfile->position), SEEK_SET);
	148	+ if (result == -1) {
	149	+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
	150	+ return(-1);
	151	+ }
	152	+ result = read(fin, bfile->marker_buffer, 7);
	153	+ if (result < 7) {
	154	+ /* fprintf(stderr,"read of file failed\n"); */
	155	+ return(-1);
	156	+ }
	157	+ }
	158	+ else {
	159	+ bfile->block_start = bfile->position;
	160	+ return(1);
	161	+ }
	162	+ }
	163	+ return(0);
	164	+}
	165	+
	166	+/*
	167	+ initializes the bz2 strm structure,
	168	+ calls the BZ2 decompression library initializer
	169	+
	170	+ returns:
	171	+ BZ_OK on success
	172	+ various BZ_ errors on failure (see bzlib.h)
	173	+*/
	174	+int init_decompress(bz_info_t *bfile) {
	175	+ int bz_verbosity = 0;
	176	+ int bz_small = 0;
	177	+ int ret;
	178	+
	179	+ bfile->strm.bzalloc = NULL;
	180	+ bfile->strm.bzfree = NULL;
	181	+ bfile->strm.opaque = NULL;
	182	+
	183	+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
	184	+ if (ret != BZ_OK) {
	185	+ fprintf(stderr,"uncompress failed, err %d\n", ret);
	186	+ exit(-1);
	187	+ }
	188	+ return(ret);
	189	+}
	190	+
	191	+/*
	192	+ reads the first 4 bytes from a bz2 file (should be
	193	+ "BZh" followed by the block size indicator, typically "9")
	194	+ and passes them into the BZ2 decompression library.
	195	+ This must be done before decompression of any block of the
	196	+ file is attempted.
	197	+
	198	+ returns:
	199	+ BZ_OK if successful,
	200	+ various BZ_ errors on failure (see bzlib.h)
	201	+*/
	202	+int decompress_header(int fin, bz_info_t *bfile) {
	203	+ int ret, res;
	204	+
	205	+ res = lseek(fin,0,SEEK_SET);
	206	+ if (res == -1) {
	207	+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
	208	+ exit(-1);
	209	+ }
	210	+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
	211	+ if (bfile->bytes_read < 4) {
	212	+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
	213	+ exit(-1);
	214	+ }
	215	+ bfile->strm.next_in = (char *)bfile->header_buffer;
	216	+ bfile->strm.avail_in = 4;
	217	+
	218	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	219	+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
	220	+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
	221	+ exit(-1);
	222	+ }
	223	+ return(ret);
	224	+}
	225	+
	226	+/*
	227	+ seek to appropriate offset as specified in bfile,
	228	+ read compressed data into buffer indicated by bfile,
	229	+ update the bfile structure accordingly,
	230	+ save the overflow byte (bit-shifted data = suck)
	231	+ this is for the first buffer of data in a stream,
	232	+ for subsequent buffers use fill_buffer_to_decompress()
	233	+
	234	+ this will set bfile->eof on eof. no other indicator
	235	+ will be provided.
	236	+
	237	+ returns:
	238	+ 0 on success
	239	+ -1 on error
	240	+*/
	241	+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
	242	+ int res;
	243	+
	244	+ if (bfile->bits_shifted == 0) {
	245	+ res = lseek(fin,bfile->position+1,SEEK_SET);
	246	+ if (res == -1) {
	247	+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
	248	+ return(-1);
	249	+ }
	250	+ }
	251	+ else {
	252	+ res = lseek(fin,bfile->position,SEEK_SET);
	253	+ if (res == -1) {
	254	+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
	255	+ return(-1);
	256	+ }
	257	+ }
	258	+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
	259	+ if (bfile->bytes_read > 0) {
	260	+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
	261	+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
	262	+
	263	+ bfile->strm.next_in = (char *)(bfile->bufin);
	264	+ bfile->strm.avail_in = bfile->bytes_read-1;
	265	+ }
	266	+ if (bfile->bytes_read <=0) {
	267	+ bfile->eof++;
	268	+ }
	269	+ return(0);
	270	+}
	271	+
	272	+/*
	273	+ read compressed data into buffer indicated by bfile,
	274	+ from current position of file,
	275	+ stuffing the overflow byte in first.
	276	+ update the bfile structure accordingly
	277	+ save the new overflow byte (bit-shifted data = suck)
	278	+ this function is for decompression of buffers *after
	279	+ the first one*. for the first one use
	280	+ setup_first_buffer_to_decompress()
	281	+
	282	+ this will set bfile->eof on eof. no other indicator
	283	+ will be provided.
	284	+
	285	+ returns:
	286	+ 0 on success
	287	+ hmm, it really does not do anything about errors :-D
	288	+*/
	289	+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
	290	+ if (bfile->strm.avail_in == 0) {
	291	+ bfile->strm.next_in = (char *)(bfile->bufin);
	292	+ bfile->bufin[0] = bfile->overflow;
	293	+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
	294	+ if (bfile->bytes_read > 0) {
	295	+ bfile->overflow = bfile->bufin[bfile->bytes_read];
	296	+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
	297	+ bfile->strm.avail_in = bfile->bytes_read;
	298	+ bfile->position+=bfile->bytes_read;
	299	+ }
	300	+ else {
	301	+ bfile->strm.avail_in = 1; /* the overflow byte */
	302	+ bfile->eof++;
	303	+ }
	304	+ }
	305	+ return(0);
	306	+}
	307	+
	308	+/* size of buffer is bytes usable. there will be a null byte at the end
	309	+
	310	+ what we do with the buffer:
	311	+ - read from front of buffer to end,
	312	+ - fill from point where prev read did not fill buffer, or from where
	313	+ move of data at end of buffer to beginning left room,
	314	+ - mark a string of bytes (starting from what's available to read) as "read"
	315	+
	316	+*/
	317	+buf_info_t *init_buffer(int size) {
	318	+ buf_info_t *b;
	319	+
	320	+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
	321	+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
	322	+ b->buffer[size]='\0';
	323	+ b->end = b->buffer + size;
	324	+ b->next_to_read = b->end; /* nothing available */
	325	+ b->bytes_avail = 0; /* bytes to read, nothing available */
	326	+ b->next_to_fill = b->buffer; /* empty */
	327	+ b->next_to_fill[0] = '\0';
	328	+ return(b);
	329	+}
	330	+
	331	+/* check if buffer (used for decompressed data output) is empty,
	332	+ returns 1 if so and 0 if not */
	333	+int buffer_is_empty(buf_info_t *b) {
	334	+ if (b->bytes_avail == 0) {
	335	+ return(1);
	336	+ }
	337	+ else {
	338	+ return(0);
	339	+ }
	340	+}
	341	+
	342	+/* check if buffer (used for decompressed data output) is full,
	343	+
	344	+ returns 1 if so and 0 if not
	345	+ I'm not liking this function so well, fixme */
	346	+int buffer_is_full(buf_info_t *b) {
	347	+ if (b->next_to_fill == b->end) {
	348	+ return(1);
	349	+ }
	350	+ else {
	351	+ return(0);
	352	+ }
	353	+}
	354	+
	355	+/* FIXME do this right. whatever. */
	356	+int get_file_size(int fin) {
	357	+ int res;
	358	+
	359	+ res = lseek(fin, 0, SEEK_END);
	360	+ if (res == -1) {
	361	+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
	362	+ exit(-1);
	363	+ }
	364	+ return(res);
	365	+}
	366	+
	367	+
	368	+/*
	369	+ look for the first bz2 block in the file after specified offset
	370	+ it tests that the block is valid by doing partial decompression.
	371	+ this function will update the bfile structure:
	372	+ bfile->position will contain the current position of the file (? will it?)
	373	+ bfile->bits_shifted will contain the number of bits that the block is rightshifted
	374	+ bfile->block_start will contain the offset from start of file to the block
	375	+ returns:
	376	+ position of next byte in file to be read, on success
	377	+ -1 if no marker or other error
	378	+*/
	379	+int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {
	380	+ int res;
	381	+
	382	+ bfile->bufin_size = BUFINSIZE;
	383	+ bfile->marker = init_marker();
	384	+ bfile->position = position;
	385	+ bfile->block_start = -1;
	386	+ bfile->bytes_read = 0;
	387	+ bfile->bytes_written = 0;
	388	+ bfile->eof = 0;
	389	+ bfile->bits_shifted = -1;
	390	+
	391	+ bfile->file_size = get_file_size(fin);
	392	+
	393	+ while (bfile->bits_shifted < 0) {
	394	+ if (bfile->position > bfile->file_size) {
	395	+ return(-1);
	396	+ }
	397	+ res = lseek(fin, bfile->position, SEEK_SET);
	398	+ if (res == -1) {
	399	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	400	+ exit(-1);
	401	+ }
	402	+ res = find_next_bz2_block_marker(fin, bfile);
	403	+ if (res == 1) {
	404	+ init_decompress(bfile);
	405	+ decompress_header(fin, bfile);
	406	+ res = setup_first_buffer_to_decompress(fin, bfile);
	407	+ if (res == -1) {
	408	+ fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
	409	+ exit(-1);
	410	+ }
	411	+ bfile->strm.next_out = (char *)bfile->bufout;
	412	+ bfile->strm.avail_out = bfile->bufout_size;
	413	+ res = BZ2_bzDecompress ( &(bfile->strm) );
	414	+ /* this means we (probably) have a genuine marker */
	415	+ if (BZ_OK == res \|\| BZ_STREAM_END == res) {
	416	+ res = BZ2_bzDecompressEnd ( &(bfile->strm) );
	417	+ bfile->bytes_read = 0;
	418	+ bfile->bytes_written = 0;
	419	+ bfile->eof = 0;
	420	+ /* leave the file at the right position */
	421	+ res = lseek(fin, bfile->block_start, SEEK_SET);
	422	+ if (res == -1) {
	423	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	424	+ exit(-1);
	425	+ }
	426	+ return(0);
	427	+ }
	428	+ /* right bytes, but there by chance, skip and try again */
	429	+ else {
	430	+ bfile->position+=6;
	431	+ bfile->bits_shifted = -1;
	432	+ bfile->block_start = -1;
	433	+ }
	434	+ }
	435	+ else {
	436	+ return(-1);
	437	+ }
	438	+ }
	439	+ return(-1);
	440	+}
	441	+
	442	+/*
	443	+ find the first bz2 block marker in the file,
	444	+ from its current position,
	445	+ then set up for decompression from that point
	446	+ returns:
	447	+ 0 on success
	448	+ -1 if no marker or other error
	449	+*/
	450	+int init_bz2_file(bz_info_t *bfile, int fin) {
	451	+ int res;
	452	+
	453	+ bfile->initialized++;
	454	+
	455	+ res = find_next_bz2_block_marker(fin, bfile);
	456	+ if (res ==1) {
	457	+ init_decompress(bfile);
	458	+ decompress_header(fin, bfile);
	459	+ setup_first_buffer_to_decompress(fin, bfile);
	460	+ return(0);
	461	+ }
	462	+ return(-1);
	463	+}
	464	+
	465	+/* return -1 if error */
	466	+int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {
	467	+ int ret;
	468	+
	469	+ bfile->bufout = bufferout;
	470	+ bfile->bufout_size = bufout_size;
	471	+ bfile->bytes_written = 0;
	472	+
	473	+ if (! bfile->initialized) {
	474	+ if (init_bz2_file(bfile, fin) == -1) {
	475	+ /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */
	476	+ return(-1);
	477	+ };
	478	+ bfile->strm.next_out = (char *)bfile->bufout;
	479	+ bfile->strm.avail_out = bfile->bufout_size;
	480	+ }
	481	+
	482	+ ret = BZ_OK;
	483	+ while (BZ_OK == ret && bfile->bytes_written == 0) {
	484	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	485	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {
	486	+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
	487	+ }
	488	+ else {
	489	+ /* fprintf(stderr,"error from BZ decompress %d\n",ret); */
	490	+ return(-1);
	491	+ }
	492	+ fill_buffer_to_decompress(fin, bfile, ret);
	493	+ /*
	494	+ if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {
	495	+ fprintf(stderr,"eof reached\n");
	496	+ }
	497	+ */
	498	+ }
	499	+ return(0);
	500	+}
	501	+
	502	+
	503	+/*
	504	+ fill output buffer in b with uncompressed data from bfile
	505	+ if this is the first call to the function for this file,
	506	+ the file header will be read, and the first buffer of
	507	+ uncompressed data will be prepared. bfile->position
	508	+ should be set to the offset (from the beginning of file) from
	509	+ which to find the first bz2 block.
	510	+
	511	+ returns:
	512	+ on success, number of bytes read (may be 0)
	513	+ -1 on error
	514	+*/
	515	+int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {
	516	+ int res;
	517	+
	518	+ if (buffer_is_full(b)) {
	519	+ return(0);
	520	+ }
	521	+
	522	+ if (buffer_is_empty(b)) {
	523	+ b->next_to_fill = b->buffer;
	524	+ }
	525	+
	526	+ res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
	527	+ if (res == -1) {
	528	+ return(res);
	529	+ }
	530	+ if (bfile->bytes_written < 0) {
	531	+ /* fprintf(stderr,"read of file failed\n"); */
	532	+ return(-1);
	533	+ }
	534	+ else {
	535	+ /* really?? FIXME check this */
	536	+ if (buffer_is_empty(b)) {
	537	+ b->next_to_read = b->next_to_fill; /* where we just read */
	538	+ }
	539	+ b->bytes_avail += bfile->bytes_written;
	540	+ b->next_to_fill += bfile->bytes_written;
	541	+ b->next_to_fill[0] = '\0';
	542	+ return(0);
	543	+ }
	544	+}
	545	+
	546	+void dumpbuf_info_t(buf_info_t *b) {
	547	+ fprintf(stdout, "\n");
	548	+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
	549	+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
	550	+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
	551	+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
	552	+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
	553	+}
	554	+
	555	+
	556	+/*
	557	+ copy text from end of buffer to the beginning, that we want to keep
	558	+ around for further processing (i.e. further regex matches)
	559	+ returns number of bytes copied
	560	+*/
	561	+int move_bytes_to_buffer_start(buf_info_t b, unsigned char from_where, int maxbytes) {
	562	+ int i, tocopy;
	563	+
	564	+ if (from_where >= b->end) {
	565	+ return(0);
	566	+ }
	567	+ else {
	568	+ tocopy = b->end - from_where;
	569	+ if (maxbytes && (tocopy > maxbytes)) {
	570	+ tocopy = maxbytes;
	571	+ }
	572	+ for (i = 0; i < tocopy; i++) {
	573	+ b->buffer[i] = from_where[i];
	574	+ }
	575	+ b->next_to_fill = b->buffer + tocopy;
	576	+ b->next_to_fill[0] = '\0';
	577	+ b->next_to_read = b->buffer;
	578	+ b->bytes_avail = tocopy;
	579	+ return(tocopy);
	580	+ }
	581	+}
	582	+
	583	+/*
	584	+ get the first page id after position in file
	585	+ if a pageid is found, the structure pinfo will be updated accordingly
	586	+ returns:
	587	+ 1 if a pageid found,
	588	+ 0 if no pageid found,
	589	+ -1 on error
	590	+*/
	591	+int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) {
	592	+ int res;
	593	+ regmatch_t match_page, match_page_id;
	594	+ regex_t compiled_page, compiled_page_id;
	595	+ int length=5000; /* output buffer size */
	596	+ char *page = "<page>";
	597	+ char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
	598	+
	599	+ buf_info_t *b;
	600	+ bz_info_t bfile;
	601	+
	602	+ bfile.initialized = 0;
	603	+
	604	+ res = regcomp(&compiled_page, page, REG_EXTENDED);
	605	+ res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
	606	+
	607	+ match_page = (regmatch_t )malloc(sizeof(regmatch_t)1);
	608	+ match_page_id = (regmatch_t )malloc(sizeof(regmatch_t)2);
	609	+
	610	+ b = init_buffer(length);
	611	+
	612	+ pinfo->bits_shifted = -1;
	613	+ pinfo->position = -1;
	614	+ pinfo->page_id = -1;
	615	+
	616	+ bfile.bytes_read = 0;
	617	+
	618	+ if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {
	619	+ /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
	620	+ return(-1);
	621	+ }
	622	+
	623	+ while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {
	624	+ if (bfile.bytes_read) {
	625	+ while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
	626	+ if (match_page_id[1].rm_so >=0) {
	627	+ /* write page_id to stderr */
	628	+ /*
	629	+ fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);
	630	+ fwrite("\n",1,1,stderr);
	631	+ */
	632	+ pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));
	633	+ pinfo->position = bfile.block_start;
	634	+ pinfo->bits_shifted = bfile.bits_shifted;
	635	+ return(1);
	636	+ /* write up to and including page id tag to stdout */
	637	+ /*
	638	+ fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout);
	639	+ b->next_to_read = b->next_to_read+match_page_id[0].rm_eo;
	640	+ b->bytes_avail -= match_page_id[0].rm_eo;
	641	+ */
	642	+ }
	643	+ else {
	644	+ /* should never happen */
	645	+ fprintf(stderr,"regex gone bad...\n");
	646	+ exit(-1);
	647	+ }
	648	+ }
	649	+ if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
	650	+ /* write everything up to but not including the page tag to stdout */
	651	+ /*
	652	+ fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
	653	+ */
	654	+ move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so);
	655	+ bfile.strm.next_out = (char *)b->next_to_fill;
	656	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	657	+ }
	658	+ else {
	659	+ /* could have the first part of the page tag... so copy up enough bytes to cover that case */
	660	+ if (b->bytes_avail> 5) {
	661	+ /* write everything that didn't match, but leave 5 bytes, to stdout */
	662	+ /*
	663	+ fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);
	664	+ */
	665	+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);
	666	+ bfile.strm.next_out = (char *)b->next_to_fill;
	667	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	668	+ }
	669	+ else {
	670	+ if (buffer_is_empty(b)) {
	671	+ bfile.strm.next_out = (char *)b->buffer;
	672	+ bfile.strm.avail_out = bfile.bufout_size;
	673	+ b->next_to_fill = b->buffer; /* empty */
	674	+ }
	675	+ else {
	676	+ /* there were only 5 or less bytes so just save em don't write em to stdout */
	677	+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
	678	+ bfile.strm.next_out = (char *)b->next_to_fill;
	679	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	680	+ }
	681	+ }
	682	+ }
	683	+ }
	684	+ }
	685	+ /*
	686	+ if (b->bytes_avail) {
	687	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	688	+ }
	689	+ */
	690	+ return(0);
	691	+}
	692	+
	693	+/* search for pageid in a bz2 file, given start and end offsets
	694	+ to search for
	695	+ we guess by the most boring method possible (shrink the
	696	+ interval according to the value found on the last guess,
	697	+ try midpoint of the new interval)
	698	+ multiple calls of this will get the job done.
	699	+ interval has left end = right end if search is complete.
	700	+ this function may return the previous guess and simply
	701	+ shrink the interval.
	702	+ note that a "match" means either that the pageid we find
	703	+ is smaller than the one the caller wants, or is equal.
	704	+ why? because then we can use the output for prefetch
	705	+ for xml dumps and be sure a specific page range is covered :-P
	706	+
	707	+ return value from guess, or -1 on error.
	708	+ */
	709	+int do_iteration(iter_info_t iinfo, int fin, page_info_t pinfo) {
	710	+ int res;
	711	+ int new_position;
	712	+ int interval;
	713	+
	714	+ /*
	715	+ last_position is somewhere in the interval, perhaps at an end
	716	+ last_value is the value we had at that position
	717	+ */
	718	+
	719	+ interval = (iinfo->right_end - iinfo->left_end)/2;
	720	+ if (interval == 0) {
	721	+ interval = 1;
	722	+ }
	723	+ /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */
	724	+ /* if we're this close, we'll check this value and be done with it */
	725	+ if (iinfo->right_end -iinfo->left_end < 2) {
	726	+ new_position = iinfo->left_end;
	727	+ iinfo->right_end = iinfo->left_end;
	728	+ }
	729	+ else {
	730	+ if (iinfo->last_value < iinfo->value_wanted) {
	731	+ /* fprintf(stderr,"resetting left end\n"); */
	732	+ iinfo->left_end = iinfo->last_position;
	733	+ new_position = iinfo->last_position + interval;
	734	+ }
	735	+ /* iinfo->last_value > iinfo->value_wanted */
	736	+ else {
	737	+ /* fprintf(stderr,"resetting right end\n"); */
	738	+ iinfo->right_end = iinfo->last_position;
	739	+ new_position = iinfo->last_position - interval;
	740	+ }
	741	+ }
	742	+ res = get_first_page_id_after_offset(fin, new_position, pinfo);
	743	+ if (res >0) {
	744	+ /* caller wants the new value */
	745	+ iinfo->last_value = pinfo->page_id;
	746	+ iinfo->last_position = new_position;
	747	+ return(pinfo->page_id);
	748	+ }
	749	+ else {
	750	+ /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or
	751	+ there's just no block here.
	752	+ set the right end, keep the last value and position and let the caller retry with the new interval */
	753	+ if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */
	754	+ iinfo->right_end = new_position;
	755	+ return(iinfo->last_value);
	756	+ }
	757	+ /* in theory we were moving towards beginning of file, should not have issues, so bail here */
	758	+ else {
	759	+ /* fprintf(stderr,"something very broken, giving up\n"); */
	760	+ return(-1);
	761	+ }
	762	+ }
	763	+}
	764	+
	765	+/*
	766	+ given a bzipped and possibly truncated file, and a page id,
	767	+ hunt for the page id in the file; this assume that the
	768	+ bz2 header is intact and that page ids are steadily increasing
	769	+ throughout the file.
	770	+
	771	+ writes the offset of the relevant block (from beginning of file)
	772	+ and the first pageid found in that block, to stdout
	773	+
	774	+ format of output:
	775	+ position:xxxxx pageid:nnn
	776	+
	777	+ returns: 0 on success, -1 on error
	778	+*/
	779	+int main(int argc, char **argv) {
	780	+ int fin, position, res, interval, page_id, oldmarker, file_size;
	781	+ page_info_t pinfo;
	782	+ iter_info_t iinfo;
	783	+
	784	+ if (argc != 3) {
	785	+ fprintf(stderr,"usage: %s infile id\n", argv[0]);
	786	+ exit(-1);
	787	+ }
	788	+
	789	+ fin = open (argv[1], O_RDONLY);
	790	+ if (fin < 0) {
	791	+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
	792	+ exit(-1);
	793	+ }
	794	+
	795	+ page_id = atoi(argv[2]);
	796	+ if (page_id <1) {
	797	+ fprintf(stderr,"please specify a page_id >= 1.\n");
	798	+ fprintf(stderr,"usage: %s infile page_id\n", argv[0]);
	799	+ exit(-1);
	800	+ }
	801	+
	802	+ file_size = get_file_size(fin);
	803	+
	804	+ interval = file_size;
	805	+ position = 0;
	806	+ oldmarker = -1;
	807	+ pinfo.bits_shifted = -1;
	808	+ pinfo.position = -1;
	809	+ pinfo.page_id = -1;
	810	+
	811	+ iinfo.left_end = 0;
	812	+ file_size = get_file_size(fin);
	813	+ iinfo.right_end = file_size;
	814	+ iinfo.value_wanted = page_id;
	815	+
	816	+ res = get_first_page_id_after_offset(fin, 0, &pinfo);
	817	+ if (res > 0) {
	818	+ iinfo.last_value = pinfo.page_id;
	819	+ iinfo.last_position = 0;
	820	+ }
	821	+ else {
	822	+ fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n");
	823	+ exit(1);
	824	+ }
	825	+ if (pinfo.page_id == page_id) {
	826	+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
	827	+ exit(0);
	828	+ }
	829	+
	830	+ while (1) {
	831	+ res = do_iteration(&iinfo, fin, &pinfo);
	832	+ /* things to check: bad return? interval is 0 bytes long? */
	833	+ if (iinfo.left_end == iinfo.right_end) {
	834	+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
	835	+ exit(0);
	836	+ }
	837	+ else if (res < 0) {
	838	+ fprintf(stderr,"broken and quitting\n");
	839	+ exit(-1);
	840	+ }
	841	+ }
	842	+ exit(0);
	843	+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
___________________________________________________________________
Added: svn:eol-style
1	844	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c
—	—	@@ -0,0 +1,156 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+
	11	+/*
	12	+ Check to see whether a file ends with a bz2 footer or not
	13	+ (i.e. if it is truncated or corrupted).
	14	+ This is a crude but fast test for integrity; we don't
	15	+ check the CRC at the end of fthe stream, nor do we check the
	16	+ bit padding in the last byte of the file.
	17	+
	18	+ Arguments: the name of the file to check, presumably
	19	+ a bzipped file.
	20	+ Outputs: none.
	21	+ Exits with 0 if the file contains the footer at the end,
	22	+ 1 if the file does not contain the footer, and -1 on error.
	23	+*/
	24	+
	25	+
	26	+int read_footer(unsigned char *buffer, int fin) {
	27	+ int res;
	28	+
	29	+ res = lseek(fin, -11, SEEK_END);
	30	+ if (res == -1) {
	31	+ fprintf(stderr,"lseek of file failed\n");
	32	+ exit(-1);
	33	+ }
	34	+ res = read(fin, buffer, 11);
	35	+ if (res == -1) {
	36	+ fprintf(stderr,"read of file failed\n");
	37	+ exit(-1);
	38	+ }
	39	+ return(0);
	40	+}
	41	+
	42	+#define LEFT 0
	43	+#define RIGHT 1
	44	+
	45	+/* return n ones either at left or right end */
	46	+int bitmask(int numbits, int end) {
	47	+ if (end == RIGHT) {
	48	+ return((1<<numbits)-1);
	49	+ }
	50	+ else {
	51	+ return(((1<<numbits)-1) << (8-numbits));
	52	+ }
	53	+}
	54	+
	55	+void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
	56	+ int i;
	57	+
	58	+ for (i=buflen-1; i>=0; i--) {
	59	+ /* right 1 */
	60	+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
	61	+
	62	+ /* grab rightmost from prev byte */
	63	+ if (i > 0) {
	64	+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(1,LEFT)));
	65	+ }
	66	+ }
	67	+}
	68	+
	69	+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
	70	+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
	71	+ matches and 0 otherwise. */
	72	+int bytescompare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {
	73	+ int i;
	74	+
	75	+ if (bitsrightshifted == 0) {
	76	+ for (i = 0; i< numbytes; i++) {
	77	+ if (buff1[i] != buff2[i]) {
	78	+ return(1);
	79	+ }
	80	+ }
	81	+ return(0);
	82	+ }
	83	+ else {
	84	+ for (i = 1; i< numbytes-2; i++) {
	85	+ if (buff1[i] != buff2[i]) {
	86	+ return(1);
	87	+ }
	88	+ }
	89	+ /* do leftmost byte */
	90	+ if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
	91	+ return(1);
	92	+ }
	93	+ /* do rightmost byte */
	94	+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
	95	+ return(1);
	96	+ }
	97	+ return(0);
	98	+ }
	99	+}
	100	+
	101	+int checkfileforfooter(int fin) {
	102	+ unsigned char buffer[11];
	103	+ int result, i;
	104	+ unsigned char *footer = malloc(8sizeof(unsigned char *));
	105	+
	106	+ /* set up footer plus its various right-shifted incarnations */
	107	+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
	108	+ for (i = 0; i< 8; i++) {
	109	+ footer[i] = malloc(sizeof(unsigned char)*7);
	110	+ }
	111	+ footer[0][0]= (unsigned char) 0x17;
	112	+ footer[0][1]= (unsigned char) 0x72;
	113	+ footer[0][2]= (unsigned char) 0x45;
	114	+ footer[0][3]= (unsigned char) 0x38;
	115	+ footer[0][4]= (unsigned char) 0x50;
	116	+ footer[0][5]= (unsigned char) 0x90;
	117	+ footer[0][6]= (unsigned char) 0x00;
	118	+ for (i = 1; i< 8; i++) {
	119	+ memcpy((char )(footer[i]), (char )(footer[i-1]),7);
	120	+ shiftbytesright(footer[i],7,1);
	121	+ }
	122	+
	123	+ read_footer(buffer,fin);
	124	+
	125	+ result = bytescompare(footer[0],buffer+1,6,0);
	126	+ if (!result) {
	127	+ return(0);
	128	+ }
	129	+
	130	+ for (i=1; i<8; i++) {
	131	+ result = bytescompare(footer[i],buffer,7,i);
	132	+ if (!result) {
	133	+ return(0);
	134	+ }
	135	+ }
	136	+ return(1);
	137	+}
	138	+
	139	+int main(int argc, char **argv) {
	140	+
	141	+ int fin;
	142	+ int result;
	143	+
	144	+ if (argc != 2) {
	145	+ fprintf(stderr,"usage: %s infile\n", argv[0]);
	146	+ exit(-1);
	147	+ }
	148	+ fin = open (argv[1], O_RDONLY);
	149	+ if (fin < 0) {
	150	+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
	151	+ exit(-1);
	152	+ }
	153	+ result = checkfileforfooter(fin);
	154	+ close(fin);
	155	+ exit(result);
	156	+}
	157	+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c
___________________________________________________________________
Added: svn:eol-style
1	158	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c
—	—	@@ -0,0 +1,463 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+#include "bzlib.h"
	11	+
	12	+/*
	13	+ Find the last bz2 block marker in a file
	14	+ and dump whatever can be decompressed after
	15	+ that point. The header of the file must
	16	+ be intact in order for any output to be produced.
	17	+ This will produce output for truncated files as well,
	18	+ as long as there is "enough" data after the block
	19	+ marker.
	20	+
	21	+ Arguments: the name of the file to check, presumably
	22	+ a bzipped file.
	23	+ Outputs: the decompressed data at the end of the file.
	24	+ Exits with 0 if decompression of some data can be done,
	25	+ 1 if decompression fails, and -1 on error.
	26	+*/
	27	+
	28	+#define BUFSIZE 121072
	29	+typedef struct {
	30	+ unsigned char bufin[BUFSIZE];
	31	+ unsigned char bufout[BUFSIZE];
	32	+ int bufsize;
	33	+ bz_stream strm;
	34	+ unsigned char overflow;
	35	+ int bitsshifted;
	36	+ int position;
	37	+} bzinfo;
	38	+
	39	+int read_footer(unsigned char *buffer, int fin) {
	40	+ int res;
	41	+
	42	+ res = lseek(fin, -11, SEEK_END);
	43	+ if (res == -1) {
	44	+ fprintf(stderr,"lseek of file failed\n");
	45	+ exit(-1);
	46	+ }
	47	+ res = read(fin, buffer, 11);
	48	+ if (res == -1) {
	49	+ fprintf(stderr,"read of file failed\n");
	50	+ exit(-1);
	51	+ }
	52	+ return(0);
	53	+}
	54	+
	55	+#define LEFT 0
	56	+#define RIGHT 1
	57	+
	58	+/* return n ones either at left or right end */
	59	+int bitmask(int numbits, int end) {
	60	+ if (end == RIGHT) {
	61	+ return((1<<numbits)-1);
	62	+ }
	63	+ else {
	64	+ return(((1<<numbits)-1) << (8-numbits));
	65	+ }
	66	+}
	67	+
	68	+void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {
	69	+ int i;
	70	+
	71	+ if (numbits == 0) {
	72	+ return;
	73	+ }
	74	+
	75	+ for (i=0; i<buflen; i++) {
	76	+ /* left 1 */
	77	+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
	78	+
	79	+ /* grab leftmost from next byte */
	80	+ if (i < buflen-1) {
	81	+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );
	82	+ }
	83	+ }
	84	+}
	85	+
	86	+
	87	+void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
	88	+ int i;
	89	+
	90	+ for (i=buflen-1; i>=0; i--) {
	91	+ /* right 1 */
	92	+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
	93	+
	94	+ /* grab rightmost from prev byte */
	95	+ if (i > 0) {
	96	+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));
	97	+ }
	98	+ }
	99	+}
	100	+
	101	+unsigned char ** init_marker() {
	102	+ unsigned char *marker = malloc(8sizeof(unsigned char *));
	103	+ int i;
	104	+
	105	+ /* set up block marker plus its various right-shifted incarnations */
	106	+ for (i = 0; i< 8; i++) {
	107	+ marker[i] = malloc(sizeof(unsigned char)*7);
	108	+ }
	109	+ marker[0][0]= (unsigned char) 0x31;
	110	+ marker[0][1]= (unsigned char) 0x41;
	111	+ marker[0][2]= (unsigned char) 0x59;
	112	+ marker[0][3]= (unsigned char) 0x26;
	113	+ marker[0][4]= (unsigned char) 0x53;
	114	+ marker[0][5]= (unsigned char) 0x59;
	115	+ marker[0][6]= (unsigned char) 0x00;
	116	+ for (i = 1; i< 8; i++) {
	117	+ memcpy((char )(marker[i]), (char )(marker[i-1]),7);
	118	+ shiftbytesright(marker[i],7,1);
	119	+ }
	120	+ return(marker);
	121	+}
	122	+
	123	+unsigned char ** init_footer() {
	124	+ unsigned char *footer = malloc(8sizeof(unsigned char *));
	125	+ int i;
	126	+
	127	+ /* set up footer plus its various right-shifted incarnations */
	128	+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
	129	+ for (i = 0; i< 8; i++) {
	130	+ footer[i] = malloc(sizeof(unsigned char)*7);
	131	+ }
	132	+ footer[0][0]= (unsigned char) 0x17;
	133	+ footer[0][1]= (unsigned char) 0x72;
	134	+ footer[0][2]= (unsigned char) 0x45;
	135	+ footer[0][3]= (unsigned char) 0x38;
	136	+ footer[0][4]= (unsigned char) 0x50;
	137	+ footer[0][5]= (unsigned char) 0x90;
	138	+ footer[0][6]= (unsigned char) 0x00;
	139	+ for (i = 1; i< 8; i++) {
	140	+ memcpy((char )(footer[i]), (char )(footer[i-1]),7);
	141	+ shiftbytesright(footer[i],7,1);
	142	+ }
	143	+ return(footer);
	144	+}
	145	+
	146	+
	147	+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
	148	+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
	149	+ matches and 0 otherwise. */
	150	+int bytescompare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {
	151	+ int i;
	152	+
	153	+ if (bitsrightshifted == 0) {
	154	+ for (i = 0; i< numbytes; i++) {
	155	+ if (buff1[i] != buff2[i]) {
	156	+ return(1);
	157	+ }
	158	+ }
	159	+ return(0);
	160	+ }
	161	+ else {
	162	+ for (i = 1; i< numbytes-2; i++) {
	163	+ if (buff1[i] != buff2[i]) {
	164	+ return(1);
	165	+ }
	166	+ }
	167	+ /* do leftmost byte */
	168	+ if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
	169	+ return(1);
	170	+ }
	171	+ /* do rightmost byte */
	172	+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
	173	+ return(1);
	174	+ }
	175	+ return(0);
	176	+ }
	177	+}
	178	+
	179	+/* return -1 if no match
	180	+ return number of bits rightshifted otherwise */
	181	+int checkfileforfooter(int fin, unsigned char **footer) {
	182	+ unsigned char buffer[11];
	183	+ int result, i;
	184	+
	185	+ read_footer(buffer,fin);
	186	+
	187	+ result = bytescompare(footer[0],buffer+1,6,0);
	188	+ if (!result) {
	189	+ return(0);
	190	+ }
	191	+
	192	+ for (i=1; i<8; i++) {
	193	+ result = bytescompare(footer[i],buffer,7,i);
	194	+ if (!result) {
	195	+ return(i);
	196	+ }
	197	+ }
	198	+ return(-1);
	199	+}
	200	+
	201	+/* return -1 if no match
	202	+ return number of bits rightshifted otherwise */
	203	+int checkbufferforblockmarker(unsigned char buffer, unsigned char *marker) {
	204	+ int result, i;
	205	+
	206	+ result = bytescompare(marker[0],buffer+1,6,0);
	207	+ if (!result) {
	208	+ return(0);
	209	+ }
	210	+ for (i=1; i<8; i++) {
	211	+ result = bytescompare(marker[i],buffer,7,i);
	212	+ if (!result) {
	213	+ return(i);
	214	+ }
	215	+ }
	216	+ return(-1);
	217	+}
	218	+
	219	+void clearbuffer(unsigned char *buf, int length) {
	220	+ int i;
	221	+
	222	+ for (i=0; i<length; i++) {
	223	+ buf[i]=0;
	224	+ }
	225	+ return;
	226	+}
	227	+
	228	+int findnextmarker(int fin, int start_at, int position, unsigned char *marker, unsigned char buffer ) {
	229	+ int bitsshifted = -1;
	230	+ int result;
	231	+
	232	+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
	233	+ of data read in case some bits have been shifted into it */
	234	+ while (*position >= 3 && bitsshifted < 0) {
	235	+ bitsshifted = checkbufferforblockmarker(buffer, marker);
	236	+ if (bitsshifted < 0) {
	237	+ (*start_at)++;
	238	+ /*
	239	+ if (*start_at % 10000 == 0) {
	240	+ fprintf(stderr, "starting at %d, position %d\n", start_at, position);
	241	+ }
	242	+ */
	243	+ position = lseek(fin, -1(*start_at), SEEK_END);
	244	+ if (*position == -1) {
	245	+ fprintf(stderr,"lseek of file failed\n");
	246	+ exit(-1);
	247	+ }
	248	+ result = read(fin, buffer, 7);
	249	+ if (result == -1) {
	250	+ fprintf(stderr,"read of file failed\n");
	251	+ exit(-1);
	252	+ }
	253	+ }
	254	+ else {
	255	+ return(bitsshifted);
	256	+ }
	257	+ }
	258	+ return(bitsshifted);
	259	+}
	260	+
	261	+int init_decompress(bzinfo *bfile) {
	262	+ int bz_verbosity = 0;
	263	+ int bz_small = 0;
	264	+ int ret;
	265	+
	266	+ bfile->strm.bzalloc = NULL;
	267	+ bfile->strm.bzfree = NULL;
	268	+ bfile->strm.opaque = NULL;
	269	+
	270	+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
	271	+ if (ret != BZ_OK) {
	272	+ fprintf(stderr,"uncompress failed, err %d\n", ret);
	273	+ exit(-1);
	274	+ }
	275	+ return(ret);
	276	+}
	277	+
	278	+int decompress_header(int fin, bzinfo *bfile) {
	279	+ int bytesread, ret;
	280	+ unsigned char header[4];
	281	+
	282	+ lseek(fin,0,SEEK_SET);
	283	+ bytesread = read(fin, header, 4);
	284	+ if (bytesread < 4) {
	285	+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
	286	+ exit(-1);
	287	+ }
	288	+ bfile->strm.next_in = (char *)header;
	289	+ bfile->strm.avail_in = 4;
	290	+
	291	+ bfile->strm.next_out = (char *)(bfile->bufout);
	292	+ bfile->strm.avail_out = bfile->bufsize;
	293	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	294	+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
	295	+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
	296	+ exit(-1);
	297	+ }
	298	+ return(ret);
	299	+}
	300	+
	301	+int setup_first_buffer(int fin, bzinfo *bfile) {
	302	+ int bytesread, eof=0;
	303	+
	304	+ if (bfile->bitsshifted == 0) {
	305	+ lseek(fin,bfile->position+1,SEEK_SET);
	306	+ }
	307	+ else {
	308	+ lseek(fin,bfile->position,SEEK_SET);
	309	+ }
	310	+ bytesread = read(fin, bfile->bufin, bfile->bufsize);
	311	+ if (bytesread > 0) {
	312	+ bfile->overflow = bfile->bufin[bytesread-1];
	313	+ shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);
	314	+
	315	+ bfile->strm.next_in = (char *)(bfile->bufin);
	316	+ bfile->strm.avail_in = bytesread-1;
	317	+
	318	+ bfile->strm.next_out = (char *)(bfile->bufout);
	319	+ bfile->strm.avail_out = bfile->bufsize;
	320	+ }
	321	+ if (bytesread <=0) {
	322	+ eof++;
	323	+ }
	324	+ return(eof);
	325	+}
	326	+
	327	+int do_last_byte(bzinfo *bfile) {
	328	+ int ret=BZ_OK;
	329	+ int written;
	330	+
	331	+ if (bfile->strm.avail_in == 0) {
	332	+ bfile->strm.next_in = (char *)(bfile->bufin);
	333	+ bfile->bufin[0] = bfile->overflow;
	334	+ shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);
	335	+ bfile->strm.avail_in = 1;
	336	+ bfile->strm.next_out = (char *)(bfile->bufout);
	337	+ bfile->strm.avail_out = bfile->bufsize;
	338	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	339	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {
	340	+ written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);
	341	+ }
	342	+ }
	343	+ return(ret);
	344	+}
	345	+
	346	+int read_next_buffer(int fin, bzinfo *bfile, int ret) {
	347	+ int bytesread, eof=0;
	348	+
	349	+ /* fprintf(stderr," got return from decompress of %d\n", ret); */
	350	+
	351	+ if (bfile->strm.avail_in == 0) {
	352	+ bfile->strm.next_in = (char *)(bfile->bufin);
	353	+ bfile->bufin[0] = bfile->overflow;
	354	+ bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);
	355	+ if (bytesread > 0) {
	356	+ bfile->overflow = bfile->bufin[bytesread];
	357	+ shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);
	358	+ bfile->strm.avail_in = bytesread;
	359	+ }
	360	+ else {
	361	+ eof++;
	362	+ bfile->strm.avail_in = 0;
	363	+ }
	364	+ }
	365	+ bfile->strm.next_out = (char *)(bfile->bufout);
	366	+ bfile->strm.avail_out = bfile->bufsize;
	367	+
	368	+ return(eof);
	369	+}
	370	+
	371	+
	372	+int main(int argc, char **argv) {
	373	+
	374	+ bzinfo bfile;
	375	+
	376	+ int fin;
	377	+ int result, ret;
	378	+ unsigned char buffer[8];
	379	+
	380	+ unsigned char **footer;
	381	+ unsigned char **marker;
	382	+
	383	+ int written=0;
	384	+ int start_at;
	385	+
	386	+ int eof = 0;
	387	+
	388	+ if (argc != 2) {
	389	+ fprintf(stderr,"usage: %s infile\n", argv[0]);
	390	+ exit(-1);
	391	+ }
	392	+
	393	+ marker = init_marker();
	394	+ footer = init_footer();
	395	+
	396	+ fin = open (argv[1], O_RDONLY);
	397	+ if (fin < 0) {
	398	+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
	399	+ exit(-1);
	400	+ }
	401	+
	402	+ bfile.bufsize = BUFSIZE;
	403	+
	404	+ result = checkfileforfooter(fin, footer);
	405	+ if (result == -1) {
	406	+ start_at = 0;
	407	+ }
	408	+ else {
	409	+ start_at = 11; /* size of footer, perhaps with 1 byte extra */
	410	+ }
	411	+ start_at +=6; /* size of marker */
	412	+ bfile.position = lseek(fin, -1*start_at, SEEK_END);
	413	+ if (bfile.position == -1) {
	414	+ fprintf(stderr,"lseek of file failed\n");
	415	+ exit(-1);
	416	+ }
	417	+ result = read(fin, buffer, 7);
	418	+ if (result == -1) {
	419	+ fprintf(stderr,"read of file failed\n");
	420	+ exit(-1);
	421	+ }
	422	+
	423	+ while (1) {
	424	+
	425	+ bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);
	426	+ if (bfile.bitsshifted >= 0) {
	427	+ /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */
	428	+ ret = init_decompress(&bfile);
	429	+
	430	+ /* pass in the header */
	431	+ ret = decompress_header(fin,&bfile);
	432	+
	433	+ eof = setup_first_buffer(fin, &bfile);
	434	+
	435	+ while (BZ_OK == ret && !eof) {
	436	+ ret = BZ2_bzDecompress ( &(bfile.strm) );
	437	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {
	438	+ written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);
	439	+ }
	440	+ eof = read_next_buffer(fin, &bfile, ret);
	441	+ }
	442	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret ) {
	443	+ /* so we read no bytes, process the last byte we held */
	444	+ do_last_byte(&bfile);
	445	+ }
	446	+ if (written == 0) {
	447	+ /* truncated block or other corruption, try going back one */
	448	+ start_at +=5;
	449	+ clearbuffer(buffer,sizeof(buffer));
	450	+ continue;
	451	+ }
	452	+ else {
	453	+ break;
	454	+ }
	455	+ }
	456	+ else {
	457	+ fprintf(stderr,"no block marker in this file.\n");
	458	+ exit(-1);
	459	+ }
	460	+ }
	461	+ close(fin);
	462	+ exit(0);
	463	+}
	464	+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c
___________________________________________________________________
Added: svn:eol-style
1	465	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h
—	—	@@ -0,0 +1,81 @@
	2	+#ifndef _FINDPAGEID_H
	3	+#define _FINDPAGEID_H
	4	+
	5	+typedef struct {
	6	+ int page_id; /* first id in the block */
	7	+ int bits_shifted; /* block is right shifted this many bits */
	8	+ int position; /* position in file of block */
	9	+} page_info_t;
	10	+
	11	+#define BUFINSIZE 5000
	12	+
	13	+/*
	14	+ keeps all information about a bzipped file
	15	+ plus input/output buffers for decompression
	16	+*/
	17	+typedef struct {
	18	+ unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
	19	+ unsigned char bufout; / uncompressed data, must be allocated by caller */
	20	+ unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
	21	+ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
	22	+
	23	+ int bufin_size; /* size of input buffer for compressed data */
	24	+ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
	25	+
	26	+ int initialized; /* whether bz2file has been initialized (header processed, seek to
	27	+ some bz2 block in the file and input buffer filled) */
	28	+ int block_start; /* position of bz2 block in file from which we started to read (we
	29	+ read a sequence of bz2 blocks from a given position, this is
	30	+ the offset to the first one) */
	31	+
	32	+ bz_stream strm; /* stream structure for libbz2 */
	33	+ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
	34	+ read around so we can grab the lower end bits off the end for
	35	+ sticking in front of the next pile of compressed bytes we read */
	36	+
	37	+ int bits_shifted; /* number of bits that the compressed data has been right shifted
	38	+ in the file (if the number is 0, the block marker and subsequent
	39	+ data is byte-aligned) */
	40	+ unsigned char *marker; / bzip2 start of block marker, plus bit-shifted versions of it for
	41	+ locating the marker in a stream of compressed data */
	42	+
	43	+ int position; /* current offset into file from start of file */
	44	+
	45	+ int bytes_read; /* number of bytes of compressed data read from file (per read) */
	46	+ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
	47	+ int eof; /* nonzero if eof reached */
	48	+ int file_size; /* length of file, so we don't search past it for blocks */
	49	+} bz_info_t;
	50	+
	51	+#define MASKLEFT 0
	52	+#define MASKRIGHT 1
	53	+
	54	+/*
	55	+ this output buffer is used to collect decompressed output.
	56	+ this is not a circular buffer; when it is full the user is
	57	+ responsible for emptying it completely or partially and moving
	58	+ to the beginning any unused bytes.
	59	+
	60	+*/
	61	+typedef struct {
	62	+ unsigned char buffer; / output storage, allocated by the caller */
	63	+ unsigned char next_to_read; / pointer to the next byte in the buffer with data to be read */
	64	+ unsigned char next_to_fill; / pointer to the next byte in the buffer which is empty and can receive data */
	65	+ int bytes_avail; /* number of bytes available for reading */
	66	+ unsigned char end; / points to byte after end of buffer */
	67	+} buf_info_t;
	68	+
	69	+/*
	70	+ used for each iteration of narrowing down the location in a bzipped2 file of
	71	+ a desired pageid, by finding first compressed block after a guessed
	72	+ position and checking the first pageid (if any) contained in it.
	73	+*/
	74	+typedef struct {
	75	+ int left_end; /* left end of interval to search (bytes from start of file) */
	76	+ int right_end; /* right end of interval to search */
	77	+ int value_wanted; /* pageid desired */
	78	+ int last_value; /* pageid we found in last iteration */
	79	+ int last_position; /* position in file for last iteration */
	80	+} iter_info_t;
	81	+
	82	+#endif
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h
___________________________________________________________________
Added: svn:eol-style
1	83	+ native

Status & tagging log

15:58, 7 July 2011 Reedy (talk | contribs) changed the status of r91637 [removed: new added: deferred]