r107839 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107838‎ | r107839 | r107840 >
Date:17:20, 2 January 2012
Author:ariel
Status:deferred
Tags:
Comment:
utility to compress an input stream into multiple bz2 streams on output, with index of pages and offsets
Modified paths:
  • /branches/ariel/xmldumps-backup/mwbzutils/Makefile (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/README (modified) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/recompressxml.c (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/recompressxml.c
@@ -0,0 +1,360 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <getopt.h>
 5+#include <string.h>
 6+#include <sys/types.h>
 7+#include <sys/stat.h>
 8+#include <fcntl.h>
 9+#include <stdlib.h>
 10+#include <errno.h>
 11+#include <sys/types.h>
 12+#include <regex.h>
 13+#include <ctype.h>
 14+#include "bzlib.h"
 15+
 16+char inBuf[4096];
 17+char outBuf[8192];
 18+
 19+char inBuf_indx[4096];
 20+char outBuf_indx[8192];
 21+
 22+char *pageOpenTag = "<page>\n";
 23+
 24+char *pageTitleExpr = "<title>(.+)</title>\n";
 25+regmatch_t *matchPageTitleExpr;
 26+regex_t compiledMatchPageTitleExpr;
 27+
 28+char *idExpr = "<id>([0-9]+)</id>\n";
 29+regmatch_t *matchIdExpr;
 30+regex_t compiledMatchIdExpr;
 31+
 32+bz_stream strm_indx;
 33+
 34+void setupIndexBz2Stream() {
 35+ int bz_verbosity = 0;
 36+ int bz_workFactor = 0;
 37+ int bz_blockSize100k = 9;
 38+
 39+ strm_indx.bzalloc = NULL;
 40+ strm_indx.bzfree = NULL;
 41+ strm_indx.opaque = NULL;
 42+
 43+ /* init bzip compression stuff */
 44+ BZ2_bzCompressInit(&(strm_indx), bz_blockSize100k, bz_verbosity, bz_workFactor);
 45+}
 46+
 47+void setupRegexps() {
 48+ matchPageTitleExpr = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 49+ regcomp(&compiledMatchPageTitleExpr, pageTitleExpr, REG_EXTENDED);
 50+ matchIdExpr = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 51+ regcomp(&compiledMatchIdExpr, idExpr, REG_EXTENDED);
 52+ return;
 53+}
 54+
 55+int startsPage(char *buf) {
 56+ while (*buf == ' ') buf++;
 57+
 58+ if (!strcmp(buf,pageOpenTag)) return 1;
 59+ else return 0;
 60+}
 61+
 62+char *hasPageTitle(char *buf) {
 63+ static char pageTitle[513];
 64+ int length = 0;
 65+
 66+ pageTitle[0]='\0';
 67+
 68+ while (*buf == ' ') buf++;
 69+
 70+ if (regexec(&compiledMatchPageTitleExpr, buf, 2, matchPageTitleExpr, 0 ) == 0) {
 71+ if (matchPageTitleExpr[1].rm_so >=0) {
 72+ length = matchPageTitleExpr[1].rm_eo - matchPageTitleExpr[1].rm_so;
 73+ if (length > 512) {
 74+ fprintf(stderr,"Page title length > 512 bytes... really? Bailing.\n");
 75+ exit(1);
 76+ }
 77+ strncpy(pageTitle,buf+matchPageTitleExpr[1].rm_so, length);
 78+ pageTitle[length] = '\0';
 79+ }
 80+ }
 81+ return(pageTitle);
 82+}
 83+
 84+int hasId(char *buf) {
 85+ int id = 0;
 86+
 87+ while (*buf == ' ') buf++;
 88+
 89+ if (regexec(&compiledMatchIdExpr, buf, 2, matchIdExpr, 0 ) == 0) {
 90+ if (matchIdExpr[1].rm_so >=0) {
 91+ id = atoi(buf+matchIdExpr[1].rm_so);
 92+ }
 93+ }
 94+ return(id);
 95+}
 96+
 97+int endsXmlBlock(char *buf, int header) {
 98+ char *pageCloseTag = "</page>\n";
 99+ char *mediawikiCloseTag = "</mediawiki>\n";
 100+ char *siteinfoCloseTag = "</siteinfo>\n";
 101+
 102+ while (*buf == ' ') buf++;
 103+
 104+ /* if we are trying to process the header, check for that only */
 105+ if (header) {
 106+ if (!strcmp(buf,siteinfoCloseTag)) return 1;
 107+ else return 0;
 108+ }
 109+
 110+ /* normal check for end of page, end of content */
 111+ if (!strcmp(buf,pageCloseTag) || !strcmp(buf,mediawikiCloseTag)) return 1;
 112+ else return 0;
 113+}
 114+
 115+int endBz2Stream(bz_stream *strm, char *outBuf, int bufSize, FILE *fd) {
 116+ int result;
 117+ int offset;
 118+
 119+ do {
 120+ strm->avail_in = 0;
 121+ result = BZ2_bzCompress ( strm, BZ_FINISH );
 122+ fwrite(outBuf,bufSize-strm->avail_out,1,fd);
 123+ strm->next_out = outBuf;
 124+ strm->avail_out = 8192;
 125+ } while (result != BZ_STREAM_END);
 126+ offset = strm->total_out_lo32;
 127+ BZ2_bzCompressEnd(strm);
 128+ return(offset);
 129+}
 130+
 131+int writeCompressedXmlBlock(int header, int count, int fileOffset, FILE *indexfd, int indexcompressed, int verbose) {
 132+
 133+ bz_stream strm;
 134+ int bz_verbosity = 0;
 135+ int bz_workFactor = 0;
 136+ int bz_blockSize100k = 9;
 137+ int wroteSomething = 0;
 138+ int blocksDone = 0;
 139+
 140+ strm.bzalloc = NULL;
 141+ strm.bzfree = NULL;
 142+ strm.opaque = NULL;
 143+
 144+ char *pageTitle = NULL;
 145+ int pageId = 0;
 146+ enum States{WantPage,WantPageTitle,WantPageId};
 147+ int state = WantPage;
 148+
 149+ /* init bzip compression stuff */
 150+ BZ2_bzCompressInit(&strm, bz_blockSize100k, bz_verbosity, bz_workFactor);
 151+
 152+ while (fgets(inBuf, sizeof(inBuf), stdin) != NULL) {
 153+ if (verbose > 1) {
 154+ fprintf(stderr,"input buffer is: ");
 155+ fprintf(stderr,"%s",inBuf);
 156+ }
 157+
 158+ wroteSomething = 1;
 159+ /* add the buffer content to stuff to be compressed */
 160+ strm.next_in = inBuf;
 161+ strm.avail_in = strlen(inBuf);
 162+ strm.next_out = outBuf;
 163+ strm.avail_out = 8192;
 164+
 165+ /* we are to build an index. */
 166+ if (indexfd) {
 167+ if (verbose > 2) {
 168+ fprintf(stderr,"doing index check\n");
 169+ }
 170+ if (state == WantPage) {
 171+ if (verbose > 2) {
 172+ fprintf(stderr,"checking for page tag\n");
 173+ }
 174+ if (startsPage(inBuf)) {
 175+ state = WantPageTitle;
 176+ }
 177+ }
 178+ else if (state == WantPageTitle) {
 179+ if (verbose > 1) {
 180+ fprintf(stderr,"checking for page title tag\n");
 181+ }
 182+ pageTitle = hasPageTitle(inBuf);
 183+ if (pageTitle[0]) {
 184+ state = WantPageId;
 185+ }
 186+ }
 187+ else if (state == WantPageId) {
 188+ if (verbose > 1) {
 189+ fprintf(stderr,"checking for page id tag\n");
 190+ }
 191+ pageId = hasId(inBuf);
 192+ if (pageId) {
 193+ state = WantPage;
 194+ }
 195+ if (indexcompressed) {
 196+ if (verbose) {
 197+ fprintf(stderr,"writing line to compressed index file\n");
 198+ }
 199+ sprintf(inBuf_indx,"%d:%d:%s\n",fileOffset,pageId,pageTitle);
 200+ strm_indx.next_in = inBuf_indx;
 201+ strm_indx.avail_in = strlen(inBuf_indx);
 202+ do {
 203+ if (verbose > 2) {
 204+ fprintf(stderr,"bytes left to read for index compression: %d\n",strm_indx.avail_in);
 205+ }
 206+ strm_indx.next_out = outBuf_indx;
 207+ strm_indx.avail_out = 8192;
 208+ BZ2_bzCompress ( &strm_indx, BZ_RUN );
 209+ fwrite(outBuf_indx,sizeof(outBuf_indx)-strm_indx.avail_out,1,indexfd);
 210+ } while (strm_indx.avail_in >0);
 211+ }
 212+ else {
 213+ if (verbose) {
 214+ fprintf(stderr,"writing line to index file\n");
 215+ }
 216+ fprintf(indexfd,"%d:%d:%s\n",fileOffset,pageId,pageTitle);
 217+ }
 218+ pageId = 0;
 219+ pageTitle = NULL;
 220+ }
 221+ }
 222+ do {
 223+ if (verbose > 2) {
 224+ fprintf(stderr,"bytes left to read for text compression: %d\n",strm.avail_in);
 225+ }
 226+ strm.next_out = outBuf;
 227+ strm.avail_out = 8192;
 228+ BZ2_bzCompress ( &strm, BZ_RUN );
 229+ fwrite(outBuf,sizeof(outBuf)-strm.avail_out,1,stdout);
 230+ } while (strm.avail_in > 0);
 231+ if (verbose > 1) fprintf(stderr,"avail_out is now: %d\n", strm.avail_out);
 232+
 233+ if (endsXmlBlock(inBuf, header)) {
 234+ /* special case: doing the siteinfo stuff at the beginning */
 235+ if (verbose) {
 236+ fprintf(stderr,"end of header found\n");
 237+ }
 238+ if (header) {
 239+ fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout);
 240+ return(fileOffset);
 241+ }
 242+
 243+ blocksDone++;
 244+ if (blocksDone % count == 0) {
 245+ if (verbose) fprintf(stderr, "end of xml block found\n");
 246+ /* close down bzip stream, we are done with this block */
 247+ fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout);
 248+ return(fileOffset);
 249+ }
 250+ }
 251+ }
 252+ if (verbose) fprintf(stderr,"eof reached\n");
 253+ if (wroteSomething) {
 254+ /* close down bzip stream, we are done with this block */
 255+ fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout);
 256+ }
 257+ return(fileOffset);
 258+}
 259+
 260+void usage(char *whoami, char *message) {
 261+ if (message) {
 262+ fprintf(stderr,"%s",message);
 263+ }
 264+ fprintf(stderr,"Usage: %s --pagesperstream n [--buildindex indexfilename] [--verbose]\n\n", whoami);
 265+ fprintf(stderr,"Reads a stream of XML pages from stdin,\n");
 266+ fprintf(stderr,"and writes to stdout the bz2 compressed\n");
 267+ fprintf(stderr,"data, one bz2 stream per count pages.\n\n");
 268+ fprintf(stderr,"Options:\n");
 269+ fprintf(stderr,"pagesperstream: compress this many pages in each complete bz2stream before\n");
 270+ fprintf(stderr," opening a new stream. The siteinfo header is written to a\n");
 271+ fprintf(stderr," separate stream at the beginning of all output, and the closing\n");
 272+ fprintf(stderr," mediawiki tag is written into a separate stream at the end.\n");
 273+ fprintf(stderr,"buildindex: generate a file containing an index of pages ids and titles\n");
 274+ fprintf(stderr," per stream. Each line contains: offset-to-stream:pageid:pagetitle\n");
 275+ fprintf(stderr," If filename ends in '.bz2' the file will be written in bz2 format.\n");
 276+ fprintf(stderr,"verbose: produce lots of debugging output to stderr. This option can be used\n");
 277+ fprintf(stderr," multiple times to increase verbosity.\n");
 278+ exit(-1);
 279+}
 280+
 281+int main(int argc, char **argv) {
 282+ int optindex=0;
 283+ int optc;
 284+ int offset = 0;
 285+
 286+ struct option optvalues[] = {
 287+ {"buildindex", 1, 0, 'b'},
 288+ {"pagesperstream", 1, 0, 'p'},
 289+ {"verbose", 0, 0, 'v'},
 290+ {NULL, 0, NULL, 0}
 291+ };
 292+
 293+ int count = 0;
 294+ int doIndex = 0;
 295+ char *indexFilename = NULL;
 296+ int verbose = 0;
 297+ FILE *indexfd = NULL;
 298+ int indexcompressed = 0;
 299+
 300+ while (1) {
 301+ optc=getopt_long_only(argc,argv,"pagesperstream:buildindex:verbose", optvalues, &optindex);
 302+ if (optc=='b') {
 303+ doIndex=1;
 304+ indexFilename = optarg;
 305+ }
 306+ else if (optc=='p') {
 307+ if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
 308+ count=atoi(optarg);
 309+ }
 310+ else if (optc=='v')
 311+ verbose++;
 312+ else if (optc==-1) break;
 313+ else usage(argv[0],"unknown option or other error\n");
 314+ }
 315+
 316+ if (count <= 0) {
 317+ usage(argv[0],"bad or no argument given for count.\n");
 318+ }
 319+
 320+ if (indexFilename) {
 321+ if (verbose) {
 322+ fprintf(stderr,"setting up index file creation.\n");
 323+ }
 324+ indexfd = fopen(indexFilename, "w");
 325+ if (! indexfd) {
 326+ usage(argv[0],"failed to open index file for write.\n");
 327+ }
 328+ if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) {
 329+ if (verbose) {
 330+ fprintf(stderr,"index file will be bz2 compressed.\n");
 331+ }
 332+ indexcompressed++;
 333+ setupIndexBz2Stream();
 334+ }
 335+ }
 336+
 337+ setupRegexps();
 338+
 339+ /* deal with the XML header */
 340+ offset = writeCompressedXmlBlock(1,count,0,indexfd,indexcompressed,verbose);
 341+
 342+ while (!feof(stdin)) {
 343+ offset = writeCompressedXmlBlock(0,count,offset,indexfd,indexcompressed,verbose);
 344+ }
 345+
 346+ if (indexFilename) {
 347+ if (indexcompressed) {
 348+ if (verbose) {
 349+ fprintf(stderr,"closing bz2 index file stream.\n");
 350+ }
 351+ endBz2Stream(&strm_indx, outBuf_indx, sizeof(outBuf_indx), indexfd);
 352+ }
 353+ if (verbose) {
 354+ fprintf(stderr,"closing index file.\n");
 355+ }
 356+ fclose(indexfd);
 357+ }
 358+
 359+ exit(0);
 360+
 361+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/recompressxml.c
___________________________________________________________________
Added: svn:eol-style
1362 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/README
@@ -42,6 +42,18 @@
4343 position:xxxxx pageid:nnn
4444 It exits with 0 on success, -1 on error.
4545
 46+recompresszml - Reads an xml stream of pages and writes multiple bz2 compressed
 47+ streams, concatenated, to stdout, with the specified number of
 48+ pages per stream. The mediawiki site info header is in its
 49+ own bz2 stream. Each stream can be extracted as a separate file
 50+ by an appropriate tool, checking for the byte-aligned string "BZh91AY&SY"
 51+ and a following <page> tag (after uncompressing the first chunk
 52+ of data after that string). Alternatively, a tool can seek to
 53+ the location of one of the streams in order to find a particular
 54+ page. An index of file-offset:page-id:page-title lines
 55+ is written to a specified file if desired; the index file will be
 56+ bz2 compressed if the filename given ends with .bz2.
 57+
4658 Library routines:
4759
4860 mwbz2lib.c - various utility functions (bitmasks, shifting and comparing bytes,
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile
@@ -29,7 +29,8 @@
3030 all: checkforbz2footer \
3131 dumpbz2filefromoffset \
3232 dumplastbz2block \
33 - findpageidinbz2xml
 33+ findpageidinbz2xml \
 34+ recompressxml
3435
3536 dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
3637 $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2
@@ -43,20 +44,26 @@
4445 dumpbz2filefromoffset: $(OBJSBZ) mwbzlib.o dumpbz2filefromoffset.o
4546 $(CC) $(CFLAGS) $(LDFLAGS) -o dumpbz2filefromoffset dumpbz2filefromoffset.o mwbzlib.o $(OBJSBZ) -lbz2
4647
47 -install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset
 48+recompressxml: $(OBJSBZ) recompressxml.o
 49+ $(CC) $(CFLAGS) $(LDFLAGS) -o recompressxml recompressxml.o -lbz2
 50+
 51+install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset recompressxml
4852 if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
4953 cp -f dumplastbz2block $(PREFIX)/bin/dumplastbz2block
5054 cp -f findpageidinbz2xml $(PREFIX)/bin/findpageidinbz2xml
5155 cp -f checkforbz2footer $(PREFIX)/bin/checkforbz2footer
5256 cp -f dumpbz2filefromoffset $(PREFIX)/bin/dumpbz2filefromoffset
 57+ cp -f recompressxml $(PREFIX)/bin/recompressxml
5358 chmod a+x $(PREFIX)/bin/dumplastbz2block
5459 chmod a+x $(PREFIX)/bin/findpageidinbz2xml
5560 chmod a+x $(PREFIX)/bin/checkforbz2footer
5661 chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset
 62+ chmod a+x $(PREFIX)/bin/recompressxml
5763
5864 clean:
5965 rm -f *.o *.a dumplastbz2block findpageidinbz2xml \
60 - checkforbz2footer dumpbz2filefromoffset
 66+ checkforbz2footer dumpbz2filefromoffset \
 67+ recompressxml
6168
6269 bzlibfuncs.o: bzlibfuncs.c bzlib.h bzlib_private.h
6370 $(CC) $(CFLAGS) -c bzlibfuncs.c
@@ -72,6 +79,8 @@
7380 $(CC) $(CFLAGS) -c checkforbz2footer.c
7481 dumpbz2filefromoffset.o: dumpbz2filefromoffset.c
7582 $(CC) $(CFLAGS) -c dumpbz2filefromoffset.c
 83+recompressxml.o: recompressxml.c
 84+ $(CC) $(CFLAGS) -c recompressxml.c
7685
7786 distclean: clean
7887
@@ -80,6 +89,7 @@
8190 rm -f $(DISTNAME)
8291 ln -s -f . $(DISTNAME)
8392 tar cvf $(DISTNAME).tar \
 93+ $(DISTNAME)/recompressxml.c \
8494 $(DISTNAME)/dumplastbz2block.c \
8595 $(DISTNAME)/findpageidinbz2xml.c \
8696 $(DISTNAME)/checkforbz2footer.c \

Status & tagging log