r81438 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81437‎ | r81438 | r81439 >
Date:03:20, 3 February 2011
Author:ariel
Status:deferred
Tags:
Comment:
filter for use in cleaning up half-written history xml files
Modified paths:
  • /branches/ariel/xmldumps-backup/writeuptopageid.c (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/writeuptopageid.c
@@ -0,0 +1,141 @@
 2+#include <stdio.h>
 3+#include <stdlib.h>
 4+#include <stdarg.h>
 5+#include <errno.h>
 6+#include <string.h>
 7+
 8+typedef enum { None, StartHeader, StartPage, AtPageID, WriteMem, Write, EndPage, AtLastPageID } States;
 9+
 10+/* assume the header is never going to be longer than 1000 x 80 4-byte characters... how many
 11+ namespaces will one project want? */
 12+#define MAXHEADERLEN 524289
 13+
 14+void usage(char *me) {
 15+ fprintf(stderr,"Usage: %s pageID\n",me);
 16+ fprintf(stderr,"Copies the contents of an XML file up to but not including\n");
 17+ fprintf(stderr,"the specified pageID. This program is used in processing XML\n");
 18+ fprintf(stderr,"dump files that were only partially written.\n");
 19+}
 20+
 21+/* note that even if we have only read a partial line
 22+ of text from the body of the page, (cause the text
 23+ is longer than our buffer), it's fine, since the
 24+ <> delimiters only mark xml, they can't appear
 25+ in the page text.
 26+
 27+ returns new state */
 28+States setState (char *line, States currentState, int endPageID) {
 29+ int pageID = 0;
 30+
 31+ if (!strncmp(line,"<mediawiki",10)) {
 32+ return(StartHeader);
 33+ }
 34+ else if (!strncmp(line,"<page>",6)) {
 35+ return(StartPage);
 36+ }
 37+ /* there are also user ids, revision ids, etc... pageid will be the first one */
 38+ else if (currentState == StartPage && (!strncmp(line, "<id>", 4))) {
 39+ /* dig the id out, format is <id>num</id> */
 40+ pageID = atoi(line+4);
 41+ if (pageID == endPageID) {
 42+ return(AtLastPageID);
 43+ }
 44+ else {
 45+ return(WriteMem);
 46+ }
 47+ }
 48+ else if (currentState == WriteMem) {
 49+ return(Write);
 50+ }
 51+ else if (!strncmp(line, "</page>", 6)) {
 52+ return(EndPage);
 53+ }
 54+ return(currentState);
 55+}
 56+
 57+/* returns 1 on success, 0 on error */
 58+int writeMemoryIfNeeded(char *mem, States state) {
 59+ int res = 0;
 60+
 61+ if (state == WriteMem) {
 62+ res = fwrite(mem,strlen(mem),1,stdout);
 63+ mem[0]='\0';
 64+ return(res);
 65+ }
 66+}
 67+
 68+/* returns 1 on success, 0 on error */
 69+int writeIfNeeded(char *line, States state) {
 70+ if (state == StartHeader || state == WriteMem || state == Write || state == EndPage) {
 71+ return(fwrite(line,strlen(line),1,stdout));
 72+ }
 73+}
 74+
 75+/* returns 1 on success, 0 on error */
 76+int saveInMemIfNeeded(char *mem, char *line, States state) {
 77+ if (state == StartPage) {
 78+ if (strlen(mem) + strlen(line) < MAXHEADERLEN) {
 79+ strcpy(mem + strlen(mem),line);
 80+ }
 81+ else {
 82+ /* we actually ran out of room, who knew */
 83+ return(0);
 84+ }
 85+ }
 86+ return(1);
 87+}
 88+
 89+int main(int argc,char **argv) {
 90+ long int pageID = 0;
 91+ char *nonNumeric = 0;
 92+ States state = None;
 93+ char *text;
 94+ char line[4097];
 95+ /* order of magnitude of 2K lines of 80 chrs each,
 96+ no header of either a page nor the mw header should
 97+ ever be longer than that. At least not for some good
 98+ length of time. */
 99+ char mem[MAXHEADERLEN];
 100+
 101+ if (argc != 2) {
 102+ usage(argv[0]);
 103+ exit(-1);
 104+ }
 105+
 106+ errno = 0;
 107+ pageID = strtol(argv[1], &nonNumeric, 10);
 108+ if (pageID == 0 ||
 109+ *nonNumeric != 0 ||
 110+ nonNumeric == (char *) &pageID ||
 111+ errno != 0) {
 112+ fprintf (stderr,"The value you entered for pageID must be a positive integer.\n");
 113+ usage(argv[0]);
 114+ exit(-1);
 115+ }
 116+
 117+ while (fgets(line, sizeof(line)-1, stdin) != NULL) {
 118+ text=line;
 119+ while (*text && isspace(*text))
 120+ text++;
 121+ state = setState(text, state, pageID);
 122+ if (!saveInMemIfNeeded(mem,line,state)) {
 123+ fprintf(stderr,"failed to save text in temp memory, bailing\n");
 124+ exit(-1);
 125+ };
 126+ if (!writeMemoryIfNeeded(mem,state)) {
 127+ fprintf(stderr,"failed to write text from memory, bailing\n");
 128+ exit(-1);
 129+ }
 130+ if (!writeIfNeeded(line,state)) {
 131+ fprintf(stderr,"failed to write text, bailing\n");
 132+ exit(-1);
 133+ }
 134+ if (state == AtLastPageID) {
 135+ /* we are done. */
 136+ break;
 137+ }
 138+ }
 139+ fwrite("</mediawiki>\n",13,1,stdout);
 140+ exit(0);
 141+}
 142+
Property changes on: branches/ariel/xmldumps-backup/writeuptopageid.c
___________________________________________________________________
Added: svn:eol-style
1143 + native

Status & tagging log