r100807 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r100806‎ | r100807 | r100808 >
Date:14:37, 26 October 2011
Author:platonides
Status:deferred (Comments)
Tags:tools 
Comment:
Tool to verify the correctness of php message files.
Ensures that the given files adhere to a safe subset of php.

The provided update-msgs.sh can be used to svn up files only
if they are message files, so it can run unattended.
Modified paths:
  • /trunk/tools/justMessages (added) (history)
  • /trunk/tools/justMessages/Makefile (added) (history)
  • /trunk/tools/justMessages/justMessages.l (added) (history)
  • /trunk/tools/justMessages/update-msgs.sh (added) (history)

Diff [purge]

Index: trunk/tools/justMessages/update-msgs.sh
@@ -0,0 +1,29 @@
 2+#!/bin/sh
 3+FOLDER="../ToolserverI18N/language/messages"
 4+JUSTMESSAGES="justMessages"
 5+
 6+if [ ! -z "$1" ]; then
 7+ FOLDER="$1"
 8+fi
 9+
 10+if ! which "$JUSTMESSAGES" 2> /dev/null 1>&2; then
 11+ DIRNAME=$(dirname $0)
 12+ if [ -z "$DIRNAME" ]; then
 13+ DIRNAME="."
 14+ fi
 15+ JUSTMESSAGES="$DIRNAME/$JUSTMESSAGES"
 16+fi
 17+
 18+set -e
 19+LOCKFILE="$FOLDER/.svn/lock"
 20+( set -C; printf "" > "$LOCKFILE" ) 2> /dev/null
 21+trap 'rm "$LOCKFILE"' EXIT
 22+TEMP=`mktemp -d`
 23+rsync -a "$FOLDER"/ "$TEMP"
 24+rm "$TEMP/.svn/lock"
 25+svn up "$TEMP"
 26+find "$TEMP" -name "*.php" -exec "$JUSTMESSAGES" \{\} +
 27+touch "$TEMP/.svn/lock"
 28+rsync -a --delete "$TEMP"/ "$FOLDER"
 29+# $LOCKFILE removed on exit
 30+
Property changes on: trunk/tools/justMessages/update-msgs.sh
___________________________________________________________________
Added: svn:eol-style
131 + native
Added: svn:executable
232 + *
Index: trunk/tools/justMessages/justMessages.l
@@ -0,0 +1,233 @@
 2+%{
 3+ #include <stddef.h>
 4+ #include <stdint.h>
 5+ #include <stdlib.h>
 6+ #include <assert.h>
 7+ #define YY_NO_INPUT
 8+ static int old_state;
 9+ static int debug = 0;
 10+ static int retcode = 0;
 11+ #define ERRORLEVEL(x) if ((x) > retcode) { retcode = (x); }
 12+ static char * filename;
 13+
 14+ static void init_variable(const char* yytext, size_t yyleng);
 15+ static void set_variable_offset(char* yytext, size_t yyleng);
 16+%}
 17+
 18+LNUM [0-9]+
 19+DNUM ([0-9]*"."[0-9]+)|([0-9]+"."[0-9]*)
 20+EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM})
 21+HNUM "0x"[0-9a-fA-F]+
 22+LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*
 23+WHITESPACE [ \n\r\t]
 24+TABS_AND_SPACES [ \t]*
 25+TOKENS [;:,.\[\]()|^&+-/*=%!~$<>?@]
 26+ANY_CHAR [^]
 27+NEWLINE ("\r"|"\n"|"\r\n")
 28+NULL [Nn][Uu][Ll][Ll]
 29+ARRAY array
 30+
 31+ /* A string literal made with single quotes */
 32+STRING_SINGLE '([^'\\]|\\[\\'])*'
 33+ /* A string literal made by double quotes */
 34+STRING_DOUBLE \"([^"$\\]|\\[nrtvf\\$"0-9A-Fa-f]|${LNUM})*\"
 35+
 36+ /* Note we are rejecting HEREDOCs and NOWDOCs. The brackets shouldn't be necessary, but they are. */
 37+STRING ({STRING_SINGLE}|{STRING_DOUBLE})
 38+
 39+%s SCRIPTING LINE_COMMENT MULTILINE_COMMENT EQUALS RVALUE ARRAY ARRAY_ITEM_END ERROR
 40+%pointer
 41+
 42+%%
 43+
 44+<INITIAL>"<?php"([ \t]|{NEWLINE}) BEGIN(SCRIPTING);
 45+<INITIAL>\xEF\xBB\xBF { fprintf(stderr, "%s: File contains UTF-8 BOM\n", filename); ERRORLEVEL(3); }
 46+<INITIAL>([^<]|<[^?])+ { fprintf(stderr, "%s: Data before php tag\n", filename); ERRORLEVEL(3); }
 47+
 48+<SCRIPTING,EQUALS,ARRAY,ARRAY_ITEM_END>"#"|"//" { old_state = YY_START; BEGIN(LINE_COMMENT); }
 49+
 50+<SCRIPTING,EQUALS,ARRAY,ARRAY_ITEM_END>"/*" { old_state = YY_START; BEGIN(MULTILINE_COMMENT); }
 51+<MULTILINE_COMMENT>"\*/" BEGIN(old_state);
 52+
 53+<LINE_COMMENT>\n BEGIN(old_state); ;
 54+
 55+<LINE_COMMENT,MULTILINE_COMMENT>. ;
 56+
 57+ /* In script mode we accept variables, both alone and with an offset
 58+ * They have to be followed by an equal plus another variable ($foo = $bar = ""),
 59+ * a string literal or an array.
 60+ */
 61+<SCRIPTING>"$"{LABEL}"["{STRING}"]" { set_variable_offset(yytext, yyleng); BEGIN(EQUALS); }
 62+<SCRIPTING,RVALUE>"$"{LABEL} { init_variable(yytext, yyleng); BEGIN(EQUALS); }
 63+<EQUALS>"=" { BEGIN(RVALUE); }
 64+
 65+ /* At the right side we accept null, single-quote strings and arrays */
 66+<RVALUE>{STRING}{TABS_AND_SPACES}";" { BEGIN(SCRIPTING); }
 67+<RVALUE>{NULL}{TABS_AND_SPACES}";" { BEGIN(SCRIPTING); }
 68+<RVALUE>{ARRAY}{TABS_AND_SPACES}"(" BEGIN(ARRAY);
 69+ /* End of array */
 70+<ARRAY,ARRAY_ITEM_END>")"{TABS_AND_SPACES}";" { BEGIN(SCRIPTING); }
 71+
 72+ /* Array with key => value */
 73+<ARRAY>{STRING}{WHITESPACE}*"=>"{WHITESPACE}*{STRING} BEGIN(ARRAY_ITEM_END);
 74+<ARRAY_ITEM_END>"," BEGIN(ARRAY); /* After a key=>value either a , or a ) ending the array. */
 75+
 76+{WHITESPACE}|{NEWLINE} ;
 77+<ERROR>.* { fprintf(stderr, "near %s\n", yytext); ERRORLEVEL(5); return 1; }
 78+. { fprintf(stderr, "%s: Error in state %d ", filename, YY_START); BEGIN(ERROR); unput(yytext[0]); if (0) goto find_rule; }
 79+
 80+%%
 81+
 82+
 83+struct variable {
 84+ struct variable *next;
 85+ uint8_t min, count;
 86+ char name[];
 87+};
 88+static struct variable *variable_root = NULL;
 89+
 90+static struct variable *find_variable(const char* name) {
 91+ struct variable *var = variable_root;
 92+ while (var) {
 93+ if (!strcmp(var->name, name))
 94+ return var;
 95+ var = var->next;
 96+ }
 97+ return NULL;
 98+}
 99+
 100+/* Reset the count of each variable */
 101+static void reset_variables(void) {
 102+ struct variable *var = variable_root;
 103+ while (var) {
 104+ var->count = 0;
 105+ var = var->next;
 106+ }
 107+}
 108+
 109+static void init_variable(const char* yytext, size_t yyleng) {
 110+ struct variable *v;
 111+
 112+ if (debug) {
 113+ printf(" Variable %s\n", yytext);
 114+ }
 115+
 116+ v = find_variable(yytext);
 117+ if (!v) {
 118+ v = malloc( sizeof(struct variable) + yyleng);
 119+ v->next = variable_root;
 120+ v->min = 0;
 121+ v->count = 0;
 122+ strcpy(v->name, yytext);
 123+ variable_root = v;
 124+ if (debug) printf(" Registered variable %s\n", yytext);
 125+ }
 126+
 127+ v->count++;
 128+ if (v->count > 1) {
 129+ fprintf(stderr, " %s: %s is assigned several times\n", filename, yytext);
 130+ ERRORLEVEL(1);
 131+ }
 132+}
 133+
 134+static void set_variable_offset(char* yytext, size_t yyleng) {
 135+ struct variable *v;
 136+ char *p, tmp;
 137+
 138+
 139+ if (debug) {
 140+ printf(" Variable with offset %s\n", yytext);
 141+ }
 142+
 143+ p = strchr(yytext,'[');
 144+ assert(p);
 145+ tmp = *p; *p = '\0';
 146+
 147+ v = find_variable(yytext);
 148+ if (!v || !v->count) {
 149+ fprintf(stderr, " %s: %s is used uninitialized\n", filename, yytext);
 150+ ERRORLEVEL(2);
 151+ }
 152+ *p = tmp;
 153+}
 154+
 155+static int usage() {
 156+ puts("justMessages [-f specfile] [-d] [file [file ... ]]");
 157+ puts("\nOptions:\n"
 158+ " -d\n"
 159+ " --debug\n"
 160+ " Show each variable as it is found.\n"
 161+ "\n"
 162+ " -f\n"
 163+ " Provide a specfile listing which variables should appear on the files.\n"
 164+ "\n"
 165+ " -h\n"
 166+ " --help\n"
 167+ " Show this help.\n"
 168+ );
 169+ puts("Error codes:\n"
 170+ " 0: Everything went ok\n"
 171+ " 1: Variables assigned twice\n"
 172+ " 2: Minor issues which shouldn't affect a secure setup (register_globals/undefined vars)\n"
 173+ " 3: Leading data (gets output to the client)\n"
 174+ " 4: Missing file\n"
 175+ " 5: Unknown php constructs. May lead to code execution.\n"
 176+ );
 177+ return 0;
 178+}
 179+
 180+int main(int argc, char** argv) {
 181+ while (argc > 1) {
 182+ argv++; argc--;
 183+ if (!strcmp(argv[0], "-h") || !strcmp(argv[0], "--help")) {
 184+ usage();
 185+ continue;
 186+ }
 187+ if (!strcmp(argv[0], "-d") || !strcmp(argv[0], "--debug")) {
 188+ debug = 1;
 189+ continue;
 190+ }
 191+ if (!strcmp(argv[0], "-f")) {
 192+ fprintf(stderr, "Unimplemented\n"); // TODO
 193+ argv++; argc--;
 194+ continue;
 195+ }
 196+
 197+ filename = argv[0];
 198+ if (debug) {
 199+ printf("Processing %s\n", filename);
 200+ }
 201+
 202+ yyin = fopen( filename, "r" );
 203+ if (!yyin) {
 204+ perror(argv[0]);
 205+ ERRORLEVEL(4);
 206+ continue;
 207+ }
 208+
 209+ if ( yylex() ) {
 210+ yyrestart(yyin); /* We need to reset it manually (automatically done on EOF) */
 211+ } else if (YY_START != SCRIPTING) {
 212+ fprintf(stderr, "%s: File ends in wrong state %d\n", filename, YY_START);
 213+ ERRORLEVEL(5);
 214+ }
 215+ /* yyrestart does not reset the start condition to INITIAL */
 216+ BEGIN(INITIAL);
 217+ reset_variables();
 218+
 219+ fclose(yyin);
 220+ }
 221+
 222+
 223+ return retcode;
 224+}
 225+
 226+int yywrap() {
 227+ return 1;
 228+}
 229+
 230+/* Shut up warning about yy_flex_strlen being unused */
 231+int dummy() {
 232+ return yy_flex_strlen("");
 233+}
 234+
Property changes on: trunk/tools/justMessages/justMessages.l
___________________________________________________________________
Added: svn:eol-style
1235 + native
Index: trunk/tools/justMessages/Makefile
@@ -0,0 +1,10 @@
 2+CC=gcc
 3+LEX=lex
 4+CFLAGS=-Wall -O3 -fwhole-program
 5+
 6+justMessages: justMessages.c
 7+ $(CC) $(CFLAGS) -o $@ $^
 8+
 9+justMessages.c: justMessages.l
 10+ $(LEX) -t $^ > $@
 11+
Property changes on: trunk/tools/justMessages/Makefile
___________________________________________________________________
Added: svn:eol-style
112 + native
Property changes on: trunk/tools/justMessages
___________________________________________________________________
Added: svn:ignore
213 + justMessages
justMessages.c

Comments

#Comment by Nikerabbit (talk | contribs)   15:30, 26 October 2011

Wouldn't it be easier to just not use PHP as message storage format?

#Comment by Platonides (talk | contribs)   15:56, 26 October 2011

I suggested that back then, when TsIntuition was going to be coded. It has also appeared in some wikitech-l discussions. Still, it is what it's being used*, so it's useful to validate a not-much-trusted file is just messages and nothing more.

* I admit it has several good points: it is easy to read and write, fast to parse from php and supported by twn.

#Comment by Nikerabbit (talk | contribs)   16:00, 26 October 2011

Yeah, and it is also hard to generate, prone to syntax errors when editing manually and just unnecessary vulnerability surface. Speed isn't a concern anymore since translations are cached. I would happily switch to another format for mediawiki.

#Comment by Platonides (talk | contribs)   16:39, 26 October 2011

I may became too used to it? Why is it hard to generate? I think it's quite easy to pick how it works, specially for a php developer.

Re: "prone to syntax errors when editing manually and just unnecessary vulnerability surface", that's what the above revision checks :)

#Comment by Nikerabbit (talk | contribs)   17:24, 26 October 2011

I meant generating/editing PHP source code programmatically. It's bearable by hand of course, otherwise we wouldn't develop MediaWiki :).

Status & tagging log