r30603 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r30602‎ | r30603 | r30604 >
Date:01:23, 6 February 2008
Author:brion
Status:old
Tags:
Comment:
More file type checks...
* Switch XML type detection/validity check from dipping for XML processing instructions, doctypes, or subtags to just trying to parse it and checking the root element's name and namespace. This lets us properly handle SVG files which specify a namespace but no doctype, as well as rejecting files that aren't well-formed. (See http://meta.wikimedia.org/wiki/SVG_validity_checks for some samples of bad files I encountered.) Non-XML files will abort parsing pretty quickly, so this shouldn't be a big burden on other types that didn't hit a magic check.
* Fix Unicode unix script checks (er.... is that even right? :D), remove the iconv dependency
Modified paths:
  • /trunk/phase3/includes/AutoLoader.php (modified) (history)
  • /trunk/phase3/includes/MimeMagic.php (modified) (history)
  • /trunk/phase3/includes/XmlTypeCheck.php (added) (history)

Diff [purge]

Index: trunk/phase3/includes/MimeMagic.php
@@ -455,71 +455,20 @@
456456 /*
457457 * look for XML formats (XHTML and SVG)
458458 */
459 - $xml_type = NULL;
460 - if ( substr( $head, 0, 5 ) == "<?xml" ) {
461 - $xml_type = "ASCII";
462 - } elseif ( substr( $head, 0, 8 ) == "\xef\xbb\xbf<?xml") {
463 - $xml_type = "UTF-8";
464 - } elseif ( substr( $head, 0, 12 ) == "\xfe\xff\x00<\x00?\x00x\x00m\x00l" ) {
465 - $xml_type = "UTF-16BE";
466 - } elseif ( substr( $head, 0, 12 ) == "\xff\xfe<\x00?\x00x\x00m\x00l\x00") {
467 - $xml_type = "UTF-16LE";
468 - } else {
469 - /*
470 - echo "WARNING: Undetected xml_type ...\n";
471 - for( $i = 0; $i < 10; $i++ ) {
472 - $c = ord( $head{$i} );
473 - if( $c < 32 || $c > 126 ) {
474 - printf( "\\x%02x", $c );
475 - } else {
476 - print $head{$i};
477 - }
478 - }
479 - echo "\n";
480 - */
481 - }
482 -
483 - if( $xml_type == 'UTF-16BE' || $xml_type == 'UTF-16LE' ) {
484 - // Quick and dirty fold down to ASCII!
485 - $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
486 - $chars = unpack( $pack[$xml_type], substr( $head, 2 ) );
487 - $head = '';
488 - foreach( $chars as $codepoint ) {
489 - if( $codepoint < 128 ) {
490 - $head .= chr( $codepoint );
491 - } else {
492 - $head .= '?';
493 - }
494 - }
495 - }
496 -
497 - $match = array();
498 - $doctype = "";
499 - $tag = "";
500 -
501 - if ( preg_match( '%<!DOCTYPE\s+[\w-]+\s+PUBLIC\s+["'."'".'"](.*?)["'."'".'"].*>%siD',
502 - $head, $match ) ) {
503 - $doctype = $match[1];
504 - }
505 -
506 - if( $xml_type || $doctype ) {
507 - if ( preg_match( '%<(\w+)\b%si', $head, $match ) ) {
508 - $tag = $match[1];
509 - }
510 -
511 - #print "<br>ANALYSING $file: doctype= $doctype; tag= $tag<br>";
512 -
513 - if ( strpos( $doctype, "-//W3C//DTD SVG" ) === 0 ) {
514 - return "image/svg+xml";
515 - } elseif ( $tag === "svg" ) {
516 - return "image/svg+xml";
517 - } elseif ( strpos( $doctype, "-//W3C//DTD XHTML" ) === 0 ) {
518 - return "text/html";
519 - } elseif ( $tag === "html" ) {
520 - return "text/html";
 459+ $xml = new XmlTypeCheck( $file );
 460+ if( $xml->wellFormed ) {
 461+ $types = array(
 462+ 'http://www.w3.org/2000/svg:svg' => 'image/svg+xml',
 463+ 'svg' => 'image/svg+xml',
 464+ 'http://www.w3.org/1999/xhtml:html' => 'text/html', // application/xhtml+xml?
 465+ 'html' => 'text/html', // application/xhtml+xml?
 466+ );
 467+ if( isset( $types[$xml->rootElement] ) ) {
 468+ $mime = $types[$xml->rootElement];
 469+ return $mime;
521470 } else {
522471 /// Fixme -- this would be the place to allow additional XML type checks
523 - return "application/xml";
 472+ return 'application/xml';
524473 }
525474 }
526475
@@ -541,7 +490,17 @@
542491
543492 if ( $script_type ) {
544493 if ( $script_type !== "UTF-8" && $script_type !== "ASCII") {
545 - $head = iconv( $script_type, "ASCII//IGNORE", $head);
 494+ // Quick and dirty fold down to ASCII!
 495+ $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
 496+ $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
 497+ $head = '';
 498+ foreach( $chars as $codepoint ) {
 499+ if( $codepoint < 128 ) {
 500+ $head .= chr( $codepoint );
 501+ } else {
 502+ $head .= '?';
 503+ }
 504+ }
546505 }
547506
548507 $match = array();
Index: trunk/phase3/includes/AutoLoader.php
@@ -271,6 +271,7 @@
272272 'WikiErrorMsg' => 'includes/WikiError.php',
273273 'WikiXmlError' => 'includes/WikiError.php',
274274 'Xml' => 'includes/Xml.php',
 275+ 'XmlTypeCheck' => 'includes/XmlTypeCheck.php',
275276 'ZhClient' => 'includes/ZhClient.php',
276277 'memcached' => 'includes/memcached-client.php',
277278 'EmaillingJob' => 'includes/JobQueue.php',
Index: trunk/phase3/includes/XmlTypeCheck.php
@@ -0,0 +1,93 @@
 2+<?php
 3+
 4+class XmlTypeCheck {
 5+ /**
 6+ * Will be set to true or false to indicate whether the file is
 7+ * well-formed XML. Note that this doesn't check schema validity.
 8+ */
 9+ public $wellFormed = false;
 10+
 11+ /**
 12+ * Name of the document's root element, including any namespace
 13+ * as an expanded URL.
 14+ */
 15+ public $rootElement = '';
 16+
 17+ private $softNamespaces;
 18+ private $namespaces = array();
 19+
 20+ /**
 21+ * @param $file string filename
 22+ * @param $softNamespaces bool
 23+ * If set to true, use of undeclared XML namespaces will be ignored.
 24+ * This matches the behavior of rsvg, but more compliant consumers
 25+ * such as Firefox will reject such files.
 26+ * Leave off for the default, stricter checks.
 27+ */
 28+ function __construct( $file, $softNamespaces=false ) {
 29+ $this->softNamespaces = $softNamespaces;
 30+ $this->run( $file );
 31+ }
 32+
 33+ private function run( $fname ) {
 34+ if( $this->softNamespaces ) {
 35+ $parser = xml_parser_create( 'UTF-8' );
 36+ } else {
 37+ $parser = xml_parser_create_ns( 'UTF-8' );
 38+ }
 39+
 40+ // case folding violates XML standard, turn it off
 41+ xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
 42+
 43+ xml_set_element_handler( $parser, array( $this, 'elementOpen' ), false );
 44+
 45+ $file = fopen( $fname, "rb" );
 46+ do {
 47+ $chunk = fread( $file, 32768 );
 48+ $ret = xml_parse( $parser, $chunk, feof( $file ) );
 49+ if( $ret == 0 ) {
 50+ // XML isn't well-formed!
 51+ fclose( $file );
 52+ xml_parser_free( $parser );
 53+ return;
 54+ }
 55+ } while( !feof( $file ) );
 56+
 57+ $this->wellFormed = true;
 58+
 59+ fclose( $file );
 60+ xml_parser_free( $parser );
 61+ }
 62+
 63+ private function elementOpen( $parser, $name, $attribs ) {
 64+ if( $this->softNamespaces ) {
 65+ // Check namespaces manually, so expat doesn't throw
 66+ // errors on use of undeclared namespaces.
 67+ foreach( $attribs as $attrib => $val ) {
 68+ if( $attrib == 'xmlns' ) {
 69+ $this->namespaces[''] = $val;
 70+ } elseif( substr( $attrib, 0, strlen( 'xmlns:' ) ) == 'xmlns:' ) {
 71+ $this->namespaces[substr( $attrib, strlen( 'xmlns:' ) )] = $val;
 72+ }
 73+ }
 74+
 75+ if( strpos( $name, ':' ) === false ) {
 76+ $ns = '';
 77+ $subname = $name;
 78+ } else {
 79+ list( $ns, $subname ) = explode( ':', $name, 2 );
 80+ }
 81+
 82+ if( isset( $this->namespaces[$ns] ) ) {
 83+ $name = $this->namespaces[$ns] . ':' . $subname;
 84+ } else {
 85+ // Technically this is invalid for XML with Namespaces.
 86+ // But..... we'll just let it slide in soft mode.
 87+ }
 88+ }
 89+
 90+ // We only need the first open element
 91+ $this->rootElement = $name;
 92+ xml_set_element_handler( $parser, false, false );
 93+ }
 94+}
Property changes on: trunk/phase3/includes/XmlTypeCheck.php
___________________________________________________________________
Name: svn:eol-style
195 + native

Status & tagging log