r95155 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95154‎ | r95155 | r95156 >
Date:17:16, 21 August 2011
Author:bawolff
Status:ok
Tags:
Comment:
follow-up r86169 - 2 minor issues found while writing unit tests

# Some really obscure Exif properties did not have the Exif byte order taken into account
and were being extracted with the bytes reversed (for example user comment when encoded as utf-16).
Not a major issue as these properties are very rare in practise, but certainly not a good thing.
( One would think php's exif support would take care of that, but no it does not...)
# Change the fallback encoding for Gif comments to be windows-1252 instead of iso 8859-1. More
to be consitent with jpg and iptc then anything else.
Modified paths:
  • /trunk/phase3/includes/media/BitmapMetadataHandler.php (modified) (history)
  • /trunk/phase3/includes/media/Exif.php (modified) (history)
  • /trunk/phase3/includes/media/GIFMetadataExtractor.php (modified) (history)
  • /trunk/phase3/includes/media/JpegMetadataExtractor.php (modified) (history)
  • /trunk/phase3/includes/media/Tiff.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/media/BitmapMetadataHandler.php
@@ -40,16 +40,16 @@
4141
4242
4343 /**
44 - *
45 - * get exif info using exif class.
 44+ * Get exif info using exif class.
4645 * Basically what used to be in BitmapHandler::getMetadata().
4746 * Just calls stuff in the Exif class.
4847 *
4948 * @param $filename string
5049 */
51 - function getExif ( $filename ) {
52 - if ( file_exists( $filename ) ) {
53 - $exif = new Exif( $filename );
 50+ function getExif ( $filename, $byteOrder ) {
 51+ global $wgShowEXIF;
 52+ if ( file_exists( $filename ) && $wgShowEXIF ) {
 53+ $exif = new Exif( $filename, $byteOrder );
5454 $data = $exif->getFilteredData();
5555 if ( $data ) {
5656 $this->addMetadata( $data, 'exif' );
@@ -117,7 +117,6 @@
118118 static function Jpeg ( $filename ) {
119119 $showXMP = function_exists( 'xml_parser_create_ns' );
120120 $meta = new self();
121 - $meta->getExif( $filename );
122121
123122 $seg = JpegMetadataExtractor::segmentSplitter( $filename );
124123 if ( isset( $seg['COM'] ) && isset( $seg['COM'][0] ) ) {
@@ -141,6 +140,9 @@
142141 $meta->addMetadata( $array, $type );
143142 }
144143 }
 144+ if ( isset( $seg['byteOrder'] ) ) {
 145+ $meta->getExif( $filename, $seg['byteOrder'] );
 146+ }
145147 return $meta->getMetadataArray();
146148 }
147149
@@ -208,4 +210,60 @@
209211 return $baseArray;
210212 }
211213
 214+ /**
 215+ * This doesn't do much yet, but eventually I plan to add
 216+ * XMP support for Tiff. (PHP's exif support already extracts
 217+ * but needs some further processing because PHP's exif support
 218+ * is stupid...)
 219+ *
 220+ * @todo Add XMP support, so this function actually makes
 221+ * sense to put here.
 222+ *
 223+ * The various exceptions this throws are caught later.
 224+ * @param $filename String
 225+ * @return Array The metadata.
 226+ */
 227+ static public function Tiff ( $filename ) {
 228+ if ( file_exists( $filename ) ) {
 229+ $byteOrder = self::getTiffByteOrder( $filename );
 230+ if ( !$byteOrder ) {
 231+ throw new MWException( "Error determining byte order of $filename" );
 232+ }
 233+ $exif = new Exif( $filename, $byteOrder );
 234+ $data = $exif->getFilteredData();
 235+ if ( $data ) {
 236+ $data['MEDIAWIKI_EXIF_VERSION'] = Exif::version();
 237+ return $data;
 238+ } else {
 239+ throw new MWException( "Could not extract data from tiff file $filename" );
 240+ }
 241+ } else {
 242+ throw new MWException( "File doesn't exist - $filename" );
 243+ }
 244+ }
 245+ /**
 246+ * Read the first 2 bytes of a tiff file to figure out
 247+ * Little Endian or Big Endian. Needed for exif stuff.
 248+ *
 249+ * @param $filename String The filename
 250+ * @return String 'BE' or 'LE' or false
 251+ */
 252+ static function getTiffByteOrder( $filename ) {
 253+ $fh = fopen( $filename, 'rb' );
 254+ if ( !$fh ) return false;
 255+ $head = fread( $fh, 2 );
 256+ fclose( $fh );
 257+
 258+ switch( $head ) {
 259+ case 'II':
 260+ return 'LE'; // II for intel.
 261+ case 'MM':
 262+ return 'BE'; // MM for motorla.
 263+ default:
 264+ return false; // Something went wrong.
 265+
 266+ }
 267+ }
 268+
 269+
212270 }
Index: trunk/phase3/includes/media/JpegMetadataExtractor.php
@@ -28,7 +28,10 @@
2929
3030 $segmentCount = 0;
3131
32 - $segments = array( 'XMP_ext' => array(), 'COM' => array() );
 32+ $segments = array(
 33+ 'XMP_ext' => array(),
 34+ 'COM' => array(),
 35+ );
3336
3437 if ( !$filename ) {
3538 throw new MWException( "No filename specified for " . __METHOD__ );
@@ -82,23 +85,34 @@
8386 wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
8487 }
8588
86 - } elseif ( $buffer === "\xE1" && $showXMP ) {
 89+ } elseif ( $buffer === "\xE1" ) {
8790 // APP1 section (Exif, XMP, and XMP extended)
8891 // only extract if XMP is enabled.
8992 $temp = self::jpegExtractMarker( $fh );
90 -
9193 // check what type of app segment this is.
92 - if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" ) {
 94+ if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
9395 $segments["XMP"] = substr( $temp, 29 );
94 - } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" ) {
 96+ } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
9597 $segments["XMP_ext"][] = substr( $temp, 35 );
96 - } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" ) {
 98+ } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
9799 // Some images (especially flickr images) seem to have this.
98100 // I really have no idea what the deal is with them, but
99101 // whatever...
100102 $segments["XMP"] = substr( $temp, 29 );
101103 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
102104 . "Using anyways.\n" );
 105+ } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
 106+ // Just need to find out what the byte order is.
 107+ // because php's exif plugin sucks...
 108+ // This is a II for little Endian, MM for big. Not a unicode BOM.
 109+ $byteOrderMarker = substr( $temp, 6, 2 );
 110+ if ( $byteOrderMarker === 'MM' ) {
 111+ $segments['byteOrder'] = 'BE';
 112+ } elseif ( $byteOrderMarker === 'II' ) {
 113+ $segments['byteOrder'] = 'LE';
 114+ } else {
 115+ wfDebug( __METHOD__ . ' Invalid byte ordering?!' );
 116+ }
103117 }
104118 } elseif ( $buffer === "\xED" ) {
105119 // APP13 - PSIR. IPTC and some photoshop stuff
Index: trunk/phase3/includes/media/Tiff.php
@@ -56,13 +56,20 @@
5757 */
5858 function getMetadata( $image, $filename ) {
5959 global $wgShowEXIF;
60 - if ( $wgShowEXIF && file_exists( $filename ) ) {
61 - $exif = new Exif( $filename );
62 - $data = $exif->getFilteredData();
63 - if ( $data ) {
64 - $data['MEDIAWIKI_EXIF_VERSION'] = Exif::version();
65 - return serialize( $data );
66 - } else {
 60+ if ( $wgShowEXIF ) {
 61+ try {
 62+ $meta = BitmapMetadataHandler::Tiff( $filename );
 63+ if ( !is_array( $meta ) ) {
 64+ // This should never happen, but doesn't hurt to be paranoid.
 65+ throw new MWException('Metadata array is not an array');
 66+ }
 67+ $meta['MEDIAWIKI_EXIF_VERSION'] = Exif::version();
 68+ return serialize( $meta );
 69+ }
 70+ catch ( MWException $e ) {
 71+ // BitmapMetadataHandler throws an exception in certain exceptional
 72+ // cases like if file does not exist.
 73+ wfDebug( __METHOD__ . ': ' . $e->getMessage() . "\n" );
6774 return ExifBitmapHandler::BROKEN_FILE;
6875 }
6976 } else {
Index: trunk/phase3/includes/media/Exif.php
@@ -90,6 +90,11 @@
9191 */
9292 var $log = false;
9393
 94+ /**
 95+ * The byte order of the file. Needed because php's
 96+ * extension doesn't fully process some obscure props.
 97+ */
 98+ private $byteOrder;
9499 //@}
95100
96101 /**
@@ -102,7 +107,7 @@
103108 * DigitalZoomRatio = 0/0 is rejected. need to determine if that's valid.
104109 * possibly should treat 0/0 = 0. need to read exif spec on that.
105110 */
106 - function __construct( $file ) {
 111+ function __construct( $file, $byteOrder = '' ) {
107112 /**
108113 * Page numbers here refer to pages in the EXIF 2.2 standard
109114 *
@@ -275,6 +280,16 @@
276281
277282 $this->file = $file;
278283 $this->basename = wfBaseName( $this->file );
 284+ if ( $byteOrder === 'BE' || $byteOrder === 'LE' ) {
 285+ $this->byteOrder = $byteOrder;
 286+ } else {
 287+ // Only give a warning for b/c, since originally we didn't
 288+ // require this. The number of things affected by this is
 289+ // rather small.
 290+ wfWarn( 'Exif class did not have byte order specified. '
 291+ . 'Some properties may be decoded incorrectly.' );
 292+ $this->byteOrder = 'BE'; // BE seems about twice as popular as LE in jpg's.
 293+ }
279294
280295 $this->debugFile( $this->basename, __FUNCTION__, true );
281296 if( function_exists( 'exif_read_data' ) ) {
@@ -394,7 +409,16 @@
395410 }
396411 $newVal .= ord( substr($val, $i, 1) );
397412 }
398 - $this->mFilteredExifData['GPSVersionID'] = $newVal;
 413+ if ( $this->byteOrder === 'LE' ) {
 414+ // Need to reverse the string
 415+ $newVal2 = '';
 416+ for ( $i = strlen( $newVal ) - 1; $i >= 0; $i-- ) {
 417+ $newVal2 .= substr( $newVal, $i, 1 );
 418+ }
 419+ $this->mFilteredExifData['GPSVersionID'] = $newVal2;
 420+ } else {
 421+ $this->mFilteredExifData['GPSVersionID'] = $newVal;
 422+ }
399423 unset( $this->mFilteredExifData['GPSVersion'] );
400424 }
401425
@@ -415,7 +439,6 @@
416440 unset($this->mFilteredExifData[$prop]);
417441 return;
418442 }
419 -
420443 $charCode = substr( $this->mFilteredExifData[$prop], 0, 8);
421444 $val = substr( $this->mFilteredExifData[$prop], 8);
422445
@@ -426,7 +449,7 @@
427450 $charset = "Shift-JIS";
428451 break;
429452 case "UNICODE\x00":
430 - $charset = "UTF-16";
 453+ $charset = "UTF-16" . $this->byteOrder;
431454 break;
432455 default: //ascii or undefined.
433456 $charset = "";
Index: trunk/phase3/includes/media/GIFMetadataExtractor.php
@@ -126,14 +126,14 @@
127127
128128 // The standard says this should be ASCII, however its unclear if
129129 // thats true in practise. Check to see if its valid utf-8, if so
130 - // assume its that, otherwise assume its iso-8859-1
 130+ // assume its that, otherwise assume its windows-1252 (iso-8859-1)
131131 $dataCopy = $data;
132132 // quickIsNFCVerify has the side effect of replacing any invalid characters
133133 UtfNormal::quickIsNFCVerify( $dataCopy );
134134
135135 if ( $dataCopy !== $data ) {
136136 wfSuppressWarnings();
137 - $data = iconv( 'ISO-8859-1', 'UTF-8', $data );
 137+ $data = iconv( 'windows-1252', 'UTF-8', $data );
138138 wfRestoreWarnings();
139139 }
140140

Follow-up revisions

RevisionCommit summaryAuthorDate
r95179Follow-up r95155 - Make PagedTiffHandler use the new method to make sure the ...bawolff21:07, 21 August 2011
r96088MFT to REL1_18...hashar10:53, 2 September 2011

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r86169Merge to trunk everything in img_metadata branch....bawolff01:23, 16 April 2011

Status & tagging log