r71148 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r71147‎ | r71148 | r71149 >
Date:20:39, 15 August 2010
Author:bawolff
Status:deferred (Comments)
Tags:
Comment:
Add support for extracting metadata in PNG iTXt, tXEt and zTXt chunks.
(semi follow-up to 70860 )
Modified paths:
  • /branches/img_metadata/phase3/includes/media/BitmapMetadataHandler.php (modified) (history)
  • /branches/img_metadata/phase3/includes/media/FormatMetadata.php (modified) (history)
  • /branches/img_metadata/phase3/includes/media/PNGMetadataExtractor.php (modified) (history)
  • /branches/img_metadata/phase3/languages/messages/MessagesEn.php (modified) (history)
  • /branches/img_metadata/phase3/maintenance/language/messageTypes.inc (modified) (history)
  • /branches/img_metadata/phase3/maintenance/language/messages.inc (modified) (history)

Diff [purge]

Index: branches/img_metadata/phase3/maintenance/language/messages.inc
@@ -2752,6 +2752,9 @@
27532753 'exif-morepermissionsurl',
27542754 'exif-attributionurl',
27552755 'exif-preferredattributionname',
 2756+ 'exif-pngfilecomment',
 2757+ 'exif-disclaimer',
 2758+ 'exif-contentwarning',
27562759 ),
27572760 'exif-values' => array(
27582761 'exif-make-value',
Index: branches/img_metadata/phase3/maintenance/language/messageTypes.inc
@@ -669,4 +669,7 @@
670670 'exif-copyrighted-false',
671671 'exif-rating-rejected',
672672 'exif-isospeedratings-overflow',
 673+ 'exif-pngfilecomment',
 674+ 'exif-disclaimer',
 675+ 'exif-contentwarning',
673676 );
Index: branches/img_metadata/phase3/includes/media/FormatMetadata.php
@@ -652,7 +652,10 @@
653653 case 'MorePermissionsUrl':
654654 case 'AttributionUrl':
655655 case 'PreferredAttributionName':
656 -
 656+ case 'PNGFileComment':
 657+ case 'Disclaimer':
 658+ case 'ContentWarning':
 659+
657660 $val = htmlspecialchars( $val );
658661 break;
659662
@@ -748,7 +751,7 @@
749752 $content = '';
750753
751754 $cLang = $wgContLang->getCode();
752 - $default = false;
 755+ $defaultItem = false;
753756 $defaultLang = false;
754757
755758 // If default is set, save it for later,
Index: branches/img_metadata/phase3/includes/media/BitmapMetadataHandler.php
@@ -13,7 +13,7 @@
1414 private $metadata = Array();
1515 private $metaPriority = Array(
1616 20 => Array( 'other' ),
17 - 40 => Array( 'file-comment' ),
 17+ 40 => Array( 'file-comment', 'native-png' ),
1818 60 => Array( 'iptc-good-hash', 'iptc-no-hash' ),
1919 70 => Array( 'xmp-deprected' ),
2020 80 => Array( 'xmp-general' ),
@@ -161,15 +161,17 @@
162162
163163 $meta = new self( $filename );
164164 $array = PNGMetadataExtractor::getMetadata( $filename );
165 - if ( isset( $array['xmp'] ) && $array['xmp'] !== '' && $showXMP ) {
 165+ if ( isset( $array['text']['xmp']['x-default'] ) && $array['text']['xmp']['x-default'] !== '' && $showXMP ) {
166166 $xmp = new XMPReader();
167 - $xmp->parse( $array['xmp'] );
 167+ $xmp->parse( $array['text']['xmp']['x-default'] );
168168 $xmpRes = $xmp->getResults();
169169 foreach ( $xmpRes as $type => $xmpSection ) {
170170 $meta->addMetadata( $xmpSection, $type );
171171 }
172172 }
173 - unset( $array['xmp'] );
 173+ unset( $array['text']['xmp'] );
 174+ $meta->addMetadata( $array['text'], 'native-png' );
 175+ unset( $array['text'] );
174176 $array['metadata'] = $meta->getMetadataArray();
175177 $array['metadata']['_MW_PNG_VERSION'] = '1';
176178 return $array;
Index: branches/img_metadata/phase3/includes/media/PNGMetadataExtractor.php
@@ -9,18 +9,39 @@
1010 class PNGMetadataExtractor {
1111 static $png_sig;
1212 static $CRC_size;
 13+ static $text_chunks;
1314
1415 static function getMetadata( $filename ) {
1516 self::$png_sig = pack( "C8", 137, 80, 78, 71, 13, 10, 26, 10 );
1617 self::$CRC_size = 4;
 18+ /* based on list at http://owl.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
 19+ * and http://www.w3.org/TR/PNG/#11keywords
 20+ */
 21+ self::$text_chunks = array(
 22+ 'XML:com.adobe.xmp' => 'xmp',
 23+ 'Artist' => 'Artist', # this is unofficial, compared to Author, which is
 24+ 'Model' => 'Model',
 25+ 'Make' => 'Make',
 26+ 'Author' => 'Artist',
 27+ 'Comment' => 'PNGFileComment',
 28+ 'Description' => 'ImageDescription',
 29+ 'Title' => 'ObjectName',
 30+ 'Copyright' => 'Copyright',
 31+ 'Source' => 'Model', # Source as in original device used to make image
 32+ 'Software' => 'Software',
 33+ 'Disclaimer' => 'Disclaimer',
 34+ 'Warning' => 'ContentWarning',
 35+ 'URL' => 'Identifer', # Not sure if this is best mapping. Maybe WebStatement.
 36+ 'Label' => 'Label',
 37+ /* Other potentially useful things - Creation Time, Document */
 38+ );
1739
1840 $showXMP = function_exists( 'xml_parser_create_ns' );
1941
2042 $frameCount = 0;
2143 $loopCount = 1;
2244 $duration = 0.0;
23 - $xmp = '';
24 - $meta = array();
 45+ $text = array();
2546
2647 if (!$filename)
2748 throw new Exception( __METHOD__ . ": No file name specified" );
@@ -65,23 +86,136 @@
6687 if( $fctldur['delay_num'] ) {
6788 $duration += $fctldur['delay_num'] / $fctldur['delay_den'];
6889 }
69 - } elseif ( $chunk_type == "iTXt" && $showXMP ) {
70 - // At the moment this only does XMP iText chunks,
71 - // but in the future might extract other metadata chunks.
72 - if( $chunk_size <= 22 ) {
73 - // something weird, so skip
74 - fseek( $fh, $chunk_size, SEEK_CUR );
75 - continue;
 90+ } elseif ( $chunk_type == "iTXt" ) {
 91+ // Extracts iTXt chunks, uncompressing if neccesary.
 92+ $buf = fread( $fh, $chunk_size );
 93+ $items = array();
 94+ if ( preg_match(
 95+ '/^([^\x00]{1,79})\x00(\x00|\x01)\x00([^\x00]*)(.)[^\x00]*\x00(.*)$/Ds',
 96+ $buf, $items )
 97+ ) {
 98+ /* $items[1] = text chunk name, $items[2] = compressed flag,
 99+ * $items[3] = lang code (or ""), $items[4]= compression type.
 100+ * $items[5] = content
 101+ */
 102+
 103+ if ( !isset( self::$text_chunks[$items[1]] ) ) {
 104+ // Only extract textual chunks on our list.
 105+ fseek( $fh, self::$CRC_size, SEEK_CUR );
 106+ continue;
 107+ }
 108+
 109+ if ( $items[3] == '' ) {
 110+ // if no lang specified use x-default like in xmp.
 111+ $items[3] = 'x-default';
 112+ }
 113+
 114+ // if compressed
 115+ if ( $items[2] == "\x01" ) {
 116+ if ( function_exists( 'gzuncompress' ) && $items[4] === "\x00" ) {
 117+ wfSuppressWarnings();
 118+ $items[5] = gzuncompress( $items[5] );
 119+ wfRestoreWarnings();
 120+
 121+ if ( $items[5] === false ) {
 122+ //decompression failed
 123+ wfDebug( __METHOD__ . ' Error decompressing iTxt chunk - ' . $items[1] );
 124+ fseek( $fh, self::$CRC_size, SEEK_CUR );
 125+ continue;
 126+ }
 127+
 128+ } else {
 129+ wfDebug( __METHOD__ . ' Skipping compressed png iTXt chunk due to lack of zlib,'
 130+ . ' or potentially invalid compression method' );
 131+ fseek( $fh, self::$CRC_size, SEEK_CUR );
 132+ continue;
 133+ }
 134+ }
 135+ $finalKeyword = self::$text_chunks[ $items[1] ];
 136+ $text[ $finalKeyword ][ $items[3] ] = $items[5];
 137+ $text[ $finalKeyword ]['_type'] = 'lang';
 138+
 139+ } else {
 140+ //Error reading iTXt chunk
 141+ throw new Exception( __METHOD__ . ": Read error on iTXt chunk" );
 142+ return;
76143 }
77 - $itxtHeader = fread( $fh, 22 );
78 - if( !$itxtHeader ) { throw new Exception( __METHOD__ . ": Read error" ); return; }
79 - if( $itxtHeader !== "XML:com.adobe.xmp\x00\x00\x00\x00\x00" ) {
80 - // some other iTXt chunk.
81 - fseek( $fh, $chunk_size - 22, SEEK_CUR );
 144+
 145+ } elseif ( $chunk_type == 'tEXt' ) {
 146+ $buf = fread( $fh, $chunk_size );
 147+ $keyword = '';
 148+ $content = '';
 149+
 150+ list( $keyword, $content ) = explode( "\x00", $buf, 2 );
 151+ if ( $keyword === '' || $content === '' ) {
 152+ throw new Exception( __METHOD__ . ": Read error on tEXt chunk" );
 153+ return;
 154+ }
 155+ if ( !isset( self::$text_chunks[ $keyword ] ) ) {
 156+ // Don't recognize chunk, so skip.
 157+ fseek( $fh, self::$CRC_size, SEEK_CUR );
82158 continue;
83159 }
84 - $xmp = fread( $fh, $chunk_size - 22 );
85 - if( !$xmp ) { throw new Exception( __METHOD__ . ": Read error" ); return; }
 160+ $content = iconv( 'ISO-8859-1', 'UTF-8', $content);
 161+ if ( $content === false ) {
 162+ throw new Exception( __METHOD__ . ": Read error (error with iconv)" );
 163+ return;
 164+ }
 165+
 166+ $finalKeyword = self::$text_chunks[ $keyword ];
 167+ $text[ $finalKeyword ][ 'x-default' ] = $content;
 168+ $text[ $finalKeyword ]['_type'] = 'lang';
 169+
 170+ } elseif ( $chunk_type == 'zTXt' ) {
 171+ if ( function_exists( 'gzuncompress' ) ) {
 172+ $buf = fread( $fh, $chunk_size );
 173+ $keyword = '';
 174+ $postKeyword = '';
 175+
 176+ list( $keyword, $postKeyword ) = explode( "\x00", $buf, 2 );
 177+ if ( $keyword === '' || $postKeyword === '' ) {
 178+ throw new Exception( __METHOD__ . ": Read error on zTXt chunk" );
 179+ return;
 180+ }
 181+ if ( !isset( self::$text_chunks[ $keyword ] ) ) {
 182+ // Don't recognize chunk, so skip.
 183+ fseek( $fh, self::$CRC_size, SEEK_CUR );
 184+ continue;
 185+ }
 186+ $compression = substr( $postKeyword, 0, 1 );
 187+ $content = substr( $postKeyword, 1 );
 188+ if ( $compression !== "\x00" ) {
 189+ wfDebug( __METHOD__ . " Unrecognized compression method in zTXt ($keyword). Skipping." );
 190+ fseek( $fh, self::$CRC_size, SEEK_CUR );
 191+ continue;
 192+ }
 193+
 194+ wfSuppressWarnings();
 195+ $content = gzuncompress( $content );
 196+ wfRestoreWarnings();
 197+
 198+ if ( $content === false ) {
 199+ //decompression failed
 200+ wfDebug( __METHOD__ . ' Error decompressing zTXt chunk - ' . $keyword );
 201+ fseek( $fh, self::$CRC_size, SEEK_CUR );
 202+ continue;
 203+ }
 204+
 205+ $content = iconv( 'ISO-8859-1', 'UTF-8', $content);
 206+ if ( $content === false ) {
 207+ throw new Exception( __METHOD__ . ": Read error (error with iconv)" );
 208+ return;
 209+ }
 210+
 211+ $finalKeyword = self::$text_chunks[ $keyword ];
 212+ $text[ $finalKeyword ][ 'x-default' ] = $content;
 213+ $text[ $finalKeyword ]['_type'] = 'lang';
 214+
 215+ } else {
 216+ wfDebug( __METHOD__ . " Cannot decompress zTXt chunk due to lack of zlib. Skipping." );
 217+ fseek( $fh, $chunk_size, SEEK_CUR );
 218+ }
 219+
86220 } elseif ( $chunk_type == "IEND" ) {
87221 break;
88222 } else {
@@ -99,7 +233,7 @@
100234 'frameCount' => $frameCount,
101235 'loopCount' => $loopCount,
102236 'duration' => $duration,
103 - 'xmp' => $xmp,
 237+ 'text' => $text,
104238 );
105239
106240 }
Index: branches/img_metadata/phase3/languages/messages/MessagesEn.php
@@ -3821,6 +3821,9 @@
38223822 'exif-morepermissionsurl' => 'Alternative licensing information',
38233823 'exif-attributionurl' => 'When re-using this work, please link to',
38243824 'exif-preferredattributionname' => 'When re-using this work, please credit',
 3825+'exif-pngfilecomment' => 'PNG file comment',
 3826+'exif-disclaimer' => 'Disclaimer',
 3827+'exif-contentwarning' => 'Content warning',
38253828
38263829
38273830 # Make & model, can be wikified in order to link to the camera and model name

Follow-up revisions

RevisionCommit summaryAuthorDate
r71155Follow up to r71148 per cr comments. Clarify some comments,...bawolff09:44, 16 August 2010

Comments

#Comment by Nikerabbit (talk | contribs)   21:35, 15 August 2010

+'Artist' => 'Artist', # this is unofficial, compared to Author, which is ... is what?

+$content = iconv( 'ISO-8859-1', 'UTF-8', $content); Iconv might throw warnings, do they need to be suppressed?

#Comment by Bawolff (talk | contribs)   09:49, 16 August 2010

I don't think iconv throws warnings when converting from iso-8859-1 to utf-8 since all iso-8859-1 codepoints have a corresponding code point in utf-8, and if you include control characters, every posible 8-bit value is used. (I tried converting some random binary files and got no errors). However nonetheless its better to be safe then sorry so i suppressed errors per your suggestion in r71155.

Status & tagging log