r70788 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r70787‎ | r70788 | r70789 >
Date:00:54, 10 August 2010
Author:bawolff
Status:deferred
Tags:
Comment:
Limited support for XMPExtended blocks in jpeg files.

I say limited as i don't have any example files, so the only
testing this has gone through is handcrafted file i made myself.
However thats not very good as if i had conceptual misunderstandings
while writing the code, I'd have the same misunderstandings when testing
it.
Modified paths:
  • /branches/img_metadata/phase3/includes/media/BitmapMetadataHandler.php (modified) (history)
  • /branches/img_metadata/phase3/includes/media/XMP.php (modified) (history)
  • /branches/img_metadata/phase3/includes/media/XMPInfo.php (modified) (history)

Diff [purge]

Index: branches/img_metadata/phase3/includes/media/XMP.php
@@ -30,6 +30,7 @@
3131
3232 private $xmlParser;
3333 private $charset = false;
 34+ private $extendedXMPOffset = 0;
3435
3536 protected $items;
3637
@@ -74,6 +75,20 @@
7576
7677 $this->items = XMPInfo::getItems();
7778
 79+ $this->resetXMLParser();
 80+
 81+ }
 82+ /**
 83+ * Main use is if a single item has multiple xmp documents describing it.
 84+ * For example in jpeg's with extendedXMP
 85+ */
 86+ private function resetXMLParser() {
 87+
 88+ if ($this->xmlParser) {
 89+ //is this needed?
 90+ xml_parser_free( $this->xmlParser );
 91+ }
 92+
7893 $this->xmlParser = xml_parser_create_ns( 'UTF-8', ' ' );
7994 xml_parser_set_option( $this->xmlParser, XML_OPTION_CASE_FOLDING, 0 );
8095 xml_parser_set_option( $this->xmlParser, XML_OPTION_SKIP_WHITE, 1 );
@@ -83,6 +98,8 @@
8499 array( $this, 'endElement' ) );
85100
86101 xml_set_character_data_handler( $this->xmlParser, array( $this, 'char' ) );
 102+
 103+
87104 }
88105
89106 /** Destroy the xml parser
@@ -99,6 +116,9 @@
100117 * FormatExif.
101118 */
102119 public function getResults() {
 120+ // xmp-special is for metadata that affects how stuff
 121+ // is extracted. For example xmpNote:HasExtendedXMP
 122+ unset( $this->results['xmp-special'] );
103123 return $this->results;
104124 }
105125
@@ -110,13 +130,17 @@
111131 * debug log, blanks result array and returns false.
112132 *
113133 * @param String: $content XMP data
 134+ * @param Boolean: $allOfIt If this is all the data (true) or if its split up (false). Default true
 135+ * @param Boolean: $reset - does xml parser need to be reset. Default false
114136 * @return Boolean success.
115 - * @todo charset detection (usually UTF-8, but UTF-16 or 32 is allowed).
116137 */
117 - public function parse( $content ) {
 138+ public function parse( $content, $allOfIt = true, $reset = false ) {
 139+ if ( $reset ) {
 140+ $this->resetXMLParser();
 141+ }
118142 try {
119143
120 - // detect encoding by looking for BOM
 144+ // detect encoding by looking for BOM which is supposed to be in processing instruction.
121145 // see page 12 of http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart3.pdf
122146 if ( !$this->charset ) {
123147 $bom = array();
@@ -147,6 +171,7 @@
148172 }
149173
150174 } else {
 175+ // standard specificly says, if no bom assume utf-8
151176 $this->charset = 'UTF-8';
152177 }
153178 }
@@ -155,7 +180,7 @@
156181 $content = iconv( $this->charset, 'UTF-8//IGNORE', $content );
157182 }
158183
159 - $ok = xml_parse( $this->xmlParser, $content, true );
 184+ $ok = xml_parse( $this->xmlParser, $content, $allOfIt );
160185 if ( !$ok ) {
161186 $error = xml_error_string( xml_get_error_code( $this->xmlParser ) );
162187 $where = 'line: ' . xml_get_current_line_number( $this->xmlParser )
@@ -174,6 +199,66 @@
175200 return true;
176201 }
177202
 203+ /** Entry point for XMPExtended blocks in jpeg files
 204+ *
 205+ * @todo In serious need of testing
 206+ * @see http://www.adobe.ge/devnet/xmp/pdfs/XMPSpecificationPart3.pdf XMP spec part 3 page 20
 207+ * @param String $content XMPExtended block minus the namespace signature
 208+ * @return Boolean If it succeded.
 209+ */
 210+ public function parseExtended( $content ) {
 211+ // FIXME: This is untested. Hard to find example files
 212+ // or programs that make such files..
 213+ $guid = substr( $content, 0, 32 );
 214+ if ( !isset( $this->results['xmp-special']['HasExtendedXMP'] )
 215+ || $this->results['xmp-special']['HasExtendedXMP'] !== $guid )
 216+ {
 217+ wfDebugLog('XMP', __METHOD__ . " Ignoring XMPExtended block due to wrong guid (guid= '$guid' )");
 218+ return;
 219+ }
 220+ $len = unpack( 'Nlength/Noffset', substr( $content, 32, 8 ) );
 221+
 222+ if (!$len || $len['length'] < 4 || $len['offset'] < 0 || $len['offset'] > $len['length'] ) {
 223+ wfDebugLog('XMP', __METHOD__ . 'Error reading extended XMP block, invalid length or offset.');
 224+ return false;
 225+ }
 226+
 227+
 228+ // we're not very robust here. we should accept it in the wrong order. To quote
 229+ // the xmp standard:
 230+ // "A JPEG writer should write the ExtendedXMP marker segments in order, immediately following the
 231+ // StandardXMP. However, the JPEG standard does not require preservation of marker segment order. A
 232+ // robust JPEG reader should tolerate the marker segments in any order."
 233+ //
 234+ // otoh the probability that an image will have more than 128k of metadata is rather low...
 235+ // so the probability that it will have > 128k, and be in the wrong order is very low...
 236+
 237+ if ( $len['offset'] !== $this->extendedXMPOffset ) {
 238+ wfDebugLog('XMP', __METHOD__ . 'Ignoring XMPExtended block due to wrong order. (Offset was '
 239+ . $len['offset'] . ' but expected ' . $this->extendedXMPOffset . ')');
 240+ return false;
 241+ }
 242+
 243+ if ( $len['offset'] === 0 ) {
 244+ // if we're starting the extended block, we've probably already
 245+ // done the XMPStandard block, so reset.
 246+ $this->resetXMLParser();
 247+ }
 248+
 249+ $this->extendedXMPOffset += $len['length'];
 250+
 251+ $actualContent = substr( $content, 40 );
 252+
 253+ if ( $this->extendedXMPOffset === strlen( $actualContent ) ) {
 254+ $atEnd = true;
 255+ } else {
 256+ $atEnd = false;
 257+ }
 258+
 259+ wfDebugLog('XMP', __METHOD__ . 'Parsing a XMPExtended block');
 260+ return $this->parse( $actualContent, $atEnd );
 261+ }
 262+
178263 /** Character data handler
179264 * Called whenever character data is found in the xmp document.
180265 *
@@ -657,6 +742,9 @@
658743 // In practise I have yet to see a file that
659744 // uses this element, however it is mentioned
660745 // on page 25 of part 1 of the xmp standard.
 746+ //
 747+ // also it seems as if exiv2 and exiftool do not support
 748+ // this either (That or I misunderstand the standard)
661749 wfDebugLog( 'XMP', __METHOD__ . ' Encoutered <rdf:type> which isn\'t currently supported' );
662750 }
663751
Index: branches/img_metadata/phase3/includes/media/BitmapMetadataHandler.php
@@ -88,7 +88,7 @@
8989 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" ) {
9090 $segments["XMP"] = substr( $temp, 29 );
9191 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" ) {
92 - $segments["XMP_ext"][] = $temp;
 92+ $segments["XMP_ext"][] = substr( $temp, 35 );
9393 }
9494 } elseif ( $buffer === "\xED" ) {
9595 // APP13 - PSIR. IPTC and some photoshop stuff
@@ -316,8 +316,11 @@
317317 if ( isset( $seg['XMP'] ) ) {
318318 $xmp = new XMPReader();
319319 $xmp->parse( $seg['XMP'] );
320 - if ( isset( $seg['XMP_ext'] ) ) {
321 - /* FIXME!! */
 320+ foreach( $seg['XMP_ext'] as $xmpExt ) {
 321+ /* Support for extended xmp in jpeg files
 322+ * is not well tested and a bit fragile.
 323+ */
 324+ $xmp->parseExtended( $xmpExt );
322325
323326 }
324327 $res = $xmp->getResults();
Index: branches/img_metadata/phase3/includes/media/XMPInfo.php
@@ -201,5 +201,12 @@
202202 ),
203203
204204 ),
 205+ //Note, this property affects how jpeg metadata is extracted.
 206+ 'http://ns.adobe.com/xmp/note/' => array(
 207+ 'HasExtendedXMP' => array(
 208+ 'map_group' => 'special',
 209+ 'mode' => XMPReader::MODE_SIMPLE,
 210+ ),
 211+ ),
205212 );
206213 }

Status & tagging log