r100426 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r100425‎ | r100426 | r100427 >
Date:15:55, 21 October 2011
Author:vitalif
Status:deferred (Comments)
Tags:
Comment:
Add patch which fixes pdfinfo to respect page rotation when reporting the size.
Add if(!isset()) to configuration.
Render directly to thumbnails without using ImageMagick and without the need to specify DPI.
Link thumbnails to individual pages of PDF.
Add GhostScript options for better rendering.
Modified paths:
  • /branches/vitalif/extensions/PdfHandler/PdfHandler.image.php (modified) (history)
  • /branches/vitalif/extensions/PdfHandler/PdfHandler.php (modified) (history)
  • /branches/vitalif/extensions/PdfHandler/PdfHandler_body.php (modified) (history)
  • /branches/vitalif/extensions/PdfHandler/poppler-utils.diff (added) (history)

Diff [purge]

Index: branches/vitalif/extensions/PdfHandler/poppler-utils.diff
@@ -0,0 +1,24 @@
 2+Fixes pdfinfo to respect page rotation.
 3+
 4+--- utils/pdfinfo.cc 2010-12-27 23:44:28.000000000 +0300
 5+@@ -111,6 +111,8 @@ int main(int argc, char *argv[]) {
 6+ int exitCode;
 7+ int pg, i;
 8+ GBool multiPage;
 9++ int r;
 10++ double xchg;
 11+
 12+ exitCode = 99;
 13+
 14+@@ -232,6 +234,10 @@ int main(int argc, char *argv[]) {
 15+ for (pg = firstPage; pg <= lastPage; ++pg) {
 16+ w = doc->getPageCropWidth(pg);
 17+ h = doc->getPageCropHeight(pg);
 18++ r = doc->getPageRotate(pg);
 19++ if ((r != 0) && ((r % 180) != 0)) {
 20++ xchg = h; h = w; w = xchg;
 21++ }
 22+ if (multiPage) {
 23+ printf("Page %4d size: %g x %g pts", pg, w, h);
 24+ } else {
Index: branches/vitalif/extensions/PdfHandler/PdfHandler_body.php
@@ -4,6 +4,8 @@
55 *
66 * Inspired by djvuhandler from Tim Starling
77 * Modified and written by Xarax
 8+ * Modified by Vitaliy Filippov (2011):
 9+ * antialiasing, page links, multiple page thumbnails
810 *
911 * This program is free software; you can redistribute it and/or modify
1012 * it under the terms of the GNU General Public License as published by
@@ -21,15 +23,47 @@
2224 * http://www.gnu.org/copyleft/gpl.html
2325 */
2426
 27+class PdfThumbnailImage extends ThumbnailImage {
 28+
 29+ var $startpage, $endpage;
 30+
 31+ function __construct( $file, $url, $width, $height, $path, $page, $endpage ) {
 32+ parent::__construct( $file, $url, $width, $height, $path, $page );
 33+ $this->startpage = $page;
 34+ $this->endpage = $endpage;
 35+ }
 36+
 37+ function toHtml( $options = array() ) {
 38+ if( !empty( $options['file-link'] ) ) {
 39+ // Link the thumbnail to an individual PDF page
 40+ $options['custom-url-link'] = $this->file->getURL() . '#page=' . $this->page;
 41+ }
 42+ if( $this->endpage > $this->startpage ) {
 43+ // Multiple thumbnails requested - useful, for example,
 44+ // for embedding presentations on wiki pages
 45+ $html = '';
 46+ $urlpattern = $this->url;
 47+ for( $this->page = $this->startpage; $this->page <= $this->endpage; $this->page++ ) {
 48+ $this->url = str_replace( '$N', $this->page, $urlpattern );
 49+ $html .= parent::toHtml( $options )."\n";
 50+ }
 51+ $this->url = $urlpattern;
 52+ return $html;
 53+ }
 54+ return parent::toHtml( $options );
 55+ }
 56+
 57+}
 58+
2559 class PdfHandler extends ImageHandler {
2660
2761 function isEnabled() {
28 - global $wgPdfProcessor, $wgPdfPostProcessor, $wgPdfInfo;
 62+ global $wgPdfProcessor, $wgPdfInfo;
2963
30 - if ( !isset( $wgPdfProcessor ) || !isset( $wgPdfPostProcessor ) || !isset( $wgPdfInfo ) ) {
 64+ if ( empty( $wgPdfProcessor ) || empty( $wgPdfInfo ) ) {
3165 wfDebug( "PdfHandler is disabled, please set the following\n" );
3266 wfDebug( "variables in LocalSettings.php:\n" );
33 - wfDebug( "\$wgPdfProcessor, \$wgPdfPostProcessor, \$wgPdfInfo\n" );
 67+ wfDebug( "\$wgPdfProcessor, \$wgPdfInfo\n" );
3468 return false;
3569 }
3670 return true;
@@ -44,8 +78,10 @@
4579 }
4680
4781 function validateParam( $name, $value ) {
48 - if ( in_array( $name, array( 'width', 'height', 'page' ) ) ) {
 82+ if ( $name == 'width' || $name == 'height' ) {
4983 return ( $value <= 0 ) ? false : true;
 84+ } elseif ( $name == 'page' ) {
 85+ return preg_match( '/^\d*(-\d*)?$/s', $value );
5086 } else {
5187 return false;
5288 }
@@ -56,13 +92,16 @@
5793 if ( !isset( $params['width'] ) ) {
5894 return false;
5995 }
 96+ if ( strpos( $page, '-' ) !== false ) {
 97+ $page = '$N';
 98+ }
6099 return "page{$page}-{$params['width']}px";
61100 }
62101
63102 function parseParamString( $str ) {
64103 $m = false;
65104
66 - if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) {
 105+ if ( preg_match( '/^page(\d+|\$N)-(\d+)px$/', $str, $m ) ) {
67106 return array( 'width' => $m[2], 'page' => $m[1] );
68107 }
69108
@@ -89,7 +128,7 @@
90129 }
91130
92131 function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
93 - global $wgPdfProcessor, $wgPdfPostProcessor, $wgPdfHandlerDpi;
 132+ global $wgPdfProcessor, $wgPdfOutputDevice, $wgPdfDpiRatio;
94133
95134 $metadata = $image->getMetadata();
96135
@@ -97,43 +136,73 @@
98137 return $this->doThumbError( @$params['width'], @$params['height'], 'pdf_no_metadata' );
99138 }
100139
 140+ $n = $this->pageCount( $image );
 141+ $page = isset( $params['page'] ) ? $params['page'] : 1;
 142+ $endpage = false;
 143+ if ( strpos( $page, '-' ) !== false ) {
 144+ list( $page, $endpage ) = explode( '-', $page, 2 );
 145+ if ( $page === '' ) {
 146+ $page = 1;
 147+ }
 148+ if ( $endpage === '' ) {
 149+ $endpage = $n;
 150+ }
 151+ $dstUrl = str_replace( 'page%24N', 'page$N', $dstUrl );
 152+ } else {
 153+ $endpage = $page;
 154+ }
 155+ $params['page'] = $page;
 156+
101157 if ( !$this->normaliseParams( $image, $params ) ) {
102158 return new TransformParameterError( $params );
103159 }
104160
 161+ if ( $page > $n || $endpage > $n ) {
 162+ return $this->doTHumbError( $width, $height, 'pdf_page_error' );
 163+ }
 164+
105165 $width = $params['width'];
106166 $height = $params['height'];
107167 $srcPath = $image->getPath();
108 - $page = $params['page'];
109168
110 - if ( $page > $this->pageCount( $image ) ) {
111 - return $this->doTHumbError( $width, $height, 'pdf_page_error' );
112 - }
113 -
114169 if ( $flags & self::TRANSFORM_LATER ) {
115 - return new ThumbnailImage( $image, $dstUrl, $width,
116 - $height, $dstPath, $page );
 170+ return new PdfThumbnailImage( $image, $dstUrl, $width,
 171+ $height, $dstPath, $page, $endpage );
117172 }
118173
119174 if ( !wfMkdirParents( dirname( $dstPath ), null, __METHOD__ ) ) {
120175 return $this->doThumbError( $width, $height, 'thumbnail_dest_directory' );
121176 }
122177
123 - $cmd = '(' . wfEscapeShellArg( $wgPdfProcessor );
124 - $cmd .= " -sDEVICE=jpeg -sOutputFile=- -dFirstPage={$page} -dLastPage={$page}";
125 - $cmd .= " -r{$wgPdfHandlerDpi} -dBATCH -dNOPAUSE -q ". wfEscapeShellArg( $srcPath );
126 - $cmd .= " | " . wfEscapeShellArg( $wgPdfPostProcessor );
127 - $cmd .= " -depth 8 -resize {$width} - ";
128 - $cmd .= wfEscapeShellArg( $dstPath ) . ")";
129 - $cmd .= " 2>&1";
 178+ $dpi = $wgPdfDpiRatio * $this->getPageDPIForWidth( $image, $page, $width );
130179
 180+ // GhostScript fails (sometimes even crashes) if output filename length is > 255 bytes.
 181+ // Sometimes even when it's shorter. Workaround it by generating files in temporary directory.
 182+ $dst = tempnam( wfTempDir(), 'pdf-' );
 183+ if ( $endpage > $page ) {
 184+ $dst .= '%d';
 185+ }
 186+ $cmd = "$wgPdfProcessor -dUseCropBox -dTextAlphaBits=4 -dGraphicsAlphaBits=4".
 187+ " -sDEVICE=$wgPdfOutputDevice " . wfEscapeShellArg( "-sOutputFile=$dst" ) .
 188+ " -dFirstPage=$page -dLastPage=$endpage -r$dpi -dSAFER -dBATCH -dNOPAUSE -q " .
 189+ wfEscapeShellArg( $srcPath ) . " 2>&1";
 190+
131191 wfProfileIn( 'PdfHandler' );
132192 wfDebug( __METHOD__ . ": $cmd\n" );
133193 $retval = '';
134194 $err = wfShellExec( $cmd, $retval );
135195 wfProfileOut( 'PdfHandler' );
136196
137 - $removed = $this->removeBadFile( $dstPath, $retval );
 197+ // Move files from temporary directory to the destination
 198+ $removed = false;
 199+ for ( $i = $page; $i <= $endpage; $i++ ) {
 200+ $tmp = sprintf( $dst, $i );
 201+ $real = str_replace( '$N', $i, $dstPath );
 202+ if ( file_exists( $tmp ) ) {
 203+ rename( $tmp, $real );
 204+ }
 205+ $removed = $removed || $this->removeBadFile( $real, $retval );
 206+ }
138207
139208 if ( $retval != 0 || $removed ) {
140209 wfDebugLog( 'thumbnail',
@@ -141,7 +210,7 @@
142211 wfHostname(), $retval, trim( $err ), $cmd ) );
143212 return new MediaTransformError( 'thumbnail_error', $width, $height, $err );
144213 } else {
145 - return new ThumbnailImage( $image, $dstUrl, $width, $height, $dstPath, $page );
 214+ return new PdfThumbnailImage( $image, $dstUrl, $width, $height, $dstPath, $page, $endpage );
146215 }
147216 }
148217
@@ -162,19 +231,17 @@
163232 return $image->pdfMetaArray;
164233 }
165234
166 - $metadata = $image->getMetadata();
 235+ wfProfileIn( __METHOD__ );
 236+ $metadata = $this->isMetadataValid( $image, $image->getMetadata() );
 237+ wfProfileOut( __METHOD__ );
167238
168 - if ( !$this->isMetadataValid( $image, $metadata ) ) {
 239+ if ( !$metadata ) {
 240+ $image->purgeCache();
169241 wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" );
170242 return false;
171243 }
172244
173 - wfProfileIn( __METHOD__ );
174 - wfSuppressWarnings();
175 - $image->pdfMetaArray = unserialize( $metadata );
176 - wfRestoreWarnings();
177 - wfProfileOut( __METHOD__ );
178 -
 245+ $image->pdfMetaArray = $metadata;
179246 return $image->pdfMetaArray;
180247 }
181248
@@ -198,7 +265,13 @@
199266 }
200267
201268 function isMetadataValid( $image, $metadata ) {
202 - return !empty( $metadata ) && $metadata != serialize( array() );
 269+ wfSuppressWarnings();
 270+ $metadata = unserialize( $metadata );
 271+ wfRestoreWarnings();
 272+ if ( !empty( $metadata ) && ( empty( $metadata['pages'] ) || empty( $metadata['pages'][0] ) ) ) {
 273+ return $metadata;
 274+ }
 275+ return NULL;
203276 }
204277
205278 function pageCount( $image ) {
@@ -206,7 +279,7 @@
207280 if ( !$data ) {
208281 return false;
209282 }
210 - return intval( $data['Pages'] );
 283+ return count( $data['pages'] );
211284 }
212285
213286 function getPageDimensions( $image, $page ) {
@@ -214,6 +287,12 @@
215288 return PdfImage::getPageSize( $data, $page );
216289 }
217290
 291+ function getPageDPIForWidth( $image, $page, $width ) {
 292+ $data = $this->getMetaArray( $image );
 293+ $pagesize = PdfImage::getPageSize( $data, $page );
 294+ return intval( $width * 72 / $pagesize['width'] );
 295+ }
 296+
218297 function getPageText( $image, $page ) {
219298 $data = $this->getMetaArray( $image, true );
220299 if ( !$data ) {
Index: branches/vitalif/extensions/PdfHandler/PdfHandler.php
@@ -4,7 +4,7 @@
55 *
66 * @file
77 * @ingroup Extensions
8 - * @author Martin Seidel (Xarax) <jodeldi@gmx.de>
 8+ * @author Martin Seidel (Xarax) <jodeldi@gmx.de>, Vitaliy Filippov (vitalif) <vitalif@yourcmc.ru>
99 * @copyright Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
1010 *
1111 * This program is free software; you can redistribute it and/or modify
@@ -23,6 +23,13 @@
2424 * http://www.gnu.org/copyleft/gpl.html
2525 */
2626
 27+#########################################################################
 28+# WARNING: pdfinfo does not respect page rotation, so if you don't want #
 29+# to see ugly thumbnails with incorrect aspect ratio on landscape PDFs, #
 30+# you must build your own poppler-utils with "poppler-utils.diff" patch #
 31+# applied. #
 32+#########################################################################
 33+
2734 # Not a valid entry point, skip unless MEDIAWIKI is defined
2835 if ( !defined( 'MEDIAWIKI' ) ) {
2936 echo 'PdfHandler extension';
@@ -32,27 +39,32 @@
3340 $wgExtensionCredits['media'][] = array(
3441 'path' => __FILE__,
3542 'name' => 'PDF Handler',
36 - 'author' => array( 'Martin Seidel', 'Mike Połtyn'),
 43+ 'author' => array( 'Martin Seidel', 'Mike Połtyn', 'Vitaliy Filippov' ),
3744 'descriptionmsg' => 'pdf-desc',
3845 'url' => 'http://www.mediawiki.org/wiki/Extension:PdfHandler',
3946 );
4047
4148 // External program requirements...
42 -$wgPdfProcessor = 'gs';
43 -$wgPdfPostProcessor = 'convert';
44 -$wgPdfInfo = 'pdfinfo';
45 -$wgPdftoText = 'pdftotext';
 49+if ( !isset( $wgPdfProcessor ) ) $wgPdfProcessor = 'gs';
 50+if ( !isset( $wgPdfInfo ) ) $wgPdfInfo = 'pdfinfo';
 51+if ( !isset( $wgPdftoText ) ) $wgPdftoText = 'pdftotext';
4652
47 -$wgPdfOutputExtension = 'jpg';
48 -$wgPdfHandlerDpi = 150;
 53+// GhostScript's output device name and file extension
 54+if ( !isset( $wgPdfOutputDevice ) ) $wgPdfOutputDevice = 'jpeg';
 55+if ( !isset( $wgPdfOutputExtension ) ) $wgPdfOutputExtension = 'jpg';
4956
 57+// Now PdfHandler selects output DPI by itself, based on requested image size
 58+// If you want more quality, specify a power of 2 here.
 59+// Generated images will be downscaled by browser.
 60+if ( !isset( $wgPdfDpiRatio ) ) $wgPdfDpiRatio = 2;
 61+
5062 // This setting, if enabled, will put creating thumbnails into a job queue,
5163 // so they do not have to be created on-the-fly,
5264 // but rather inconspicuously during normal wiki browsing
53 -$wgPdfCreateThumbnailsInJobQueue = false;
 65+if ( !isset( $wgPdfCreateThumbnailsInJobQueue ) ) $wgPdfCreateThumbnailsInJobQueue = false;
5466
55 -// To upload new PDF files you'll need to do this too:
56 -// $wgFileExtensions[] = 'pdf';
 67+// Enable PDF upload by default. If you want to forbid PDF upload - do so in your LocalSettings.php
 68+$wgFileExtensions[] = 'pdf';
5769
5870 $dir = dirname( __FILE__ ) . '/';
5971 $wgExtensionMessagesFiles['PdfHandler'] = $dir . 'PdfHandler.i18n.php';
Index: branches/vitalif/extensions/PdfHandler/PdfHandler.image.php
@@ -48,8 +48,6 @@
4949 }
5050
5151 public static function getPageSize( $data, $page ) {
52 - global $wgPdfHandlerDpi;
53 -
5452 if( isset( $data['pages'][$page]['Page size'] ) ) {
5553 $o = $data['pages'][$page]['Page size'];
5654 } elseif( isset( $data['Page size'] ) ) {
@@ -62,9 +60,9 @@
6361 $size = explode( 'x', $o, 2 );
6462
6563 if ( $size ) {
66 - $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
 64+ $width = intval( trim( $size[0] ) );
6765 $height = explode( ' ', trim( $size[1] ), 2 );
68 - $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
 66+ $height = intval( trim( $height[0] ) );
6967
7068 return array(
7169 'width' => $width,

Follow-up revisions

RevisionCommit summaryAuthorDate
r101066Revert stuff vulnerable to register_globalsvitalif21:41, 27 October 2011

Comments

#Comment by Johnduhart (talk | contribs)   16:08, 21 October 2011
+if ( !isset( $wgPdfProcessor ) )

This opens up the extension to Register global attacks, please keep it the way it was :) (Users should be modifying settings after inclusion)

#Comment by Johnduhart (talk | contribs)   16:14, 21 October 2011

Oh and poppler-utils.diff needs eol-style=native. Be sure to setup auto props.

#Comment by Brion VIBBER (talk | contribs)   23:19, 21 October 2011

This seems to munge a bunch of stuff together which should be looked at separately.

  • multiple page thumbnails -- what does this even mean? any usability issues? safety? will this work with the thumbnail 404 handlers etc? do we want this at all?
  • rotation fixing -- probably nice but needs testing
  • direct rendering -- unsure whether this is good or not; why didn't we do it direct in the first place? any issues with file size, sharpness etc?
  • lots of register_globals vulnerabilities -- needs immediate revert
  • there appears to be a patch for a third-party utility, which it sounds kinda like it's required to make the rotation actually work? if so this is kind of a killer as it makes deployment MUCH harder
#Comment by P858snake (talk | contribs)   06:34, 10 December 2011

What FIXME stuff is left? the register_globals appear to have been done in a follow up and the rest of the points seem more of a "things we should look at" list compared a "list of broken things" list.

Status & tagging log