r50050 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r50049‎ | r50050 | r50051 >
Date:19:33, 29 April 2009
Author:thomasv
Status:reverted (Comments)
Tags:
Comment:
extract djvu text (bug 18046); escape possible script with htmlspecialchars instead of sed
Modified paths:
  • /trunk/phase3/includes/DefaultSettings.php (modified) (history)
  • /trunk/phase3/includes/media/DjVu.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/DefaultSettings.php
@@ -3535,6 +3535,13 @@
35363536 $wgDjvuRenderer = null;
35373537
35383538 /**
 3539+ * Path of the djvutxt DJVU text extraction utility
 3540+ * Enable this and $wgDjvuDump to enable text layer extraction from djvu files
 3541+ */
 3542+# $wgDjvuTxt = 'djvutxt';
 3543+$wgDjvuTxt = null;
 3544+
 3545+/**
35393546 * Path of the djvutoxml executable
35403547 * This works like djvudump except much, much slower as of version 3.5.
35413548 *
Index: trunk/phase3/includes/media/DjVu.php
@@ -52,6 +52,8 @@
5353 $m = false;
5454 if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) {
5555 return array( 'width' => $m[2], 'page' => $m[1] );
 56+ } else if ( preg_match( '/^page(\d+)-djvutxt$/', $str, $m ) ) {
 57+ return array( 'djvutxt' => 1, 'page' => $m[1] );
5658 } else {
5759 return false;
5860 }
@@ -64,8 +66,21 @@
6567 );
6668 }
6769
 70+ function normaliseParams( $image, &$params ) {
 71+ global $wgDjvuTxt;
 72+ if( $params['djvutxt'] && $wgDjvuTxt) {
 73+ if ( !isset( $params['page'] ) ) {
 74+ $params['page'] = 1;
 75+ }
 76+ $params['width'] = 0;
 77+ $params['height'] = 0;
 78+ return true;
 79+ }
 80+ else return parent::normaliseParams( $image, $params );
 81+ }
 82+
6883 function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
69 - global $wgDjvuRenderer, $wgDjvuPostProcessor;
 84+ global $wgDjvuRenderer, $wgDjvuPostProcessor, $wgDjvuTxt;
7085
7186 // Fetch XML and check it, to give a more informative error message than the one which
7287 // normaliseParams will inevitably give.
@@ -94,18 +109,36 @@
95110 return new MediaTransformError( 'thumbnail_error', $width, $height, wfMsg( 'thumbnail_dest_directory' ) );
96111 }
97112
98 - # Use a subshell (brackets) to aggregate stderr from both pipeline commands
99 - # before redirecting it to the overall stdout. This works in both Linux and Windows XP.
100 - $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " .
101 - wfEscapeShellArg( $srcPath );
102 - if ( $wgDjvuPostProcessor ) {
103 - $cmd .= " | {$wgDjvuPostProcessor}";
 113+ if( $params['djvutxt'] && $wgDjvuTxt ) {
 114+ # Extract djvu text
 115+ $cmd = wfEscapeShellArg( $wgDjvuTxt ) . " --page={$page} " . wfEscapeShellArg( $srcPath ) ;
 116+ wfProfileIn( 'djvutxt' );
 117+ wfDebug( __METHOD__.": $cmd\n" );
 118+ $err = wfShellExec( $cmd, $retval );
 119+ wfProfileOut( 'djvutxt' );
 120+ # Escape html characters
 121+ $txt = htmlspecialchars( $err );
 122+ # Write result to file
 123+ if($retval == 0) {
 124+ $f = fopen($dstPath, 'w');
 125+ fwrite($f, $txt);
 126+ fclose($f);
 127+ }
104128 }
105 - $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
106 - wfProfileIn( 'ddjvu' );
107 - wfDebug( __METHOD__.": $cmd\n" );
108 - $err = wfShellExec( $cmd, $retval );
109 - wfProfileOut( 'ddjvu' );
 129+ else {
 130+ # Use a subshell (brackets) to aggregate stderr from both pipeline commands
 131+ # before redirecting it to the overall stdout. This works in both Linux and Windows XP.
 132+ $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " .
 133+ wfEscapeShellArg( $srcPath );
 134+ if ( $wgDjvuPostProcessor ) {
 135+ $cmd .= " | {$wgDjvuPostProcessor}";
 136+ }
 137+ $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
 138+ wfProfileIn( 'ddjvu' );
 139+ wfDebug( __METHOD__.": $cmd\n" );
 140+ $err = wfShellExec( $cmd, $retval );
 141+ wfProfileOut( 'ddjvu' );
 142+ }
110143
111144 $removed = $this->removeBadFile( $dstPath, $retval );
112145 if ( $retval != 0 || $removed ) {

Follow-up revisions

RevisionCommit summaryAuthorDate
r50051fetch djvu text (bug 18046); using parser hook instead of ajax, and Http::get...thomasv19:37, 29 April 2009
r51399reverting r50050. djvu text should be stored in img_metadata (per Tim)thomasv13:07, 3 June 2009

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r49669extract text layer from djvu file (see bug 18046)thomasv18:00, 20 April 2009
r50026Revert r49669, r49670 "extract text layer from djvu file (see bug 18046)"...brion22:54, 28 April 2009

Comments

#Comment by ThomasV (talk | contribs)   13:08, 3 June 2009

reverted in r51399 (except the addition to DefaultSettings)

Status & tagging log