r50026 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r50025‎ | r50026 | r50027 >
Date:22:54, 28 April 2009
Author:brion
Status:ok (Comments)
Tags:
Comment:
Revert r49669, r49670 "extract text layer from djvu file (see bug 18046)"

I don't really like the use of sed here; it's an extra Unix dependency which seems unnecessary, when we could trivially do the escaping in the code.

The ajax fetch added in r49670 also looks like a big security hole:
* Manual use of curl should be avoided; it may not be installed, and 'localhost' may not do anything useful.
* Further this appears to be a general "fetch any foreign URL and pass the data through" which is a serious security hole.
Modified paths:
  • /trunk/extensions/ProofreadPage/ProofreadPage.php (modified) (history)
  • /trunk/extensions/ProofreadPage/proofread.js (modified) (history)
  • /trunk/phase3/includes/DefaultSettings.php (modified) (history)
  • /trunk/phase3/includes/media/DjVu.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/media/DjVu.php
@@ -52,8 +52,6 @@
5353 $m = false;
5454 if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) {
5555 return array( 'width' => $m[2], 'page' => $m[1] );
56 - } else if ( preg_match( '/^page(\d+)-djvutxt$/', $str, $m ) ) {
57 - return array( 'djvutxt' => 1, 'page' => $m[1] );
5856 } else {
5957 return false;
6058 }
@@ -66,22 +64,8 @@
6765 );
6866 }
6967
70 -
71 - function normaliseParams( $image, &$params ) {
72 - global $wgDjvuTxt;
73 - if( $wgDjvuTxt && isset($params['djvutxt']) && $params['djvutxt']) {
74 - if ( !isset( $params['page'] ) ) {
75 - $params['page'] = 1;
76 - }
77 - $params['width'] = 0;
78 - $params['height'] = 0;
79 - return true;
80 - }
81 - else return parent::normaliseParams( $image, $params );
82 - }
83 -
8468 function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
85 - global $wgDjvuRenderer, $wgDjvuPostProcessor, $wgDjvuTxt, $wgSed;
 69+ global $wgDjvuRenderer, $wgDjvuPostProcessor;
8670
8771 // Fetch XML and check it, to give a more informative error message than the one which
8872 // normaliseParams will inevitably give.
@@ -112,22 +96,12 @@
11397
11498 # Use a subshell (brackets) to aggregate stderr from both pipeline commands
11599 # before redirecting it to the overall stdout. This works in both Linux and Windows XP.
116 -
117 - if( $params['djvutxt'] && $wgDjvuTxt && $wgSed ) {
118 - #Read text from djvu
119 - $cmd = '(' . wfEscapeShellArg( $wgDjvuTxt ) . " --page={$page} " . wfEscapeShellArg( $srcPath );
120 - #Escape < > & characters
121 - $cmd .= ' | ' . wfEscapeShellArg( $wgSed ) . ' "s/\&/\&amp;/g ; s/</\&lt;/g ; s/>/\&gt;/g ; s/\"/\&quot;/g "';
122 - $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
 100+ $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " .
 101+ wfEscapeShellArg( $srcPath );
 102+ if ( $wgDjvuPostProcessor ) {
 103+ $cmd .= " | {$wgDjvuPostProcessor}";
123104 }
124 - else {
125 - $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " .
126 - wfEscapeShellArg( $srcPath );
127 - if ( $wgDjvuPostProcessor ) {
128 - $cmd .= " | {$wgDjvuPostProcessor}";
129 - }
130 - $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
131 - }
 105+ $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
132106 wfProfileIn( 'ddjvu' );
133107 wfDebug( __METHOD__.": $cmd\n" );
134108 $err = wfShellExec( $cmd, $retval );
Index: trunk/phase3/includes/DefaultSettings.php
@@ -1898,12 +1898,6 @@
18991899 $wgDiff = '/usr/bin/diff';
19001900
19011901 /**
1902 - * Path to the GNU sed utility. Required by $wgDjvuTxt.
1903 - */
1904 -#$wgSed = '/bin/sed';
1905 -$wgSed = null;
1906 -
1907 -/**
19081902 * We can also compress text stored in the 'text' table. If this is set on, new
19091903 * revisions will be compressed on page save if zlib support is available. Any
19101904 * compressed revisions will be decompressed on load regardless of this setting
@@ -3541,13 +3535,6 @@
35423536 $wgDjvuRenderer = null;
35433537
35443538 /**
3545 - * Path of the djvutxt DJVU text extraction utility
3546 - * Enable this and $wgSed, $wgDjvuDump to enable text layer extraction from djvu files
3547 - */
3548 -# $wgDjvuTxt = 'djvutxt';
3549 -$wgDjvuTxt = null;
3550 -
3551 -/**
35523539 * Path of the djvutoxml executable
35533540 * This works like djvudump except much, much slower as of version 3.5.
35543541 *
Index: trunk/extensions/ProofreadPage/ProofreadPage.php
@@ -31,8 +31,6 @@
3232 );
3333
3434 $wgExtensionFunctions[] = "pr_main";
35 -$wgAjaxExportList[] = "pr_fetch_djvutxt";
36 -
3735 function pr_main() {
3836 global $wgParser;
3937 $wgParser->setHook( "pagelist", "pr_renderPageList" );
@@ -40,29 +38,11 @@
4139 }
4240
4341
44 -/*
45 - * Fetch Djvu text with curl
46 - */
47 -function pr_fetch_djvutxt( $url ) {
4842
49 - if($url[0]=='/') $url = "http://localhost" . $url;
5043
51 - $ch = curl_init( $url );
52 - curl_setopt( $ch, CURLOPT_HEADER, false );
53 - curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true);
54 - $text = curl_exec( $ch );
 44+# Bump the version number every time you change proofread.js
 45+$wgProofreadPageVersion = 18;
5546
56 - $errno = curl_errno( $ch );
57 - $httpCode = curl_getinfo( $ch, CURLINFO_HTTP_CODE );
58 - //$contentType = curl_getinfo( $ch, CURLINFO_CONTENT_TYPE );
59 -
60 - curl_close($ch);
61 - if($errno==0 && ( $httpCode==200 || $httpCode==404 ) ) {
62 - return $text;
63 - }
64 - return "";
65 -}
66 -
6747 /**
6848 *
6949 * Query the database to find if the current page is referred in an Index page.
Index: trunk/extensions/ProofreadPage/proofread.js
@@ -47,7 +47,6 @@
4848
4949
5050
51 -
5251 function pr_image_url(requested_width){
5352 var image_url;
5453
@@ -317,7 +316,7 @@
318317 table.style.cssText = "width:100%;";
319318
320319 //fill table
321 - if(self.proofreadpage_default_layout=='horizontal')
 320+ if(self.proofreadpage_default_layout=='horizontal')
322321 pr_fill_table(true);
323322 else
324323 pr_fill_table(false);
@@ -494,41 +493,18 @@
495494 addOnloadHook(pr_init_tabs);
496495
497496
498 -/*fetch djvu content with ajax*/
499 -function pr_fetch_djvutxt(){
500 - var text_url = proofreadPageThumbURL.replace('##WIDTH##px',"djvutxt").replace(".jpg",".txt");
501 - sajax_do_call( 'pr_fetch_djvutxt', [ text_url ], pr_init_textbox );
502 -}
503 -
504 -
505 -function pr_init_textbox(xmlhttp) {
506 - if (xmlhttp == null) return;
507 - if (xmlhttp.readyState == 4) {
508 - document.getElementById("wpTextbox1").value = xmlhttp.responseText;
509 - }
510 -}
511 -
512 -
513 -function pr_onload(){
514 -
515 - if(self.proofreadPageIsEdit){
516 - if(!document.getElementById("wpTextbox1") ) return;
 497+function pr_initzoom(){
 498+ if(document.getElementById("wpTextbox1")){
517499 if(self.pr_horiz)
518500 document.getElementById("wpTextbox1").style.cssText = "height:"+self.vertHeight+"px";
519501 else
520502 document.getElementById("wpTextbox1").style.cssText = "height:"+(self.TextBoxHeight-7)+"px";
521503 pr_zoom(0);
522 -
523 - //to enable, set proofreadpage_djvutxt=1
524 - if(self.proofreadpage_djvutxt) {
525 - if( document.getElementById("wpTextbox1").value == "" ) pr_fetch_djvutxt();
526 - }
527504 }
528505 }
529 -hookEvent("load", pr_onload );
 506+hookEvent("load", pr_initzoom );
530507
531508
532 -
533509 /*Quality buttons*/
534510
535511 function pr_add_quality(form,value){

Follow-up revisions

RevisionCommit summaryAuthorDate
r50050extract djvu text (bug 18046); escape possible script with htmlspecialchars i...thomasv19:33, 29 April 2009
r50051fetch djvu text (bug 18046); using parser hook instead of ajax, and Http::get...thomasv19:37, 29 April 2009

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r49669extract text layer from djvu file (see bug 18046)thomasv18:00, 20 April 2009
r49670fetch djvu textthomasv18:01, 20 April 2009

Comments

#Comment by ThomasV (talk | contribs)   19:40, 29 April 2009

ok, I did it again, without sed and without ajax : see r50050 and r50051.

Status & tagging log