r56413 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r56412‎ | r56413 | r56414 >
Date:13:50, 16 September 2009
Author:thomasv
Status:ok
Tags:
Comment:
extract text layer from pdf
Modified paths:
  • /trunk/extensions/PdfHandler/PdfHandler.image.php (modified) (history)
  • /trunk/extensions/PdfHandler/PdfHandler.php (modified) (history)
  • /trunk/extensions/PdfHandler/PdfHandler_body.php (modified) (history)

Diff [purge]

Index: trunk/extensions/PdfHandler/PdfHandler_body.php
@@ -202,4 +202,19 @@
203203 $data = $this->getMetaArray( $image );
204204 return PdfImage::getPageSize( $data, $page );
205205 }
 206+
 207+ function getPageText( $image, $page ){
 208+ $data = $this->getMetaArray( $image, true );
 209+ if ( !$data ) {
 210+ return false;
 211+ }
 212+ if( ! isset( $data['text'] ) ) {
 213+ return false;
 214+ }
 215+ if( ! isset( $data['text'][$page-1] ) ) {
 216+ return false;
 217+ }
 218+ return $data['text'][$page-1];
 219+ }
 220+
206221 }
Index: trunk/extensions/PdfHandler/PdfHandler.php
@@ -39,6 +39,7 @@
4040 $wgPdfProcessor = 'gs';
4141 $wgPdfPostProcessor = 'convert';
4242 $wgPdfInfo = 'pdfinfo';
 43+$wgPdftoText = 'pdftotext';
4344
4445 $wgPdfOutputExtension = "jpg";
4546 $wgPdfHandlerDpi = 150;
Index: trunk/extensions/PdfHandler/PdfHandler.image.php
@@ -79,7 +79,7 @@
8080 }
8181
8282 public function retrieveMetaData() {
83 - global $wgPdfInfo;
 83+ global $wgPdfInfo, $wgPdftoText;
8484
8585 if ( $wgPdfInfo ) {
8686 wfProfileIn( 'pdfinfo' );
@@ -93,6 +93,25 @@
9494 } else {
9595 $data = null;
9696 }
 97+
 98+ # Read text layer
 99+ if ( isset( $wgPdftoText ) ) {
 100+ wfProfileIn( 'pdftotext' );
 101+ $cmd = wfEscapeShellArg( $wgPdftoText ) . ' '. wfEscapeShellArg( $this->mFilename ) . ' - ';
 102+ wfDebug( __METHOD__.": $cmd\n" );
 103+ $txt = wfShellExec( $cmd, $retval );
 104+ wfProfileOut( 'pdftotext' );
 105+ if( $retval == 0) {
 106+ # Get rid of invalid UTF-8, strip control characters
 107+ wfSuppressWarnings();
 108+ $txt = iconv( "UTF-8","UTF-8//IGNORE", $txt );
 109+ wfRestoreWarnings();
 110+ $txt = preg_replace( "/[\013\035\037]/", "", $txt );
 111+ $txt = htmlspecialchars($txt);
 112+ $pages = preg_split("/\f/s", $txt );
 113+ $data['text'] = $pages;
 114+ }
 115+ }
97116 return $data;
98117 }
99118

Follow-up revisions

RevisionCommit summaryAuthorDate
r57266Cleanup for r56413 - PDF text extraction support:...brion23:29, 1 October 2009

Status & tagging log