Index: trunk/extensions/PdfHandler/PdfHandler_body.php |
— | — | @@ -202,4 +202,19 @@ |
203 | 203 | $data = $this->getMetaArray( $image ); |
204 | 204 | return PdfImage::getPageSize( $data, $page ); |
205 | 205 | } |
| 206 | + |
| 207 | + function getPageText( $image, $page ){ |
| 208 | + $data = $this->getMetaArray( $image, true ); |
| 209 | + if ( !$data ) { |
| 210 | + return false; |
| 211 | + } |
| 212 | + if( ! isset( $data['text'] ) ) { |
| 213 | + return false; |
| 214 | + } |
| 215 | + if( ! isset( $data['text'][$page-1] ) ) { |
| 216 | + return false; |
| 217 | + } |
| 218 | + return $data['text'][$page-1]; |
| 219 | + } |
| 220 | + |
206 | 221 | } |
Index: trunk/extensions/PdfHandler/PdfHandler.php |
— | — | @@ -39,6 +39,7 @@ |
40 | 40 | $wgPdfProcessor = 'gs'; |
41 | 41 | $wgPdfPostProcessor = 'convert'; |
42 | 42 | $wgPdfInfo = 'pdfinfo'; |
| 43 | +$wgPdftoText = 'pdftotext'; |
43 | 44 | |
44 | 45 | $wgPdfOutputExtension = "jpg"; |
45 | 46 | $wgPdfHandlerDpi = 150; |
Index: trunk/extensions/PdfHandler/PdfHandler.image.php |
— | — | @@ -79,7 +79,7 @@ |
80 | 80 | } |
81 | 81 | |
82 | 82 | public function retrieveMetaData() { |
83 | | - global $wgPdfInfo; |
| 83 | + global $wgPdfInfo, $wgPdftoText; |
84 | 84 | |
85 | 85 | if ( $wgPdfInfo ) { |
86 | 86 | wfProfileIn( 'pdfinfo' ); |
— | — | @@ -93,6 +93,25 @@ |
94 | 94 | } else { |
95 | 95 | $data = null; |
96 | 96 | } |
| 97 | + |
| 98 | + # Read text layer |
| 99 | + if ( isset( $wgPdftoText ) ) { |
| 100 | + wfProfileIn( 'pdftotext' ); |
| 101 | + $cmd = wfEscapeShellArg( $wgPdftoText ) . ' '. wfEscapeShellArg( $this->mFilename ) . ' - '; |
| 102 | + wfDebug( __METHOD__.": $cmd\n" ); |
| 103 | + $txt = wfShellExec( $cmd, $retval ); |
| 104 | + wfProfileOut( 'pdftotext' ); |
| 105 | + if( $retval == 0) { |
| 106 | + # Get rid of invalid UTF-8, strip control characters |
| 107 | + wfSuppressWarnings(); |
| 108 | + $txt = iconv( "UTF-8","UTF-8//IGNORE", $txt ); |
| 109 | + wfRestoreWarnings(); |
| 110 | + $txt = preg_replace( "/[\013\035\037]/", "", $txt ); |
| 111 | + $txt = htmlspecialchars($txt); |
| 112 | + $pages = preg_split("/\f/s", $txt ); |
| 113 | + $data['text'] = $pages; |
| 114 | + } |
| 115 | + } |
97 | 116 | return $data; |
98 | 117 | } |
99 | 118 | |