Index: trunk/extensions/DataTransclusion/ImportMAB2.php |
— | — | @@ -18,17 +18,23 @@ |
19 | 19 | parent::__construct(); |
20 | 20 | |
21 | 21 | $this->addArg( "name", "name of a transclusion data source, as specified in \$wgDataTransclusionSources", true ); |
22 | | - $this->addArg( "dir", "directory containing MAB files", true ); |
| 22 | + $this->addArg( "file/dir", "directory containing MAB files, or a single MAB file, or - for stdin", true ); |
| 23 | + |
23 | 24 | $this->addArg( "blob_table", "database table for data blobs, without prefix", true ); |
24 | 25 | $this->addArg( "index_table", "database table for index entries, without prefix", true ); |
25 | 26 | |
26 | 27 | $this->addOption( "create", "create database tables if they do not exist", false, false ); |
27 | 28 | $this->addOption( "truncate", "truncate (empty) database tables", false, false ); |
28 | 29 | $this->addOption( "prefix", "database table prefix. May contain a period (\".\") to reference tables in another database. If not given, the wiki's table prefix will be used", false, true ); |
| 30 | + |
29 | 31 | $this->addOption( "recursive", "recurse into subdirectories while importing MAB files", false, false ); |
| 32 | + |
30 | 33 | $this->addOption( "noblob", "don't write blob data, import index fields only", false, false ); |
31 | 34 | $this->addOption( "limit", "max number of files to process", false, true ); |
32 | 35 | $this->addOption( "debug", "don't write to the database, dump to console instead", false, false ); |
| 36 | + |
| 37 | + $this->addOption( "multi-record", "read multiple records from a single file. Records may be separated by special lines matching --record-separator; if --record-separator is not given, all records are expected to start with filed number 001.", false, false ); |
| 38 | + $this->addOption( "record-separator", "regular expression for lines separating records in a multi-record file. Implies --multi-record", false, true ); |
33 | 39 | } |
34 | 40 | |
35 | 41 | public function createTables( ) { |
— | — | @@ -72,6 +78,9 @@ |
73 | 79 | $recursive = $this->hasOption( 'recursive' ); |
74 | 80 | $limit = (int)$this->getOption( 'limit' ); |
75 | 81 | |
| 82 | + $this->recordSeparator = $this->getOption( 'record-separator' ); |
| 83 | + $this->multiRecord = $this->recordSeparator || $this->hasOption( 'multi-record' ); |
| 84 | + |
76 | 85 | $src = $this->mArgs[0]; |
77 | 86 | $dir = $this->mArgs[1]; |
78 | 87 | $this->blob_table = $this->mArgs[2]; |
— | — | @@ -111,7 +120,13 @@ |
112 | 121 | } |
113 | 122 | } |
114 | 123 | |
115 | | - $this->importDir( $dir, $recursive, $limit ); |
| 124 | + $dir = "php://stdin"; |
| 125 | + |
| 126 | + if ( is_dir( $dir ) ) { |
| 127 | + $this->importDir( $dir, $recursive, $limit ); |
| 128 | + } else { |
| 129 | + $this->importMabFile( $dir ); |
| 130 | + } |
116 | 131 | } |
117 | 132 | |
118 | 133 | public function importDir( $dir, $recursive = false, $limit = 0 ) { |
— | — | @@ -138,27 +153,12 @@ |
139 | 154 | continue; |
140 | 155 | } |
141 | 156 | |
142 | | - $rec = $this->readMabFile( $dir . $file ); |
| 157 | + $ok = $this->importMabFile( $dir . $file ); |
143 | 158 | |
144 | | - if ( !$rec ) { |
| 159 | + if ( !$ok ) { |
145 | 160 | $this->output( "error processing $file\n" ); |
146 | | - } else { |
147 | | - $ids = $this->getIds($rec); |
| 161 | + } |
148 | 162 | |
149 | | - if ( $ids ) { |
150 | | - if ( $this->debug ) { |
151 | | - var_export( $ids ); |
152 | | - if ( !$this->noblob ) var_export( $rec ); |
153 | | - print "------------------------------------\n"; |
154 | | - } else { |
155 | | - $this->output( "importing file $file\n" ); |
156 | | - $this->storeRecord($rec, $ids); |
157 | | - } |
158 | | - } else { |
159 | | - $this->output( "skipping file $file\n" ); |
160 | | - } |
161 | | - } |
162 | | - |
163 | 163 | if ( $limit > 0 ) { |
164 | 164 | $limit -= 1; |
165 | 165 | if ( $limit <= 0 ) break; |
— | — | @@ -224,28 +224,91 @@ |
225 | 225 | $db->insert( $this->index_table, $insert, __METHOD__, array( 'IGNORE' ) ); |
226 | 226 | } |
227 | 227 | |
228 | | - public function readMabFile( $file ) { |
229 | | - $rec = array(); |
| 228 | + public function importMabFile( $file ) { |
230 | 229 | $f = fopen( $file, 'r' ); |
231 | 230 | if ( !$f ) return false; |
232 | 231 | |
233 | | - while( ( $s = fgets( $f ) ) ) { |
234 | | - if ( preg_match( '/^(\d+[a-z]?)\s*([a-z])?=(.*$)/', $s, $m ) ) { |
235 | | - $k = $m[1]; |
236 | | - $t = $m[2]; |
237 | | - $v = $m[3]; |
| 232 | + if ( $this->debug ) { |
| 233 | + print "== $file =======================\n"; |
| 234 | + } else if ( $this->multiRecord ) { |
| 235 | + $this->output( "reading records from $file\n" ); |
| 236 | + } |
238 | 237 | |
239 | | - if ( isset( $rec[$k] ) ) { |
240 | | - if ( !is_array( $rec[$k] ) ) { |
241 | | - $rec[$k] = array( $rec[$k] ); |
| 238 | + $eof = false; |
| 239 | + $pushed = false; |
| 240 | + |
| 241 | + while( !$eof ) { |
| 242 | + $rec = array(); |
| 243 | + |
| 244 | + while( !$eof ) { |
| 245 | + if ( $pushed ) { |
| 246 | + $s = $pushed; |
| 247 | + $pushed = false; |
| 248 | + } else { |
| 249 | + $s = fgets( $f ); |
| 250 | + } |
| 251 | + |
| 252 | + if ( $s === "" || $s === false ) { |
| 253 | + $eof = true; |
| 254 | + break; |
| 255 | + } |
| 256 | + |
| 257 | + if ( $rec && $this->recordSeparator && preg_match( $this->recordSeparator, $s ) ) { |
| 258 | + break; // next record |
| 259 | + } |
| 260 | + |
| 261 | + if ( preg_match( '/^(\d+[a-z]?)\s*([a-z])?=(.*$)/', $s, $m ) ) { |
| 262 | + $k = $m[1]; |
| 263 | + $t = $m[2]; |
| 264 | + $v = $m[3]; |
| 265 | + |
| 266 | + if ( $rec && ($this->multiRecord && !$this->recordSeparator) && $k === "001" ) { |
| 267 | + $pushed = $s; |
| 268 | + # we expect 0001 to be the first thing in every record! |
| 269 | + break; // next record |
242 | 270 | } |
243 | 271 | |
244 | | - $rec[$k][] = $v; |
| 272 | + if ( isset( $rec[$k] ) ) { |
| 273 | + if ( !is_array( $rec[$k] ) ) { |
| 274 | + $rec[$k] = array( $rec[$k] ); |
| 275 | + } |
| 276 | + |
| 277 | + $rec[$k][] = $v; |
| 278 | + } else { |
| 279 | + $rec[$k] = $v; |
| 280 | + } |
| 281 | + } |
| 282 | + } |
| 283 | + |
| 284 | + if ( $rec ) { |
| 285 | + $ids = $this->getIds($rec); |
| 286 | + |
| 287 | + if ( $ids ) { |
| 288 | + if ( $this->debug ) { |
| 289 | + var_export( $ids ); |
| 290 | + if ( !$this->noblob ) var_export( $rec ); |
| 291 | + print "------------------------------------\n"; |
| 292 | + } else { |
| 293 | + $id = false; |
| 294 | + foreach ( $this->source->keyFields as $idf ) { |
| 295 | + if ( !empty( $ids[ $idf ] ) ) { |
| 296 | + $id = "$idf:" . $ids[$idf][0]; |
| 297 | + } |
| 298 | + } |
| 299 | + |
| 300 | + $this->output( "importing record $id\n" ); |
| 301 | + $this->storeRecord($rec, $ids); |
| 302 | + } |
245 | 303 | } else { |
246 | | - $rec[$k] = $v; |
| 304 | + $this->output( "skipping record from file $file\n" ); |
| 305 | + if ( $this->debug ) { |
| 306 | + var_export( $rec ); |
| 307 | + print "------------------------------------\n"; |
| 308 | + } |
247 | 309 | } |
248 | 310 | } |
249 | 311 | } |
| 312 | + |
250 | 313 | fclose( $f ); |
251 | 314 | return $rec; |
252 | 315 | } |