r73113 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r73112‎ | r73113 | r73114 >
Date:12:39, 16 September 2010
Author:daniel
Status:deferred
Tags:
Comment:
multiple records per file, stdin
Modified paths:
  • /trunk/extensions/DataTransclusion/ImportMAB2.php (modified) (history)

Diff [purge]

Index: trunk/extensions/DataTransclusion/ImportMAB2.php
@@ -18,17 +18,23 @@
1919 parent::__construct();
2020
2121 $this->addArg( "name", "name of a transclusion data source, as specified in \$wgDataTransclusionSources", true );
22 - $this->addArg( "dir", "directory containing MAB files", true );
 22+ $this->addArg( "file/dir", "directory containing MAB files, or a single MAB file, or - for stdin", true );
 23+
2324 $this->addArg( "blob_table", "database table for data blobs, without prefix", true );
2425 $this->addArg( "index_table", "database table for index entries, without prefix", true );
2526
2627 $this->addOption( "create", "create database tables if they do not exist", false, false );
2728 $this->addOption( "truncate", "truncate (empty) database tables", false, false );
2829 $this->addOption( "prefix", "database table prefix. May contain a period (\".\") to reference tables in another database. If not given, the wiki's table prefix will be used", false, true );
 30+
2931 $this->addOption( "recursive", "recurse into subdirectories while importing MAB files", false, false );
 32+
3033 $this->addOption( "noblob", "don't write blob data, import index fields only", false, false );
3134 $this->addOption( "limit", "max number of files to process", false, true );
3235 $this->addOption( "debug", "don't write to the database, dump to console instead", false, false );
 36+
 37+ $this->addOption( "multi-record", "read multiple records from a single file. Records may be separated by special lines matching --record-separator; if --record-separator is not given, all records are expected to start with filed number 001.", false, false );
 38+ $this->addOption( "record-separator", "regular expression for lines separating records in a multi-record file. Implies --multi-record", false, true );
3339 }
3440
3541 public function createTables( ) {
@@ -72,6 +78,9 @@
7379 $recursive = $this->hasOption( 'recursive' );
7480 $limit = (int)$this->getOption( 'limit' );
7581
 82+ $this->recordSeparator = $this->getOption( 'record-separator' );
 83+ $this->multiRecord = $this->recordSeparator || $this->hasOption( 'multi-record' );
 84+
7685 $src = $this->mArgs[0];
7786 $dir = $this->mArgs[1];
7887 $this->blob_table = $this->mArgs[2];
@@ -111,7 +120,13 @@
112121 }
113122 }
114123
115 - $this->importDir( $dir, $recursive, $limit );
 124+ $dir = "php://stdin";
 125+
 126+ if ( is_dir( $dir ) ) {
 127+ $this->importDir( $dir, $recursive, $limit );
 128+ } else {
 129+ $this->importMabFile( $dir );
 130+ }
116131 }
117132
118133 public function importDir( $dir, $recursive = false, $limit = 0 ) {
@@ -138,27 +153,12 @@
139154 continue;
140155 }
141156
142 - $rec = $this->readMabFile( $dir . $file );
 157+ $ok = $this->importMabFile( $dir . $file );
143158
144 - if ( !$rec ) {
 159+ if ( !$ok ) {
145160 $this->output( "error processing $file\n" );
146 - } else {
147 - $ids = $this->getIds($rec);
 161+ }
148162
149 - if ( $ids ) {
150 - if ( $this->debug ) {
151 - var_export( $ids );
152 - if ( !$this->noblob ) var_export( $rec );
153 - print "------------------------------------\n";
154 - } else {
155 - $this->output( "importing file $file\n" );
156 - $this->storeRecord($rec, $ids);
157 - }
158 - } else {
159 - $this->output( "skipping file $file\n" );
160 - }
161 - }
162 -
163163 if ( $limit > 0 ) {
164164 $limit -= 1;
165165 if ( $limit <= 0 ) break;
@@ -224,28 +224,91 @@
225225 $db->insert( $this->index_table, $insert, __METHOD__, array( 'IGNORE' ) );
226226 }
227227
228 - public function readMabFile( $file ) {
229 - $rec = array();
 228+ public function importMabFile( $file ) {
230229 $f = fopen( $file, 'r' );
231230 if ( !$f ) return false;
232231
233 - while( ( $s = fgets( $f ) ) ) {
234 - if ( preg_match( '/^(\d+[a-z]?)\s*([a-z])?=(.*$)/', $s, $m ) ) {
235 - $k = $m[1];
236 - $t = $m[2];
237 - $v = $m[3];
 232+ if ( $this->debug ) {
 233+ print "== $file =======================\n";
 234+ } else if ( $this->multiRecord ) {
 235+ $this->output( "reading records from $file\n" );
 236+ }
238237
239 - if ( isset( $rec[$k] ) ) {
240 - if ( !is_array( $rec[$k] ) ) {
241 - $rec[$k] = array( $rec[$k] );
 238+ $eof = false;
 239+ $pushed = false;
 240+
 241+ while( !$eof ) {
 242+ $rec = array();
 243+
 244+ while( !$eof ) {
 245+ if ( $pushed ) {
 246+ $s = $pushed;
 247+ $pushed = false;
 248+ } else {
 249+ $s = fgets( $f );
 250+ }
 251+
 252+ if ( $s === "" || $s === false ) {
 253+ $eof = true;
 254+ break;
 255+ }
 256+
 257+ if ( $rec && $this->recordSeparator && preg_match( $this->recordSeparator, $s ) ) {
 258+ break; // next record
 259+ }
 260+
 261+ if ( preg_match( '/^(\d+[a-z]?)\s*([a-z])?=(.*$)/', $s, $m ) ) {
 262+ $k = $m[1];
 263+ $t = $m[2];
 264+ $v = $m[3];
 265+
 266+ if ( $rec && ($this->multiRecord && !$this->recordSeparator) && $k === "001" ) {
 267+ $pushed = $s;
 268+ # we expect 0001 to be the first thing in every record!
 269+ break; // next record
242270 }
243271
244 - $rec[$k][] = $v;
 272+ if ( isset( $rec[$k] ) ) {
 273+ if ( !is_array( $rec[$k] ) ) {
 274+ $rec[$k] = array( $rec[$k] );
 275+ }
 276+
 277+ $rec[$k][] = $v;
 278+ } else {
 279+ $rec[$k] = $v;
 280+ }
 281+ }
 282+ }
 283+
 284+ if ( $rec ) {
 285+ $ids = $this->getIds($rec);
 286+
 287+ if ( $ids ) {
 288+ if ( $this->debug ) {
 289+ var_export( $ids );
 290+ if ( !$this->noblob ) var_export( $rec );
 291+ print "------------------------------------\n";
 292+ } else {
 293+ $id = false;
 294+ foreach ( $this->source->keyFields as $idf ) {
 295+ if ( !empty( $ids[ $idf ] ) ) {
 296+ $id = "$idf:" . $ids[$idf][0];
 297+ }
 298+ }
 299+
 300+ $this->output( "importing record $id\n" );
 301+ $this->storeRecord($rec, $ids);
 302+ }
245303 } else {
246 - $rec[$k] = $v;
 304+ $this->output( "skipping record from file $file\n" );
 305+ if ( $this->debug ) {
 306+ var_export( $rec );
 307+ print "------------------------------------\n";
 308+ }
247309 }
248310 }
249311 }
 312+
250313 fclose( $f );
251314 return $rec;
252315 }

Status & tagging log