r72899 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r72898‎ | r72899 | r72900 >
Date:14:33, 13 September 2010
Author:daniel
Status:deferred
Tags:
Comment:
MAB support. Refactored value normalisation. Several breaking changes to config for DBDataTRansclusionSource
Modified paths:
  • /trunk/extensions/DataTransclusion/DBDataTransclusionSource.php (modified) (history)
  • /trunk/extensions/DataTransclusion/DataTransclusion.php (modified) (history)
  • /trunk/extensions/DataTransclusion/DataTransclusionHandler.php (modified) (history)
  • /trunk/extensions/DataTransclusion/DataTransclusionSource.php (modified) (history)
  • /trunk/extensions/DataTransclusion/ImportMAB2.php (added) (history)
  • /trunk/extensions/DataTransclusion/MAB2RecordTransformer.php (modified) (history)
  • /trunk/extensions/DataTransclusion/ValueNormalizers.php (added) (history)

Diff [purge]

Index: trunk/extensions/DataTransclusion/DataTransclusionSource.php
@@ -31,7 +31,20 @@
3232 * to a key, to refine the output. Optional.
3333 * * $spec['fieldNames']: names of all fields present in each record.
3434 * Fields not listed here will not be available on the wiki,
35 - * even if they are returned by the data source. REQUIRED.
 35+ * even if they are returned by the data source. If not given, this defaults to
 36+ * $spec['keyFields'] + array_keys( $spec['fieldInfo'] ).
 37+ * * $spec['fieldInfo']: Assiciative array mapping logical field names to additional
 38+ * information for using and interpreting these fields. Different data sources
 39+ * may allow different hints for each field. The following hints are known per
 40+ * default:
 41+ * * $spec['fieldInfo'][$field]['type']: specifies the data types for the field:
 42+ * 'int' for integers, 'float' or 'decimal' for decimals, or 'string' for
 43+ * string fields. Serialization types 'json', 'wddx' and 'php' are also
 44+ * supported. Defaults to 'string'.
 45+ * * $spec['fieldInfo'][$field]['normalization']: normalization to be applied for
 46+ * this field, when used as a query key. This may be a callable, or an object
 47+ * that supports the function normalize(), or a regular expression for patterns
 48+ * to be removed from the value.
3649 * * $spec['cacheDuration']: the number of seconds a result from this source
3750 * may be cached for. If not set, results are assumed to be cacheable
3851 * indefinitely. This setting determines the expiry time of the parser
@@ -87,10 +100,24 @@
88101 $this->keyFields = self::splitList( $spec[ 'keyFields' ] );
89102 $this->optionNames = self::splitList( @$spec[ 'optionNames' ] );
90103
 104+ if ( isset( $spec[ 'fieldInfo' ] ) ) {
 105+ $this->fieldInfo = $spec[ 'fieldInfo' ];
 106+ } else {
 107+ $this->fieldInfo = null;
 108+ }
 109+
91110 if ( isset( $spec[ 'fieldNames' ] ) ) {
92111 $this->fieldNames = self::splitList( $spec[ 'fieldNames' ] );
 112+ } else if ( isset( $spec[ 'fieldInfo' ] ) ) {
 113+ $this->fieldNames = array_keys( $spec[ 'fieldInfo' ] );
93114 } else {
94115 $this->fieldNames = $this->keyFields;
 116+
 117+ if ( !empty( $this->fieldInfo ) ) {
 118+ $this->fieldNames = array_merge( $this->fieldNames, array_keys( $this->fieldInfo ) );
 119+ }
 120+
 121+ $this->fieldNames = array_unique( $this->fieldNames );
95122 }
96123
97124 if ( !empty( $spec[ 'cacheDuration' ] ) ) {
@@ -120,6 +147,51 @@
121148 $this->sourceInfo[ 'source-name' ] = $this->name; // force this one
122149 }
123150
 151+ public function normalize( $key, $value, $norm = null ) {
 152+ if ( $norm );
 153+ else if ( isset( $this->fieldInfo[ $key ]['normalization'] ) ) {
 154+ $norm = trim( $this->fieldInfo[ $key ]['normalization'] );
 155+ } else {
 156+ return $value;
 157+ }
 158+
 159+ if ( is_object( $norm ) ) {
 160+ return $norm->normalize( $value );
 161+ } else if ( is_callable( $norm ) || preg_match( '/^(\w[\w\d]*::)?(\w[\w\d]*)$/', $norm ) ) {
 162+ return call_user_func( $norm, $value );
 163+ } else if ( is_array( $norm ) ) {
 164+ return preg_replace( $norm[0], $norm[1], $value );
 165+ } else {
 166+ return preg_replace( $norm, '', $value );
 167+ }
 168+ }
 169+
 170+ public function convert( $key, $value, $format = null ) {
 171+ if ( $format );
 172+ else if ( isset( $this->fieldInfo[ $key ]['type'] ) ) {
 173+ $format = strtolower( trim( $this->fieldInfo[ $key ]['type'] ) );
 174+ } else {
 175+ return (string)$value;
 176+ }
 177+
 178+ if ( $format == 'int' ) {
 179+ return (int)$value;
 180+ } else if ( $format == 'decimal' || $format == 'float' ) {
 181+ return (float)$value;
 182+ } else if ( $format == 'json' || $format == 'js' ) {
 183+ return DataTransclusionSource::decodeJson( $value );
 184+ } else if ( $format == 'wddx' ) {
 185+ return DataTransclusionSource::decodeWddx( $value );
 186+ } else if ( $format == 'xml' ) {
 187+ return DataTransclusionSource::parseXml( $value ); #WARNING: returns DOM
 188+ } else if ( $format == 'php' || $format == 'pser' ) {
 189+ return DataTransclusionSource::decodeSerialized( $value );
 190+ } else {
 191+ return (string)$value;
 192+ }
 193+ }
 194+
 195+
124196 public function getName() {
125197 return $this->name;
126198 }
@@ -147,6 +219,9 @@
148220 public abstract function fetchRawRecord( $field, $value, $options = null );
149221
150222 public function fetchRecord( $field, $value, $options = null ) {
 223+ $value = $this->normalize( $field, $value );
 224+ $value = $this->convert( $field, $value );
 225+
151226 $rec = $this->fetchRawRecord( $field, $value, $options );
152227
153228 if ( $this->transformer ) {
@@ -155,6 +230,27 @@
156231
157232 return $rec;
158233 }
 234+
 235+ public static function decodeSerialized( $raw ) {
 236+ return unserialize( $raw );
 237+ }
 238+
 239+ public static function decodeJson( $raw ) {
 240+ $raw = preg_replace( '/^\s*(var\s)?\w([\w\d]*)\s+=\s*|\s*;\s*$/sim', '', $raw);
 241+ return FormatJson::decode( $raw, true );
 242+ }
 243+
 244+ public static function decodeWddx( $raw ) {
 245+ return wddx_unserialize( $raw );
 246+ }
 247+
 248+ public static function parseXml( $raw ) {
 249+ $dom = new DOMDocument();
 250+ $dom->loadXML( $raw );
 251+
 252+ #NOTE: returns a DOM, RecordTransformer must be aware!
 253+ return $dom->documentElement;
 254+ }
159255 }
160256
161257 /**
@@ -279,22 +375,4 @@
280376 public function fetchRawRecord( $field, $value, $options = null ) {
281377 return @$this->lookup[ $field ][ $value ];
282378 }
283 -
284 - public static function decodeJson( $raw ) {
285 - $raw = preg_replace( '/^\s*(var\s)?\w([\w\d]*)\s+=\s*|\s*;\s*$/sim', '', $raw);
286 - return FormatJson::decode( $raw, true );
287 - }
288 -
289 - public static function decodeWddx( $raw ) {
290 - return wddx_unserialize( $raw );
291 - }
292 -
293 - public static function parseXml( $raw ) {
294 - $dom = new DOMDocument();
295 - $dom->loadXML( $raw );
296 -
297 - #NOTE: returns a DOM, RecordTransformer must be aware!
298 - return $dom->documentElement;
299 - }
300 -
301379 }
Index: trunk/extensions/DataTransclusion/ValueNormalizers.php
@@ -0,0 +1,29 @@
 2+<?php
 3+/**
 4+ * Collection of normalization functions to be applied to data values.
 5+ *
 6+ * @file
 7+ * @ingroup Extensions
 8+ * @author Daniel Kinzler for Wikimedia Deutschland
 9+ * @copyright © 2010 Wikimedia Deutschland (Author: Daniel Kinzler)
 10+ * @licence GNU General Public Licence 2.0 or later
 11+ */
 12+
 13+if ( !defined( 'MEDIAWIKI' ) ) {
 14+ echo( "This file is an extension to the MediaWiki software and cannot be used standalone.\n" );
 15+ die( 1 );
 16+}
 17+
 18+/*
 19+ */
 20+abstract class ValueNormalizers {
 21+ static function trim( $v ) {
 22+ return trim( $v );
 23+ }
 24+
 25+ static function strip_punctuation( $v ) {
 26+ $w = preg_replace( '/[^a-zA-Z0-9]+/', '', $v );
 27+ return $w;
 28+ }
 29+}
 30+
Property changes on: trunk/extensions/DataTransclusion/ValueNormalizers.php
___________________________________________________________________
Added: svn:mergeinfo
Added: svn:eol-style
131 + native
Index: trunk/extensions/DataTransclusion/MAB2RecordTransformer.php
@@ -14,6 +14,7 @@
1515 die( 1 );
1616 }
1717
 18+global $mab_field_map; # auto-loaded from within a function, so top scope is not global!
1819 $mab_field_map = array(
1920 'title' => array( '081', '200b', '304', '310', '331', '335' ),
2021 'series' => array( '089', '090', '451', '545' ),
@@ -22,6 +23,7 @@
2324 'author' => array( ), # added later
2425 'editor' => array( ), # added later
2526 'institution' => array( '200', '204', ),
 27+ 'language' => array( '037', '037a', '037b', '037c', '037z' ),
2628 'annote' => array( '334', '434', '501' ),
2729 'note' => array( '359' ),
2830 'journal' => array( '376' ),
@@ -31,10 +33,15 @@
3234 'pages' => array( '433' ),
3335 'type' => array( '509' ),
3436 'copyright' => array( '531' ),
35 - 'ISBN' => array( '540', '540a', '540b' ),
36 - 'ISSN' => array( '542', ),
37 - 'LCC' => array( '544', ),
38 - 'DOI' => array( '552', ),
 37+ 'isbn' => array( '540', '540a', '540b' ),
 38+ 'issn' => array( '542', ),
 39+ 'lcc' => array( '544', '25l' ),
 40+ 'doi' => array( '552', ),
 41+ 'dnb' => array( '025a', ),
 42+ 'zdb' => array( '025z', ),
 43+ 'zka' => array( '025g', ),
 44+ 'hzb' => array( '025h', ),
 45+ 'id' => array( '001', ),
3946 'howpublished' => array( '590', '596', ),
4047 );
4148
@@ -48,15 +55,15 @@
4956 }
5057
5158 $mab_field_map['author'][] = '333';
 59+$mab_field_map['author'][] = '359';
 60+$mab_field_map['author'][] = '369';
5261
5362 /**
5463 * Implementations of RecordTransformer for processing data from the OpenLibrary web API.
5564 * No configuration options are needed.
5665 */
57 -class MABRecordTransformer extends RecordTransformer {
 66+class MAB2RecordTransformer extends RecordTransformer {
5867
59 -
60 -
6168 /**
6269 * Initializes the RecordTransformer from the given parameter array.
6370 * @param $spec associative array of options. See class-level documentation for details.
@@ -69,17 +76,32 @@
7077 $this->fieldPrefix = @$spec[ 'fieldPrefix' ];
7178 }
7279
 80+ public static function getMABFields( $logical ) {
 81+ global $mab_field_map;
 82+
 83+ if ( isset( $mab_field_map[ $logical ] ) ) {
 84+ return $mab_field_map[ $logical ];
 85+ } else {
 86+ return false;
 87+ }
 88+ }
 89+
7390 public function transform( $rec ) {
7491 global $mab_field_map;
7592
7693 $r = array();
7794
78 - foreach ($mab_field_map as $field => $items) {
 95+ foreach ( $mab_field_map as $field => $items ) {
7996 foreach ( $items as $item ) {
8097 if ( $this->fieldPrefix ) $item = fieldPrefix + $item;
8198
8299 if ( !empty( $rec[ $item ] ) ) {
83 - $r[ $field ][] = $rec[ $item ];
 100+ if ( is_array( $rec[ $item ] ) ) {
 101+ $r[ $field ] = array_merge( $r[ $field ], $rec[ $item ] );
 102+ } else {
 103+ $r[ $field ][] = $rec[ $item ];
 104+ }
 105+
84106 break;
85107 }
86108 }
@@ -87,9 +109,10 @@
88110
89111 foreach ($r as $f => $values) {
90112 if ( count($values) == 0 ) unset( $r[ $f ] );
91 - else if ( count($values) == 1 ) $r[ $f ] = $values[0];
 113+ else if ( count($values) == 1 ) $r[ $f ] = MAB2RecordTransformer::mangleValue( $values[0] );
92114 else {
93115 $values = array_unique( $values );
 116+ $values = array_map( 'MAB2RecordTransformer::mangleValue', $values );
94117 $r[ $f ] = join(', ', $values);
95118 }
96119 }
@@ -97,6 +120,15 @@
98121 return $r;
99122 }
100123
 124+ function mangleValue( $v ) {
 125+ $v = preg_replace( '/<<\[(.*?)\]>>/', '', $v );
 126+ $v = preg_replace( '/\[(.*?)\]/', '$1', $v );
 127+ $v = preg_replace( '/<<(.*?)>>/', '$1', $v );
 128+ $v = preg_replace( '/<(.*?)>/', '$1', $v );
 129+ $v = preg_replace( '/^[¤¬]/', '', $v );
 130+ return $v;
 131+ }
 132+
101133 /**
102134 * Extracts any error message from the $data from the data source. This is done
103135 * by calling resolvePath() on the $spec['errorPath'] provided to the constructor.
@@ -104,6 +136,13 @@
105137 * @param $rec a structured data response, as received from the data source
106138 */
107139 public function extractError( $data ) {
 140+ if ( !$this->dataPath ) {
 141+ $r = $this->extractRecord( $data );
 142+
 143+ if ( $r ) return false;
 144+ else return true;
 145+ }
 146+
108147 $err = $this->resolvePath( $data, $this->errorPath );
109148 $err = $this->asString( $err );
110149
@@ -117,6 +156,10 @@
118157 * @param $rec a structured data response, as received from the data source
119158 */
120159 public function extractRecord( $data ) {
 160+ if ( !$this->dataPath ) {
 161+ return $data;
 162+ }
 163+
121164 $rec = $this->resolvePath( $data, $this->dataPath );
122165
123166 return $rec;
Index: trunk/extensions/DataTransclusion/DBDataTransclusionSource.php
@@ -28,19 +28,17 @@
2929 * not supported reliably. REQUIRED.
3030 * * $spec['querySuffix']: additional clauses to be added after the WHERE clause.
3131 * Useful mostly to specify GROUP BY (or ORDER BY or LIMIT).
32 - * * $spec['keyTypes']: associative arrays specifying the data types for the key fields.
33 - * Array keys are the field names, the associated values specify the type
34 - * as 'int' for integers, 'float' or 'decimal' for decimals, or 'string'
35 - * for string fields.
36 - * * $spec['serializedFields']: associative array of fields that contain serialized data
37 - * structures. The keys in the array are the field names, the values are the
38 - * specify the data format: 'json', 'wddx' and 'php' for php serialized objects.
39 - * If deserialzation yields an array, the array will be merged with rest of the
40 - * record.
41 - * * $spec['keyFields']: like for DataTransclusionSource, this is list of fields
42 - * that can be used as the key for fetching a record. However, it's not required
43 - * for DBDataTransclusionSource: if not provided, array_keys( $spec['keyTypes'] )
44 - * will be used.
 32+ * * $spec['fieldNames']: like for DataTransclusionSource; However, it's not required
 33+ * for DBDataTransclusionSource: if not provided, array_keys( $spec['fieldInfo'] )
 34+ * will be used.
 35+ * * $spec['fieldInfo']: like for DataTransclusionSource; Some additional hints are
 36+ * supported for each field:
 37+ * * $spec['fieldInfo'][...]['dbfield']: the field's name in the database table,
 38+ * if different from the logical name.
 39+ * * $spec['fieldInfo'][...]['serialized']: format if the field contains a
 40+ * serialized structure as a blob. If deserialzation yields an array, it is
 41+ * merged with the data record. Supported formats are 'json', 'wddx' and
 42+ * 'php' for php serialized objects.
4543 *
4644 * For more information on options supported by DataTransclusionSource, see the class-level
4745 * documentation there.
@@ -52,59 +50,16 @@
5351 * @param $spec associative array of options. See class-level documentation for details.
5452 */
5553 function __construct( $spec ) {
56 - if ( !isset( $spec[ 'keyFields' ] ) && isset( $spec[ 'keyTypes' ] ) ) {
57 - $spec[ 'keyFields' ] = array_keys( $spec[ 'keyTypes' ] );
 54+ if ( !isset( $spec[ 'fieldNames' ] ) && isset( $spec[ 'fieldInfo' ] ) ) {
 55+ $spec[ 'fieldNames' ] = array_keys( $spec[ 'fieldInfo' ] );
5856 }
5957
6058 DataTransclusionSource::__construct( $spec );
6159
6260 $this->query = $spec[ 'query' ];
6361 $this->querySuffix = @$spec[ 'querySuffix' ];
64 - $this->serializedFields = DataTransclusionSource::splitList( @$spec[ 'serializedFields' ] );
65 -
66 - if ( isset( $spec[ 'keyTypes' ] ) ) {
67 - $this->keyTypes = $spec[ 'keyTypes' ];
68 - } else {
69 - $this->keyTypes = null;
70 - }
7162 }
7263
73 - public function convertKey( $key, $value ) {
74 - if ( !isset( $this->keyTypes[ $key ] ) ) {
75 - return (string)$value;
76 - }
77 -
78 - $t = strtolower( trim( $this->keyTypes[ $key ] ) );
79 -
80 - if ( $t == 'int' ) {
81 - return (int)$value;
82 - } else if ( $t == 'decimal' || $t == 'float' ) {
83 - return (float)$value;
84 - } else if ( $format == 'json' || $format == 'js' ) {
85 - return DataTransclusionSource::decodeJson( $raw );
86 - } else if ( $format == 'wddx' ) {
87 - return DataTransclusionSource::decodeWddx( $raw );
88 - } else if ( $format == 'xml' ) {
89 - return DataTransclusionSource::parseXml( $raw ); #WARNING: returns DOM
90 - } else if ( $format == 'php' || $format == 'pser' ) {
91 - return DataTransclusionSource::decodeSerialized( $raw );
92 - } else {
93 - return (string)$value;
94 - }
95 - }
96 -
97 - public function unserialize( $data, $format ) {
98 - if ( $format == 'json' || $format == 'js' ) {
99 - return DataTransclusionSource::decodeJson( $raw );
100 - } else if ( $format == 'wddx' ) {
101 - return DataTransclusionSource::decodeWddx( $raw );
102 - } else if ( $format == 'php' || $format == 'pser' ) {
103 - return DataTransclusionSource::decodeSerialized( $raw );
104 - }
105 -
106 - return $data;
107 - }
108 -
10964 public function getQuery( $field, $value, $db = null ) {
11065 if ( !$db ) {
11166 $db = wfGetDB( DB_SLAVE );
@@ -114,7 +69,9 @@
11570 return false; // redundant, but make extra sure we don't get anythign evil here
11671 }
11772
118 - $value = $this->convertKey( $field, $value );
 73+ if ( !empty( $this->fieldInfo ) && isset( $this->fieldInfo[$field]['dbfield'] ) ) {
 74+ $field = $this->fieldInfo[$field]['dbfield'];
 75+ }
11976
12077 if ( is_string( $value ) ) {
12178 $v = $db->addQuotes( $value );
@@ -159,16 +116,34 @@
160117
161118 $db->freeResult( $rs );
162119
163 - if ( $this->serializedFields ) {
164 - foreach ( $this->serializedFields as $f => $format ) {
165 - if ( empty( $rec[ $f ] ) ) continue;
 120+ foreach ( $rec as $k => $v ) {
 121+ if ( is_int( $k ) ) { # remove numeric keys, keep only assoc.
 122+ unset( $rec[ $k ] );
 123+ }
 124+ }
166125
167 - $data = $this->unserialize( $rec[ $f ], $format );
 126+ foreach ( $rec as $k => $v ) {
 127+ if ( isset( $this->fieldInfo[ $k ] ) ) {
 128+ $format = null; # auto format
 129+ $serialized = !empty( $this->fieldInfo[ $k ]['serialized'] );
168130
169 - if ( is_array( $data ) ) {
170 - $rec = array_merge( $data, $rec );
 131+ if ( $serialized && is_string( $this->fieldInfo[ $k ]['serialized'] ) ) {
 132+ $format = $this->fieldInfo[ $k ]['serialized']; # serialization format, else use ['type']
 133+ }
 134+
 135+ $data = $this->convert( $k , $rec[ $k ], $format ); # unserialize or convert
 136+
 137+ if ( $serialized && is_array( $data ) ) { # flatten serialized
 138+ # flatten
 139+ unset( $rec[ $k ] );
 140+
 141+ foreach ( $data as $m => $w ) { # don't use array_merge, it stinks.
 142+ $rec[ $m ] = $w;
 143+ }
 144+ } else {
 145+ $rec[ $k ] = $data;
 146+ }
171147 }
172 - }
173148 }
174149
175150 wfDebugLog( 'DataTransclusion', "loaded record for $field=$value from database\n" );
Index: trunk/extensions/DataTransclusion/DataTransclusionHandler.php
@@ -163,10 +163,14 @@
164164 }
165165
166166 // render the record into wiki text
167 - $t = Title::newFromText( $template, NS_TEMPLATE );
168 - if ( empty( $t ) ) {
169 - wfDebugLog( 'DataTransclusion', "illegal template name: $template\n" );
170 - return DataTransclusionHandler::errorMessage( 'datatransclusion-bad-template-name', $asHTML, $template );
 167+ if ( $template === "#dump" ) {
 168+ $t = null;
 169+ } else {
 170+ $t = Title::newFromText( $template, NS_TEMPLATE );
 171+ if ( empty( $t ) ) {
 172+ wfDebugLog( 'DataTransclusion', "illegal template name: $template\n" );
 173+ return DataTransclusionHandler::errorMessage( 'datatransclusion-bad-template-name', $asHTML, $template );
 174+ }
171175 }
172176
173177 $handler = new DataTransclusionHandler( $parser, $source, $t, $templateText );
@@ -201,6 +205,20 @@
202206 }
203207
204208 function render( $record ) {
 209+ if ( empty( $this->templateText ) && $this->template === null ) {
 210+ // magic record dump
 211+ $t = null;
 212+
 213+ $this->templateText = "\n{|";
 214+ $this->templateText .= "|--\n";
 215+ $this->templateText .= "! key !! value\n";
 216+ foreach ( $record as $k => $v ) {
 217+ $this->templateText .= "|--\n";
 218+ $this->templateText .= "| $k || {{{{$k}}}}\n";
 219+ }
 220+ $this->templateText .= "|}\n";
 221+ }
 222+
205223 if ( $this->templateText ) {
206224 // explicit template content set. Used for testing and debugging.
207225 if ( is_string( $this->templateText ) ) {
@@ -264,7 +282,7 @@
265283 if ( $args ) {
266284 // add arguments
267285 foreach ( $args as $f => $v ) {
268 - if ( is_array( $v ) || is_object( $v ) || is_resource( $v ) ) {
 286+ if ( is_int( $f ) || is_array( $v ) || is_object( $v ) || is_resource( $v ) ) {
269287 continue;
270288 }
271289
Index: trunk/extensions/DataTransclusion/ImportMAB2.php
@@ -0,0 +1,229 @@
 2+<?php
 3+/**
 4+ */
 5+
 6+if ( getenv( 'MW_INSTALL_PATH' ) ) {
 7+ $IP = getenv( 'MW_INSTALL_PATH' );
 8+} else {
 9+ $IP = dirname( __FILE__ ) . '/../..';
 10+
 11+ if ( !file_exists( "$IP/LocalSettings.php" ) ) {
 12+ $IP = dirname( __FILE__ ) . '/../../phase3';
 13+ }
 14+}
 15+require_once( "$IP/maintenance/Maintenance.php" );
 16+
 17+class ImportMAB2 extends Maintenance {
 18+ public function __construct( ) {
 19+ parent::__construct();
 20+
 21+ $this->addArg( "name", "name of a transclusion data source, as specified in \$wgDataTransclusionSources", true );
 22+ $this->addArg( "dir", "directory containing MAB files", true );
 23+ $this->addArg( "blob_table", "database table for data blobs, without prefix", true );
 24+ $this->addArg( "index_table", "database table for index entries, without prefix", true );
 25+
 26+ $this->addOption( "create", "create database tables if they do not exist", false, false );
 27+ $this->addOption( "truncate", "truncate (empty) database tables", false, false );
 28+ $this->addOption( "prefix", "database table prefix. May contain a period (\".\") to reference tables in another database. If not given, the wiki's table prefix will be used", false, true );
 29+ $this->addOption( "limit", "max number of files to process", false, true );
 30+ $this->addOption( "debug", "don't write to the database, dump to console instead", false, false );
 31+ }
 32+
 33+ public function createTables( ) {
 34+ $db = wfGetDB( DB_MASTER );
 35+
 36+ $this->output( "creating blob table {$this->blob_table}\n" );
 37+ $sql = "CREATE TABLE IF NOT EXISTS " . $this->blob_table . " ( ";
 38+ $sql .= " id INT(12) NOT NULL AUTO_INCREMENT, ";
 39+ $sql .= " data BLOB NOT NULL, ";
 40+ $sql .= " PRIMARY KEY (id) ";
 41+ $sql .= ") ";
 42+ $db->query( $sql, __METHOD__ );
 43+
 44+ $this->output( "creating index table {$this->index_table}\n" );
 45+ $sql = "CREATE TABLE IF NOT EXISTS " . $this->index_table . " ( ";
 46+ $sql .= " field VARCHAR(255) NOT NULL, "; #FIXME: varchar vs varbinary!
 47+ $sql .= " value VARCHAR(255) NOT NULL, "; #FIXME: varchar vs varbinary!
 48+ $sql .= " data_id INT(12) NOT NULL, ";
 49+ $sql .= " PRIMARY KEY (field, value, data_id) "; #NOTE: we don't require (field,value) to be unique!
 50+ $sql .= ") ";
 51+ $db->query( $sql, __METHOD__ );
 52+ }
 53+
 54+ public function truncateTables( ) {
 55+ $db = wfGetDB( DB_MASTER );
 56+
 57+ $this->output( "truncating blob table {$this->blob_table}\n" );
 58+ $sql = "TRUNCATE TABLE " . $this->blob_table;
 59+ $db->query( $sql, __METHOD__ );
 60+
 61+ $this->output( "truncating index table {$this->index_table}\n" );
 62+ $sql = "TRUNCATE TABLE " . $this->index_table;
 63+ $db->query( $sql, __METHOD__ );
 64+ }
 65+
 66+ public function execute() {
 67+ global $wgDataTransclusionSources;
 68+
 69+ $this->debug = $this->hasOption( 'debug' );
 70+ $limit = (int)$this->getOption( 'limit' );
 71+
 72+ $src = $this->mArgs[0];
 73+ $dir = $this->mArgs[1];
 74+ $this->blob_table = $this->mArgs[2];
 75+ $this->index_table = $this->mArgs[3];
 76+
 77+ if ( !isset( $wgDataTransclusionSources[ $src ] ) ) {
 78+ throw new MWException( "unknown transclusion data source '$src', not found in \$wgDataTransclusionSources" );
 79+ }
 80+
 81+ $this->source = DataTransclusionHandler::getDataSource( $src );
 82+
 83+ if ( !( $this->source instanceof DBDataTransclusionSource ) ) {
 84+ throw new MWException( "bad data source '$src': not compatible with DBDataTransclusionSource" );
 85+ }
 86+
 87+ if ( $this->hasOption( 'prefix' ) ) {
 88+ $prefix = $this->getOption( "prefix" );
 89+ $this->blob_table = $prefix . $this->blob_table;
 90+ $this->index_table = $prefix . $this->index_table;
 91+ } else {
 92+ $db = wfGetDB( DB_MASTER ); # we'll need the master anyway later
 93+ $this->blob_table = $db->tableName( $this->blob_table );
 94+ $this->index_table = $db->tableName( $this->index_table );
 95+ }
 96+
 97+ if ( $this->hasOption('create') && !$this->debug ) {
 98+ $this->createTables( $this->blob_table, $this->index_table );
 99+ } else if ( $this->hasOption('truncate') && !$this->debug ) {
 100+ $this->truncateTables( $this->blob_table, $this->index_table );
 101+ }
 102+
 103+ $this->id_map = array();
 104+ foreach ( $this->source->keyFields as $key ) {
 105+ $this->id_map[ $key ] = MAB2RecordTransformer::getMABFields( $key );
 106+ if ( !$this->id_map[ $key ] ) {
 107+ $this->error( "unknown key field '$key', no MAB fields mapped." );
 108+ }
 109+ }
 110+
 111+ $dir = "$dir/";
 112+
 113+ $this->output( "scanning directory $dir\n" );
 114+ $d = opendir( $dir );
 115+ while( ( $file = readdir( $d ) ) ) {
 116+ if ( $file == "." or $file == ".." ) {
 117+ continue;
 118+ }
 119+
 120+ $rec = $this->readMabFile( $dir . $file );
 121+
 122+ if ( !$rec ) {
 123+ $this->output( "error processing $file\n" );
 124+ } else {
 125+ $ids = $this->getIds($rec);
 126+
 127+ if ( $ids ) {
 128+ if ( $this->debug ) {
 129+ var_export( $ids );
 130+ var_export( $rec );
 131+ print "------------------------------------\n";
 132+ } else {
 133+ $this->output( "importing file $file\n" );
 134+ $this->storeRecord($rec, $ids);
 135+ }
 136+ } else {
 137+ $this->output( "skipping file $file\n" );
 138+ }
 139+ }
 140+
 141+ if ( $limit > 0 ) {
 142+ $limit -= 1;
 143+ if ( $limit <= 0 ) break;
 144+ }
 145+ }
 146+ closedir( $d );
 147+ }
 148+
 149+ public function getIds( $rec ) {
 150+ $ids = array();
 151+ foreach ( $this->id_map as $field => $items ) {
 152+ if ( !$items ) continue;
 153+
 154+ foreach ( $items as $item ) {
 155+ if ( isset( $rec[ $item ] ) ) {
 156+ if ( !isset( $ids[ $field ] ) ) {
 157+ $ids[ $field ] = array();
 158+ }
 159+
 160+ if ( is_array( $rec[ $item ] ) ) {
 161+ foreach( $rec[ $item ] as $k => $v ) {
 162+ $v = $this->source->normalize( $field, $v );
 163+ $v = $this->source->convert( $field, $v );
 164+
 165+ $ids[ $field ][] = $v;
 166+ }
 167+ } else {
 168+ $v = $rec[ $item ];
 169+ $v = $this->source->normalize( $field, $v );
 170+ $v = $this->source->convert( $field, $v );
 171+
 172+ $ids[ $field ][] = $v;
 173+ }
 174+ }
 175+ }
 176+ }
 177+
 178+ return $ids;
 179+ }
 180+
 181+ public function storeRecord( $rec, $ids ) {
 182+ $db = wfGetDB( DB_MASTER );
 183+
 184+ $insert = array( 'data' => serialize($rec) );
 185+
 186+ $db->insert( $this->blob_table, $insert );
 187+ $id = $db->insertId();
 188+
 189+ $insert = array();
 190+ foreach ( $ids as $field => $values ) {
 191+ foreach ( $values as $v ) {
 192+ $insert[] = array(
 193+ 'field' => $field,
 194+ 'value' => $v,
 195+ 'data_id' => $id );
 196+ }
 197+ }
 198+
 199+ $db->insert( $this->index_table, $insert );
 200+ }
 201+
 202+ public function readMabFile( $file ) {
 203+ $rec = array();
 204+ $f = fopen( $file, 'r' );
 205+ if ( !$f ) return false;
 206+
 207+ while( ( $s = fgets( $f ) ) ) {
 208+ if ( preg_match( '/^(\d+[a-z]?)\s*([a-z])?=(.*$)/', $s, $m ) ) {
 209+ $k = $m[1];
 210+ $t = $m[2];
 211+ $v = $m[3];
 212+
 213+ if ( isset( $rec[$k] ) ) {
 214+ if ( !is_array( $rec[$k] ) ) {
 215+ $rec[$k] = array( $rec[$k] );
 216+ }
 217+
 218+ $rec[$k][] = $v;
 219+ } else {
 220+ $rec[$k] = $v;
 221+ }
 222+ }
 223+ }
 224+ fclose( $f );
 225+ return $rec;
 226+ }
 227+}
 228+
 229+$maintClass = "ImportMAB2";
 230+require_once( DO_MAINTENANCE );
Index: trunk/extensions/DataTransclusion/DataTransclusion.php
@@ -28,10 +28,12 @@
2929
3030 $wgAutoloadClasses['DataTransclusionRenderer'] = $dir . 'DataTransclusionRenderer.php';
3131 $wgAutoloadClasses['DataTransclusionHandler'] = $dir . 'DataTransclusionHandler.php';
 32+$wgAutoloadClasses['ValueNormalizers'] = $dir . 'ValueNormalizers.php';
3233 $wgAutoloadClasses['RecordTransformer'] = $dir . 'RecordTransformer.php';
3334 $wgAutoloadClasses['FlattenRecord'] = $dir . 'FlattenRecord.php';
3435 $wgAutoloadClasses['XPathFlattenRecord'] = $dir . 'XPathFlattenRecord.php';
3536 $wgAutoloadClasses['OpenLibraryRecordTransformer'] = $dir . 'OpenLibraryRecordTransformer.php';
 37+$wgAutoloadClasses['MAB2RecordTransformer'] = $dir . 'MAB2RecordTransformer.php';
3638 $wgAutoloadClasses['DataTransclusionSource'] = $dir . 'DataTransclusionSource.php';
3739 $wgAutoloadClasses['CachingDataTransclusionSource'] = $dir . 'DataTransclusionSource.php';
3840 $wgAutoloadClasses['FakeDataTransclusionSource'] = $dir . 'DataTransclusionSource.php';

Status & tagging log