r68832 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r68831‎ | r68832 | r68833 >
Date:14:50, 1 July 2010
Author:daniel
Status:deferred
Tags:
Comment:
better indexes for faster access
Modified paths:
  • /trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh (modified) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql (added) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql (deleted) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql (modified) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php (modified) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php (modified) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php (modified) (history)
  • /trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql
@@ -1,16 +0,0 @@
2 -insert into {collection}_{thesaurus}_search_index (
3 - concept, concept_name, `type`,
4 - `lang`, `term`, `score`, `norm` )
5 -select O.global_concept, M.concept_name, C.type, "{lang}",
6 - REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE utf8_general_ci ), "-", "" ),
7 - M.rule * M.freq, 1
8 -from {collection}_{lang}_meaning as M
9 -join {collection}_{thesaurus}_origin as O on O.lang = "{lang}" and O.local_concept = M.concept
10 -join {collection}_{thesaurus}_concept as C on C.id = O.global_concept
11 -where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0
12 -on duplicate key update
13 - score = if (score > values(score), score, values(score)),
14 - norm = if (norm < values(norm), score, values(norm));
15 -
\ No newline at end of file
Index: trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh
@@ -4,12 +4,9 @@
55 db="$1"
66 collection="$2"
77 thesaurus="$3"
8 -languages="en de fr nl it es pt pl"
98
10 -echo "preparing search index"
 9+echo "building search index"
1110 replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" < search-index.sql | mysql "$db"
1211
13 -for n in $languages; do
14 - echo "collection search index: $n"
15 - replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" '{lang}' "$n" < search-index-local.sql | mysql "$db"
16 -done
 12+echo "building resource index"
 13+replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" < resource-index.sql | mysql "$db"
Index: trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql
@@ -11,3 +11,19 @@
1212 ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
1313
1414 truncate {collection}_{thesaurus}_search_index;
 15+
 16+-- collect definitions
 17+insert into {collection}_{thesaurus}_search_index (
 18+ concept, concept_name, `type`, `lang`, `term`, `score`, `norm` )
 19+select C.id, O.local_concept_name, C.type, M.lang,
 20+ REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE utf8_general_ci ), "-", "" ),
 21+ M.rule * M.freq, 1
 22+from {collection}_{thesaurus}_meaning as M
 23+join {collection}_{thesaurus}_concept as C on C.id = M.concept
 24+join {collection}_{thesaurus}_origin as O on O.global_concept = M.concept and O.lang = M.lang -- FIXME: remove this once the global menaing table contains the local concept name
 25+where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0
 26+on duplicate key update
 27+ score = if (score > values(score), score, values(score)),
 28+ norm = if (norm < values(norm), score, values(norm));
 29+
 30+-- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 2=whitespace-and-punctuation, 4=soundex
\ No newline at end of file
Index: trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql
@@ -0,0 +1,14 @@
 2+create table if not exists {collection}_{thesaurus}_resource_index (
 3+ concept int(11) NOT NULL,
 4+ resources MEDIUMBLOB int(11) NOT NULL,
 5+ PRIMARY KEY ( concept ),
 6+ ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
 7+
 8+truncate {collection}_{thesaurus}_resource_index;
 9+
 10+-- collect ressources in all languages
 11+insert into {collection}_{thesaurus}_resource_index (
 12+ concept, resources
 13+select concept, group_concat(distinct concat(type, ":", lang, ":", local_resource_name) separator "|" ) as resources
 14+from {collection}_{thesaurus}_about as A
 15+where type > 0;
Index: trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php
@@ -10,16 +10,19 @@
1111 if ( $query ) {
1212 $lang = @$_REQUEST['lang'];
1313 $qlang = @$_REQUEST['qlang'];
 14+ $rclang = @$_REQUEST['rclang'];
1415 $format = @$_REQUEST['format'];
1516 if ( !$format ) $format = 'phps';
1617
1718 if ($lang) $lang = preg_replace('[^\\w\\d_]', '', $lang);
1819 if ($qlang) $qlang = preg_replace('[^\\w\\d_]', '', $qlang);
 20+
 21+ if ($rclang) $rclang = preg_replace('[^\\w\\d_]', '', $rclang);
1922
2023 if ($lang) {
21 - $lang = preg_split('![\\s,;|/:]\\s*!', $lang);
22 - if ( !$qlang ) $qlang = $lang[0];
23 - if (count($lang) == 1) $lang = $lang[0];
 24+ $lang = preg_split('![\\s,;|/:]\\s*!', $lang);
 25+ if ( !$qlang ) $qlang = $lang[0];
 26+ if (count($lang) == 1) $lang = $lang[0];
2427 }
2528
2629 $result = array( 'query' => $query );
@@ -40,29 +43,30 @@
4144
4245 if ( $qlang === null ) $result['error'] = array('code' => 150, 'message' => "missing parameter qlang");
4346 else if ( $term !== null ) {
44 - $result['concepts'] = $thesaurus->getConceptsForTerm($qlang, $term, $lang, $norm); #TODO: limit!
45 - if ( $result['concepts'] === false || $result['concepts'] === null ) {
46 - $result['error'] = array('code' => 210, 'message' => "failed to retrieve concepts for term $langt:$term");
47 - }
 47+ $result['concepts'] = $thesaurus->getConceptsForTerm($qlang, $term, $lang, $norm, $rclang); #TODO: limit!
 48+ if ( $result['concepts'] === false || $result['concepts'] === null ) {
 49+ $result['error'] = array('code' => 210, 'message' => "failed to retrieve concepts for term $langt:$term");
 50+ }
4851 } /*else if ( $page !== null ) {
4952 $result['concepts'] = $thesaurus->getConceptsForPage($lang, $page);
5053 if ( $result['concepts'] === false || $result['concepts'] === null ) {
5154 $result['error'] = array('code' => 250, 'message' => "failed to retrieve concepts for page $langt:$page");
5255 }
5356 } */else {
54 - $result['error'] = array('code' => 110, 'message' => "missing parameter term");
 57+ $result['error'] = array('code' => 110, 'message' => "missing parameter term");
5558 }
5659 } else if ($query == 'concept' || $query == 'info') {
5760 $gcid = @$_REQUEST['gcid'];
5861 if (!$gcid) $gcid = @$_REQUEST['id'];
5962
6063 if ( $gcid === null ) {
61 - $result['error'] = array('code' => 120, 'message' => "missing parameter gcid");
 64+ $result['error'] = array('code' => 120, 'message' => "missing parameter gcid");
6265 } else {
63 - $result['concept'] = $thesaurus->getConceptInfo($gcid, $lang); #TODO: limit!
64 - if ( $result['concept'] === false || $result['concept'] === null ) {
65 - $result['error'] = array('code' => 210, 'message' => "concept not found: $gcid");
66 - }
 66+ $fields = null;
 67+ $result['concept'] = $thesaurus->getConceptInfo($gcid, $lang, $fields, $rclang); #TODO: limit!
 68+ if ( $result['concept'] === false || $result['concept'] === null ) {
 69+ $result['error'] = array('code' => 210, 'message' => "concept not found: $gcid");
 70+ }
6771 }
6872 } else if ($query == 'properties') {
6973 $gcid = @$_REQUEST['gcid'];
Index: trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php
@@ -200,7 +200,7 @@
201201 $mode = NULL;
202202 $result = NULL;
203203
204 -$fallback_languages = array( "en", "commons" ); #TODO: make the user define this list
 204+$fallback_languages = array( "en" ); #TODO: make the user define this list
205205
206206 if ( $lang ) {
207207 $languages = explode( '|', $lang );
@@ -222,11 +222,11 @@
223223 try {
224224 if ($lang && $conceptId) {
225225 $mode = "concept";
226 - $result = $thesaurus->getConceptInfo($conceptId, $allLanguages);
 226+ $result = $thesaurus->getConceptInfo($conceptId, $lang, null, $allLanguages);
227227 if ( $result ) $result = array( $result ); //hack
228228 } else if ($lang && $term) {
229229 $mode = "term";
230 - $result = $thesaurus->getConceptsForTerm($lang, $term, $allLanguages, $norm, $limit);
 230+ $result = $thesaurus->getConceptsForTerm($lang, $term, $languages, $norm, $allLanguages, $limit);
231231 }
232232 } catch (Exception $e) {
233233 $error = $e->getMessage();
@@ -237,7 +237,8 @@
238238 if (!isset($scriptPath)) $scriptPath = "./";
239239 if (!isset($skinPath)) $skinPath = "$scriptPath/../skin/";
240240
241 -if ( $format == "atom" || $format == "xml" || $format == "opensearch" ) include("response.atom.php");
242 -else include("response.html.php");
 241+/*if ( $format == "atom" || $format == "xml" || $format == "opensearch" ) include("response.atom.php");
 242+else*/
 243+include("response.html.php");
243244
244245 $utils->close();
Index: trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php
@@ -94,11 +94,13 @@
9595 return $p['scores'];
9696 }*/
9797
98 - function getConceptInfo( $id, $lang = null ) {
 98+ function getConceptInfo( $id, $lang = null, $fields = null, $rclang = null ) {
9999 $param = array(
100100 'query' => 'info',
101101 'gcid' => $id,
102 - 'lang' => $lang
 102+ 'lang' => $lang,
 103+ 'fields' => fields,
 104+ 'rclang' => $rclang
103105 );
104106
105107 $rs = $this->query( $param );
@@ -132,12 +134,13 @@
133135 return $rs;
134136 }*/
135137
136 - function getConceptsForTerm( $qlang, $term, $languages, $norm = 1, $limit = 100 ) {
 138+ function getConceptsForTerm( $qlang, $term, $languages, $norm = 1, $rclang = null, $limit = 100 ) {
137139 if ( is_array( $languages ) ) $languages = implode('|', $languages);
138140
139141 $param = array(
140142 'query' => 'concepts',
141143 'qlang' => $qlang,
 144+ 'rclang' => $rclang,
142145 'lang' => $languages,
143146 'norm' => $norm,
144147 'term' => $term,
Index: trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php
@@ -72,20 +72,26 @@
7373 return $s;
7474 }
7575
76 - function queryConceptsForTerm($qlang, $term, $languages, $norm = 1, $limit = 100) {
 76+ function queryConceptsForTerm($qlang, $term, $languages, $norm = 1, $rclang = null, $limit = 100) {
7777 global $wwTablePrefix, $wwThesaurusDataset, $wwLanguages;
7878
7979 if ( !$languages ) $languages = array_keys( $wwLanguages );
8080
8181 $term = $this->normalizeSearchString($term, $norm);
8282
83 - $sql = "SELECT I.*, S.score FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I"
84 - . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S ON I.concept = S.concept "
85 - . " WHERE term = " . $this->quote($term)
 83+ $sql = "SELECT I.*, S.score ";
 84+ if ( $rclang ) $sql .= ", R.resources "
 85+
 86+ $sql .= " FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I ";
 87+ $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S ON I.concept = S.concept ";
 88+ if ( $rclang ) $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_resource_index as R ON R.concept = I.concept "
 89+
 90+ $sql .= " WHERE term = " . $this->quote($term)
8691 . " AND I.lang IN " . $this->quoteSet($languages)
8792 . " AND S.lang = " . $this->quote($qlang)
88 - . " AND S.norm <= " . (int)$norm
89 - . " ORDER BY S.score DESC, S.concept "
 93+ . " AND S.norm <= " . (int)$norm;
 94+
 95+ $sql .= " ORDER BY S.score DESC, S.concept "
9096 . " LIMIT " . (int)$limit;
9197
9298 #FIXME: query-lang vs. output-languages!
@@ -93,8 +99,8 @@
94100 return $this->query($sql);
95101 }
96102
97 - function getConceptsForTerm($qlang, $term, $languages, $norm = 1, $limit = 100) {
98 - $rs = $this->queryConceptsForTerm($qlang, $term, $languages);
 103+ function getConceptsForTerm($qlang, $term, $languages, $norm = 1, $rclang = null, $limit = 100) {
 104+ $rs = $this->queryConceptsForTerm($qlang, $term, $languages, $norm, $rclang, $limit);
99105 $list = WWUtils::slurpRows($rs);
100106 mysql_free_result($rs);
101107 return $this->buildConcepts($list);
@@ -243,6 +249,23 @@
244250 }
245251 */
246252
 253+ function spliceResources( $rc, &$into ) {
 254+ if (!$rc) return;
 255+
 256+ if (is_string($rc)) {
 257+ $rr = explode("|", $rc);
 258+
 259+ $rc = array();
 260+ foreach ($rr as $r) {
 261+ list($t, $lang, $n) = explode(":", $p, 3);
 262+ $rc[$lang][$n] = (int)$t;
 263+ }
 264+ }
 265+
 266+ if (!$into) $into = $rc;
 267+ else $into = array_merge( $into, $rc );
 268+ }
 269+
247270 function splitPages( $s ) {
248271 $pp = explode("|", $s);
249272
@@ -269,7 +292,7 @@
270293 }
271294
272295 /////////////////////////////////////////////////////////
273 - function getConceptInfo( $id, $lang = null, $fields = null ) {
 296+ function getConceptInfo( $id, $lang = null, $fields = null, $rclang = null ) {
274297 global $wwTablePrefix, $wwThesaurusDataset;
275298
276299 #TODO: concept cache!
@@ -279,9 +302,14 @@
280303
281304 #TODO: scores, concept-type, ...
282305
283 - $sql = "SELECT $fields FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info "
284 - . " WHERE concept = ".(int)$id;
 306+ $sql = "SELECT $fields ";
 307+ if ( $rclang ) $sql .= ", R.resources ";
 308+
 309+ $sql .= " FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I ";
 310+ if ( $rclang ) $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_resource_index as R ON R.concept = I.concept ";
285311
 312+ $sql .= " WHERE concept = ".(int)$id;
 313+
286314 if ($lang) {
287315 if ( is_array($lang) ) $sql .= " AND lang IN " . $this->quoteSet($lang);
288316 else $sql .= " AND lang = " . $this->quote($lang);
@@ -352,8 +380,10 @@
353381
354382 if (@$row["name"] !== null) $concept["name"][$lang] = $row["name"];
355383 if (@$row["definition"] !== null) $concept["definition"][$lang] = $row["definition"];
356 - if (@$row["pages"] !== null) $concept["pages"][$lang] = $this->splitPages($row["pages"]);
357384
 385+ if (@$row["resources"]) $this->spliceResources($row["resources"], &$concept["pages"]) );
 386+ else if (@$row["pages"]) $concept["pages"][$lang] = $this->splitPages($row["pages"]);
 387+
358388 if (@$row["broader"] !== null) $broader[$lang] = $this->splitConcepts($row["broader"]);
359389 if (@$row["narrower"] !== null) $narrower[$lang] = $this->splitConcepts($row["narrower"]);
360390 if (@$row["similar"] !== null) $similar[$lang] = $this->splitConcepts($row["similar"]);

Status & tagging log