r62871 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r62870‎ | r62871 | r62872 >
Date:11:34, 23 February 2010
Author:daniel
Status:deferred
Tags:
Comment:
wikiword search index
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/php/build-search-index.sh (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/php/concept-info.sql (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/php/search-index-local.sql (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/php/search-index.sql (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/php/wwthesaurus.php (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/php/build-search-index.sh
@@ -0,0 +1,15 @@
 2+#!/bin/bash
 3+set -e
 4+
 5+db="$1"
 6+collection="$2"
 7+thesaurus="$3"
 8+languages="en de fr nl it es pt pl"
 9+
 10+echo "preparing search index"
 11+replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" < search-index.sql | mysql "$db"
 12+
 13+for n in $languages; do
 14+ echo "collection search index: $n"
 15+ replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" '{lang}' "$n" < search-index-local.sql | mysql "$db"
 16+done
Property changes on: trunk/WikiWord/WikiWord/src/main/php/build-search-index.sh
___________________________________________________________________
Name: svn:mergeinfo
117 +
Index: trunk/WikiWord/WikiWord/src/main/php/search-index-local.sql
@@ -0,0 +1,16 @@
 2+-- collect definitions
 3+insert into {collection}_{thesaurus}_search_index (
 4+ concept, concept_name, `type`,
 5+ `lang`, `term`, `score`, `norm` )
 6+select O.global_concept, M.concept_name, C.type, "{lang}",
 7+ REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE utf8_general_ci ), "-", "" ),
 8+ M.rule * M.freq, 1
 9+from {collection}_{lang}_meaning as M
 10+join {collection}_{thesaurus}_origin as O on O.lang = "{lang}" and O.local_concept = M.concept
 11+join {collection}_{thesaurus}_concept as C on C.id = O.global_concept
 12+where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0
 13+on duplicate key update
 14+ score = if (score > values(score), score, values(score)),
 15+ norm = if (norm < values(norm), score, values(norm));
 16+
 17+-- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 2=whitespace-and-punctuation, 4=soundex
\ No newline at end of file
Property changes on: trunk/WikiWord/WikiWord/src/main/php/search-index-local.sql
___________________________________________________________________
Name: svn:mergeinfo
118 +
Index: trunk/WikiWord/WikiWord/src/main/php/concept-info.sql
@@ -13,8 +13,8 @@
1414 PRIMARY KEY ( concept, lang )
1515 ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
1616
 17+truncate {collection}_{thesaurus}_concept_info;
 18+
1719 insert into {collection}_{thesaurus}_concept_info ( concept, lang, name )
1820 select global_concept, lang, local_concept_name
1921 from {collection}_{thesaurus}_origin;
20 -
21 -truncate {collection}_{thesaurus}_concept_info;
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/php/wwthesaurus.php
@@ -62,18 +62,22 @@
6363
6464 class WWThesaurus extends WWUTils {
6565
66 - function queryConceptsForTerm($lang, $term, $limit = 100) {
 66+ function queryConceptsForTerm($lang, $term, $norm = 3, $limit = 100) {
6767 global $wwTablePrefix, $wwThesaurusDataset;
6868
69 - $term = trim($term);
 69+ $term = $this->normalize($term, $norm);
7070
71 - $sql = "SELECT O.global_concept as id, M.*, O.*, definition FROM {$wwTablePrefix}_{$lang}_meaning as M"
72 - . " LEFT JOIN {$wwTablePrefix}_{$lang}_definition as D ON M.concept = D.concept "
73 - . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_origin as O ON O.lang = \"" . mysql_real_escape_string($lang) . "\" AND M.concept = O.local_concept "
74 - . " WHERE term_text = \"" . mysql_real_escape_string($term) . "\""
75 - . " ORDER BY freq DESC "
 71+ $sql = "SELECT I.* FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I"
 72+ . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S ON I.concept = S.concept and I.lang = S.lang"
 73+ . " WHERE term = " . $this->quote($term)
 74+ . " AND I.lang = " . $this->quote($lang)
 75+ . " AND S.lang = " . $this->quote($lang)
 76+ . " AND S.norm <= " . (int)$norm
 77+ . " ORDER BY S.score DESC "
7678 . " LIMIT " . (int)$limit;
7779
 80+ #FIXME: query-lang vs. output-languages!
 81+
7882 return $this->query($sql);
7983 }
8084
@@ -81,9 +85,10 @@
8286 $rs = $this->queryConceptsForTerm($lang, $term);
8387 $list = WWUtils::slurpRows($rs);
8488 mysql_free_result($rs);
85 - return $list;
 89+ return $this->buildConcepts($rs);
8690 }
8791
 92+ /*
8893 function queryConceptsForPage($lang, $page, $limit = 100) {
8994 global $wwTablePrefix, $wwThesaurusDataset;
9095
@@ -115,7 +120,7 @@
116121
117122 function getLocalConcepts($id) { //NOTE: deprecated alias for backward compat
118123 return getPagesForConcept($id);
119 - }
 124+ } */
120125
121126 /*
122127 function queryLocalConceptInfo($lang, $id) {
@@ -142,7 +147,7 @@
143148 return $this->query($sql);
144149 }*/
145150
146 - function getConceptInfo( $id, $lang = null ) {
 151+ /*function getConceptInfo( $id, $lang = null ) {
147152 $result = $this->getConcept($id, $lang);
148153
149154 $result['broader'] = $this->getBroaderForConcept($id);
@@ -155,8 +160,9 @@
156161 }
157162
158163 return $result;
159 - }
 164+ }*/
160165
 166+ /*
161167 function unpickle($s, $lang, $hasId=true, $hasName=true, $hasConf=true) {
162168 $ss = explode("\x1E", $s);
163169 $items = array();
@@ -218,6 +224,7 @@
219225
220226 return $names;
221227 }
 228+ */
222229
223230 function splitPages( $s ) {
224231 $pp = explode("|", $s);
@@ -262,15 +269,39 @@
263270 }
264271
265272 $r = $this->getRows($sql);
 273+ if (!$r) return false;
266274
267 - $rs = $this->query($sql);
268 - if (!$rs) return false;
 275+ return $this->buildConcept($r);
 276+ }
269277
 278+ function buildConcepts($rows) {
 279+ $concepts = array();
 280+ $buff = array();
 281+ $id = null;
 282+ foreach($rows as $row) {
 283+ if ( $id !== null && $id != $row['concept'] ) {
 284+ if ($buff) {
 285+ $concepts[$id] = $this->buildConcept($buff);
 286+ $buff = array();
 287+ }
 288+
 289+ $id = null;
 290+ }
 291+
 292+ if ($id === null) $id = $row['concept'];
 293+ $buff[] = $row;
 294+ }
 295+
 296+ return $concepts;
 297+ }
 298+
 299+ function buildConcept($rows) {
270300 $concept = array();
271 - $concept["id"] = $id;
272301 $concept["languages"] = array();
273302
274 - while ($row = mysql_fetch_assoc($rs)) {
 303+ foreach ($rows as $row) {
 304+ if (!isset($concept["id"])) = $row["concept"];
 305+
275306 $lang = $row["lang"];
276307 $concept["languages"][] = $lang;
277308
@@ -296,8 +327,6 @@
297328 if (isset($concept["similar"]["*"])) $concept["similar"]["*"] = array_unique($concept["similar"]["*"]);
298329 if (isset($concept["broader"]["*"])) $concept["related"]["*"] = array_unique($concept["related"]["*"]);
299330
300 - mysql_free_result($rs);
301 -
302331 return $concept;
303332 }
304333
Index: trunk/WikiWord/WikiWord/src/main/php/search-index.sql
@@ -0,0 +1,13 @@
 2+create table if not exists {collection}_{thesaurus}_search_index (
 3+ concept int(11) NOT NULL,
 4+ concept_name varbinary(255) NOT NULL,
 5+ type int(11) NOT NULL,
 6+ `lang` varbinary(10) NOT NULL,
 7+ `term` varchar(255) character set utf8 collate utf8_general_ci NOT NULL,
 8+ `score` int NOT NULL,
 9+ `norm` int NOT NULL,
 10+ PRIMARY KEY ( lang, term, concept ),
 11+ KEY ( concept, lang )
 12+ ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
 13+
 14+truncate {collection}_{thesaurus}_search_index;
Property changes on: trunk/WikiWord/WikiWord/src/main/php/search-index.sql
___________________________________________________________________
Name: svn:mergeinfo
115 +

Status & tagging log