Index: trunk/WikiWord/WikiWord/src/main/php/build-search-index.sh |
— | — | @@ -0,0 +1,15 @@ |
| 2 | +#!/bin/bash |
| 3 | +set -e |
| 4 | + |
| 5 | +db="$1" |
| 6 | +collection="$2" |
| 7 | +thesaurus="$3" |
| 8 | +languages="en de fr nl it es pt pl" |
| 9 | + |
| 10 | +echo "preparing search index" |
| 11 | +replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" < search-index.sql | mysql "$db" |
| 12 | + |
| 13 | +for n in $languages; do |
| 14 | + echo "collection search index: $n" |
| 15 | + replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" '{lang}' "$n" < search-index-local.sql | mysql "$db" |
| 16 | +done |
Property changes on: trunk/WikiWord/WikiWord/src/main/php/build-search-index.sh |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 17 | + |
Index: trunk/WikiWord/WikiWord/src/main/php/search-index-local.sql |
— | — | @@ -0,0 +1,16 @@ |
| 2 | +-- collect definitions |
| 3 | +insert into {collection}_{thesaurus}_search_index ( |
| 4 | + concept, concept_name, `type`, |
| 5 | + `lang`, `term`, `score`, `norm` ) |
| 6 | +select O.global_concept, M.concept_name, C.type, "{lang}", |
| 7 | + REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE utf8_general_ci ), "-", "" ), |
| 8 | + M.rule * M.freq, 1 |
| 9 | +from {collection}_{lang}_meaning as M |
| 10 | +join {collection}_{thesaurus}_origin as O on O.lang = "{lang}" and O.local_concept = M.concept |
| 11 | +join {collection}_{thesaurus}_concept as C on C.id = O.global_concept |
| 12 | +where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0 |
| 13 | +on duplicate key update |
| 14 | + score = if (score > values(score), score, values(score)), |
| 15 | + norm = if (norm < values(norm), score, values(norm)); |
| 16 | + |
| 17 | +-- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 2=whitespace-and-punctuation, 4=soundex |
\ No newline at end of file |
Property changes on: trunk/WikiWord/WikiWord/src/main/php/search-index-local.sql |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 18 | + |
Index: trunk/WikiWord/WikiWord/src/main/php/concept-info.sql |
— | — | @@ -13,8 +13,8 @@ |
14 | 14 | PRIMARY KEY ( concept, lang ) |
15 | 15 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin; |
16 | 16 | |
| 17 | +truncate {collection}_{thesaurus}_concept_info; |
| 18 | + |
17 | 19 | insert into {collection}_{thesaurus}_concept_info ( concept, lang, name ) |
18 | 20 | select global_concept, lang, local_concept_name |
19 | 21 | from {collection}_{thesaurus}_origin; |
20 | | - |
21 | | -truncate {collection}_{thesaurus}_concept_info; |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/php/wwthesaurus.php |
— | — | @@ -62,18 +62,22 @@ |
63 | 63 | |
64 | 64 | class WWThesaurus extends WWUTils { |
65 | 65 | |
66 | | - function queryConceptsForTerm($lang, $term, $limit = 100) { |
| 66 | + function queryConceptsForTerm($lang, $term, $norm = 3, $limit = 100) { |
67 | 67 | global $wwTablePrefix, $wwThesaurusDataset; |
68 | 68 | |
69 | | - $term = trim($term); |
| 69 | + $term = $this->normalize($term, $norm); |
70 | 70 | |
71 | | - $sql = "SELECT O.global_concept as id, M.*, O.*, definition FROM {$wwTablePrefix}_{$lang}_meaning as M" |
72 | | - . " LEFT JOIN {$wwTablePrefix}_{$lang}_definition as D ON M.concept = D.concept " |
73 | | - . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_origin as O ON O.lang = \"" . mysql_real_escape_string($lang) . "\" AND M.concept = O.local_concept " |
74 | | - . " WHERE term_text = \"" . mysql_real_escape_string($term) . "\"" |
75 | | - . " ORDER BY freq DESC " |
| 71 | + $sql = "SELECT I.* FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I" |
| 72 | + . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S ON I.concept = S.concept and I.lang = S.lang" |
| 73 | + . " WHERE term = " . $this->quote($term) |
| 74 | + . " AND I.lang = " . $this->quote($lang) |
| 75 | + . " AND S.lang = " . $this->quote($lang) |
| 76 | + . " AND S.norm <= " . (int)$norm |
| 77 | + . " ORDER BY S.score DESC " |
76 | 78 | . " LIMIT " . (int)$limit; |
77 | 79 | |
| 80 | + #FIXME: query-lang vs. output-languages! |
| 81 | + |
78 | 82 | return $this->query($sql); |
79 | 83 | } |
80 | 84 | |
— | — | @@ -81,9 +85,10 @@ |
82 | 86 | $rs = $this->queryConceptsForTerm($lang, $term); |
83 | 87 | $list = WWUtils::slurpRows($rs); |
84 | 88 | mysql_free_result($rs); |
85 | | - return $list; |
| 89 | + return $this->buildConcepts($rs); |
86 | 90 | } |
87 | 91 | |
| 92 | + /* |
88 | 93 | function queryConceptsForPage($lang, $page, $limit = 100) { |
89 | 94 | global $wwTablePrefix, $wwThesaurusDataset; |
90 | 95 | |
— | — | @@ -115,7 +120,7 @@ |
116 | 121 | |
117 | 122 | function getLocalConcepts($id) { //NOTE: deprecated alias for backward compat |
118 | 123 | return getPagesForConcept($id); |
119 | | - } |
| 124 | + } */ |
120 | 125 | |
121 | 126 | /* |
122 | 127 | function queryLocalConceptInfo($lang, $id) { |
— | — | @@ -142,7 +147,7 @@ |
143 | 148 | return $this->query($sql); |
144 | 149 | }*/ |
145 | 150 | |
146 | | - function getConceptInfo( $id, $lang = null ) { |
| 151 | + /*function getConceptInfo( $id, $lang = null ) { |
147 | 152 | $result = $this->getConcept($id, $lang); |
148 | 153 | |
149 | 154 | $result['broader'] = $this->getBroaderForConcept($id); |
— | — | @@ -155,8 +160,9 @@ |
156 | 161 | } |
157 | 162 | |
158 | 163 | return $result; |
159 | | - } |
| 164 | + }*/ |
160 | 165 | |
| 166 | + /* |
161 | 167 | function unpickle($s, $lang, $hasId=true, $hasName=true, $hasConf=true) { |
162 | 168 | $ss = explode("\x1E", $s); |
163 | 169 | $items = array(); |
— | — | @@ -218,6 +224,7 @@ |
219 | 225 | |
220 | 226 | return $names; |
221 | 227 | } |
| 228 | + */ |
222 | 229 | |
223 | 230 | function splitPages( $s ) { |
224 | 231 | $pp = explode("|", $s); |
— | — | @@ -262,15 +269,39 @@ |
263 | 270 | } |
264 | 271 | |
265 | 272 | $r = $this->getRows($sql); |
| 273 | + if (!$r) return false; |
266 | 274 | |
267 | | - $rs = $this->query($sql); |
268 | | - if (!$rs) return false; |
| 275 | + return $this->buildConcept($r); |
| 276 | + } |
269 | 277 | |
| 278 | + function buildConcepts($rows) { |
| 279 | + $concepts = array(); |
| 280 | + $buff = array(); |
| 281 | + $id = null; |
| 282 | + foreach($rows as $row) { |
| 283 | + if ( $id !== null && $id != $row['concept'] ) { |
| 284 | + if ($buff) { |
| 285 | + $concepts[$id] = $this->buildConcept($buff); |
| 286 | + $buff = array(); |
| 287 | + } |
| 288 | + |
| 289 | + $id = null; |
| 290 | + } |
| 291 | + |
| 292 | + if ($id === null) $id = $row['concept']; |
| 293 | + $buff[] = $row; |
| 294 | + } |
| 295 | + |
| 296 | + return $concepts; |
| 297 | + } |
| 298 | + |
| 299 | + function buildConcept($rows) { |
270 | 300 | $concept = array(); |
271 | | - $concept["id"] = $id; |
272 | 301 | $concept["languages"] = array(); |
273 | 302 | |
274 | | - while ($row = mysql_fetch_assoc($rs)) { |
| 303 | + foreach ($rows as $row) { |
| 304 | + if (!isset($concept["id"])) = $row["concept"]; |
| 305 | + |
275 | 306 | $lang = $row["lang"]; |
276 | 307 | $concept["languages"][] = $lang; |
277 | 308 | |
— | — | @@ -296,8 +327,6 @@ |
297 | 328 | if (isset($concept["similar"]["*"])) $concept["similar"]["*"] = array_unique($concept["similar"]["*"]); |
298 | 329 | if (isset($concept["broader"]["*"])) $concept["related"]["*"] = array_unique($concept["related"]["*"]); |
299 | 330 | |
300 | | - mysql_free_result($rs); |
301 | | - |
302 | 331 | return $concept; |
303 | 332 | } |
304 | 333 | |
Index: trunk/WikiWord/WikiWord/src/main/php/search-index.sql |
— | — | @@ -0,0 +1,13 @@ |
| 2 | +create table if not exists {collection}_{thesaurus}_search_index ( |
| 3 | + concept int(11) NOT NULL, |
| 4 | + concept_name varbinary(255) NOT NULL, |
| 5 | + type int(11) NOT NULL, |
| 6 | + `lang` varbinary(10) NOT NULL, |
| 7 | + `term` varchar(255) character set utf8 collate utf8_general_ci NOT NULL, |
| 8 | + `score` int NOT NULL, |
| 9 | + `norm` int NOT NULL, |
| 10 | + PRIMARY KEY ( lang, term, concept ), |
| 11 | + KEY ( concept, lang ) |
| 12 | + ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin; |
| 13 | + |
| 14 | +truncate {collection}_{thesaurus}_search_index; |
Property changes on: trunk/WikiWord/WikiWord/src/main/php/search-index.sql |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 15 | + |