Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/spell/api |
___________________________________________________________________ |
Name: svn:ignore |
1 | 1 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/spell |
___________________________________________________________________ |
Name: svn:ignore |
2 | 2 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/test |
___________________________________________________________________ |
Name: svn:ignore |
3 | 3 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/storage |
___________________________________________________________________ |
Name: svn:ignore |
4 | 4 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/config |
___________________________________________________________________ |
Name: svn:ignore |
5 | 5 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/search |
___________________________________________________________________ |
Name: svn:ignore |
6 | 6 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/highlight |
___________________________________________________________________ |
Name: svn:ignore |
7 | 7 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/index |
___________________________________________________________________ |
Name: svn:ignore |
8 | 8 | + *.class |
Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/AnalysisTest.java |
— | — | @@ -41,7 +41,7 @@ |
42 | 42 | public class AnalysisTest extends WikiTestCase { |
43 | 43 | Analyzer a = null; |
44 | 44 | Configuration config = null; |
45 | | - |
| 45 | + |
46 | 46 | protected void setUp() throws Exception { |
47 | 47 | super.setUp(); |
48 | 48 | if(config == null){ |
— | — | @@ -49,7 +49,7 @@ |
50 | 50 | GlobalConfiguration.getInstance(); |
51 | 51 | } |
52 | 52 | } |
53 | | - |
| 53 | + |
54 | 54 | public void testCJKAnalyzer(){ |
55 | 55 | a = new CJKAnalyzer(); |
56 | 56 | assertEquals("[(いわ,0,2,type=double), (わさ,1,3,type=double), (さき,2,4,type=double), (ic,4,6,type=single), (カー,6,8,type=double), (ード,7,9,type=double)]",tokens("いわさきicカード")); |
— | — | @@ -69,35 +69,35 @@ |
70 | 70 | assertEquals("[(pokémons,0,8), (pokemons,0,8,posIncr=0), (pokemon,0,8,type=stemmed,posIncr=0)]",tokens("Pokémons")); |
71 | 71 | assertEquals("[(1990,0,4), (s,4,5), (iv,6,8)]",tokens("1990s IV")); |
72 | 72 | } |
73 | | - |
| 73 | + |
74 | 74 | public void testEnglishSearch(){ |
75 | 75 | a = Analyzers.getSearcherAnalyzer(IndexId.get("enwiki")); |
76 | 76 | commonEnglish(); |
77 | 77 | // acronyms don't get split |
78 | 78 | assertEquals("[(a.k.a,0,5), (aka,0,5,posIncr=0), (www,6,9), (google,10,16), (com,17,20)]",tokens("a.k.a www.google.com")); |
79 | 79 | } |
80 | | - |
| 80 | + |
81 | 81 | public void testEnglishIndex(){ |
82 | 82 | a = Analyzers.getIndexerAnalyzer(new FieldBuilder(IndexId.get("enwiki"))); |
83 | 83 | commonEnglish(); |
84 | 84 | // acronyms are always split |
85 | 85 | assertEquals("[(a.k.a,0,5), (aka,0,5,posIncr=0), (a,0,5,posIncr=0), (k,2,7,posIncr=0), (a,4,9,posIncr=0), (www,6,9), (google,10,16), (com,17,20)]",tokens("a.k.a www.google.com")); |
86 | 86 | } |
87 | | - |
| 87 | + |
88 | 88 | public void commonSerbian(){ |
89 | 89 | assertEquals("[(нешто,0,5), (nesto,0,5,type=alias,posIncr=0), (на,6,8), (na,6,8,type=alias,posIncr=0), (ћирилици,9,17), (cirilici,9,17,type=alias,posIncr=0)]",tokens("Нешто на ћирилици")); |
90 | 90 | } |
91 | | - |
| 91 | + |
92 | 92 | public void testSerbianSearch(){ |
93 | 93 | a = Analyzers.getSearcherAnalyzer(IndexId.get("srwiki")); |
94 | 94 | commonSerbian(); |
95 | 95 | } |
96 | | - |
| 96 | + |
97 | 97 | public void testSerbianIndex(){ |
98 | 98 | a = Analyzers.getIndexerAnalyzer(new FieldBuilder(IndexId.get("srwiki"))); |
99 | 99 | commonSerbian(); |
100 | 100 | } |
101 | | - |
| 101 | + |
102 | 102 | public String tokens(String text){ |
103 | 103 | try{ |
104 | 104 | return Arrays.toString(tokensFromAnalysis(a,text,"contents")); |
— | — | @@ -106,7 +106,7 @@ |
107 | 107 | return null; |
108 | 108 | } |
109 | 109 | } |
110 | | - |
| 110 | + |
111 | 111 | public static Token[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException { |
112 | 112 | TokenStream stream = analyzer.tokenStream(field, text); |
113 | 113 | ArrayList tokenList = new ArrayList(); |
— | — | @@ -117,15 +117,15 @@ |
118 | 118 | } |
119 | 119 | return (Token[]) tokenList.toArray(new Token[0]); |
120 | 120 | } |
121 | | - |
122 | | - public static void displayTokens(Analyzer analyzer, String text) throws IOException { |
| 121 | + |
| 122 | + public static void displayTokens(Analyzer analyzer, String text) throws IOException { |
123 | 123 | Token[] tokens = tokensFromAnalysis(analyzer, text, "contents"); |
124 | 124 | System.out.println(text); |
125 | 125 | System.out.print(">> "); |
126 | 126 | print(tokens); |
127 | 127 | System.out.println(); |
128 | 128 | } |
129 | | - |
| 129 | + |
130 | 130 | protected static void print(Token[] tokens){ |
131 | 131 | for (int i = 0, j =0; i < tokens.length; i++, j++) { |
132 | 132 | Token token = tokens[i]; |
— | — | @@ -138,14 +138,14 @@ |
139 | 139 | System.out.println(); |
140 | 140 | j=0; |
141 | 141 | } |
142 | | - |
143 | | - } |
| 142 | + |
| 143 | + } |
144 | 144 | } |
145 | | - |
146 | | - public static void displayTokens2(Analyzer analyzer, String text) throws IOException { |
| 145 | + |
| 146 | + public static void displayTokens2(Analyzer analyzer, String text) throws IOException { |
147 | 147 | Token[] tokens = tokensFromAnalysis(analyzer, text, "contents"); |
148 | 148 | System.out.println(text); |
149 | | - System.out.print("contents >> "); |
| 149 | + System.out.print("contents >> "); |
150 | 150 | print(tokens); |
151 | 151 | System.out.println(); |
152 | 152 | tokens = tokensFromAnalysis(analyzer, text, "stemmed"); |
— | — | @@ -165,16 +165,16 @@ |
166 | 166 | System.out.println(); |
167 | 167 | } |
168 | 168 | } |
169 | | - |
| 169 | + |
170 | 170 | public static void main(String args[]) throws IOException, ParseException{ |
171 | 171 | Configuration.open(); |
172 | | - |
| 172 | + |
173 | 173 | //serializeTest(Analyzers.getHighlightAnalyzer(IndexId.get("enwiki"))); |
174 | 174 | //testAnalyzer(Analyzers.getHighlightAnalyzer(IndexId.get("enwiki")),"Aaliyah"); |
175 | | - |
| 175 | + |
176 | 176 | Analyzer aa = Analyzers.getSearcherAnalyzer(IndexId.get("wikilucene")); |
177 | 177 | displayTokens(aa,"boxes france"); |
178 | | - |
| 178 | + |
179 | 179 | HashSet<String> stopWords = new HashSet<String>(); |
180 | 180 | stopWords.add("the"); stopWords.add("of"); stopWords.add("is"); stopWords.add("in"); stopWords.add("and"); stopWords.add("he") ; |
181 | 181 | //Analyzer analyzer = Analyzers.getSpellCheckAnalyzer(IndexId.get("enwiki"),stopWords); |
— | — | @@ -186,7 +186,7 @@ |
187 | 187 | text = "a.k.a www.google.com Google's Pokémons links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)"; |
188 | 188 | displayTokens(analyzer,text); |
189 | 189 | text = "a8n sli compatible compatibly Thomas c# c++ good-thomas Good-Thomas rats RATS Frame semantics (linguistics) 16th century sixteenth .fr web.fr other"; |
190 | | - displayTokens(analyzer,text); |
| 190 | + displayTokens(analyzer,text); |
191 | 191 | displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("zhwiki")),"末朝以來藩鎮割據and some plain english 和宦官亂政的現象 as well"); |
192 | 192 | displayTokens(analyzer,"Thomas Goode school"); |
193 | 193 | displayTokens(analyzer,"Agreement reply readily Gödel;"); |
— | — | @@ -200,7 +200,7 @@ |
201 | 201 | displayTokens(analyzer,"[[Image:Lawrence_Brainerd.jpg]], [[Image:Lawrence_Brainerd.jpg|thumb|300px|Lawrence Brainerd]]"); |
202 | 202 | displayTokens(analyzer,"{{Otheruses4|the Irish rock band|other uses|U2 (disambiguation)}}"); |
203 | 203 | displayTokens(analyzer,"{{Otheruses4|the Irish rock band|other uses|U2<ref>U2-ref</ref> (disambiguation)}} Let's see<ref>Seeing is...</ref> if template extraction works.\n==Some heading==\n And after that some text..\n\nAnd now? Not now. Then when? "); |
204 | | - |
| 204 | + |
205 | 205 | ArrayList<String> l = new ArrayList<String>(); |
206 | 206 | l.add("0:Douglas Adams|0:Someone"); |
207 | 207 | l.add("0:Someone"); |
— | — | @@ -208,27 +208,27 @@ |
209 | 209 | l.add(""); |
210 | 210 | l.add("0:Heu"); |
211 | 211 | displayTokens(new SplitAnalyzer(10,true),new StringList(l).toString()); |
212 | | - |
| 212 | + |
213 | 213 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("viwiki")); |
214 | 214 | displayTokens(analyzer,"ä, ö, ü; Đ đViệt Nam Đ/đ ↔ D/d lastone"); |
215 | | - |
| 215 | + |
216 | 216 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("dewiki")); |
217 | 217 | displayTokens(analyzer,"Gunzen ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire"); |
218 | | - |
| 218 | + |
219 | 219 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("enwiki")); |
220 | 220 | displayTokens(analyzer," ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire"); |
221 | | - |
| 221 | + |
222 | 222 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("srwiki")); |
223 | 223 | displayTokens(analyzer," ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire"); |
224 | | - |
| 224 | + |
225 | 225 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("eswiki")); |
226 | 226 | displayTokens(analyzer,"lógico y matemático"); |
227 | | - |
| 227 | + |
228 | 228 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("mlwiki")); |
229 | 229 | displayTokens(analyzer,"കൊറിയ,“കൊറിയ”"); |
230 | | - |
| 230 | + |
231 | 231 | printCodePoints("“കൊറിയ”"); |
232 | | - |
| 232 | + |
233 | 233 | QueryParser parser = new QueryParser("contents",new CJKAnalyzer()); |
234 | 234 | Query q = parser.parse("いわさきicカード プロサッカークラブをつくろう"); |
235 | 235 | System.out.println("Japanese in standard analyzer: "+q); |
— | — | @@ -236,7 +236,7 @@ |
237 | 237 | displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木 孝治(すずき こうじ、1954年 - )『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
238 | 238 | displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタjavaイトルはbalaton"); |
239 | 239 | displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"パ ン"); |
240 | | - |
| 240 | + |
241 | 241 | ArrayList<Aggregate> items = new ArrayList<Aggregate>(); |
242 | 242 | analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("enwiki")); |
243 | 243 | items.add(new Aggregate("douglas adams",10,IndexId.get("enwiki"),analyzer,"related",stopWords,Flags.ALTTITLE)); |
— | — | @@ -244,7 +244,7 @@ |
245 | 245 | items.add(new Aggregate("hurricane",3.22f,IndexId.get("enwiki"),analyzer,"related",stopWords,Flags.ANCHOR)); |
246 | 246 | items.add(new Aggregate("and some other stuff",3.2f,IndexId.get("enwiki"),analyzer,"related",stopWords,Flags.NONE)); |
247 | 247 | displayTokens(new AggregateAnalyzer(items),"AGGREGATE TEST"); |
248 | | - |
| 248 | + |
249 | 249 | // redirects? |
250 | 250 | FieldBuilder builder = new FieldBuilder(IndexId.get("enwiki")); |
251 | 251 | ArrayList<String> list = new ArrayList<String>(); |
— | — | @@ -264,20 +264,20 @@ |
265 | 265 | int p[] = MathFunc.partitionList(new double[] {0.52,0.12},5); |
266 | 266 | analyzer = (Analyzer) Analyzers.getIndexerAnalyzer("Agreement reply readily",builder,null,null,related,p,null,null,null)[0]; |
267 | 267 | displayTokens2(analyzer,""); |
268 | | - |
| 268 | + |
269 | 269 | analyzer = (Analyzer) Analyzers.getIndexerAnalyzer("Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method.",builder,null,null,null,null,null,null,null)[0]; |
270 | 270 | displayTokens2(analyzer,""); |
271 | 271 | analyzer = (Analyzer) Analyzers.getIndexerAnalyzer("1,039/Smoothed Out Slappy Hours",new FieldBuilder(IndexId.get("itwiki")),null,null,null,null,null,null,null)[0]; |
272 | 272 | displayTokens2(analyzer,""); |
273 | 273 | displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("itwiki")),"1,039/Smoothed Out Slappy Hours"); |
274 | | - |
| 274 | + |
275 | 275 | ArrayList<Aggregate> items = new ArrayList<Aggregate>(); |
276 | 276 | items.add(new Aggregate("douglas adams",10,IndexId.get("enwiki"),false)); |
277 | 277 | items.add(new Aggregate("the selected works...",2.1f,IndexId.get("enwiki"),false)); |
278 | 278 | items.add(new Aggregate("hurricane",3.22f,IndexId.get("enwiki"),false)); |
279 | 279 | items.add(new Aggregate("and some other stuff",3.2f,IndexId.get("enwiki"),false)); |
280 | 280 | displayTokens(new AggregateAnalyzer(items),"AGGREGATE TEST"); */ |
281 | | - |
| 281 | + |
282 | 282 | IndexId wl = IndexId.get("wikilucene"); |
283 | 283 | Analyzer an = Analyzers.getSearcherAnalyzer(wl); |
284 | 284 | Aggregate a1 = new Aggregate("Redheugh Bridges",1,wl,an,"alttitle",Flags.ALTTITLE); |
— | — | @@ -285,24 +285,24 @@ |
286 | 286 | ArrayList<Aggregate> al = new ArrayList<Aggregate>(); |
287 | 287 | al.add(a1); al.add(a2); |
288 | 288 | displayTokens(new AggregateAnalyzer(al),"AGGREGATE TEST"); |
289 | | - |
| 289 | + |
290 | 290 | displayTokens(Analyzers.getSpellCheckAnalyzer(IndexId.get("enwiki"),new HashSet<String>()), |
291 | 291 | "Agreement boxes reply readily Gödel, Gödel; a/b"); |
292 | | - |
293 | | - |
| 292 | + |
| 293 | + |
294 | 294 | if(true) |
295 | 295 | return; |
296 | | - |
| 296 | + |
297 | 297 | //testAnalyzer(new EnglishAnalyzer()); |
298 | 298 | testAnalyzer(Analyzers.getSearcherAnalyzer(IndexId.get("enwiki"))); |
299 | 299 | testAnalyzer(Analyzers.getSearcherAnalyzer(IndexId.get("dewiki"))); |
300 | 300 | testAnalyzer(Analyzers.getSearcherAnalyzer(IndexId.get("frwiki"))); |
301 | 301 | testAnalyzer(Analyzers.getSearcherAnalyzer(IndexId.get("srwiki"))); |
302 | 302 | testAnalyzer(Analyzers.getSearcherAnalyzer(IndexId.get("eswiki"))); |
303 | | - |
304 | | - |
| 303 | + |
| 304 | + |
305 | 305 | } |
306 | | - |
| 306 | + |
307 | 307 | private static void printCodePoints(String string) { |
308 | 308 | char[] str = string.toCharArray(); |
309 | 309 | for(int i=0;i<str.length;i++){ |
— | — | @@ -323,7 +323,7 @@ |
324 | 324 | byte[] b = ExtToken.serialize(analyzer.tokenStream("",article.content)); |
325 | 325 | if(i == 0) |
326 | 326 | size += b.length; |
327 | | - else |
| 327 | + else |
328 | 328 | size2 += b.length; |
329 | 329 | tokensFromAnalysis(analyzer, article.content,"contents"); |
330 | 330 | } |
— | — | @@ -331,7 +331,7 @@ |
332 | 332 | long delta = System.currentTimeMillis() - start; |
333 | 333 | System.out.println(delta+"ms ["+delta/count+"ms/ar] elapsed for analyzer "+analyzer+", size="+size+", size2="+size2); |
334 | 334 | } |
335 | | - |
| 335 | + |
336 | 336 | public static void testAnalyzer(Analyzer analyzer) throws IOException{ |
337 | 337 | ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test"); |
338 | 338 | ArrayList<TestArticle> articles = ap.getArticles(); |
— | — | @@ -347,7 +347,7 @@ |
348 | 348 | long delta = System.currentTimeMillis() - start; |
349 | 349 | System.out.println(delta+"ms ["+delta/count+"ms/ar] elapsed for analyzer "+analyzer); |
350 | 350 | } |
351 | | - |
| 351 | + |
352 | 352 | public static void testAnalyzer(Analyzer analyzer, String name) throws IOException{ |
353 | 353 | ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test"); |
354 | 354 | ArrayList<TestArticle> articles = ap.getArticles(); |
Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/FastWikiTokenizerTest.java |
— | — | @@ -25,116 +25,116 @@ |
26 | 26 | |
27 | 27 | public class FastWikiTokenizerTest extends WikiTestCase { |
28 | 28 | IndexId iid; |
29 | | - TokenizerOptions options; |
30 | | - |
| 29 | + TokenizerOptions options; |
| 30 | + |
31 | 31 | public void testIndex(){ |
32 | 32 | this.iid = IndexId.get("enwiki"); |
33 | 33 | this.options = new TokenizerOptions.ContentOptions(false); |
34 | | - |
35 | | - assertEquals("1 [link] 1 [text]", |
| 34 | + |
| 35 | + assertEquals("1 [link] 1 [text]", |
36 | 36 | tokens("[[link text]]")); |
37 | | - |
38 | | - assertEquals("1 [anchor] 1 [text]", |
| 37 | + |
| 38 | + assertEquals("1 [anchor] 1 [text]", |
39 | 39 | tokens("[[some link|anchor text]]")); |
40 | | - |
41 | | - assertEquals("1 [italic] 2 [see]", |
| 40 | + |
| 41 | + assertEquals("1 [italic] 2 [see]", |
42 | 42 | tokens("''italic''<nowiki><!-- see --></nowiki><!-- nosee -->")); |
43 | | - |
44 | | - assertEquals("1 [http] 2 [en] 1 [wikipedia] 1 [org/] 0 [org] 1 [english] 1 [wikipedia]", |
| 43 | + |
| 44 | + assertEquals("1 [http] 2 [en] 1 [wikipedia] 1 [org/] 0 [org] 1 [english] 1 [wikipedia]", |
45 | 45 | tokens("[http://en.wikipedia.org/ english wikipedia]")); |
46 | | - |
47 | | - assertEquals("500 [image] 1 [argishti] 1 [monument] 1 [jpg] 1 [king] 1 [argishti] 1 [of] 1 [urartu]", |
| 46 | + |
| 47 | + assertEquals("500 [image] 1 [argishti] 1 [monument] 1 [jpg] 1 [king] 1 [argishti] 1 [of] 1 [urartu]", |
48 | 48 | tokens("[[Image:Argishti monument.JPG|thumb|King Argishti of Urartu]]")); |
49 | | - |
50 | | - assertEquals("500 [image] 1 [argishti] 1 [monument] 1 [jpg] 1 [king] 1 [argishti] 1 [of] 1 [urartu]", |
| 49 | + |
| 50 | + assertEquals("500 [image] 1 [argishti] 1 [monument] 1 [jpg] 1 [king] 1 [argishti] 1 [of] 1 [urartu]", |
51 | 51 | tokens("[[Image:Argishti monument.JPG|thumb|King [[link target|Argishti]] of Urartu]]")); |
52 | | - |
| 52 | + |
53 | 53 | assertEquals("500 [image] 1 [frizbi] 1 [jpg] 1 [frizbi] 1 [za] 1 [ultimate] 1 [28] 1 [cm] 1 [175] 1 [g]", |
54 | 54 | tokens("[[Image:frizbi.jpg|десно|мини|240п|Frizbi za ultimate, 28cm, 175g]]")); |
55 | | - |
56 | | - assertEquals("1 [image] 3 [argishti] 1 [monument] 1 [jpg] 1 [thumb] 1 [king] 1 [argishti] 1 [of] 1 [urartu]", |
| 55 | + |
| 56 | + assertEquals("1 [image] 3 [argishti] 1 [monument] 1 [jpg] 1 [thumb] 1 [king] 1 [argishti] 1 [of] 1 [urartu]", |
57 | 57 | tokens("[[Image:Argishti monument.JPG|thumb|King Argishti of Urartu")); |
58 | | - |
59 | | - assertEquals("1 [clinton] 1 [comets]", |
| 58 | + |
| 59 | + assertEquals("1 [clinton] 1 [comets]", |
60 | 60 | tokens("{| style=\"margin:0px 5px 10px 10px; border:1px solid #8888AA;\" align=right cellpadding=3 cellspacing=3 width=360\n|- align=\"center\" bgcolor=\"#dddddd\"\n|colspan=\"3\"| '''Clinton Comets'''")); |
61 | | - |
62 | | - assertEquals("2 [or] 1 [ا] 500 [lɒs] 1 [ˈændʒəˌlɪs] 0 [ˈaendʒəˌlɪs]", |
| 61 | + |
| 62 | + assertEquals("2 [or] 1 [ا] 500 [lɒs] 1 [ˈændʒəˌlɪs] 0 [ˈaendʒəˌlɪs]", |
63 | 63 | tokens("{{IPA|[lɒs ˈændʒəˌlɪs]}} < or < © ©ا")); |
64 | | - |
65 | | - assertEquals("500 [text1] 1 [text2] 1 [text3]", |
| 64 | + |
| 65 | + assertEquals("500 [text1] 1 [text2] 1 [text3]", |
66 | 66 | tokens("{{template|text1}} {{template|text2|text3}}")); |
67 | | - |
| 67 | + |
68 | 68 | assertEquals("", |
69 | 69 | tokens("[[sr:Naslov]]")); |
70 | | - |
71 | | - assertEquals("500 [some] 1 [category] 1 [name]", |
| 70 | + |
| 71 | + assertEquals("500 [some] 1 [category] 1 [name]", |
72 | 72 | tokens("[[Category:Some category name]]")); |
73 | | - |
74 | | - assertEquals("[Some category name]", |
| 73 | + |
| 74 | + assertEquals("[Some category name]", |
75 | 75 | categories("[[Category:Some category name]]")); |
76 | | - |
77 | | - assertEquals("500 [param1] 1 [param2] 1 [value2]", |
| 76 | + |
| 77 | + assertEquals("500 [param1] 1 [param2] 1 [value2]", |
78 | 78 | tokens("{{template|param1 = {{value1}}|param2 = value2}}")); |
79 | | - |
80 | | - assertEquals("500 [param1] 1 [value1] 1 [param2] 1 [value2]", |
| 79 | + |
| 80 | + assertEquals("500 [param1] 1 [value1] 1 [param2] 1 [value2]", |
81 | 81 | tokens("{{template|param1 = [[target|value1]]|param2 = value2}}")); |
82 | | - |
| 82 | + |
83 | 83 | assertEquals("1 [wikipedia] 1 [is] 1 [accurate] 2 [and] 1 [it's] 0 [its] 1 [not] 1 [a] 1 [lie] 20 [see] 1 [kurir]", |
84 | 84 | tokens("Wikipedia is accurate<ref>see Kurir</ref>, and it's not a lie.")); |
85 | | - |
| 85 | + |
86 | 86 | assertEquals("1 [this] 1 [column] 1 [is] 1 [100] 1 [points] 1 [wide] 1 [this] 1 [column] 1 [is] 1 [200] 1 [points] 1 [wide] 1 [this] 1 [column] 1 [is] 1 [300] 1 [points] 1 [wide] 1 [blah] 1 [blah] 1 [blah]", |
87 | 87 | tokens("{| border=\"1\" cellpadding=\"2\"\n|-\n|width=\"100pt\"|This column is 100 points wide\n|width=\"200pt\"|This column is 200 points wide\n|width=\"300pt\"|This column is 300 points wide\n|-\n|blah || blah || blah\n|}")); |
88 | | - |
| 88 | + |
89 | 89 | assertEquals("1 [first] 10 [second]", |
90 | 90 | tokens("first\n\nsecond")); |
91 | | - |
| 91 | + |
92 | 92 | assertEquals("1 [u2] 1 [heading1]", |
93 | 93 | tokens("u2 heading1")); |
94 | | - |
| 94 | + |
95 | 95 | assertEquals("1 [test] 1 [apostrophe's] 0 [apostrophes] 1 [and] 1 [other’s] 0 [others]", |
96 | 96 | tokens("Test apostrophe's and other\u2019s.")); |
97 | 97 | |
98 | | - |
| 98 | + |
99 | 99 | } |
100 | | - |
| 100 | + |
101 | 101 | public void testHighlight(){ |
102 | 102 | this.iid = IndexId.get("enwiki"); |
103 | 103 | this.options = new TokenizerOptions.Highlight(false); |
104 | | - |
105 | | - assertEquals("1 [' ' GLUE FIRST_SECTION] 1 ['link' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['text' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION]", |
| 104 | + |
| 105 | + assertEquals("1 [' ' GLUE FIRST_SECTION] 1 ['link' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['text' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION]", |
106 | 106 | tokens("[[link text]]")); |
107 | | - |
108 | | - assertEquals("1 [' ' GLUE BULLETINS] 10 ['bullet1' TEXT BULLETINS] 1 [' ' SENTENCE_BREAK BULLETINS] 1 ['bullet2' TEXT BULLETINS]", |
| 107 | + |
| 108 | + assertEquals("1 [' ' GLUE BULLETINS] 10 ['bullet1' TEXT BULLETINS] 1 [' ' SENTENCE_BREAK BULLETINS] 1 ['bullet2' TEXT BULLETINS]", |
109 | 109 | tokens("* bullet1\n* bullet2")); |
110 | | - |
111 | | - assertEquals("1 [' ' GLUE FIRST_SECTION] 1 ['http' TEXT FIRST_SECTION] 1 ['://' MINOR_BREAK FIRST_SECTION] 1 ['en' TEXT FIRST_SECTION] 1 ['.' SENTENCE_BREAK FIRST_SECTION] 1 ['wikipedia' TEXT FIRST_SECTION] 1 ['.' SENTENCE_BREAK FIRST_SECTION] 1 ['org/' TEXT FIRST_SECTION] 0 ['org' TEXT FIRST_SECTION] 1 ['wiki' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['english' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['wiki' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION]", |
| 110 | + |
| 111 | + assertEquals("1 [' ' GLUE FIRST_SECTION] 1 ['http' TEXT FIRST_SECTION] 1 ['://' MINOR_BREAK FIRST_SECTION] 1 ['en' TEXT FIRST_SECTION] 1 ['.' SENTENCE_BREAK FIRST_SECTION] 1 ['wikipedia' TEXT FIRST_SECTION] 1 ['.' SENTENCE_BREAK FIRST_SECTION] 1 ['org/' TEXT FIRST_SECTION] 0 ['org' TEXT FIRST_SECTION] 1 ['wiki' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['english' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['wiki' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION]", |
112 | 112 | tokens("[http://en.wikipedia.org/wiki english wiki]")); |
113 | | - |
114 | | - assertEquals("1 [' ' GLUE IMAGE_CAT_IW] 1 ['image' TEXT IMAGE_CAT_IW] 1 [':' MINOR_BREAK IMAGE_CAT_IW] 1 ['argishti' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['monument' TEXT IMAGE_CAT_IW] 1 ['.' SENTENCE_BREAK IMAGE_CAT_IW] 1 ['jpg' TEXT IMAGE_CAT_IW] 1 [' | ' GLUE IMAGE_CAT_IW] 1 ['king' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['argishti' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['of' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['urartu' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 [' ' SENTENCE_BREAK FIRST_SECTION] 1 ['main' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['text' TEXT FIRST_SECTION]", |
| 113 | + |
| 114 | + assertEquals("1 [' ' GLUE IMAGE_CAT_IW] 1 ['image' TEXT IMAGE_CAT_IW] 1 [':' MINOR_BREAK IMAGE_CAT_IW] 1 ['argishti' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['monument' TEXT IMAGE_CAT_IW] 1 ['.' SENTENCE_BREAK IMAGE_CAT_IW] 1 ['jpg' TEXT IMAGE_CAT_IW] 1 [' | ' GLUE IMAGE_CAT_IW] 1 ['king' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['argishti' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['of' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 ['urartu' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW] 1 [' ' SENTENCE_BREAK FIRST_SECTION] 1 ['main' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['text' TEXT FIRST_SECTION]", |
115 | 115 | tokens("[[Image:Argishti monument.JPG|thumb|King Argishti of Urartu]]\n\nMain text")); |
116 | | - |
| 116 | + |
117 | 117 | assertEquals("1 [' ' GLUE IMAGE_CAT_IW] 1 ['category' TEXT IMAGE_CAT_IW] 1 [':' MINOR_BREAK IMAGE_CAT_IW] 1 ['name' TEXT IMAGE_CAT_IW] 1 [' ' GLUE IMAGE_CAT_IW]", |
118 | 118 | tokens("[[Category:Name|sort key]]")); |
119 | | - |
120 | | - assertEquals("1 [' ' GLUE TEMPLATE] 1 ['param1' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 ['value1' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 [' | ' GLUE TEMPLATE] 1 ['param2' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 ['value2' TEXT TEMPLATE] 1 [' ' GLUE FIRST_SECTION]", |
| 119 | + |
| 120 | + assertEquals("1 [' ' GLUE TEMPLATE] 1 ['param1' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 ['value1' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 [' | ' GLUE TEMPLATE] 1 ['param2' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 ['value2' TEXT TEMPLATE] 1 [' ' GLUE FIRST_SECTION]", |
121 | 121 | tokens("{{template|param1 = [[value1]]|param2 = value2}}")); |
122 | | - |
123 | | - assertEquals("1 [' ' GLUE TEMPLATE] 1 ['param1' TEXT TEMPLATE] 1 [' | ' GLUE TEMPLATE] 1 ['param2' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 ['value2' TEXT TEMPLATE] 1 [' ' GLUE FIRST_SECTION]", |
| 122 | + |
| 123 | + assertEquals("1 [' ' GLUE TEMPLATE] 1 ['param1' TEXT TEMPLATE] 1 [' | ' GLUE TEMPLATE] 1 ['param2' TEXT TEMPLATE] 1 [' ' GLUE TEMPLATE] 1 ['value2' TEXT TEMPLATE] 1 [' ' GLUE FIRST_SECTION]", |
124 | 124 | tokens("{{template|param1 = {{value1}}|param2 = value2}}")); |
125 | 125 | |
126 | 126 | assertEquals("1 [' ' GLUE HEADING] 1 ['heading' TEXT HEADING] 1 [' ' GLUE HEADING] 1 ['1' TEXT HEADING] 1 [' ' GLUE HEADING] 1 [' ' GLUE NORMAL]", |
127 | 127 | tokens("== Heading 1 ==\n")); |
128 | | - |
| 128 | + |
129 | 129 | assertEquals("1 ['wikipedia' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['is' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['accurate' TEXT FIRST_SECTION] 1 [', ' MINOR_BREAK FIRST_SECTION] 1 ['and' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['it's' TEXT FIRST_SECTION] 0 ['its' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['not' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['a' TEXT FIRST_SECTION] 1 [' ' GLUE FIRST_SECTION] 1 ['lie' TEXT FIRST_SECTION] 1 ['.' SENTENCE_BREAK FIRST_SECTION] 20 [' ' GLUE REFERENCE] 1 ['see' TEXT REFERENCE] 1 [' ' GLUE REFERENCE] 1 ['kurir' TEXT REFERENCE] 1 [' ' GLUE REFERENCE]", |
130 | 130 | tokens("Wikipedia is accurate<ref>see Kurir</ref>, and it's not a lie.")); |
131 | | - |
| 131 | + |
132 | 132 | assertEquals("1 [' | ' GLUE TABLE] 1 ['this' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['column' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['is' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['100' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['points' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['wide' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['this' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['column' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['is' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['200' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['points' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['wide' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['this' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['column' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['is' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['300' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['points' TEXT TABLE] 1 [' ' GLUE TABLE] 1 ['wide' TEXT TABLE] 1 [' | ' SENTENCE_BREAK TABLE] 1 ['blah' TEXT TABLE] 1 [' | ' GLUE TABLE] 1 ['blah' TEXT TABLE] 1 [' | ' GLUE TABLE] 1 ['blah' TEXT TABLE] 1 [' | ' GLUE FIRST_SECTION]", |
133 | 133 | tokens("{| border=\"1\" cellpadding=\"2\"\n|-\n|width=\"100pt\"|This column is 100 points wide\n|width=\"200pt\"|This column is 200 points wide\n|width=\"300pt\"|This column is 300 points wide\n|-\n|blah || blah || blah\n|}")); |
134 | | - |
| 134 | + |
135 | 135 | } |
136 | | - |
137 | | - |
138 | | - |
| 136 | + |
| 137 | + |
| 138 | + |
139 | 139 | public String tokens(String text){ |
140 | 140 | StringBuilder sb = new StringBuilder(); |
141 | 141 | FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,iid,options); |
— | — | @@ -149,7 +149,7 @@ |
150 | 150 | } |
151 | 151 | return sb.toString().trim(); |
152 | 152 | } |
153 | | - |
| 153 | + |
154 | 154 | public String categories(String text){ |
155 | 155 | FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,iid,options); |
156 | 156 | parser.parse(); |
— | — | @@ -179,7 +179,7 @@ |
180 | 180 | System.out.print("INTERWIKI: "); |
181 | 181 | } |
182 | 182 | for(Entry<String,String> t : iw.entrySet()){ |
183 | | - System.out.print("["+t.getKey()+"] => ["+t.getValue()+"] "); |
| 183 | + System.out.print("["+t.getKey()+"] => ["+t.getValue()+"] "); |
184 | 184 | } |
185 | 185 | if(iw.size()!=0) System.out.println(); |
186 | 186 | |
— | — | @@ -188,7 +188,7 @@ |
189 | 189 | System.out.print("KEYWORDS: "); |
190 | 190 | } |
191 | 191 | for(String t : keywords){ |
192 | | - System.out.print("["+t+"] "); |
| 192 | + System.out.print("["+t+"] "); |
193 | 193 | } |
194 | 194 | if(keywords.size()!=0) System.out.println(); |
195 | 195 | |
— | — | @@ -198,7 +198,7 @@ |
199 | 199 | static void showTokens(String text){ |
200 | 200 | System.out.println("TEXT: "+text); |
201 | 201 | System.out.flush(); |
202 | | - displayTokensForParser(text); |
| 202 | + displayTokensForParser(text); |
203 | 203 | System.out.flush(); |
204 | 204 | } |
205 | 205 | |
— | — | @@ -231,7 +231,7 @@ |
232 | 232 | text = "|something|else| is| there| to | see"; |
233 | 233 | showTokens(text); |
234 | 234 | text = "-test 3.14 and U.S.A and more, .test more"; |
235 | | - showTokens(text); |
| 235 | + showTokens(text); |
236 | 236 | text = "{{IPA|[lɒs ˈændʒəˌlɪs]}} < or < © ©ا or ا "; |
237 | 237 | showTokens(text); |
238 | 238 | text = "| Unseen\n|-\n| \"Junior\"\n|\n| Goa'uld larva\n|} something"; |
— | — | @@ -318,14 +318,14 @@ |
319 | 319 | text = "[[First]] second third fourth and so on goes the ... [[last link]]"; |
320 | 320 | showTokens(text); |
321 | 321 | text = "{{Something| param = {{another}}[[First]] } }} }} }} [[first good]]s {{name| [[many]] many many tokens }} second third fourth and so on goes the ... [[good keyword]]"; |
322 | | - showTokens(text); |
323 | | - text = "{| style=\"float: right; clear: right; background-color: transparent\"\n|-\n|{{Infobox Military Conflict|\n|conflict=1982 Lebanon War <br>([[Israel-Lebanon conflict]])\n|image=[[Image:Map of Lebanon.png|300px]]\n|caption=Map of modern Lebanon\n|date=June - September 1982\n|place=Southern [[Lebanon]]\n|casus=Two main causes:\n*Terrorist raids on northern Israel by [[PLO]] [[guerrilla]] based in Lebanon\n*the [[Shlomo Argov|shooting of Israel's ambassador]] by the [[Abu Nidal Organization]]<ref>[http://www.usatoday.com/graphics/news/gra/gisrael2/flash.htm The Middle East conflict], ''[[USA Today]]'' (sourced guardian.co.uk, Facts on File, AP) \"Israel invades Lebanon in response to terrorist attacks by PLO guerrillas based there.\"</ref><ref>{{cite book\n|author = Mark C. Carnes, John A. Garraty\n|title = The American Nation\n|publisher = Pearson Education, Inc.\n|date = 2006\n|location = USA\n|pages = 903\n|id = ISBN 0-321-42606-1\n}}</ref><ref>{{cite book\n|author= ''[[Time (magazine)|Time]]''\n|title = The Year in Review\n|publisher = Time Books\n|date = 2006\n|location = 1271 Avenue of the Americs, New York, NY 10020\n|id = ISSN: 1097-5721\n}} \"For decades now, Arab terrorists operating out of southern Lebanon have staged raids and fired mortar shells into northern Israel, denying the Israelis peace of mind. In the early 1980s, the terrorists operating out of Lebanon were controlled by Yasser Arafat's Palestine Liberation Organization (P.L.O.). After Israel's ambassador to Britain, Shlomo Argov, was shot in cold blood and seriously wounded by the Palestinian terror group Abu Nidal in London in 1982, fed-up Israelis sent tanks and troops rolling into Lebanon to disperse the guerrillas.\" (pg. 44-45)</ref><ref>\"The Palestine Liberation Organization (PLO) had been launching guerrilla attacks against Israel since the 1960s (see Palestine Liberation Organization). After the PLO was driven from Jordan in 1971, the organization established bases in southern Lebanon, from which it continued to attack Israel. In 1981 heavy PLO rocket fire on Israeli settlements led Israel to conduct air strikes in Lebanon. The Israelis also destroyed Iraq's nuclear reactor at Daura near Baghdad."; |
324 | | - showTokens(text); |
| 322 | + showTokens(text); |
| 323 | + text = "{| style=\"float: right; clear: right; background-color: transparent\"\n|-\n|{{Infobox Military Conflict|\n|conflict=1982 Lebanon War <br>([[Israel-Lebanon conflict]])\n|image=[[Image:Map of Lebanon.png|300px]]\n|caption=Map of modern Lebanon\n|date=June - September 1982\n|place=Southern [[Lebanon]]\n|casus=Two main causes:\n*Terrorist raids on northern Israel by [[PLO]] [[guerrilla]] based in Lebanon\n*the [[Shlomo Argov|shooting of Israel's ambassador]] by the [[Abu Nidal Organization]]<ref>[http://www.usatoday.com/graphics/news/gra/gisrael2/flash.htm The Middle East conflict], ''[[USA Today]]'' (sourced guardian.co.uk, Facts on File, AP) \"Israel invades Lebanon in response to terrorist attacks by PLO guerrillas based there.\"</ref><ref>{{cite book\n|author = Mark C. Carnes, John A. Garraty\n|title = The American Nation\n|publisher = Pearson Education, Inc.\n|date = 2006\n|location = USA\n|pages = 903\n|id = ISBN 0-321-42606-1\n}}</ref><ref>{{cite book\n|author= ''[[Time (magazine)|Time]]''\n|title = The Year in Review\n|publisher = Time Books\n|date = 2006\n|location = 1271 Avenue of the Americs, New York, NY 10020\n|id = ISSN: 1097-5721\n}} \"For decades now, Arab terrorists operating out of southern Lebanon have staged raids and fired mortar shells into northern Israel, denying the Israelis peace of mind. In the early 1980s, the terrorists operating out of Lebanon were controlled by Yasser Arafat's Palestine Liberation Organization (P.L.O.). After Israel's ambassador to Britain, Shlomo Argov, was shot in cold blood and seriously wounded by the Palestinian terror group Abu Nidal in London in 1982, fed-up Israelis sent tanks and troops rolling into Lebanon to disperse the guerrillas.\" (pg. 44-45)</ref><ref>\"The Palestine Liberation Organization (PLO) had been launching guerrilla attacks against Israel since the 1960s (see Palestine Liberation Organization). After the PLO was driven from Jordan in 1971, the organization established bases in southern Lebanon, from which it continued to attack Israel. In 1981 heavy PLO rocket fire on Israeli settlements led Israel to conduct air strikes in Lebanon. The Israelis also destroyed Iraq's nuclear reactor at Daura near Baghdad."; |
| 324 | + showTokens(text); |
325 | 325 | |
326 | 326 | |
327 | 327 | |
328 | 328 | ArticlesParser ap1 = new ArticlesParser("./test-data/indexing-articles.test"); |
329 | | - ArrayList<TestArticle> articles1 = ap1.getArticles(); |
| 329 | + ArrayList<TestArticle> articles1 = ap1.getArticles(); |
330 | 330 | showTokens(articles1.get(articles1.size()-1).content); |
331 | 331 | |
332 | 332 | //if(true) |
— | — | @@ -378,7 +378,7 @@ |
379 | 379 | ObjectOutputStream out = new ObjectOutputStream(ba); |
380 | 380 | out.writeObject(tokens); |
381 | 381 | size += ba.size(); */ |
382 | | - //byte[] b = ExtToken.serializetokens); |
| 382 | + //byte[] b = ExtToken.serializetokens); |
383 | 383 | //size += b.length; |
384 | 384 | //ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(ba.toByteArray())); |
385 | 385 | //ArrayList<ExtToken> some = (ArrayList<ExtToken>) in.readObject(); |
— | — | @@ -388,7 +388,7 @@ |
389 | 389 | System.out.println("Parser elapsed: "+delta+"ms, per serialization: "+((double)delta/total)+"ms, size:"+size/total); |
390 | 390 | |
391 | 391 | } |
392 | | - |
| 392 | + |
393 | 393 | public void testVowels(){ |
394 | 394 | assertEquals("zdrv", FastWikiTokenizerEngine.deleteVowels("zdravo")); |
395 | 395 | assertEquals("v g mlrd", FastWikiTokenizerEngine.deleteVowels("eve ga milorad")); |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers |
___________________________________________________________________ |
Name: svn:ignore |
396 | 396 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/beans |
___________________________________________________________________ |
Name: svn:ignore |
397 | 397 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/util |
___________________________________________________________________ |
Name: svn:ignore |
398 | 398 | + *.class |
Property changes on: branches/lucene-search-2.1/test/org/wikimedia/lsearch/ranks |
___________________________________________________________________ |
Name: svn:ignore |
399 | 399 | + *.class |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java |
— | — | @@ -21,7 +21,7 @@ |
22 | 22 | |
23 | 23 | /** |
24 | 24 | * Local cache of aggregate field meta informations |
25 | | - * |
| 25 | + * |
26 | 26 | * @author rainman |
27 | 27 | * |
28 | 28 | */ |
— | — | @@ -32,20 +32,20 @@ |
33 | 33 | protected static Object lock = new Object(); |
34 | 34 | /** directory -> fields */ |
35 | 35 | protected static WeakHashMap<Directory,Set<String>> cachingInProgress = new WeakHashMap<Directory,Set<String>>(); |
36 | | - |
| 36 | + |
37 | 37 | /** Check if there is a current background caching on a reader */ |
38 | 38 | public static boolean isBeingCached(IndexReader reader){ |
39 | 39 | synchronized(cachingInProgress){ |
40 | 40 | return cachingInProgress.containsKey(reader.directory()); |
41 | 41 | } |
42 | 42 | } |
43 | | - |
| 43 | + |
44 | 44 | public static void invalidateCache(IndexReader reader){ |
45 | 45 | synchronized (lock) { |
46 | 46 | cache.remove(reader.directory()); |
47 | 47 | } |
48 | 48 | } |
49 | | - |
| 49 | + |
50 | 50 | /** Get a meta cacher, return null if field is already cached or being cached */ |
51 | 51 | public static CacheBuilder getCacherBuilder(IndexReader reader, String field) throws IOException { |
52 | 52 | synchronized(lock){ |
— | — | @@ -61,10 +61,10 @@ |
62 | 62 | s = new AggregateMetaFieldSource(reader,field); |
63 | 63 | fields.put(field,s); |
64 | 64 | return s; |
65 | | - } |
| 65 | + } |
66 | 66 | } |
67 | 67 | } |
68 | | - |
| 68 | + |
69 | 69 | /** Get a cached meta source to use in queries */ |
70 | 70 | public static AggregateMetaFieldSource getCachedSource(IndexReader reader, String field) { |
71 | 71 | synchronized(lock){ |
— | — | @@ -75,10 +75,10 @@ |
76 | 76 | } |
77 | 77 | } |
78 | 78 | |
79 | | - |
| 79 | + |
80 | 80 | /** |
81 | | - * Cached meta aggregate info |
82 | | - * |
| 81 | + * Cached meta aggregate info |
| 82 | + * |
83 | 83 | * @author rainman |
84 | 84 | * |
85 | 85 | */ |
— | — | @@ -94,10 +94,10 @@ |
95 | 95 | protected String field; |
96 | 96 | protected boolean cachingFinished = false; |
97 | 97 | protected boolean isOptimized; |
98 | | - // temporary: |
| 98 | + // temporary: |
99 | 99 | protected int count = 0; |
100 | 100 | protected int maxdoc = 0; |
101 | | - |
| 101 | + |
102 | 102 | public void init() { |
103 | 103 | synchronized(cachingInProgress){ |
104 | 104 | Set<String> set = cachingInProgress.get(reader.directory()); |
— | — | @@ -113,7 +113,7 @@ |
114 | 114 | index = new int[maxdoc]; |
115 | 115 | length = new byte[maxdoc]; // estimate maxdoc values |
116 | 116 | lengthNoStopWords = new byte[maxdoc]; |
117 | | - lengthComplete = new byte[maxdoc]; |
| 117 | + lengthComplete = new byte[maxdoc]; |
118 | 118 | boost = new float[maxdoc]; |
119 | 119 | flags = new byte[maxdoc]; |
120 | 120 | namespaces = new byte[maxdoc]; |
— | — | @@ -133,10 +133,10 @@ |
134 | 134 | if(count >= length.length){ |
135 | 135 | length = extendBytes(length); |
136 | 136 | lengthNoStopWords = extendBytes(lengthNoStopWords); |
137 | | - lengthComplete = extendBytes(lengthComplete); |
| 137 | + lengthComplete = extendBytes(lengthComplete); |
138 | 138 | boost = extendFloats(boost); |
139 | 139 | flags = extendBytes(flags); |
140 | | - } |
| 140 | + } |
141 | 141 | length[count] = stored[j*8]; |
142 | 142 | if(length[count] == 0){ |
143 | 143 | log.debug("Broken length=0 for docid="+i+", at position "+j); |
— | — | @@ -147,14 +147,14 @@ |
148 | 148 | lengthComplete[count] = stored[j*8+6]; |
149 | 149 | flags[count] = stored[j*8+7]; |
150 | 150 | count++; |
151 | | - } |
| 151 | + } |
152 | 152 | } catch(Exception e){ |
153 | 153 | log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage(),e); |
154 | 154 | e.printStackTrace(); |
155 | 155 | throw new IOException(e.getMessage()); |
156 | 156 | } |
157 | 157 | } |
158 | | - |
| 158 | + |
159 | 159 | public void end(){ |
160 | 160 | if(count < length.length - 1){ |
161 | 161 | length = resizeBytes(length,count); |
— | — | @@ -164,7 +164,7 @@ |
165 | 165 | flags = resizeBytes(flags,count); |
166 | 166 | } |
167 | 167 | cachingFinished = true; |
168 | | - |
| 168 | + |
169 | 169 | synchronized(cachingInProgress){ |
170 | 170 | Set<String> set = cachingInProgress.get(reader.directory()); |
171 | 171 | set.remove(field); |
— | — | @@ -172,7 +172,7 @@ |
173 | 173 | cachingInProgress.remove(reader.directory()); |
174 | 174 | } |
175 | 175 | } |
176 | | - |
| 176 | + |
177 | 177 | protected byte[] extendBytes(byte[] array){ |
178 | 178 | return resizeBytes(array,array.length*2); |
179 | 179 | } |
— | — | @@ -183,13 +183,13 @@ |
184 | 184 | } |
185 | 185 | protected float[] extendFloats(float[] array){ |
186 | 186 | return resizeFloats(array,array.length*2); |
187 | | - } |
| 187 | + } |
188 | 188 | protected float[] resizeFloats(float[] array, int size){ |
189 | 189 | float[] t = new float[size]; |
190 | 190 | System.arraycopy(array,0,t,0,Math.min(array.length,size)); |
191 | 191 | return t; |
192 | 192 | } |
193 | | - |
| 193 | + |
194 | 194 | protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{ |
195 | 195 | this.reader = reader; |
196 | 196 | this.field = fieldBase+"_meta"; |
— | — | @@ -203,13 +203,13 @@ |
204 | 204 | int end = (docid == index.length-1)? length.length : index[docid+1]; |
205 | 205 | if(position >= end-start){ |
206 | 206 | if(checkExists) // if true this is not an error |
207 | | - return -1; |
| 207 | + return -1; |
208 | 208 | else |
209 | 209 | throwException(docid,position,end-start-1); |
210 | 210 | } |
211 | 211 | return start+position; |
212 | 212 | } |
213 | | - |
| 213 | + |
214 | 214 | private void throwException(int docid, int position, int lastValid){ |
215 | 215 | try { |
216 | 216 | // first try to give more detailed error |
— | — | @@ -217,22 +217,22 @@ |
218 | 218 | } catch (IOException e) { |
219 | 219 | e.printStackTrace(); |
220 | 220 | throw new ArrayIndexOutOfBoundsException("Requested position "+position+" on field "+field+" unavailable"+" on "+reader.directory()); |
221 | | - } |
| 221 | + } |
222 | 222 | } |
223 | | - |
| 223 | + |
224 | 224 | protected byte[] getStored(int docid) throws CorruptIndexException, IOException{ |
225 | 225 | return reader.document(docid).getBinaryValue(field); |
226 | 226 | } |
227 | | - |
228 | | - /** Get length of nonalias tokens */ |
| 227 | + |
| 228 | + /** Get length of nonalias tokens */ |
229 | 229 | public int getLength(int docid, int position) throws CorruptIndexException, IOException{ |
230 | 230 | if(!cachingFinished) // still caching in background |
231 | 231 | return getStored(docid)[position*8]; |
232 | 232 | return length[getValueIndex(docid,position)]; |
233 | | - } |
234 | | - /** Get length without stop words */ |
| 233 | + } |
| 234 | + /** Get length without stop words */ |
235 | 235 | public int getLengthNoStopWords(int docid, int position) throws CorruptIndexException, IOException{ |
236 | | - if(!cachingFinished) |
| 236 | + if(!cachingFinished) |
237 | 237 | return getStored(docid)[position*8+1]; |
238 | 238 | return lengthNoStopWords[getValueIndex(docid,position)]; |
239 | 239 | } |
— | — | @@ -242,7 +242,7 @@ |
243 | 243 | return getStored(docid)[position*8+6]; |
244 | 244 | return lengthComplete[getValueIndex(docid,position)]; |
245 | 245 | } |
246 | | - |
| 246 | + |
247 | 247 | /** generic function to get boost value at some position, if checkExists=true won't die on error */ |
248 | 248 | private float getBoost(int docid, int position, boolean checkExists) throws CorruptIndexException, IOException{ |
249 | 249 | if(!cachingFinished){ |
— | — | @@ -261,25 +261,25 @@ |
262 | 262 | return 1; |
263 | 263 | return boost[inx]; |
264 | 264 | } |
265 | | - |
266 | | - /** Get boost for position */ |
| 265 | + |
| 266 | + /** Get boost for position */ |
267 | 267 | public float getBoost(int docid, int position) throws CorruptIndexException, IOException{ |
268 | 268 | return getBoost(docid,position,false); |
269 | 269 | } |
270 | | - |
| 270 | + |
271 | 271 | /** Get rank (boost at position 0) */ |
272 | 272 | public float getRank(int docid) throws CorruptIndexException, IOException{ |
273 | 273 | return getBoost(docid,0,true); |
274 | 274 | } |
275 | | - |
| 275 | + |
276 | 276 | /** Get namespace of the document */ |
277 | 277 | public int getNamespace(int docid) throws CorruptIndexException, IOException{ |
278 | 278 | if(!cachingFinished){ |
279 | 279 | return Integer.parseInt(reader.document(docid).get("namespace")); |
280 | | - } |
| 280 | + } |
281 | 281 | return namespaces[docid]; |
282 | 282 | } |
283 | | - |
| 283 | + |
284 | 284 | /** Get flag values for docid at position */ |
285 | 285 | public Flags getFlags(int docid, int position) throws CorruptIndexException, IOException{ |
286 | 286 | int ord = 0; |
— | — | @@ -290,8 +290,8 @@ |
291 | 291 | |
292 | 292 | return Flags.values()[ord]; |
293 | 293 | } |
294 | | - |
295 | | - |
| 294 | + |
| 295 | + |
296 | 296 | } |
297 | 297 | |
298 | 298 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateInfoImpl.java |
— | — | @@ -11,11 +11,11 @@ |
12 | 12 | import org.wikimedia.lsearch.analyzers.Aggregate.Flags; |
13 | 13 | import org.wikimedia.lsearch.search.AggregateMetaField.AggregateMetaFieldSource; |
14 | 14 | |
15 | | -/** |
| 15 | +/** |
16 | 16 | * Wrapper for aggregate fields info in the index. Include an instance |
17 | 17 | * of this class into CustomPhraseQuery to use the additional meta |
18 | | - * info (which is locally cached in AggregateMetaField). |
19 | | - * |
| 18 | + * info (which is locally cached in AggregateMetaField). |
| 19 | + * |
20 | 20 | * @author rainman |
21 | 21 | * |
22 | 22 | */ |
— | — | @@ -24,7 +24,7 @@ |
25 | 25 | protected transient AggregateMetaFieldSource src = null; |
26 | 26 | protected boolean hasRankingData = false; |
27 | 27 | protected String field = null; |
28 | | - |
| 28 | + |
29 | 29 | /** Call this while (local) scorer is constructed to init cached meta info */ |
30 | 30 | public void init(IndexReader reader, String field) throws IOException { |
31 | 31 | this.field = field; |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | protected int getSlot(int pos){ |
38 | 38 | return pos / AggregateAnalyzer.TOKEN_GAP; |
39 | 39 | } |
40 | | - |
| 40 | + |
41 | 41 | public int length(int docid, int pos) throws IOException { |
42 | 42 | try{ |
43 | 43 | return src.getLength(docid,getSlot(pos)); |
— | — | @@ -45,7 +45,7 @@ |
46 | 46 | throw e; |
47 | 47 | } |
48 | 48 | } |
49 | | - |
| 49 | + |
50 | 50 | public float boost(int docid, int pos) throws IOException { |
51 | 51 | try{ |
52 | 52 | return src.getBoost(docid,getSlot(pos)); |
— | — | @@ -63,7 +63,7 @@ |
64 | 64 | throw e; |
65 | 65 | } |
66 | 66 | } |
67 | | - |
| 67 | + |
68 | 68 | public int lengthComplete(int docid, int pos) throws IOException { |
69 | 69 | try{ |
70 | 70 | return src.getLengthComplete(docid,getSlot(pos)); |
— | — | @@ -72,14 +72,14 @@ |
73 | 73 | throw e; |
74 | 74 | } |
75 | 75 | } |
76 | | - |
| 76 | + |
77 | 77 | public float rank(int docid) throws IOException { |
78 | 78 | if(hasRankingData) |
79 | 79 | return src.getRank(docid); |
80 | | - else |
| 80 | + else |
81 | 81 | throw new RuntimeException("Trying to fetch ranking data on field "+field+" where its not available."); |
82 | 82 | } |
83 | | - |
| 83 | + |
84 | 84 | public int namespace(int docid) throws IOException{ |
85 | 85 | return src.getNamespace(docid); |
86 | 86 | } |
— | — | @@ -87,7 +87,7 @@ |
88 | 88 | public boolean hasRankingData() { |
89 | 89 | return hasRankingData; |
90 | 90 | } |
91 | | - |
| 91 | + |
92 | 92 | public Flags flags(int docid, int pos) throws IOException { |
93 | 93 | try{ |
94 | 94 | return src.getFlags(docid,getSlot(pos)); |
— | — | @@ -96,14 +96,14 @@ |
97 | 97 | throw e; |
98 | 98 | } |
99 | 99 | } |
100 | | - |
| 100 | + |
101 | 101 | /** Provides ranking information */ |
102 | 102 | public static class RankInfo extends AggregateInfoImpl { |
103 | 103 | @Override |
104 | 104 | public void init(IndexReader reader, String field) throws IOException { |
105 | 105 | super.init(reader, "alttitle"); |
106 | 106 | } |
107 | | - |
| 107 | + |
108 | 108 | } |
109 | 109 | |
110 | 110 | } |
Index: branches/lucene-search-2.1/build.xml |
— | — | @@ -13,9 +13,9 @@ |
14 | 14 | <property name="include.src" value="src/** sql/** build.xml scripts/* webinterface/* VERSION configure build update test/** udplogger/**"/> |
15 | 15 | <property name="include.bin" value="*.log4j *.txt config.inc template/** udplogger/**"/> |
16 | 16 | <property name="include.sh" value="configure build update lsearchd"/> |
17 | | - |
| 17 | + |
18 | 18 | <property file="${basedir}/hostname"/> |
19 | | - |
| 19 | + |
20 | 20 | <path id="classpath"> |
21 | 21 | <fileset dir="${lib}" includes="*.jar"/> |
22 | 22 | </path> |
— | — | @@ -28,10 +28,10 @@ |
29 | 29 | <fileset dir="${lib}"> |
30 | 30 | <include name="*.jar"/> |
31 | 31 | </fileset> |
32 | | - </classpath> |
| 32 | + </classpath> |
33 | 33 | </java> |
34 | | - </target> |
35 | | - |
| 34 | + </target> |
| 35 | + |
36 | 36 | <target name="makejar" depends="build"> |
37 | 37 | <jar destfile="${basedir}/${jar.name}"> |
38 | 38 | <manifest> |
— | — | @@ -40,42 +40,42 @@ |
41 | 41 | </manifest> |
42 | 42 | <zipfileset dir="${bin}" prefix=""> |
43 | 43 | <include name="org/**"/> |
44 | | - </zipfileset> |
| 44 | + </zipfileset> |
45 | 45 | </jar> |
46 | 46 | </target> |
47 | | - |
| 47 | + |
48 | 48 | <target name="alljar" depends="build" description="All-in-one jar"> |
49 | 49 | <jar jarfile="${jar.name}" compress="true"> |
50 | 50 | <manifest> |
51 | 51 | <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager" /> |
52 | 52 | </manifest> |
53 | 53 | <fileset dir="bin" includes="org/**" /> |
54 | | - |
| 54 | + |
55 | 55 | <!-- pack libraries as well --> |
56 | 56 | <zipfileset src="lib/xmlrpc-common-3.0.jar" /> |
57 | 57 | <zipfileset src="lib/xmlrpc-client-3.0.jar" /> |
58 | 58 | <zipfileset src="lib/xmlrpc-server-3.0.jar" /> |
59 | 59 | <zipfileset src="lib/commons-logging-1.1.jar" /> |
60 | 60 | <zipfileset src="lib/ws-commons-util-1.0.1.jar" /> |
61 | | - <zipfileset src="lib/log4j-1.2.14.jar" /> |
| 61 | + <zipfileset src="lib/log4j-1.2.14.jar" /> |
62 | 62 | <zipfileset src="lib/lucene-core-2.3.jar" /> |
63 | | - <zipfileset src="lib/lucene-analyzers.jar" /> |
| 63 | + <zipfileset src="lib/lucene-analyzers.jar" /> |
64 | 64 | <zipfileset src="lib/snowball.jar" /> |
65 | 65 | <zipfileset src="lib/mwdumper.jar" /> |
66 | 66 | <zipfileset src="lib/mysql-connector-java-3.0.17-ga-bin.jar" /> |
67 | | - |
| 67 | + |
68 | 68 | <fileset dir="resources" includes="*/**" /> |
69 | 69 | </jar> |
70 | 70 | </target> |
71 | 71 | |
72 | | - |
| 72 | + |
73 | 73 | <target name="build" description="Compile classes"> |
74 | 74 | <mkdir dir="${bin}"/> |
75 | 75 | <javac srcdir="${src}/org/" debug="on" encoding="UTF-8" includes="**/*.java" destdir="${bin}/"> |
76 | 76 | <classpath refid="classpath"/> |
77 | 77 | </javac> |
78 | 78 | </target> |
79 | | - |
| 79 | + |
80 | 80 | <target name="pack" description="Make tar.gz distribution"> |
81 | 81 | <mkdir dir="${dist}"/> |
82 | 82 | <delete file="${dist}/${pack.name}.tar"/> |
— | — | @@ -87,7 +87,7 @@ |
88 | 88 | <gzip zipfile="${dist}/${pack.name}.tar.gz" src="${dist}/${pack.name}.tar"/> |
89 | 89 | <delete file="${dist}/${pack.name}.tar"/> |
90 | 90 | </target> |
91 | | - |
| 91 | + |
92 | 92 | <target name="pack-src" depends="alljar" description="Make tar.gz distribution of only core source files"> |
93 | 93 | <mkdir dir="${dist}"/> |
94 | 94 | <delete file="${dist}/${src.name}.tar"/> |
— | — | @@ -100,7 +100,7 @@ |
101 | 101 | <delete file="${dist}/${src.name}.tar"/> |
102 | 102 | </target> |
103 | 103 | |
104 | | - |
| 104 | + |
105 | 105 | <target name="binary" depends="alljar" description="Make binary tar.gz distribution"> |
106 | 106 | <mkdir dir="${bin}"/> |
107 | 107 | <delete file="${dist}/${binary.name}.tar"/> |
— | — | @@ -113,5 +113,5 @@ |
114 | 114 | <gzip zipfile="${dist}/${binary.name}.tar.gz" src="${dist}/${binary.name}.tar"/> |
115 | 115 | <delete file="${dist}/${binary.name}.tar"/> |
116 | 116 | </target> |
117 | | - |
| 117 | + |
118 | 118 | </project> |
Property changes on: branches/lucene-search-2.1 |
___________________________________________________________________ |
Name: svn:ignore |
119 | 119 | - lsearch.conf |
lsearch-global.conf |
lsearch.log4j |
rsyncd.conf |
*~ |
indexes |
120 | 120 | + bin |
dumps |
lsearch.log4j |
indexes |
lsearch-global.conf |
lsearch.conf |
LuceneSearch.jar |
config.inc |