Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/FastWikiTokenizerTest.java |
— | — | @@ -90,6 +90,9 @@ |
91 | 91 | |
92 | 92 | assertEquals("1 [u2] 1 [heading1]", |
93 | 93 | tokens("u2 heading1")); |
| 94 | + |
| 95 | + assertEquals("1 [test] 1 [apostrophe's] 0 [apostrophes] 1 [and] 1 [other’s] 0 [others]", |
| 96 | + tokens("Test apostrophe's and other\u2019s.")); |
94 | 97 | |
95 | 98 | |
96 | 99 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -209,7 +209,7 @@ |
210 | 210 | addDecomposed = true; |
211 | 211 | continue; // skip |
212 | 212 | } |
213 | | - if(!Character.isUpperCase(buffer[i]) && buffer[i]!='.' && buffer[i]!='\'') |
| 213 | + if(!Character.isUpperCase(buffer[i]) && buffer[i]!='.' && !isApostrophe(buffer[i])) |
214 | 214 | allUpperCase = false; |
215 | 215 | if(i == 0 && !Character.isUpperCase(buffer[i])) |
216 | 216 | titleCase = false; |
— | — | @@ -276,7 +276,7 @@ |
277 | 277 | } |
278 | 278 | |
279 | 279 | // delete ' marks from words, add as aliases |
280 | | - if(cl=='\''){ |
| 280 | + if(isApostrophe(cl)){ |
281 | 281 | addToTokenAlias(""); |
282 | 282 | addToAlias = false; |
283 | 283 | } |
— | — | @@ -438,6 +438,10 @@ |
439 | 439 | pos = Position.HEADING; |
440 | 440 | return pos; |
441 | 441 | } |
| 442 | + |
| 443 | + private final boolean isApostrophe(char c){ |
| 444 | + return c == '\'' || c == '\u2019'; |
| 445 | + } |
442 | 446 | |
443 | 447 | /** tidy the glue buffer, and return the token */ |
444 | 448 | private final ExtToken makeGlueToken(){ |
— | — | @@ -484,7 +488,7 @@ |
485 | 489 | || lc == '\n' || lc == '\r' || lc == '=' || (lc==';' && last=='\n')) |
486 | 490 | continue; // forbidden chars |
487 | 491 | |
488 | | - if(lc == '\'' && (last == '\'' || (i<glueLength-1 && glueBuffer[i+1]=='\''))) |
| 492 | + if(isApostrophe(lc) && (isApostrophe(last) || (i<glueLength-1 && isApostrophe(glueBuffer[i+1])))) |
489 | 493 | continue; // more than one ' |
490 | 494 | |
491 | 495 | // always put spaces before/after | |
— | — | @@ -590,7 +594,7 @@ |
591 | 595 | try{ |
592 | 596 | // add new character to buffer |
593 | 597 | if(Character.isLetter(c) || (!numberToken && length>0 && Character.isLetterOrDigit(c)) |
594 | | - || (c == '\'' && cur>0 && Character.isLetter(text[cur-1]) && cur+1<textLength && Character.isLetter(text[cur+1]) ) |
| 598 | + || (isApostrophe(c) && cur>0 && Character.isLetter(text[cur-1]) && cur+1<textLength && Character.isLetter(text[cur+1]) ) |
595 | 599 | || (c == '.' && cur+1<textLength && Character.isLetter(text[cur+1]) && (length<2 || (length>=2 && buffer[length-1]!='.' && buffer[length-2]=='.'))) |
596 | 600 | || decomposer.isCombiningChar(c)){ |
597 | 601 | if(numberToken) // we were fetching a number |
— | — | @@ -932,6 +936,7 @@ |
933 | 937 | addLetter(); |
934 | 938 | continue; |
935 | 939 | case '\'': |
| 940 | + case '\u2019': |
936 | 941 | if(cur + 1 < textLength ){ |
937 | 942 | c1 = text[cur+1]; |
938 | 943 | if(Character.isLetter(c1) && length>0 && Character.isLetter(buffer[length-1])){ |