r57932 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r57931‎ | r57932 | r57933 >
Date:23:15, 19 October 2009
Author:rainman
Status:ok
Tags:
Comment:
Bug 21002 - add \u2019 as additional apostrophe character
Modified paths:
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/FastWikiTokenizerTest.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/FastWikiTokenizerTest.java
@@ -90,6 +90,9 @@
9191
9292 assertEquals("1 [u2] 1 [heading1]",
9393 tokens("u2 heading1"));
 94+
 95+ assertEquals("1 [test] 1 [apostrophe's] 0 [apostrophes] 1 [and] 1 [other’s] 0 [others]",
 96+ tokens("Test apostrophe's and other\u2019s."));
9497
9598
9699 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -209,7 +209,7 @@
210210 addDecomposed = true;
211211 continue; // skip
212212 }
213 - if(!Character.isUpperCase(buffer[i]) && buffer[i]!='.' && buffer[i]!='\'')
 213+ if(!Character.isUpperCase(buffer[i]) && buffer[i]!='.' && !isApostrophe(buffer[i]))
214214 allUpperCase = false;
215215 if(i == 0 && !Character.isUpperCase(buffer[i]))
216216 titleCase = false;
@@ -276,7 +276,7 @@
277277 }
278278
279279 // delete ' marks from words, add as aliases
280 - if(cl=='\''){
 280+ if(isApostrophe(cl)){
281281 addToTokenAlias("");
282282 addToAlias = false;
283283 }
@@ -438,6 +438,10 @@
439439 pos = Position.HEADING;
440440 return pos;
441441 }
 442+
 443+ private final boolean isApostrophe(char c){
 444+ return c == '\'' || c == '\u2019';
 445+ }
442446
443447 /** tidy the glue buffer, and return the token */
444448 private final ExtToken makeGlueToken(){
@@ -484,7 +488,7 @@
485489 || lc == '\n' || lc == '\r' || lc == '=' || (lc==';' && last=='\n'))
486490 continue; // forbidden chars
487491
488 - if(lc == '\'' && (last == '\'' || (i<glueLength-1 && glueBuffer[i+1]=='\'')))
 492+ if(isApostrophe(lc) && (isApostrophe(last) || (i<glueLength-1 && isApostrophe(glueBuffer[i+1]))))
489493 continue; // more than one '
490494
491495 // always put spaces before/after |
@@ -590,7 +594,7 @@
591595 try{
592596 // add new character to buffer
593597 if(Character.isLetter(c) || (!numberToken && length>0 && Character.isLetterOrDigit(c))
594 - || (c == '\'' && cur>0 && Character.isLetter(text[cur-1]) && cur+1<textLength && Character.isLetter(text[cur+1]) )
 598+ || (isApostrophe(c) && cur>0 && Character.isLetter(text[cur-1]) && cur+1<textLength && Character.isLetter(text[cur+1]) )
595599 || (c == '.' && cur+1<textLength && Character.isLetter(text[cur+1]) && (length<2 || (length>=2 && buffer[length-1]!='.' && buffer[length-2]=='.')))
596600 || decomposer.isCombiningChar(c)){
597601 if(numberToken) // we were fetching a number
@@ -932,6 +936,7 @@
933937 addLetter();
934938 continue;
935939 case '\'':
 940+ case '\u2019':
936941 if(cur + 1 < textLength ){
937942 c1 = text[cur+1];
938943 if(Character.isLetter(c1) && length>0 && Character.isLetter(buffer[length-1])){

Status & tagging log