r111293 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r111292‎ | r111293 | r111294 >
Date:14:00, 12 February 2012
Author:oren
Status:deferred
Tags:
Comment:
upgraded most deprecated apis, speed up and added some test cases
Modified paths:
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/AcronymFilter.java (modified) (history)
  • /trunk/lucene-search-3/src/test/java/org/wikimedia/lsearch/analyzers/AcronymFilterTest.java (added) (history)

Diff [purge]

Index: trunk/lucene-search-3/src/test/java/org/wikimedia/lsearch/analyzers/AcronymFilterTest.java
@@ -0,0 +1,90 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import static org.junit.Assert.assertFalse;
 5+import static org.junit.Assert.assertTrue;
 6+
 7+import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
 8+import org.junit.Before;
 9+import org.junit.Test;
 10+
 11+/**
 12+ * The class <code>AcronymFilterTest</code> contains tests for the class {@link
 13+ * <code>AcronymFilter</code>}
 14+ *
 15+ * @pattern JUnit Test Case
 16+ * @author oren
 17+ * @version $Revision$
 18+ */
 19+public class AcronymFilterTest {
 20+
 21+ AcronymFilter af;
 22+
 23+ @Before
 24+ public void Setup() {
 25+ af = new AcronymFilter(new EmptyTokenStream());
 26+
 27+ }
 28+
 29+ @Test
 30+ public void isAcronymTest() {
 31+
 32+ assertTrue(af.isAcronym(new char[] { 'a', '.', 'n', '.', 't' }));
 33+ }
 34+
 35+ @Test
 36+ public void testAcronymFilter() {
 37+
 38+ assertTrue(af.isAcronym(new char[] { 'a', '.', '1', '2', '1' }));
 39+
 40+ }
 41+
 42+ @Test
 43+ public void testAcronymFilter_2() {
 44+
 45+ assertTrue(af.isAcronym(new char[] { '.', 'b', 'c', 'd', 'a' }));
 46+
 47+ }
 48+
 49+ @Test
 50+ public void testAcronymFilter_3() {
 51+
 52+ AcronymFilter af = new AcronymFilter(new EmptyTokenStream());
 53+ assertTrue(af.isAcronym(new char[] { 'a', 'b', 'c', 'd', '.' }));
 54+
 55+ }
 56+
 57+ @Test
 58+ public void testAcronymFilter_4() {
 59+
 60+ assertFalse(af.isAcronym(new char[] { '1', '.', '2', '3', '4' }));
 61+
 62+ }
 63+
 64+ @Test
 65+ public void testAcronymFilter_5() {
 66+
 67+ assertFalse(af.isAcronym(new char[] { '1', '2', '2', '3', '4' }));
 68+
 69+ }
 70+
 71+ @Test
 72+ public void testAcronymFilter_6() {
 73+
 74+ assertFalse(af.isAcronym(new char[] { 'a', '1', '2', '3', 'a' }));
 75+
 76+ }
 77+
 78+ @Test
 79+ public void testAcronymFilter_7() {
 80+
 81+ assertFalse(af.isAcronym(new char[] { 'a', '1', '2', '3', 'a' }));
 82+
 83+ }
 84+
 85+ @Test
 86+ public void testAcronymFilter_8() {
 87+
 88+ assertFalse(af.isAcronym(new char[] { 'a', 'a', 'b', 'c', 'd' }));
 89+ }
 90+
 91+}
Property changes on: trunk/lucene-search-3/src/test/java/org/wikimedia/lsearch/analyzers/AcronymFilterTest.java
___________________________________________________________________
Added: svn:keywords
192 + LastChangedDate LastChangedRevision LastChangedBy Id
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/AcronymFilter.java
@@ -7,35 +7,89 @@
88 import org.apache.lucene.analysis.TokenStream;
99
1010 /**
11 - * Removes dots from acronyms?
 11+ * Filters acronyms tokens to tokens without internal dots.
 12+ *
1213 */
1314 public class AcronymFilter extends TokenFilter {
14 - Token buffered = null;
 15+
 16+ Token buffered = null; //TODO: document buffer behavior.
1517
1618 public AcronymFilter(TokenStream input) {
1719 super(input);
1820 }
19 -
 21+
 22+
2023 @Override
21 - public Token next(Token nextToken) throws IOException {
 24+ public Token next(Token reusableToken) throws IOException {
 25+
2226 if(buffered != null){
23 - nextToken = buffered;
 27+ reusableToken = buffered;
2428 buffered = null;
25 - return nextToken;
 29+ return reusableToken;
2630 }
27 - nextToken = input.next();
28 - if(nextToken == null)
 31+ reusableToken = input.next(reusableToken);
 32+ if(reusableToken == null)
2933 return null;
30 - if(nextToken.termText().contains(".") && !isNumber(nextToken.termText())){
31 - buffered = new Token(nextToken.termText().replace(".",""),nextToken.startOffset(),nextToken.endOffset(),nextToken.type());
 34+
 35+ if(isAcronym(reusableToken.termBuffer())){
 36+ buffered = new Token(filteredBuffer.toString(),reusableToken.startOffset(),reusableToken.endOffset(),reusableToken.type());
3237 buffered.setPositionIncrement(0);
3338 }
34 - return nextToken;
 39+ return reusableToken;
3540 }
3641
37 - protected boolean isNumber(String str){
38 - for(int i=0;i<str.length();i++){
39 - char c = str.charAt(i);
 42+ StringBuffer filteredBuffer = new StringBuffer();
 43+
 44+ /**
 45+ * check is a token is an acronym and gen filtered version
 46+ *
 47+ * @param buffer
 48+ * @param start
 49+ * @param end
 50+ * @return
 51+ */
 52+ protected boolean isAcronym(char[] buffer){
 53+
 54+ boolean isAlpha=false;
 55+ boolean hasDot=false;
 56+ //boolean isNumeric=false;
 57+
 58+ filteredBuffer.setLength(0);
 59+
 60+ char c=' ';
 61+
 62+ for (int offset = 0; offset < buffer.length; offset++) {
 63+ c = buffer[offset];
 64+
 65+ if (c == '.') {
 66+ hasDot = true;
 67+ } else {
 68+
 69+ // side effect - filter the dot
 70+ filteredBuffer.append(c);
 71+
 72+ if (!isAlpha && c >= '0' && c <= '9') {
 73+ //isNumeric = true;
 74+ } else {
 75+ isAlpha = true;
 76+ }
 77+
 78+ //process full string
 79+ }
 80+ }
 81+
 82+ return hasDot && isAlpha ;
 83+ }
 84+
 85+ protected boolean hasDot(char[] buffer){
 86+ for(char c: buffer){
 87+ if (c=='.') return true;
 88+ }
 89+ return false;
 90+ }
 91+
 92+ protected boolean isNumber(char[] buffer){
 93+ for(char c: buffer){
4094 if(! ((c >= '0' && c <='9') || (c=='.') ))
4195 return false;
4296 }

Status & tagging log