Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/AcronymFilter.java |
— | — | @@ -7,12 +7,23 @@ |
8 | 8 | import org.apache.lucene.analysis.TokenStream; |
9 | 9 | |
10 | 10 | /** |
11 | | - * Filters acronyms tokens to tokens without internal dots. |
| 11 | +/** |
| 12 | + * Filters acronyms tokens to normalize with undotted version. |
| 13 | + * |
| 14 | + * It will take N.A.S.A. and will return both N.A.S.A. and NASA . |
| 15 | + * The undotted token has 0 increment, it is buffered until the next invocation. |
12 | 16 | * |
| 17 | + * TODO: support the new filter interface |
| 18 | + * TODO: add learning mode - i.e. dumping ACRONYMS into a repository. |
| 19 | + * TODO: set token type to ACRONYM |
| 20 | + * TODO: if token is ABCD. it is not an acronym -> remove dot buffer . |
| 21 | + * |
13 | 22 | */ |
14 | 23 | public class AcronymFilter extends TokenFilter { |
15 | 24 | |
16 | | - protected transient Token buffered = null; // TODO: document buffer behavior. |
| 25 | + //if an acronym is detected the normalized version is stored in |
| 26 | + //this buffer till the next call to next() |
| 27 | + protected transient Token buffered = null; |
17 | 28 | |
18 | 29 | public AcronymFilter(final TokenStream input) { |
19 | 30 | super(input); |