r65639 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r65638‎ | r65639 | r65640 >
Date:21:52, 28 April 2010
Author:daniel
Status:deferred
Tags:
Comment:
bette rdefault word pattern for phrase detection
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
@@ -98,7 +98,7 @@
9999 }
100100
101101 public void defaults() throws IOException {
102 - if (this.wordPattern==null) this.wordPattern = Pattern.compile("\\p{L}+|\\p{Nd}+");
 102+ if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}'']+(?:[\\p{Pc}\\p{Pd}][\\p{L}'']+)*|\\p{Nd}+(?:.\\p{Nd}+)?");
103103
104104 this.sentenceManglers.add( new RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks
105105 this.sentenceManglers.add( new RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "", 0) ); //strip leading cruft (lines without any characters)

Status & tagging log