r53942 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53941‎ | r53942 | r53943 >
Date:14:27, 29 July 2009
Author:werdna
Status:deferred
Tags:
Comment:
Search support for LiquidThreads:
* Add threading data added to XML dumps in WikiExporter and parsed out in MWDumper to the lucene index.
* Handle the keywords "inthread:topPostID" and "ondiscussionpage:PageID" as conditions on the corresponding thread metadata.
TODO: Customise search result display for threads.
TODO: Add a search box :)
Modified paths:
  • /branches/lucene-search-2.1/lib/mwdumper.jar (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
@@ -66,6 +66,9 @@
6767 /** Rank of the redirect target */
6868 private int redirectRank = 0;
6969
 70+ /** Threading information */
 71+ public Hashtable<String,String> DiscussionThreadingInfo;
 72+
7073 public Article(){
7174 namespace="";
7275 title="";
@@ -75,6 +78,7 @@
7679 redirects=new ArrayList<Redirect>();
7780 related = new ArrayList<RelatedTitle>();
7881 anchors = new Hashtable<String,Integer>();
 82+ this.DiscussionThreadingInfo = new Hashtable<String,String>();
7983 }
8084
8185 public Article(long pageId, Title title, String text, String redirectTo, int references, int redirectTargetNamespace, int redirectRank) {
@@ -87,6 +91,7 @@
8892
8993 public Article(long pageId, int namespace, String titleText, String text, String redirectTo, int references, int redirectTargetNamespace, int redirectRank,
9094 ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related, Hashtable<String,Integer> anchorRank, Date date) {
 95+ this();
9196 this.namespace = Integer.toString(namespace);
9297 this.title = titleText;
9398 this.contents = text;
@@ -101,6 +106,20 @@
102107 this.redirectRank = redirectRank;
103108 }
104109
 110+ public Article(long pageId, int namespace, String titleText, String text,
 111+ String redirectTo, int references, int redirectTargetNamespace,
 112+ int redirectRank, ArrayList<Redirect> redirects,
 113+ ArrayList<RelatedTitle> related,
 114+ Hashtable<String,Integer> anchorRank, Date date,
 115+ Hashtable<String,String> DiscussionThreadingInfo) {
 116+
 117+ this(pageId, namespace, titleText, text, redirectTo, references,
 118+ redirectTargetNamespace, redirectRank, redirects, related,
 119+ anchorRank, date);
 120+
 121+ this.DiscussionThreadingInfo = DiscussionThreadingInfo;
 122+ }
 123+
105124 public boolean isRedirect() {
106125 return redirectTo != null;
107126 }
@@ -288,8 +307,7 @@
289308
290309 public void setRedirectRank(int redirectRank) {
291310 this.redirectRank = redirectRank;
292 - }
293 -
 311+ }
294312
295313
296314 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -108,8 +108,10 @@
109109 if(makeIndex && related != null)
110110 rel = related.getRelated(key);
111111 // make article
112 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,redirectTo,
113 - references,redirectTargetNamespace,0,redirects,rel,anchors,date);
 112+ Article article = new Article(page.Id,page.Title.Namespace,
 113+ page.Title.Text,revision.Text,redirectTo,references,
 114+ redirectTargetNamespace,0,redirects,rel,anchors,date,
 115+ page.DiscussionThreadingInfo);
114116 // index
115117 if(indexWriter != null)
116118 indexWriter.addArticle(article);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -781,6 +781,14 @@
782782 rtitle.setBoost(rankBoost);
783783 doc.add(rtitle);
784784
 785+ // Threading information
 786+ java.util.Enumeration e = article.DiscussionThreadingInfo.keys();
 787+ while (e.hasMoreElements()) {
 788+ String key = (String)e.nextElement();
 789+ String value = article.DiscussionThreadingInfo.get(key);
 790+ doc.add( new Field( key, value, Store.YES, Index.UN_TOKENIZED) );
 791+ }
 792+
785793 // extra info (for spellcheck indexes)
786794 if(extraInfo){
787795 addSpellCheckInfo(doc,article.getTitle(),tokenizer.getKeywords(),tokenizer.getHeadingText(),article.getRedirectKeywords(),iid,fields);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -424,6 +424,10 @@
425425 public HashSet<NamespaceFilter> getFieldNamespaces(String queryText){
426426 HashSet<String> fields = getFields(queryText);
427427 HashSet<NamespaceFilter> ret = new HashSet<NamespaceFilter>();
 428+ List ThreadingKeywords = new ArrayList();
 429+ ThreadingKeywords.add("inthread");
 430+ ThreadingKeywords.add("ondiscussionpage");
 431+
428432 for(String field : fields){
429433 field = field.toLowerCase();
430434 if(namespaceFilters.containsKey(field))
@@ -434,6 +438,8 @@
435439 ret.add(defaultNamespaceFilter);
436440 else if(field.startsWith("[")){
437441 ret.add(new NamespaceFilter(field.substring(1,field.length()-1)));
 442+ } else if (ThreadingKeywords.contains(field)) {
 443+ ret.add( new NamespaceFilter(90) );
438444 }
439445 }
440446
@@ -637,7 +643,13 @@
638644 else if(ch == ':'){
639645 // check if it's a valid field
640646 String f = new String(buffer,0,length);
641 - if(f.equals(namespaceAllKeyword) || f.equals("incategory") || f.equals("intitle") || namespaceFilters.containsKey(f) || namespacePolicy == NamespacePolicy.LEAVE){
 647+
 648+ List fieldOperators = getFieldOperators();
 649+
 650+ if( f.equals(namespaceAllKeyword)
 651+ || fieldOperators.contains(f)
 652+ || namespaceFilters.containsKey(f)
 653+ || namespacePolicy == NamespacePolicy.LEAVE){
642654 cur = lookup;
643655 return TokenType.FIELD;
644656 } else
@@ -649,6 +661,16 @@
650662 return TokenType.WORD;
651663 }
652664
 665+ private List getFieldOperators() {
 666+ List fieldOperators = new ArrayList();
 667+ fieldOperators.add("intitle");
 668+ fieldOperators.add("incategory");
 669+ fieldOperators.add("inthread");
 670+ fieldOperators.add("ondiscussionpage");
 671+
 672+ return fieldOperators;
 673+ }
 674+
653675 /**
654676 * Fetches prefixes like [0,1,2] (in [0,1,2]:query)
655677 *
@@ -722,25 +744,33 @@
723745 return makeTerm(token.termText());
724746 }
725747
726 - /** Make term form <code>buffer</code> */
 748+ /** Make term from <code>buffer</code> */
727749 private Term makeTerm(){
728750 return makeTerm(new String(buffer,0,length));
729751 }
730752
731753 /** Make a lucene term from string */
732754 private Term makeTerm(String t){
 755+ Hashtable<String,String> keywordFieldMapping = new Hashtable<String,String>();
 756+ keywordFieldMapping.put("inthread", "ThreadAncestor");
 757+ keywordFieldMapping.put("ondiscussionpage", "ThreadPage");
 758+
733759 if(currentField == null)
734760 return new Term(defaultField,builder.isExactCase()? t : t.toLowerCase());
735761 else if(defaultField.equals("contents") && isInTitle)
736762 return new Term("title",builder.isExactCase()? t : t.toLowerCase());
737 - else if(!"incategory".equals(currentField) &&
 763+ else if(currentField.equals("incategory")){
 764+ String norm = t.replace("_"," "); // bug 10822
 765+ return new Term("category",builder.isExactCase()? norm : norm.toLowerCase());
 766+ } else if( keywordFieldMapping.containsKey(currentField) ) {
 767+ String field = keywordFieldMapping.get(currentField);
 768+
 769+ return new Term(field, t);
 770+ } else if(!"incategory".equals(currentField) &&
738771 (namespacePolicy == NamespacePolicy.IGNORE ||
739772 namespacePolicy == NamespacePolicy.REWRITE))
740773 return new Term(defaultField,t);
741 - else if(currentField.equals("incategory")){
742 - String norm = t.replace("_"," "); // bug 10822
743 - return new Term("category",builder.isExactCase()? norm : norm.toLowerCase());
744 - } else
 774+ else
745775 return new Term(currentField,t);
746776 }
747777
Index: branches/lucene-search-2.1/lib/mwdumper.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Status & tagging log