r6783 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r6782‎ | r6783 | r6784 >
Date:07:52, 23 December 2004
Author:kateturner
Status:old
Tags:
Comment:
change query format to put command (SEARCH, TITLEMATCH) before query;
TITLEMATCH command shows similar titles to search query;
various fixes for building the index.
Modified paths:
  • /trunk/lucene-search/src/org/wikimedia/lsearch/MWDaemon.java (modified) (history)
  • /trunk/lucene-search/src/org/wikimedia/lsearch/MWSearch.java (modified) (history)
  • /trunk/lucene-search/src/org/wikimedia/lsearch/SearchClientReader.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search/src/org/wikimedia/lsearch/MWDaemon.java
@@ -61,6 +61,7 @@
6262 return;
6363 }
6464 indexPath = p.getProperty("mwsearch.indexpath");
 65+ SearchClientReader.init();
6566 System.out.println("Binding server to port " + port);
6667
6768 try {
Index: trunk/lucene-search/src/org/wikimedia/lsearch/SearchClientReader.java
@@ -55,6 +55,7 @@
5656 String searchterm;
5757 BufferedReader istrm;
5858 BufferedWriter ostrm;
 59+ String what;
5960
6061 static Searcher searcher = null;
6162 static Analyzer analyzer = null;
@@ -68,7 +69,7 @@
6970 //"\\(", "\\)"
7071 };
7172
72 - static {
 73+ public static void init() {
7374 try {
7475 analyzer = new StandardAnalyzer();
7576 parser = new QueryParser("contents", analyzer);
@@ -89,14 +90,24 @@
9091 try {
9192 istrm = new BufferedReader(new InputStreamReader(client.getInputStream()));
9293 ostrm = new BufferedWriter(new OutputStreamWriter(client.getOutputStream()));
 94+ what = istrm.readLine();
9395 rawsearchterm = istrm.readLine();
9496 rawsearchterm = URLDecoder.decode(rawsearchterm, "UTF-8");
95 - for (int i = 0; i < specialChars.length; ++i)
96 - rawsearchterm = rawsearchterm.replaceAll(specialChars[i],
97 - "\\" + specialChars[i]);
98 - searchterm = "title:(" + rawsearchterm + ")^4 OR contents:("
99 - + rawsearchterm + ")";
 97+ //for (int i = 0; i < specialChars.length; ++i)
 98+ // rawsearchterm = rawsearchterm.replaceAll(specialChars[i],
 99+ // "\\" + specialChars[i]);
 100+ String escaped = "";
 101+ for (int i = 0; i < rawsearchterm.length(); ++i)
 102+ escaped += "\\" + rawsearchterm.charAt(i);
 103+
 104+ if (what.equals("TITLEMATCH")) {
 105+ doTitleMatches(escaped);
 106+ return;
 107+ }
100108
 109+ searchterm = "title:(" + escaped + ")^4 OR contents:("
 110+ + escaped + ")";
 111+
101112 System.out.println("Query: " + searchterm);
102113 Query query = parser.parse(searchterm);
103114 System.out.println("Parsed: [" + query.toString() + "]");
@@ -110,7 +121,8 @@
111122 float score = hits.score(i);
112123 String namespace = doc.get("namespace");
113124 String title = doc.get("title");
114 - ostrm.write(score + " " + namespace + " " + title + "\n");
 125+ ostrm.write(score + " " + namespace + " " +
 126+ title.replaceAll(" ", "_") + "\n");
115127 ++i;
116128 }
117129 if (numhits == 0) {
@@ -121,6 +133,7 @@
122134 } catch (IOException e) {
123135 } catch (Exception e) {
124136 System.out.println("Unexpected exception: " + e.getMessage());
 137+ e.printStackTrace();
125138 } finally {
126139 try {
127140 istrm.close();
@@ -130,6 +143,45 @@
131144 }
132145 }
133146
 147+ void doTitleMatches(String term) {
 148+ try {
 149+ String terms[] = term.split(" +");
 150+ term = "";
 151+ for (int i = 0; i < terms.length; ++i) {
 152+ term += terms[i] + "~ ";
 153+ }
 154+ searchterm = "title:(" + term + ")";
 155+
 156+ System.out.println("Query: " + searchterm);
 157+ Query query = parser.parse(searchterm);
 158+ System.out.println("Parsed: [" + query.toString() + "]");
 159+ Hits hits = searcher.search(query);
 160+ int numhits = hits.length();
 161+ System.out.println(numhits + " hits");
 162+ int i = 0;
 163+ while (i < numhits && i < 10) {
 164+ Document doc = hits.doc(i);
 165+ float score = hits.score(i);
 166+ String namespace = doc.get("namespace");
 167+ String title = doc.get("title");
 168+ ostrm.write(score + " " + namespace + " " +
 169+ title.replaceAll(" ", "_") + "\n");
 170+ ++i;
 171+ }
 172+ ostrm.flush();
 173+ } catch (IOException e) {
 174+ } catch (Exception e) {
 175+ System.out.println("Unexpected exception: " + e.getMessage());
 176+ e.printStackTrace();
 177+ } finally {
 178+ try {
 179+ istrm.close();
 180+ ostrm.flush();
 181+ ostrm.close();
 182+ } catch (IOException e) {}
 183+ }
 184+ }
 185+
134186 String makeSpelFix(String query) {
135187 try {
136188 boolean anysuggest = false;
@@ -141,7 +193,7 @@
142194 String bestmatch = terms[i];
143195 double bestscore = -1;
144196 FuzzyTermEnum enum = new FuzzyTermEnum(reader,
145 - new Term("contents", terms[i]), 0.5f, 2);
 197+ new Term("contents", terms[i]), 0.5f, 1);
146198 while (enum.next()) {
147199 Term term = enum.term();
148200 int score = editDistance(terms[i], term.text(), terms[i].length(),
Index: trunk/lucene-search/src/org/wikimedia/lsearch/MWSearch.java
@@ -120,7 +120,7 @@
121121 long now = System.currentTimeMillis();
122122 long numArticles = 0;
123123
124 - String query = "SELECT old_namespace,old_title,old_text " +
 124+ String query = "SELECT page_namespace,page_title,old_text " +
125125 "FROM page, text WHERE old_id=page_latest AND page_is_redirect=0";
126126 PreparedStatement pstmt;
127127 try {
@@ -128,7 +128,7 @@
129129 ResultSet rs = pstmt.executeQuery();
130130 while (rs.next()) {
131131 String namespace = rs.getString(1);
132 - String title = rs.getString(2);
 132+ String title = rs.getString(2).replaceAll("_", " ");
133133 String content = rs.getString(3);
134134 if (!latin1) {
135135 try {
@@ -136,8 +136,10 @@
137137 content = new String(content.getBytes("ISO-8859-1"), "UTF-8");
138138 } catch (UnsupportedEncodingException e) {}
139139 }
140 - System.out.println("article " + namespace + ":" + title +
141 - content.length());
 140+ if (title.equals("Post-it")) {
 141+ System.out.println("namespace="+namespace+" title="+title+
 142+ "content=["+content+"]");
 143+ }
142144 Document d = new Document();
143145 d.add(Field.Text("namespace", namespace));
144146 d.add(Field.Text("title", title));
@@ -149,14 +151,20 @@
150152 + ":" + title + "]: " + e5.getMessage());
151153 return;
152154 }
153 - ++numArticles;
 155+ if ((++numArticles % 1000) == 0) {
 156+ System.out.println(numArticles + "...");
 157+ }
154158 }
 159+ writer.close();
155160 } catch (SQLException e) {
156161 System.out.println("Error: SQL error: " + e.getMessage());
157162 return;
158163 } catch (OutOfMemoryError em) {
159164 em.printStackTrace();
160165 return;
 166+ } catch (IOException e) {
 167+ System.out.println("Error: closing index: " + e.getMessage());
 168+ return;
161169 }
162170 double totaltime = (System.currentTimeMillis() - now) / 1000;
163171 System.out.println("Done, indexed " + numArticles + " articles in "
@@ -168,7 +176,6 @@
169177 i = text.indexOf("[[Image:");
170178 if (i == -1) i = text.indexOf("[[image:");
171179 int l = i;
172 - System.out.println("1");
173180 while (i > -1) {
174181 j = text.indexOf("[[", i + 2);
175182 k = text.indexOf("]]", i + 2);
@@ -186,7 +193,6 @@
187194 l = i;
188195 }
189196 }
190 - System.out.println("2");
191197
192198 while ((i = text.indexOf("<!--")) != -1) {
193199 if ((j = text.indexOf("-->", i)) == -1)
@@ -196,7 +202,6 @@
197203 else
198204 text = text.substring(0, i) + text.substring(j + 4);
199205 }
200 - System.out.println("3");
201206 text = text.replaceAll("\\{\\|(.*?)\\|\\}", "")
202207 .replaceAll("\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]", "")
203208 .replaceAll("\\[\\[([^|]+?)\\]\\]", "$1")
@@ -207,7 +212,6 @@
208213 .replaceAll("('''|</?[bB]>)", "")
209214 .replaceAll("''", "")
210215 .replaceAll("</?[uU]>", "");
211 - System.out.println("4");
212216 return text;
213217 }
214218 }

Status & tagging log