Index: trunk/lucene-search/src/org/wikimedia/lsearch/MWDaemon.java |
— | — | @@ -61,6 +61,7 @@ |
62 | 62 | return; |
63 | 63 | } |
64 | 64 | indexPath = p.getProperty("mwsearch.indexpath"); |
| 65 | + SearchClientReader.init(); |
65 | 66 | System.out.println("Binding server to port " + port); |
66 | 67 | |
67 | 68 | try { |
Index: trunk/lucene-search/src/org/wikimedia/lsearch/SearchClientReader.java |
— | — | @@ -55,6 +55,7 @@ |
56 | 56 | String searchterm; |
57 | 57 | BufferedReader istrm; |
58 | 58 | BufferedWriter ostrm; |
| 59 | + String what; |
59 | 60 | |
60 | 61 | static Searcher searcher = null; |
61 | 62 | static Analyzer analyzer = null; |
— | — | @@ -68,7 +69,7 @@ |
69 | 70 | //"\\(", "\\)" |
70 | 71 | }; |
71 | 72 | |
72 | | - static { |
| 73 | + public static void init() { |
73 | 74 | try { |
74 | 75 | analyzer = new StandardAnalyzer(); |
75 | 76 | parser = new QueryParser("contents", analyzer); |
— | — | @@ -89,14 +90,24 @@ |
90 | 91 | try { |
91 | 92 | istrm = new BufferedReader(new InputStreamReader(client.getInputStream())); |
92 | 93 | ostrm = new BufferedWriter(new OutputStreamWriter(client.getOutputStream())); |
| 94 | + what = istrm.readLine(); |
93 | 95 | rawsearchterm = istrm.readLine(); |
94 | 96 | rawsearchterm = URLDecoder.decode(rawsearchterm, "UTF-8"); |
95 | | - for (int i = 0; i < specialChars.length; ++i) |
96 | | - rawsearchterm = rawsearchterm.replaceAll(specialChars[i], |
97 | | - "\\" + specialChars[i]); |
98 | | - searchterm = "title:(" + rawsearchterm + ")^4 OR contents:(" |
99 | | - + rawsearchterm + ")"; |
| 97 | + //for (int i = 0; i < specialChars.length; ++i) |
| 98 | + // rawsearchterm = rawsearchterm.replaceAll(specialChars[i], |
| 99 | + // "\\" + specialChars[i]); |
| 100 | + String escaped = ""; |
| 101 | + for (int i = 0; i < rawsearchterm.length(); ++i) |
| 102 | + escaped += "\\" + rawsearchterm.charAt(i); |
| 103 | + |
| 104 | + if (what.equals("TITLEMATCH")) { |
| 105 | + doTitleMatches(escaped); |
| 106 | + return; |
| 107 | + } |
100 | 108 | |
| 109 | + searchterm = "title:(" + escaped + ")^4 OR contents:(" |
| 110 | + + escaped + ")"; |
| 111 | + |
101 | 112 | System.out.println("Query: " + searchterm); |
102 | 113 | Query query = parser.parse(searchterm); |
103 | 114 | System.out.println("Parsed: [" + query.toString() + "]"); |
— | — | @@ -110,7 +121,8 @@ |
111 | 122 | float score = hits.score(i); |
112 | 123 | String namespace = doc.get("namespace"); |
113 | 124 | String title = doc.get("title"); |
114 | | - ostrm.write(score + " " + namespace + " " + title + "\n"); |
| 125 | + ostrm.write(score + " " + namespace + " " + |
| 126 | + title.replaceAll(" ", "_") + "\n"); |
115 | 127 | ++i; |
116 | 128 | } |
117 | 129 | if (numhits == 0) { |
— | — | @@ -121,6 +133,7 @@ |
122 | 134 | } catch (IOException e) { |
123 | 135 | } catch (Exception e) { |
124 | 136 | System.out.println("Unexpected exception: " + e.getMessage()); |
| 137 | + e.printStackTrace(); |
125 | 138 | } finally { |
126 | 139 | try { |
127 | 140 | istrm.close(); |
— | — | @@ -130,6 +143,45 @@ |
131 | 144 | } |
132 | 145 | } |
133 | 146 | |
| 147 | + void doTitleMatches(String term) { |
| 148 | + try { |
| 149 | + String terms[] = term.split(" +"); |
| 150 | + term = ""; |
| 151 | + for (int i = 0; i < terms.length; ++i) { |
| 152 | + term += terms[i] + "~ "; |
| 153 | + } |
| 154 | + searchterm = "title:(" + term + ")"; |
| 155 | + |
| 156 | + System.out.println("Query: " + searchterm); |
| 157 | + Query query = parser.parse(searchterm); |
| 158 | + System.out.println("Parsed: [" + query.toString() + "]"); |
| 159 | + Hits hits = searcher.search(query); |
| 160 | + int numhits = hits.length(); |
| 161 | + System.out.println(numhits + " hits"); |
| 162 | + int i = 0; |
| 163 | + while (i < numhits && i < 10) { |
| 164 | + Document doc = hits.doc(i); |
| 165 | + float score = hits.score(i); |
| 166 | + String namespace = doc.get("namespace"); |
| 167 | + String title = doc.get("title"); |
| 168 | + ostrm.write(score + " " + namespace + " " + |
| 169 | + title.replaceAll(" ", "_") + "\n"); |
| 170 | + ++i; |
| 171 | + } |
| 172 | + ostrm.flush(); |
| 173 | + } catch (IOException e) { |
| 174 | + } catch (Exception e) { |
| 175 | + System.out.println("Unexpected exception: " + e.getMessage()); |
| 176 | + e.printStackTrace(); |
| 177 | + } finally { |
| 178 | + try { |
| 179 | + istrm.close(); |
| 180 | + ostrm.flush(); |
| 181 | + ostrm.close(); |
| 182 | + } catch (IOException e) {} |
| 183 | + } |
| 184 | + } |
| 185 | + |
134 | 186 | String makeSpelFix(String query) { |
135 | 187 | try { |
136 | 188 | boolean anysuggest = false; |
— | — | @@ -141,7 +193,7 @@ |
142 | 194 | String bestmatch = terms[i]; |
143 | 195 | double bestscore = -1; |
144 | 196 | FuzzyTermEnum enum = new FuzzyTermEnum(reader, |
145 | | - new Term("contents", terms[i]), 0.5f, 2); |
| 197 | + new Term("contents", terms[i]), 0.5f, 1); |
146 | 198 | while (enum.next()) { |
147 | 199 | Term term = enum.term(); |
148 | 200 | int score = editDistance(terms[i], term.text(), terms[i].length(), |
Index: trunk/lucene-search/src/org/wikimedia/lsearch/MWSearch.java |
— | — | @@ -120,7 +120,7 @@ |
121 | 121 | long now = System.currentTimeMillis(); |
122 | 122 | long numArticles = 0; |
123 | 123 | |
124 | | - String query = "SELECT old_namespace,old_title,old_text " + |
| 124 | + String query = "SELECT page_namespace,page_title,old_text " + |
125 | 125 | "FROM page, text WHERE old_id=page_latest AND page_is_redirect=0"; |
126 | 126 | PreparedStatement pstmt; |
127 | 127 | try { |
— | — | @@ -128,7 +128,7 @@ |
129 | 129 | ResultSet rs = pstmt.executeQuery(); |
130 | 130 | while (rs.next()) { |
131 | 131 | String namespace = rs.getString(1); |
132 | | - String title = rs.getString(2); |
| 132 | + String title = rs.getString(2).replaceAll("_", " "); |
133 | 133 | String content = rs.getString(3); |
134 | 134 | if (!latin1) { |
135 | 135 | try { |
— | — | @@ -136,8 +136,10 @@ |
137 | 137 | content = new String(content.getBytes("ISO-8859-1"), "UTF-8"); |
138 | 138 | } catch (UnsupportedEncodingException e) {} |
139 | 139 | } |
140 | | - System.out.println("article " + namespace + ":" + title + |
141 | | - content.length()); |
| 140 | + if (title.equals("Post-it")) { |
| 141 | + System.out.println("namespace="+namespace+" title="+title+ |
| 142 | + "content=["+content+"]"); |
| 143 | + } |
142 | 144 | Document d = new Document(); |
143 | 145 | d.add(Field.Text("namespace", namespace)); |
144 | 146 | d.add(Field.Text("title", title)); |
— | — | @@ -149,14 +151,20 @@ |
150 | 152 | + ":" + title + "]: " + e5.getMessage()); |
151 | 153 | return; |
152 | 154 | } |
153 | | - ++numArticles; |
| 155 | + if ((++numArticles % 1000) == 0) { |
| 156 | + System.out.println(numArticles + "..."); |
| 157 | + } |
154 | 158 | } |
| 159 | + writer.close(); |
155 | 160 | } catch (SQLException e) { |
156 | 161 | System.out.println("Error: SQL error: " + e.getMessage()); |
157 | 162 | return; |
158 | 163 | } catch (OutOfMemoryError em) { |
159 | 164 | em.printStackTrace(); |
160 | 165 | return; |
| 166 | + } catch (IOException e) { |
| 167 | + System.out.println("Error: closing index: " + e.getMessage()); |
| 168 | + return; |
161 | 169 | } |
162 | 170 | double totaltime = (System.currentTimeMillis() - now) / 1000; |
163 | 171 | System.out.println("Done, indexed " + numArticles + " articles in " |
— | — | @@ -168,7 +176,6 @@ |
169 | 177 | i = text.indexOf("[[Image:"); |
170 | 178 | if (i == -1) i = text.indexOf("[[image:"); |
171 | 179 | int l = i; |
172 | | - System.out.println("1"); |
173 | 180 | while (i > -1) { |
174 | 181 | j = text.indexOf("[[", i + 2); |
175 | 182 | k = text.indexOf("]]", i + 2); |
— | — | @@ -186,7 +193,6 @@ |
187 | 194 | l = i; |
188 | 195 | } |
189 | 196 | } |
190 | | - System.out.println("2"); |
191 | 197 | |
192 | 198 | while ((i = text.indexOf("<!--")) != -1) { |
193 | 199 | if ((j = text.indexOf("-->", i)) == -1) |
— | — | @@ -196,7 +202,6 @@ |
197 | 203 | else |
198 | 204 | text = text.substring(0, i) + text.substring(j + 4); |
199 | 205 | } |
200 | | - System.out.println("3"); |
201 | 206 | text = text.replaceAll("\\{\\|(.*?)\\|\\}", "") |
202 | 207 | .replaceAll("\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]", "") |
203 | 208 | .replaceAll("\\[\\[([^|]+?)\\]\\]", "$1") |
— | — | @@ -207,7 +212,6 @@ |
208 | 213 | .replaceAll("('''|</?[bB]>)", "") |
209 | 214 | .replaceAll("''", "") |
210 | 215 | .replaceAll("</?[uU]>", ""); |
211 | | - System.out.println("4"); |
212 | 216 | return text; |
213 | 217 | } |
214 | 218 | } |