r18762 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r18761‎ | r18762 | r18763 >
Date:23:09, 1 January 2007
Author:magnusmanske
Status:old
Tags:
Comment:
Basic search functionality working
Modified paths:
  • /trunk/yawr/ZenoFile.cpp (modified) (history)
  • /trunk/yawr/ZenoFile.h (modified) (history)
  • /trunk/yawr/base.cpp (modified) (history)
  • /trunk/yawr/base.h (modified) (history)
  • /trunk/yawr/wxWebServer.cpp (modified) (history)
  • /trunk/yawr/wxWikiServer.cpp (modified) (history)
  • /trunk/yawr/wxWikiServerSearch.cpp (modified) (history)

Diff [purge]

Index: trunk/yawr/wxWikiServerSearch.cpp
@@ -10,9 +10,13 @@
1111 #include "base.h"
1212
1313 class TSearchWordTree ;
 14+class TSearchWordTreeTableLine ;
1415
1516 WX_DECLARE_OBJARRAY(TSearchWordTree*, ArrayOfTSearchWordTree);
 17+WX_DECLARE_OBJARRAY(TSearchWordTreeTableLine, ArrayOfTSearchWordTreeTableLine);
1618
 19+#define MAX_RETURN 20
 20+
1721 enum
1822 {
1923 TREE_NORMAL_LIST = 0 ,
@@ -23,6 +27,32 @@
2428
2529 // CAUTON : The whole search section has no user input sanity checks; eg unmatching () in the search string might bring it down
2630
 31+class TSearchWordTreeTableLine
 32+{
 33+ public :
 34+ unsigned long article_id ;
 35+ unsigned long word_pos ;
 36+
 37+ bool operator == ( const TSearchWordTreeTableLine &x )
 38+ {
 39+ return article_id == x.article_id && word_pos == x.word_pos ;
 40+ }
 41+ bool operator < ( const TSearchWordTreeTableLine &x )
 42+ {
 43+ return article_id < x.article_id || ( article_id == x.article_id && word_pos < x.word_pos ) ;
 44+ }
 45+} ;
 46+
 47+int CMPFUNCtable ( TSearchWordTreeTableLine **first, TSearchWordTreeTableLine **second)
 48+{
 49+ if ( (*first)->article_id < (*second)->article_id ) return -1 ;
 50+ if ( (*first)->article_id > (*second)->article_id ) return 1 ;
 51+ if ( (*first)->word_pos < (*second)->word_pos ) return -1 ;
 52+ if ( (*first)->word_pos > (*second)->word_pos ) return 1 ;
 53+ return 0 ;
 54+}
 55+
 56+
2757 class TSearchWordTree
2858 {
2959 public :
@@ -32,17 +62,28 @@
3363 int ScanKeyword ( int start , const wxArrayString &wordlist , TSearchWordTree &child ) ;
3464 wxString GetHTMLtree ( int depth = 0 ) ;
3565 bool IsUsingWildcards() ;
36 - wxString Process ( ZenoFile *index , int depth = 0 ) ;
 66+ wxArrayString Process ( ZenoFile *index , int depth = 0 ) ;
3767 bool StringHasWildcards ( wxString s ) ;
 68+ void CreateSingleWordTable ( wxString word , ZenoFile *index ) ;
 69+ void ProcessList () ;
 70+ void ProcessAND () ;
 71+ void ProcessOR () ;
 72+ TSearchWordTree *GetRoot() ;
 73+ void FilterTitleAgainstTable ( wxString word ) ;
 74+ void Explode ( wxString query , wxArrayString &words , bool is_parameter = true ) ;
 75+ bool DoesMatchTitle ( wxString word , wxString title ) ;
3876
3977 ArrayOfTSearchWordTree children ;
4078 TSearchWordTree *_parent ;
4179 int type ;
 80+ bool title_only , fuzzy ;
4281 wxString word ;
 82+ ArrayOfTSearchWordTreeTableLine table ;
4383 } ;
4484
4585 #include <wx/arrimpl.cpp> // this is a magic incantation which must be done!
4686 WX_DEFINE_OBJARRAY(ArrayOfTSearchWordTree);
 87+WX_DEFINE_OBJARRAY(ArrayOfTSearchWordTreeTableLine);
4788
4889
4990
@@ -51,8 +92,16 @@
5293 {
5394 _parent = parent ;
5495 type = TREE_NORMAL_LIST ;
 96+ title_only = false ;
 97+ fuzzy = false ;
5598 }
5699
 100+TSearchWordTree *TSearchWordTree::GetRoot()
 101+{
 102+ if ( _parent ) return _parent->GetRoot() ;
 103+ else return this ;
 104+}
 105+
57106 void TSearchWordTree::Parse ( wxArrayString words )
58107 {
59108 // Cleanup; unnecessary, but who knows?
@@ -131,7 +180,7 @@
132181 {
133182 int a ;
134183 wxString ret ;
135 - ret = word ;
 184+ if ( type == TREE_WORD ) ret = _T("{") + word + _T("}") ;
136185 for ( a = 0 ; a < children.GetCount() ; a++ )
137186 {
138187 if ( a > 0 )
@@ -150,12 +199,14 @@
151200 bool TSearchWordTree::StringHasWildcards ( wxString s )
152201 {
153202 int a ;
 203+ wxString against = _T("abcdefghijklmnopqrstuvwxyz") ;
154204 for ( a = 0 ; a < s.Length() ; a++ )
155205 {
156206 wxChar c = s.GetChar(a) ;
157 - if ( c >= 'a' && c <= 'z' ) continue ;
 207+ if ( against.Find ( c ) > -1 ) continue ;
 208+/* if ( c >= 'a' && c <= 'z' ) continue ;
158209 if ( c >= '0' && c <= '9' ) continue ;
159 - if ( c == '�' || c == '�' || c == '�' || c == '�' ) continue ;
 210+ if ( c == '�' || c == '�' || c == '�' || c == '�' ) continue ;*/
160211 return true ; // Something else, assumed wildcard
161212 }
162213 return false ;
@@ -171,55 +222,217 @@
172223 return false ;
173224 }
174225
175 -wxString TSearchWordTree::Process ( ZenoFile *index , int depth )
 226+wxArrayString TSearchWordTree::Process ( ZenoFile *index , int depth )
176227 {
177228 int a ;
 229+ wxArrayString ret ;
178230
179231 // Process children first
180232 for ( a = 0 ; a < children.GetCount() ; a++ )
181233 children[a]->Process ( index , depth+1 ) ;
182234
183 - if ( !word.IsEmpty() )
 235+ if ( type == TREE_WORD || !word.IsEmpty() ) // Search entry
184236 {
185 -
186 - }
 237+ CreateSingleWordTable ( word , index ) ;
 238+ ProcessList() ;
 239+ } else { // Group / AND / OR / NEAR / whatnot
 240+ switch ( type )
 241+ {
 242+ case TREE_NORMAL_LIST : ProcessList() ; break ;
 243+ case TREE_AND : ProcessAND() ; break ;
 244+ case TREE_OR : ProcessOR() ; break ;
 245+ default : return ret ;
 246+ }
 247+ }
 248+
 249+ // Return by root element
 250+ if ( depth > 0 ) return ret ;
 251+
 252+ wxArrayInt ids ;
 253+ for ( a = 0 ; ids.GetCount() < MAX_RETURN && a < table.GetCount() ; a++ )
 254+ {
 255+ if ( ids.IsEmpty() ) ids.Add ( table[a].article_id ) ;
 256+ else if ( ids.Last() != table[a].article_id ) ids.Add ( table[a].article_id ) ;
 257+ }
 258+
 259+ ZenoFile *main = ((MainApp*)wxTheApp)->frame->GetMainPointer() ;
 260+ for ( a = 0 ; a < ids.GetCount() ; a++ )
 261+ {
 262+ ZenoArticle art = main->ReadSingleArticle ( ids[a] ) ;
 263+ ret.Add ( art.title ) ;
 264+ }
 265+ return ret ;
187266 }
188267
 268+void TSearchWordTree::ProcessList ()
 269+{
 270+ ProcessOR () ;
 271+}
189272
190 -//________________________________________________________________________________________________________________________
 273+void TSearchWordTree::ProcessAND ()
 274+{
 275+ if ( children.GetCount() == 0 ) return ;
 276+ if ( children.GetCount() == 1 )
 277+ {
 278+ table = children[0]->table ;
 279+ children[0]->table.Clear() ;
 280+ return ;
 281+ }
 282+ int a , b , cp ;
 283+ for ( cp = 1 ; cp < children.GetCount() ; cp++ )
 284+ {
 285+ ArrayOfTSearchWordTreeTableLine *t1 , *t2 ;
 286+ t1 = &children[0]->table ;
 287+ t2 = &children[cp]->table ;
 288+ a = 0 ;
 289+ b = 0 ;
 290+ table.Clear() ;
 291+ while ( a < t1->GetCount() && b < t2->GetCount() )
 292+ {
 293+ if ( (*t1)[a].article_id < (*t2)[b].article_id ) { a++ ; continue ; }
 294+ if ( (*t1)[a].article_id > (*t2)[b].article_id ) { b++ ; continue ; }
 295+ if ( (*t1)[a] < (*t2)[b] ) { table.Add((*t1)[a]) ; a++ ; continue ; }
 296+ if ( (*t2)[b] < (*t1)[a] ) { table.Add((*t2)[b]) ; b++ ; continue ; }
 297+ table.Add((*t1)[a]) ; // Both equal ??
 298+ a++; b++;
 299+ }
 300+ t2->Clear() ;
 301+ *t1 = table ;
 302+ }
 303+ table = children[0]->table ;
 304+ children[0]->table.Clear() ;
 305+// table.Sort ( CMPFUNCtable ) ;
 306+}
191307
 308+void TSearchWordTree::ProcessOR ()
 309+{
 310+ if ( children.GetCount() == 0 ) return ; // No need to run this
 311+ int a ;
 312+ table.Clear() ;
 313+ for ( a = 0 ; a < children.GetCount() ; a++ )
 314+ {
 315+ WX_APPEND_ARRAY ( table , children[a]->table ) ;
 316+ children[a]->table.Clear() ;
 317+ }
 318+ if ( children.GetCount() > 1 ) table.Sort ( CMPFUNCtable ) ; // Otherwise, no sorting necessary
 319+}
192320
193 -wxString wxWikiServer::Search ( wxString query , wxString type )
 321+void TSearchWordTree::CreateSingleWordTable ( wxString word , ZenoFile *index )
194322 {
195 - // Break parse string indo words and ()
196 - query = query.Lower() ;
 323+ table.Clear () ;
 324+ if ( word.IsEmpty() ) return ;
 325+ int i = index->FindPageID ( _T("X/") + word ) ;
 326+ if ( i <= 0 ) return ;
 327+ ZenoArticle art = index->ReadSingleArticle ( i ) ;
 328+ if ( !art.ok ) return ;
 329+ char *data = index->GetBlob ( art.rFilePos , art.rFileLen ) ;
 330+ if ( !data ) return ;
 331+
 332+ // Table data is now in *data
 333+ unsigned long *x = (unsigned long*) data ;
 334+ unsigned long cnt = 0 ;
 335+ while ( cnt < art.rFileLen )
 336+ {
 337+ TSearchWordTreeTableLine line ;
 338+ line.article_id = *x++ ;
 339+ line.word_pos = *x++ ;
 340+ table.Add ( line ) ;
 341+ cnt += 8 ;
 342+ }
 343+
 344+ if ( GetRoot()->title_only ) FilterTitleAgainstTable ( word ) ;
 345+}
 346+
 347+void TSearchWordTree::FilterTitleAgainstTable ( wxString word )
 348+{
 349+ unsigned long number = 0 ;
 350+ bool ok = false ;
 351+ ArrayOfTSearchWordTreeTableLine table2 ;
 352+ ZenoFile *main = ((MainApp*)wxTheApp)->frame->GetMainPointer() ;
 353+ for ( int a = 0 ; a < table.GetCount() ; a++ )
 354+ {
 355+ if ( number == table[a].article_id )
 356+ {
 357+ if ( ok ) table2.Add ( table[a] ) ;
 358+ continue ;
 359+ }
 360+ // New article in list
 361+ number = table[a].article_id ;
 362+ ok = false ;
 363+ ZenoArticle art = main->ReadSingleArticle ( number ) ;
 364+ if ( !art.ok ) continue ;
 365+ wxString t = art.title ; // Needs to be qunicode-treated!!!!!!!!!!
 366+ if ( !DoesMatchTitle ( word , t ) ) continue ;
 367+ ok = true ;
 368+ table2.Add ( table[a] ) ;
 369+ }
 370+ table = table2 ;
 371+}
 372+
 373+void TSearchWordTree::Explode ( wxString query , wxArrayString &words , bool is_parameter )
 374+{
 375+ // Break parse string into words and ()
197376 wxString temp ;
198 - wxArrayString words ;
199377 while ( !query.IsEmpty() )
200378 {
201379 wxString l = query.Left ( 1 ) ;
202380 query = query.Mid ( 1 ) ;
203 - if ( l == _T(" ") || l == _T("(") || l == _T(")") )
 381+ bool doit = false ;
 382+ if ( is_parameter && ( l == _T(" ") || l == _T("(") || l == _T(")") ) ) doit = true ;
 383+// if ( !is_parameter &&
 384+ if ( doit )
204385 {
205 - if ( !temp.IsEmpty() ) words.Add ( temp ) ;
 386+ if ( !temp.IsEmpty() ) words.Add ( String2Q ( temp ) ) ;
206387 temp.Empty() ;
207 - if ( l == _T(" ") ) continue ;
208 - words.Add ( l ) ;
 388+ if ( is_parameter && l == _T(" ") ) continue ;
 389+ words.Add ( String2Q ( l ) ) ;
209390 } else temp += l ;
210391 }
211 - if ( !temp.IsEmpty() ) words.Add ( temp ) ;
212 - temp.Empty() ;
213 - // At this point, "query" is empty, and "words[]" contains words and "()"
 392+ if ( !temp.IsEmpty() ) words.Add ( String2Q ( temp ) ) ;
 393+}
 394+
 395+bool TSearchWordTree::DoesMatchTitle ( wxString word , wxString title )
 396+{
 397+ int a ;
 398+ wxString letters = _T("abcdefghijklmnopqrstuvwxyz") ;
 399+ letters += '�' ;
 400+ letters += '�' ;
 401+ title = title.Lower() ;
 402+ for ( a = 0 ; a < title.Length() ; a++ )
 403+ {
 404+ if ( -1 == letters.Find ( title[a] ) ) title[a] = ' ' ;
 405+ else title[a] = CharToQ ( title[a] ) ;
 406+ }
 407+ title = title.Lower() ;
 408+ title = _T(" ") + title + _T(" ") ;
 409+ word = _T(" ") + word + _T(" ") ;
 410+// wxMessageBox ( title , word ) ;
 411+ if ( -1 == title.Find ( word ) ) return false ;
 412+ return true ;
 413+}
 414+
 415+//________________________________________________________________________________________________________________________
 416+
 417+
 418+wxArrayString wxWikiServer::Search ( wxString query , wxString mode )
 419+{
 420+ // Fun pre-processing
 421+ query = query.Lower() ;
 422+ query.Replace ( _T("_") , _T(" ") ) ;
 423+ query.Replace ( _T("&") , _T(" and ") ) ;
 424+ query.Replace ( _T("|") , _T(" or ") ) ;
 425+ query.Replace ( _T(" und ") , _T(" and ") ) ; // German
 426+ query.Replace ( _T(" oder ") , _T(" or ") ) ; // German
 427+ query.Replace ( _T("%") , _T("{0;1}") ) ;
 428+ query.Replace ( _T("@") , _T("{1;}") ) ;
214429
 430+ wxArrayString words ;
215431 TSearchWordTree root ;
 432+ root.Explode ( query , words ) ;
 433+ root.title_only = mode == _T("titles") ;
216434 root.Parse ( words ) ;
217435 bool wildcards = root.IsUsingWildcards() ;
218436 // MISSING : Load index if wildcards==true
219437
220 - wxString ret ;
221 - ret = root.GetHTMLtree() ;
222 - ret += _T("<hr/>") ;
223 - ret += wildcards ? _T("Using wildcards<br/>") : _T("Not using wildcards<hr/>") ;
224 - ret += root.Process ( frame->GetIndexPointer() ) ;
225 - return ret ;
 438+ return root.Process ( frame->GetIndexPointer() ) ;
226439 }
Index: trunk/yawr/wxWikiServer.cpp
@@ -42,18 +42,39 @@
4343 query = uri.Unescape ( query ) ;
4444
4545 wxString mode ;
46 - if ( GetValue ( _T("e") ) != _T("") ) mode = _T("titles") ;
47 - if ( GetValue ( _T("ft") ) != _T("") ) mode = _T("fulltext") ;
48 -
49 - hr.SetRC(wxT("200 OK"));
50 - hr.AddHeader(wxT("Content-Type: text/html; charset=UTF8") );
51 - wxString html = Search ( query , mode ) ;
52 - hr.AddDataLine ( _T("SEARCH!") );
53 - hr.AddDataLine ( query + _T("<br/>") );
54 - hr.AddDataLine ( html );
 46+ bool fulltext = GetValue ( _T("ft") ) != _T("") ;
 47+ if ( !fulltext ) mode = _T("titles") ;
 48+ else mode = _T("fulltext") ;
 49+
 50+ wxArrayString titles = Search ( query , mode ) ;
 51+ wxString html = FormatList ( titles , 1 , 100 , fulltext ) ;
 52+ ReturnHTML ( _T("-/Suche") , html , hr ) ;
5553 }
5654 }
5755
 56+wxString wxWikiServer::FormatList ( const wxArrayString &titles , int from , int howmany , bool fulltext )
 57+{
 58+ wxString html ;
 59+// if ( fulltext )
 60+ {
 61+// } else {
 62+ html = _T("<table class=\"z_lemtab\">") ;
 63+ int a , b = 1 ;
 64+ for ( a = 0 ; a < howmany && titles.GetCount() > from+a ; a++ )
 65+ {
 66+ wxString nicetitle = titles[from+a].Mid(2) ;
 67+ nicetitle.Replace ( _T("_") , _T(" ") ) ;
 68+ wxURI uri ( titles[from+a] ) ;
 69+ wxString s = _T("<a href=\"/Wikipedia/") + uri.BuildURI() + _T("\">") + nicetitle + _T("</a>");
 70+ if ( a % 3 == 0 ) html += _T("<tr>") ;
 71+ html += _T("<td>") + s + _T("</td>") ;
 72+ if ( a % 3 == 2 ) html += _T("</tr>") ;
 73+ }
 74+ html += _T("</table>") ;
 75+ }
 76+ return html ;
 77+}
 78+
5879 bool busy = false ;
5980
6081 void wxWikiServer::HandleSimpleGetRequest(const wxString &page,HttpResponse &hr)
@@ -166,6 +187,15 @@
167188 ZenoArticle va_page = frame->GetPage ( orig_article , true ) ;
168189 text = va_page.GetText() ;
169190 } else text = art.GetText() ;
 191+ ReturnHTML ( orig_article , text , hr ) ;
 192+}
 193+
 194+void wxWikiServer::ReturnHTML ( wxString article , wxString text , HttpResponse &hr )
 195+{
 196+ wxString orig_article = article ;
 197+ wxString ns = article.BeforeFirst('/').Upper() ;
 198+ wxString title = article.AfterFirst ( '/' ) ;
 199+
170200 wxString nicetitle = title ;
171201 title.Replace ( _T(" ") , _T("_") ) ;
172202 nicetitle.Replace ( _T("_") , _T(" ") ) ;
Index: trunk/yawr/base.cpp
@@ -218,6 +218,7 @@
219219 wxString MainFrame::GetIP() { return _T("127.0.0.1") ; }
220220 wxString MainFrame::GetPort() { return port_line->GetValue() ; }
221221 ZenoFile *MainFrame::GetIndexPointer() { return &zf_index ; }
 222+ZenoFile *MainFrame::GetMainPointer() { return &zf_main ; }
222223
223224 ZenoArticle MainFrame::RandomArticle ( wxString begin )
224225 {
Index: trunk/yawr/ZenoFile.cpp
@@ -21,6 +21,19 @@
2222
2323 char *qunicode = NULL ;
2424
 25+wxChar CharToQ ( wxChar c )
 26+{
 27+ return qunicode[(int)c] ;
 28+}
 29+
 30+wxString String2Q ( wxString s )
 31+{
 32+ int a ;
 33+ for ( a = 0 ; a < s.Length() ; a++ ) s[a] = qunicode[(int)s[a]] ;
 34+ s = s.Lower() ;
 35+ return s ;
 36+}
 37+
2538 wxString ArrayToString ( const wxArrayInt &array )
2639 {
2740 wxString ret ;
Index: trunk/yawr/base.h
@@ -16,11 +16,13 @@
1717 public :
1818 virtual void HandleSimpleGetRequest(const wxString &page,HttpResponse &hr);
1919 virtual void ReturnHTML ( wxString article , ZenoArticle &art , HttpResponse &hr ) ;
 20+ virtual void ReturnHTML ( wxString article , wxString text , HttpResponse &hr ) ;
2021 virtual void ReturnPlainText ( wxString article , ZenoArticle &art , HttpResponse &hr ) ;
2122 virtual void ReturnCSS ( wxString article , ZenoArticle &art , HttpResponse &hr ) ;
2223 virtual void ReturnBinary ( wxString article , ZenoArticle &art , HttpResponse &hr , wxString content_type ) ;
2324 virtual void SpecialPage (const wxString &page,HttpResponse &hr);
24 - virtual wxString Search ( wxString query , wxString type ) ;
 25+ virtual wxArrayString Search ( wxString query , wxString mode ) ;
 26+ virtual wxString FormatList ( const wxArrayString &titles , int from = 1 , int howmany = 100 , bool fulltext = false ) ;
2527 MainFrame *frame ;
2628 bool va ;
2729 } ;
@@ -48,6 +50,7 @@
4951 wxString GetIP() ;
5052 wxString GetPort() ;
5153 ZenoArticle RandomArticle ( wxString begin ) ;
 54+ ZenoFile *GetMainPointer() ;
5255 ZenoFile *GetIndexPointer() ;
5356
5457 void Log ( wxString message , wxString function = _T("") ) ;
Index: trunk/yawr/ZenoFile.h
@@ -9,6 +9,9 @@
1010
1111 WX_DECLARE_OBJARRAY(ZenoArticle, ArrayOfZenoArticles);
1212
 13+wxChar CharToQ ( wxChar c ) ;
 14+wxString String2Q ( wxString s ) ;
 15+
1316 class ZenoArticle
1417 {
1518 public :
Index: trunk/yawr/wxWebServer.cpp
@@ -187,6 +187,7 @@
188188 page = page.BeforeFirst ( ' ' ) ;
189189 page = page.BeforeLast ( '?' ) ;
190190 }
 191+ page.Replace ( _T("%20") , _T("_") ) ;
191192 page = uri.Unescape ( page ) ;
192193 HandleSimpleGetRequest ( page , hr ) ;
193194 return ;

Follow-up revisions

RevisionCommit summaryAuthorDate
r19801Add a mechanism to parserTests when run in --compare or --record mode, to giv...nickj06:59, 6 February 2007