r64088 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r64087‎ | r64088 | r64089 >
Date:19:50, 23 March 2010
Author:mah
Status:ok
Tags:
Comment:
* Implement normalization of fullwidth latin characters for all Languages, not just Japanese and Chinese.
* Tune Language::convertDoubleWidth() so that it is 8-10x faster. (See http://xrl.us/bg2mon)
Modified paths:
  • /trunk/phase3/languages/Language.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageJa.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageZh_hans.php (modified) (history)
  • /trunk/phase3/maintenance/tests/SearchDbTest.php (modified) (history)
  • /trunk/phase3/maintenance/tests/SearchEngineTest.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/tests/SearchEngineTest.php
@@ -6,41 +6,43 @@
77 * @group Stub
88 */
99 class SearchEngineTest extends MediaWiki_Setup {
10 - var $db, $search;
11 - private $count = 0;
 10+ var $db, $search, $pageList;
1211
13 - function insertSearchData() {
14 - $this->insertPage("Main_Page", "This is a main page", 0);
15 - $this->insertPage('Main_Page', 'This is a talk page to the main page, see [[smithee]]', 1);
16 - $this->insertPage('Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]', 0);
17 - $this->insertPage('Smithee', 'This article sucks.', 1);
18 - $this->insertPage('Unrelated_page', 'Nothing in this page is about the S word.', 0);
19 - $this->insertPage('Another_page', 'This page also is unrelated.', 0);
20 - $this->insertPage('Help', 'Help me!', 4);
21 - $this->insertPage('Thppt', 'Blah blah', 0);
22 - $this->insertPage('Alan_Smithee', 'yum', 0);
23 - $this->insertPage('Pages', 'are food', 0);
24 - $this->insertPage('DblPageOne', 'ABCDEF', 0);
25 - $this->insertPage('DblPageTwo', 'ABCDE', 0);
26 - $this->insertPage('DblPageTwoLow', 'abcde', 0);
 12+ function pageExists( $title ) {
 13+ return false;
2714 }
2815
29 - function normalize( $text ) {
30 - return strtolower(preg_replace("/[^[:alnum:] ]/", " ", $text));
 16+ function insertSearchData() {
 17+ if( $this->pageExists( 'Not_Main_Page' ) ) {
 18+ return;
 19+ }
 20+ $this->insertPage("Not_Main_Page", "This is not a main page", 0);
 21+ $this->insertPage('Talk:Not_Main_Page', 'This is not a talk page to the main page, see [[smithee]]', 1);
 22+ $this->insertPage('Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]', 0);
 23+ $this->insertPage('Talk:Smithee', 'This article sucks.', 1);
 24+ $this->insertPage('Unrelated_page', 'Nothing in this page is about the S word.', 0);
 25+ $this->insertPage('Another_page', 'This page also is unrelated.', 0);
 26+ $this->insertPage('Help:Help', 'Help me!', 4);
 27+ $this->insertPage('Thppt', 'Blah blah', 0);
 28+ $this->insertPage('Alan_Smithee', 'yum', 0);
 29+ $this->insertPage('Pages', 'are\'food', 0);
 30+ $this->insertPage('HalfOneUp', 'AZ', 0);
 31+ $this->insertPage('FullOneUp', 'AZ', 0);
 32+ $this->insertPage('HalfTwoLow', 'az', 0);
 33+ $this->insertPage('FullTwoLow', 'az', 0);
 34+ $this->insertPage('HalfNumbers', '1234567890', 0);
 35+ $this->insertPage('FullNumbers', '1234567890', 0);
 36+ $this->insertPage('DomainName', 'example.com', 0);
3137 }
3238
33 - function insertPage( $pageName, $text, $ns ) {
34 - $this->count++;
35 - $this->db->safeQuery( 'INSERT INTO ! (page_id,page_namespace,page_title,page_latest) VALUES (?,?,?,?)',
36 - $this->db->tableName( 'page' ), $this->count, $ns, $pageName, $this->count );
37 - $this->db->safeQuery( 'INSERT INTO ! (rev_id,rev_page) VALUES (?, ?)',
38 - $this->db->tableName( 'revision' ), $this->count, $this->count );
39 - $this->db->safeQuery( 'INSERT INTO ! (old_id,old_text) VALUES (?, ?)',
40 - $this->db->tableName( 'text' ), $this->count, $text );
41 - $this->db->safeQuery( 'INSERT INTO ! (si_page,si_title,si_text) VALUES (?, ?, ?)',
42 - $this->db->tableName( 'searchindex' ), $this->count,
43 - $this->normalize( $pageName ), $this->normalize( $text ) );
44 - }
 39+ function removeSearchData() {
 40+ return;
 41+ while( count($this->pageList) ) {
 42+ list( $title, $id ) = array_pop( $this->pageList );
 43+ $article = new Article( $title, $id );
 44+ $article->doDeleteArticle("Search Test");
 45+ }
 46+ }
4547
4648 function fetchIds( $results ) {
4749 $matches = array();
@@ -55,34 +57,98 @@
5658 return $matches;
5759 }
5860
59 - function testTextSearch() {
60 - if( is_null( $this->db ) ) {
61 - $this->markTestIncomplete( "Can't find a database to test with." );
62 - }
63 - $this->assertEquals(
64 - array( 'Smithee' ),
65 - $this->fetchIds( $this->search->searchText( 'smithee' ) ),
66 - "Plain search failed" );
 61+ // Modified version of WikiRevision::importOldRevision()
 62+ function insertPage( $pageName, $text, $ns ) {
 63+ $dbw = $this->db;
 64+ $title = Title::newFromText( $pageName );
 65+
 66+ $userId = 0;
 67+ $userText = 'WikiSysop';
 68+ $comment = 'Search Test';
 69+
 70+ // avoid memory leak...?
 71+ $linkCache = LinkCache::singleton();
 72+ $linkCache->clear();
 73+
 74+ $article = new Article( $title );
 75+ $pageId = $article->getId();
 76+ $created = false;
 77+ if( $pageId == 0 ) {
 78+ # must create the page...
 79+ $pageId = $article->insertOn( $dbw );
 80+ $created = true;
 81+ }
 82+
 83+ # FIXME: Use original rev_id optionally (better for backups)
 84+ # Insert the row
 85+ $revision = new Revision( array(
 86+ 'page' => $pageId,
 87+ 'text' => $text,
 88+ 'comment' => $comment,
 89+ 'user' => $userId,
 90+ 'user_text' => $userText,
 91+ 'timestamp' => 0,
 92+ 'minor_edit' => false,
 93+ ) );
 94+ $revId = $revision->insertOn( $dbw );
 95+ $changed = $article->updateIfNewerOn( $dbw, $revision );
 96+
 97+ $GLOBALS['wgTitle'] = $title;
 98+ if( $created ) {
 99+ Article::onArticleCreate( $title );
 100+ $article->createUpdates( $revision );
 101+ } elseif( $changed ) {
 102+ Article::onArticleEdit( $title );
 103+ $article->editUpdates(
 104+ $text, $comment, false, 0, $revId );
 105+ }
 106+
 107+ $su = new SearchUpdate($article->getId(), $pageName, $text);
 108+ $su->doUpdate();
 109+
 110+ $this->pageList[] = array( $title, $article->getId() );
 111+
 112+ return true;
 113+ }
 114+
 115+ function testFullWidth() {
 116+ $this->assertEquals(
 117+ array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
 118+ $this->fetchIds( $this->search->searchText( 'AZ' ) ),
 119+ "Search for normalized from Half-width Upper" );
 120+ $this->assertEquals(
 121+ array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
 122+ $this->fetchIds( $this->search->searchText( 'az' ) ),
 123+ "Search for normalized from Half-width Lower" );
 124+ $this->assertEquals(
 125+ array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
 126+ $this->fetchIds( $this->search->searchText( 'AZ' ) ),
 127+ "Search for normalized from Full-width Upper" );
 128+ $this->assertEquals(
 129+ array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
 130+ $this->fetchIds( $this->search->searchText( 'az' ) ),
 131+ "Search for normalized from Full-width Lower" );
67132 }
68133
 134+ function testTextSearch() {
 135+ $this->assertEquals(
 136+ array( 'Smithee' ),
 137+ $this->fetchIds( $this->search->searchText( 'smithee' ) ),
 138+ "Plain search failed" );
 139+ }
 140+
69141 function testTextPowerSearch() {
70 - if( is_null( $this->db ) ) {
71 - $this->markTestIncomplete( "Can't find a database to test with." );
72 - }
73142 $this->search->setNamespaces( array( 0, 1, 4 ) );
74143 $this->assertEquals(
75144 array(
76145 'Smithee',
77 - 'Talk:Main Page',
 146+ 'Talk:Not Main Page',
78147 ),
79148 $this->fetchIds( $this->search->searchText( 'smithee' ) ),
80149 "Power search failed" );
81150 }
82151
83152 function testTitleSearch() {
84 - if( is_null( $this->db ) ) {
85 - $this->markTestIncomplete( "Can't find a database to test with." );
86 - }
87153 $this->assertEquals(
88154 array(
89155 'Alan Smithee',
@@ -93,9 +159,6 @@
94160 }
95161
96162 function testTextTitlePowerSearch() {
97 - if( is_null( $this->db ) ) {
98 - $this->markTestIncomplete( "Can't find a database to test with." );
99 - }
100163 $this->search->setNamespaces( array( 0, 1, 4 ) );
101164 $this->assertEquals(
102165 array(
@@ -108,6 +171,3 @@
109172 }
110173
111174 }
112 -
113 -
114 -
Index: trunk/phase3/maintenance/tests/SearchDbTest.php
@@ -6,23 +6,22 @@
77
88 function setUp() {
99 global $wgDBprefix, $wgDBtype;
 10+ $this->db = wfGetDB( DB_MASTER );
 11+ if( !$this->db ) {
 12+ $this->markTestIncomplete( "Can't find a database to test with." );
 13+ }
1014
11 - if($wgDBprefix === "parsertest_" ||
12 - ($wgDBtype === 'oracle' && $wgDBprefix === 'pt_')) {
13 - $this->markTestSkipped("This test can't (yet?) be run with the parser tests");
14 - }
15 -
1615 $GLOBALS['wgContLang'] = new Language;
17 - $this->db = $this->buildTestDatabase(
18 - array( 'page', 'revision', 'text', 'searchindex', 'user' ) );
19 - if( $this->db ) {
20 - $this->insertSearchData();
21 - }
22 - $searchType = preg_replace("/Database/", "Search", get_class($this->db));
 16+ $this->insertSearchData();
 17+
 18+ $this->insertSearchData();
 19+ $searchType = preg_replace("/Database/", "Search",
 20+ get_class($this->db));
2321 $this->search = new $searchType( $this->db );
2422 }
2523
2624 function tearDown() {
 25+ $this->removeSearchData();
2726 if( !is_null( $this->db ) ) {
2827 wfGetLB()->closeConnecton( $this->db );
2928 }
Index: trunk/phase3/languages/Language.php
@@ -1707,7 +1707,7 @@
17081708 * @return String
17091709 */
17101710 function normalizeForSearch( $string ) {
1711 - return $string;
 1711+ return self::convertDoubleWidth($string);
17121712 }
17131713
17141714 /**
@@ -1715,8 +1715,17 @@
17161716 * range: ff00-ff5f ~= 0020-007f
17171717 */
17181718 protected static function convertDoubleWidth( $string ) {
1719 - $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
1720 - $string = preg_replace( '/\xef\xbd([\x80-\x9a])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
 1719+ static $full = null;
 1720+ static $half = null;
 1721+
 1722+ if( $full === null ) {
 1723+ $fullWidth = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 1724+ $halfWidth = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 1725+ $full = str_split( $fullWidth, 3 );
 1726+ $half = str_split( $halfWidth );
 1727+ }
 1728+
 1729+ $string = str_replace( $full, $half, $string );
17211730 return $string;
17221731 }
17231732
Index: trunk/phase3/languages/classes/LanguageZh_hans.php
@@ -23,10 +23,9 @@
2424 wfProfileIn( __METHOD__ );
2525
2626 // Double-width roman characters
27 - $s = self::convertDoubleWidth( $string );
 27+ $s = parent::normalizeForSearch( $s );
2828 $s = trim( $s );
2929 $s = self::segmentByWord( $s );
30 - $s = parent::normalizeForSearch( $s );
3130
3231 wfProfileOut( __METHOD__ );
3332 return $s;
Index: trunk/phase3/languages/classes/LanguageJa.php
@@ -23,14 +23,6 @@
2424 return $s;
2525 }
2626
27 - function normalizeForSearch( $string ) {
28 - // Double-width roman characters
29 - $s = self::convertDoubleWidth( $string );
30 -
31 - # Do general case folding and UTF-8 armoring
32 - return parent::normalizeForSearch( $s );
33 - }
34 -
3527 # Italic is not appropriate for Japanese script
3628 # Unfortunately most browsers do not recognise this, and render <em> as italic
3729 function emphasize( $text ) {

Follow-up revisions

RevisionCommit summaryAuthorDate
r76240Test for Language::convertDoubleWidth (followup r64088 and r63776)hashar11:30, 7 November 2010

Status & tagging log