r111171 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r111170‎ | r111171 | r111172 >
Date:17:17, 10 February 2012
Author:maxsem
Status:ok
Tags:
Comment:
Started extracting wikitext-manipulating code into a separate, isolated class
Modified paths:
  • /trunk/extensions/MobileFrontend/DomManipulator.php (added) (history)
  • /trunk/extensions/MobileFrontend/MobileFrontend.body.php (modified) (history)
  • /trunk/extensions/MobileFrontend/MobileFrontend.php (modified) (history)

Diff [purge]

Index: trunk/extensions/MobileFrontend/DomManipulator.php
@@ -0,0 +1,225 @@
 2+<?php
 3+
 4+/**
 5+ * Converts HTML into a mobile-friendly version
 6+ */
 7+class DomManipulator {
 8+ /**
 9+ * @var DOMDocument
 10+ */
 11+ protected $doc;
 12+ protected $format;
 13+ protected $removeImages = false;
 14+ protected $idWhitelist = array();
 15+
 16+ private static $defaultItemsToRemove = array(
 17+ '#contentSub',
 18+ 'div.messagebox',
 19+ '#siteNotice',
 20+ '#siteSub',
 21+ '#jump-to-nav',
 22+ 'div.editsection',
 23+ 'div.infobox',
 24+ 'table.toc',
 25+ '#catlinks',
 26+ 'div.stub',
 27+ 'form',
 28+ 'div.sister-project',
 29+ 'script',
 30+ 'div.magnify',
 31+ '.editsection',
 32+ 'span.t',
 33+ 'sup[style*="help"]',
 34+ '.portal',
 35+ '#protected-icon',
 36+ '.printfooter',
 37+ '.boilerplate',
 38+ '#id-articulo-destacado',
 39+ '#coordinates',
 40+ '#top',
 41+ '.hiddenStructure',
 42+ '.noprint',
 43+ '.medialist',
 44+ '.mw-search-createlink',
 45+ '#ogg_player_1',
 46+ '.nomobile',
 47+ );
 48+
 49+ private $itemsToRemove = array();
 50+
 51+ public function __construct( $html, $format ) {
 52+ wfProfileIn( __METHOD__ );
 53+
 54+ $this->format = $format;
 55+
 56+ $html = mb_convert_encoding( $html, 'HTML-ENTITIES', "UTF-8" );
 57+ libxml_use_internal_errors( true );
 58+ $this->doc = new DOMDocument();
 59+ $this->doc->loadHTML( '<?xml encoding="UTF-8">' . $html );
 60+ libxml_use_internal_errors( false );
 61+ $this->doc->preserveWhiteSpace = false;
 62+ $this->doc->strictErrorChecking = false;
 63+ $this->doc->encoding = 'UTF-8';
 64+ }
 65+
 66+ /**
 67+ * @return DOMDocument: DOM to manipulate
 68+ */
 69+ public function getDoc() {
 70+ return $this->doc;
 71+ }
 72+
 73+ /**
 74+ * @return string: Output format
 75+ */
 76+ public function getFormat() {
 77+ return $this->format;
 78+ }
 79+
 80+ /**
 81+ * Sets whether images should be removed from output
 82+ * @param bool $flag
 83+ */
 84+ public function removeImages( $flag = true ) {
 85+ $this->removeImages = $flag;
 86+ }
 87+
 88+ /**
 89+ * Adds one or more selector of content to remove
 90+ * @param Array|string $selectors: Selector(s) of stuff to remove
 91+ */
 92+ public function remove( $selectors ) {
 93+ $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
 94+ }
 95+
 96+ /**
 97+ * @param Array|string $ids: Id(s) of content to keep
 98+ */
 99+ public function whitelistIds( $ids ) {
 100+ $this->idWhitelist = array_merge( $this->idWhitelist, array_flip( (array)$ids ) );
 101+ }
 102+
 103+ /**
 104+ * Removes content inappropriate for mobile devices
 105+ * @global type $wgMFRemovableClasses
 106+ * @param type $removeDefaults
 107+ */
 108+ public function filterContent( $removeDefaults = true ) {
 109+ global $wgMFRemovableClasses;
 110+
 111+ wfProfileIn(__METHOD__ );
 112+ if ( $removeDefaults ) {
 113+ $this->itemsToRemove = array_merge( $this->itemsToRemove,
 114+ self::$defaultItemsToRemove, $wgMFRemovableClasses
 115+ );
 116+ }
 117+ $removals = $this->parseItemsToRemove();
 118+
 119+ // Remove tags
 120+
 121+ // You can't remove DOMNodes from a DOMNodeList as you're iterating
 122+ // over them in a foreach loop. It will seemingly leave the internal
 123+ // iterator on the foreach out of wack and results will be quite
 124+ // strange. Though, making a queue of items to remove seems to work.
 125+ // For example:
 126+
 127+ $domElemsToRemove = array();
 128+ foreach ( $removals['TAG'] as $tagToRemove ) {
 129+ $tagToRemoveNodes = $this->doc->getElementsByTagName( $tagToRemove );
 130+ foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
 131+ $tagToRemoveNodeIdAttributeValue = '';
 132+ if ( $tagToRemoveNode ) {
 133+ $tagToRemoveNodeIdAttribute = $tagToRemoveNode->getAttributeNode( 'id' );
 134+ if ( $tagToRemoveNodeIdAttribute ) {
 135+ $tagToRemoveNodeIdAttributeValue = $tagToRemoveNodeIdAttribute->value;
 136+ }
 137+ if ( !isset( $this->idWhitelist[$tagToRemoveNodeIdAttributeValue] ) ) {
 138+ $domElemsToRemove[] = $tagToRemoveNode;
 139+ }
 140+ }
 141+ }
 142+ }
 143+
 144+ foreach ( $domElemsToRemove as $domElement ) {
 145+ $domElement->parentNode->removeChild( $domElement );
 146+ }
 147+
 148+ // Elements with named IDs
 149+ foreach ( $removals['ID'] as $itemToRemove ) {
 150+ $itemToRemoveNode = $this->doc->getElementById( $itemToRemove );
 151+ if ( $itemToRemoveNode ) {
 152+ $itemToRemoveNode->parentNode->removeChild( $itemToRemoveNode );
 153+ }
 154+ }
 155+
 156+ // CSS Classes
 157+ $xpath = new DOMXpath( $this->doc );
 158+ foreach ( $removals['CLASS'] as $classToRemove ) {
 159+ $elements = $xpath->query( '//*[@class="' . $classToRemove . '"]' );
 160+
 161+ foreach ( $elements as $element ) {
 162+ $element->parentNode->removeChild( $element );
 163+ }
 164+ }
 165+
 166+ // Tags with CSS Classes
 167+ foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
 168+ $parts = explode( '.', $classToRemove );
 169+
 170+ $elements = $xpath->query(
 171+ '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
 172+ );
 173+
 174+ foreach ( $elements as $element ) {
 175+ $removedElement = $element->parentNode->removeChild( $element );
 176+ }
 177+ }
 178+
 179+ // Handle red links with action equal to edit
 180+ $redLinks = $xpath->query( '//a[@class="new"]' );
 181+ foreach ( $redLinks as $redLink ) {
 182+ // PHP Bug #36795 — Inappropriate "unterminated entity reference"
 183+ $spanNode = $this->doc->createElement( "span", str_replace( "&", "&amp;", $redLink->nodeValue ) );
 184+
 185+ if ( $redLink->hasAttributes() ) {
 186+ $attributes = $redLink->attributes;
 187+ foreach ( $attributes as $i => $attribute ) {
 188+ if ( $attribute->name != 'href' ) {
 189+ $spanNode->setAttribute( $attribute->name, $attribute->value );
 190+ }
 191+ }
 192+ }
 193+
 194+ $redLink->parentNode->replaceChild( $spanNode, $redLink );
 195+ }
 196+ wfProfileOut( __METHOD__ );
 197+ }
 198+
 199+ /**
 200+ * @return array
 201+ */
 202+ private function parseItemsToRemove() {
 203+ wfProfileIn( __METHOD__ );
 204+ $removals = array();
 205+
 206+ foreach ( $this->itemsToRemove as $itemToRemove ) {
 207+ $type = '';
 208+ $rawName = '';
 209+ CssDetection::detectIdCssOrTag( $itemToRemove, $type, $rawName );
 210+ $removals[$type][] = $rawName;
 211+ }
 212+
 213+ if ( $this->removeImages ) {
 214+ $removals['TAG'][] = "img";
 215+ $removals['TAG'][] = "audio";
 216+ $removals['TAG'][] = "video";
 217+ $removals['CLASS'][] = "thumb tright";
 218+ $removals['CLASS'][] = "thumb tleft";
 219+ $removals['CLASS'][] = "thumbcaption";
 220+ $removals['CLASS'][] = "gallery";
 221+ }
 222+
 223+ wfProfileOut( __METHOD__ );
 224+ return $removals;
 225+ }
 226+}
Property changes on: trunk/extensions/MobileFrontend/DomManipulator.php
___________________________________________________________________
Added: svn:eol-style
1227 + native
Index: trunk/extensions/MobileFrontend/MobileFrontend.body.php
@@ -3,10 +3,6 @@
44 class ExtMobileFrontend {
55 const VERSION = '0.6.1';
66
7 - /**
8 - * @var DOMDocument
9 - */
10 - private $doc;
117 public $contentFormat = '';
128 public $WMLSectionSeparator = '***************************************************************************';
139
@@ -99,39 +95,6 @@
10096 'mobile-frontend-sopa-notice',
10197 );
10298
103 - public $itemsToRemove = array(
104 - '#contentSub',
105 - 'div.messagebox',
106 - '#siteNotice',
107 - '#siteSub',
108 - '#jump-to-nav',
109 - 'div.editsection',
110 - 'div.infobox',
111 - 'table.toc',
112 - '#catlinks',
113 - 'div.stub',
114 - 'form',
115 - 'div.sister-project',
116 - 'script',
117 - 'div.magnify',
118 - '.editsection',
119 - 'span.t',
120 - 'sup[style*="help"]',
121 - '.portal',
122 - '#protected-icon',
123 - '.printfooter',
124 - '.boilerplate',
125 - '#id-articulo-destacado',
126 - '#coordinates',
127 - '#top',
128 - '.hiddenStructure',
129 - '.noprint',
130 - '.medialist',
131 - '.mw-search-createlink',
132 - '#ogg_player_1',
133 - '.nomobile',
134 - );
135 -
13699 /**
137100 * Work out the site and language name from a database name
138101 * @param $site string
@@ -1106,25 +1069,6 @@
11071070 }
11081071
11091072 /**
1110 - * @return array
1111 - */
1112 - private function parseItemsToRemove() {
1113 - global $wgMFRemovableClasses;
1114 - wfProfileIn( __METHOD__ );
1115 - $itemToRemoveRecords = array();
1116 -
1117 - foreach ( array_merge( $this->itemsToRemove, $wgMFRemovableClasses ) as $itemToRemove ) {
1118 - $type = '';
1119 - $rawName = '';
1120 - CssDetection::detectIdCssOrTag( $itemToRemove, $type, $rawName );
1121 - $itemToRemoveRecords[$type][] = $rawName;
1122 - }
1123 -
1124 - wfProfileOut( __METHOD__ );
1125 - return $itemToRemoveRecords;
1126 - }
1127 -
1128 - /**
11291073 * @param DOMDocument $mainPage
11301074 * @return string
11311075 */
@@ -1270,38 +1214,31 @@
12711215 public function DOMParse( $html ) {
12721216 global $wgScript;
12731217 wfProfileIn( __METHOD__ );
1274 - $html = mb_convert_encoding( $html, 'HTML-ENTITIES', "UTF-8" );
1275 - libxml_use_internal_errors( true );
1276 - $this->doc = new DOMDocument();
1277 - $this->doc->loadHTML( '<?xml encoding="UTF-8">' . $html );
1278 - libxml_use_internal_errors( false );
1279 - $this->doc->preserveWhiteSpace = false;
1280 - $this->doc->strictErrorChecking = false;
1281 - $this->doc->encoding = 'UTF-8';
12821218
1283 - $itemToRemoveRecords = $this->parseItemsToRemove();
 1219+ $manipulator = new DomManipulator( $html, self::$format );
 1220+ $doc = $manipulator->getDoc();
12841221
1285 - $zeroRatedBannerElement = $this->doc->getElementById( 'zero-rated-banner' );
 1222+ $zeroRatedBannerElement = $doc->getElementById( 'zero-rated-banner' );
12861223
12871224 if ( !$zeroRatedBannerElement ) {
1288 - $zeroRatedBannerElement = $this->doc->getElementById( 'zero-rated-banner-red' );
 1225+ $zeroRatedBannerElement = $doc->getElementById( 'zero-rated-banner-red' );
12891226 }
12901227
12911228 if ( $zeroRatedBannerElement ) {
1292 - self::$zeroRatedBanner = $this->doc->saveXML( $zeroRatedBannerElement, LIBXML_NOEMPTYTAG );
 1229+ self::$zeroRatedBanner = $doc->saveXML( $zeroRatedBannerElement, LIBXML_NOEMPTYTAG );
12931230 }
12941231
12951232 if ( self::$isBetaGroupMember ) {
1296 - $ptLogout = $this->doc->getElementById( 'pt-logout' );
 1233+ $ptLogout = $doc->getElementById( 'pt-logout' );
12971234
12981235 if ( $ptLogout ) {
12991236 $ptLogoutLink = $ptLogout->firstChild;
1300 - self::$logoutHtml = $this->doc->saveXML( $ptLogoutLink, LIBXML_NOEMPTYTAG );
 1237+ self::$logoutHtml = $doc->saveXML( $ptLogoutLink, LIBXML_NOEMPTYTAG );
13011238 }
1302 - $ptAnonLogin = $this->doc->getElementById( 'pt-anonlogin' );
 1239+ $ptAnonLogin = $doc->getElementById( 'pt-anonlogin' );
13031240
13041241 if ( !$ptAnonLogin ) {
1305 - $ptAnonLogin = $this->doc->getElementById( 'pt-login' );
 1242+ $ptAnonLogin = $doc->getElementById( 'pt-login' );
13061243 }
13071244
13081245 if ( $ptAnonLogin ) {
@@ -1320,124 +1257,38 @@
13211258 $ptAnonLoginLinkText->nodeValue = self::$messages['mobile-frontend-login'];
13221259 }
13231260 }
1324 - self::$loginHtml = $this->doc->saveXML( $ptAnonLoginLink, LIBXML_NOEMPTYTAG );
 1261+ self::$loginHtml = $doc->saveXML( $ptAnonLoginLink, LIBXML_NOEMPTYTAG );
13251262 }
13261263 }
13271264
13281265 if ( self::$title->isSpecial( 'Userlogin' ) && self::$isBetaGroupMember ) {
1329 - $userlogin = $this->doc->getElementById( 'userloginForm' );
 1266+ $userlogin = $doc->getElementById( 'userloginForm' );
13301267
13311268 if ( $userlogin && get_class( $userlogin ) === 'DOMElement' ) {
1332 - $firstHeading = $this->doc->getElementById( 'firstHeading' );
 1269+ $firstHeading = $doc->getElementById( 'firstHeading' );
13331270 if ( $firstHeading ) {
13341271 $firstHeading->nodeValue = '';
13351272 }
13361273 }
13371274 }
13381275
1339 - // Tags
 1276+ $manipulator->removeImages( self::$disableImages == 1 );
 1277+ $manipulator->whitelistIds( 'zero-language-search' );
 1278+ $manipulator->filterContent();
13401279
1341 - // You can't remove DOMNodes from a DOMNodeList as you're iterating
1342 - // over them in a foreach loop. It will seemingly leave the internal
1343 - // iterator on the foreach out of wack and results will be quite
1344 - // strange. Though, making a queue of items to remove seems to work.
1345 - // For example:
1346 -
1347 - if ( self::$disableImages == 1 ) {
1348 - $itemToRemoveRecords['TAG'][] = "img";
1349 - $itemToRemoveRecords['TAG'][] = "audio";
1350 - $itemToRemoveRecords['TAG'][] = "video";
1351 - $itemToRemoveRecords['CLASS'][] = "thumb tright";
1352 - $itemToRemoveRecords['CLASS'][] = "thumb tleft";
1353 - $itemToRemoveRecords['CLASS'][] = "thumbcaption";
1354 - $itemToRemoveRecords['CLASS'][] = "gallery";
1355 - }
1356 -
1357 - $tagToRemoveNodeIdAttributeValues = array( 'zero-language-search' );
1358 -
1359 - $domElemsToRemove = array();
1360 - foreach ( $itemToRemoveRecords['TAG'] as $tagToRemove ) {
1361 - $tagToRemoveNodes = $this->doc->getElementsByTagName( $tagToRemove );
1362 - foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
1363 - $tagToRemoveNodeIdAttributeValue = '';
1364 - if ( $tagToRemoveNode ) {
1365 - $tagToRemoveNodeIdAttribute = $tagToRemoveNode->getAttributeNode( 'id' );
1366 - if ( $tagToRemoveNodeIdAttribute ) {
1367 - $tagToRemoveNodeIdAttributeValue = $tagToRemoveNodeIdAttribute->value;
1368 - }
1369 - if ( !in_array( $tagToRemoveNodeIdAttributeValue, $tagToRemoveNodeIdAttributeValues ) ) {
1370 - $domElemsToRemove[] = $tagToRemoveNode;
1371 - }
1372 - }
1373 - }
1374 - }
1375 -
1376 - foreach ( $domElemsToRemove as $domElement ) {
1377 - $domElement->parentNode->removeChild( $domElement );
1378 - }
1379 -
1380 - // Elements with named IDs
1381 - foreach ( $itemToRemoveRecords['ID'] as $itemToRemove ) {
1382 - $itemToRemoveNode = $this->doc->getElementById( $itemToRemove );
1383 - if ( $itemToRemoveNode ) {
1384 - $itemToRemoveNode->parentNode->removeChild( $itemToRemoveNode );
1385 - }
1386 - }
1387 -
1388 - // CSS Classes
1389 - $xpath = new DOMXpath( $this->doc );
1390 - foreach ( $itemToRemoveRecords['CLASS'] as $classToRemove ) {
1391 - $elements = $xpath->query( '//*[@class="' . $classToRemove . '"]' );
1392 -
1393 - foreach ( $elements as $element ) {
1394 - $element->parentNode->removeChild( $element );
1395 - }
1396 - }
1397 -
1398 - // Tags with CSS Classes
1399 - foreach ( $itemToRemoveRecords['TAG_CLASS'] as $classToRemove ) {
1400 - $parts = explode( '.', $classToRemove );
1401 -
1402 - $elements = $xpath->query(
1403 - '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
1404 - );
1405 -
1406 - foreach ( $elements as $element ) {
1407 - $removedElement = $element->parentNode->removeChild( $element );
1408 - }
1409 - }
1410 -
1411 - // Handle red links with action equal to edit
1412 - $redLinks = $xpath->query( '//a[@class="new"]' );
1413 - foreach ( $redLinks as $redLink ) {
1414 - // PHP Bug #36795 — Inappropriate "unterminated entity reference"
1415 - $spanNode = $this->doc->createElement( "span", str_replace( "&", "&amp;", $redLink->nodeValue ) );
1416 -
1417 - if ( $redLink->hasAttributes() ) {
1418 - $attributes = $redLink->attributes;
1419 - foreach ( $attributes as $i => $attribute ) {
1420 - if ( $attribute->name != 'href' ) {
1421 - $spanNode->setAttribute( $attribute->name, $attribute->value );
1422 - }
1423 - }
1424 - }
1425 -
1426 - $redLink->parentNode->replaceChild( $spanNode, $redLink );
1427 - }
1428 -
14291280 if ( self::$title->isSpecial( 'Userlogin' ) && self::$isBetaGroupMember ) {
14301281 if ( $userlogin && get_class( $userlogin ) === 'DOMElement' ) {
14311282 $login = $this->renderLogin();
1432 - $loginNode = $this->doc->importNode( $login, true );
 1283+ $loginNode = $doc->importNode( $login, true );
14331284 $userlogin->appendChild( $loginNode );
14341285 }
14351286 }
14361287
14371288 if ( self::$isMainPage ) {
1438 - $contentHtml = $this->DOMParseMainPage( $this->doc );
 1289+ $contentHtml = $this->DOMParseMainPage( $doc );
14391290 } else {
1440 - $content = $this->doc->getElementById( 'content' );
1441 - $contentHtml = $this->doc->saveXML( $content, LIBXML_NOEMPTYTAG );
 1291+ $content = $doc->getElementById( 'content' );
 1292+ $contentHtml = $doc->saveXML( $content, LIBXML_NOEMPTYTAG );
14421293 }
14431294
14441295 $title = htmlspecialchars( self::$title->getText() );
Index: trunk/extensions/MobileFrontend/MobileFrontend.php
@@ -45,6 +45,7 @@
4646 'ExtMobileFrontend' => 'MobileFrontend.body',
4747 'DeviceDetection' => 'DeviceDetection',
4848 'CssDetection' => 'CssDetection',
 49+ 'DomManipulator' => 'DomManipulator',
4950
5051 'MobileFrontendTemplate' => 'templates/MobileFrontendTemplate',
5152 'ApplicationTemplate' => 'templates/ApplicationTemplate',

Status & tagging log