r12482 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r12481‎ | r12482 | r12483 >
Date:17:37, 7 January 2006
Author:vibber
Status:old
Tags:
Comment:
In-progress dump plugin to generate abstracts XML for Yahoo!'s active abstracts thingy
Need to make some more adjustments in response to updated docs from them
Modified paths:
  • /trunk/extensions/ActiveAbstract (added) (history)
  • /trunk/extensions/ActiveAbstract/AbstractFilter.php (added) (history)

Diff [purge]

Index: trunk/extensions/ActiveAbstract/AbstractFilter.php
@@ -0,0 +1,171 @@
 2+<?php
 3+
 4+/**
 5+ * Generate XML feed for Yahoo's Active Abstracts project
 6+ * Plugin for dumpBackup.php; call as eg:
 7+ *
 8+ * php dumpBackup.php \
 9+ * --plugin=AbstractFilter:extensions/ActiveAbstract/AbstractFilter.php \
 10+ * --current \
 11+ * --output=gzip:/dumps/abstract.xml.gz \
 12+ * --filter=namespace:NS_MAIN \
 13+ * --filter=noredirect \
 14+ * --filter=abstract
 15+ */
 16+
 17+require_once 'includes/EditPage.php'; // hack; for section anchor code
 18+
 19+/**
 20+ * Tosses away the MediaWiki XML and generates new output
 21+ */
 22+class AbstractFilter {
 23+ /**
 24+ * Register the filter function with the dump manager
 25+ * @param BackupDumper $dumper
 26+ * @static
 27+ */
 28+ function register( &$dumper ) {
 29+ $dumper->registerFilter( 'abstract', 'AbstractFilter' );
 30+ $dumper->registerFilter( 'noredirect', 'NoredirectFilter' );
 31+ }
 32+
 33+ function AbstractFilter( &$sink ) {
 34+ $this->sink =& $sink;
 35+ }
 36+
 37+ function writeOpenStream( $string ) {
 38+ $this->sink->writeOpenStream( "<feed>\n" );
 39+ }
 40+
 41+ function writeCloseStream( $string ) {
 42+ $this->sink->writeCloseStream( "</feed>\n" );
 43+ }
 44+
 45+ function writeOpenPage( $page, $string ) {
 46+ global $wgSitename;
 47+ $this->title = Title::makeTitle( $page->page_namespace, $page->page_title );
 48+
 49+ $xml = "<doc>\n";
 50+ $xml .= wfElement( 'url', null, $this->title->getFullUrl() ) . "\n";
 51+ $xml .= wfElement( 'title', null, $wgSitename . ': ' . $this->title->getPrefixedText() ) . "\n";
 52+
 53+ // add abstract and links when we have revision data...
 54+ $this->revision = null;
 55+
 56+ $this->sink->writeOpenPage( $page, $xml );
 57+ }
 58+
 59+ function writeClosePage( $string ) {
 60+ $xml = '';
 61+ if( $this->revision ) {
 62+ $xml .= wfElement( 'abstract', null, $this->_abstract( $this->revision ) ) . "\n";
 63+ $xml .= "<links>\n";
 64+ foreach( $this->_links( $this->revision ) as $url ) {
 65+ $xml .= wfElement( 'link', null, $url ) . "\n";
 66+ }
 67+ $xml .= "</links>\n";
 68+ }
 69+ $xml .= "</doc>\n";
 70+ $this->sink->writeClosePage( $xml );
 71+ $this->title = null;
 72+ $this->revision = null;
 73+ }
 74+
 75+ function writeRevision( $rev, $string ) {
 76+ // Only use one revision's worth of data to output
 77+ $this->revision = $rev;
 78+ }
 79+
 80+ /**
 81+ * Extract an abstract from the page
 82+ * @params object $rev Database rows with revision data
 83+ * @return string
 84+ * @access private
 85+ */
 86+ function _abstract( $rev ) {
 87+ $text = Revision::getRevisionText( $rev ); // FIXME cache this
 88+
 89+ $stripped = $this->_stripMarkup( $text );
 90+ $extract = $this->_extractStart( $stripped );
 91+
 92+ return substr( $extract, 0, 1024 ); // not too long pls
 93+ }
 94+
 95+ /**
 96+ * Strip markup to show plaintext
 97+ * @param string $text
 98+ * @return string
 99+ * @access private
 100+ */
 101+ function _stripMarkup( $text ) {
 102+ $text = str_replace( "'''", "", $text );
 103+ $text = str_replace( "''", "", $text );
 104+ $text = preg_replace( '#<!--.*?-->#s', '', $text ); // HTML-style comments
 105+ $text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text ); // HTML-style tags
 106+ $text = preg_replace( '#\\[[a-z]+:.*? (.*?)\\]#s', '$1', $text ); // URL links
 107+ $text = preg_replace( '#\\{\\{\\{.*?\\}\\}\\}#s', '', $text ); // template parameters
 108+ $text = preg_replace( '#\\{\\{.*?\\}\\}#s', '', $text ); // template calls
 109+ $text = preg_replace( '#\\[\\[([^|\\]]*\\|)?(.*?)\\]\\]#s', '$2', $text ); // links
 110+ $text = Sanitizer::decodeCharReferences( $text );
 111+ return trim( $text );
 112+ }
 113+
 114+ /**
 115+ * Extract the first two sentences, if detectable, from the text.
 116+ * @param string $text
 117+ * @return string
 118+ * @access private
 119+ */
 120+ function _extractStart( $text ) {
 121+ $endchars = array(
 122+ '.', '!', '?', // regular ASCII
 123+ '。', // full-width ideographic full-stop
 124+ '.', '!', '?', // double-width roman forms
 125+ '。', // half-width ideographic full stop
 126+ );
 127+
 128+ $endgroup = implode( '', array_map( 'preg_quote', $endchars ) );
 129+ $end = "[$endgroup]";
 130+ $sentence = ".*?$end+";
 131+ $firsttwo = "/^($sentence$sentence)/";
 132+
 133+ if( preg_match( $firsttwo, $text, $matches ) ) {
 134+ return $matches[1];
 135+ } else {
 136+ return $text;
 137+ }
 138+ }
 139+
 140+ /**
 141+ * Extract a list of TOC links
 142+ * @params object $rev Database rows with revision data
 143+ * @return array of URL strings
 144+ * @access private
 145+ * @fixme extract TOC items
 146+ */
 147+ function _links( $rev ) {
 148+ $text = Revision::getRevisionText( $rev );
 149+ $secs =
 150+ preg_split(
 151+ '/(^=+.+?=+|^<h[1-6].*?' . '>.*?<\/h[1-6].*?' . '>)(?!\S)/mi',
 152+ $text, -1,
 153+ PREG_SPLIT_DELIM_CAPTURE );
 154+
 155+ $headers = array();
 156+ for( $i = 1; $i < count( $secs ); $i += 2 ) {
 157+ $header = preg_replace( '/^=+\s*(.*?)\s*=+/', '$1', $secs[$i] );
 158+ $anchor = EditPage::sectionAnchor( $header );
 159+ $url = $this->title->getFullUrl() . $anchor;
 160+ $headers[] = $url;
 161+ }
 162+ return $headers;
 163+ }
 164+}
 165+
 166+class NoredirectFilter extends DumpFilter {
 167+ function pass( $page, $string ) {
 168+ return !$page->page_is_redirect;
 169+ }
 170+}
 171+
 172+?>
\ No newline at end of file
Property changes on: trunk/extensions/ActiveAbstract/AbstractFilter.php
___________________________________________________________________
Added: svn:keywords
1173 + Author Date Id Revision
Added: svn:eol-style
2174 + native

Status & tagging log