r22538 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r22537‎ | r22538 | r22539 >
Date:17:53, 29 May 2007
Author:rainman
Status:old
Tags:
Comment:
More work on lsearch metadataPrefix:
* added a list of redirects to a page
* more accurate page reference calculation

Using temporal dtd, it should be at some point merged with yurik's
schema changes.
Modified paths:
  • /trunk/extensions/OAI/OAIRepo_body.php (modified) (history)
  • /trunk/extensions/OAI/lsearch-0.1.xsd (added) (history)

Diff [purge]

Index: trunk/extensions/OAI/OAIRepo_body.php
@@ -459,22 +459,17 @@
460460 }
461461 }
462462
463 - $writer = new OAIDumpWriter();
464 -
465463 # Fetch one extra row to check if we need a resumptionToken
466 - $resultSet = $this->fetchRows( $from, $until, $this->chunkSize() + 1, $resume, $metadataPrefix );
 464+ $resultSet = $this->fetchRows( $from, $until, $this->chunkSize() + 1, $resume );
467465 $count = min( $resultSet->numRows(), $this->chunkSize() );
468466 if( $count ) {
469467 echo "<$verb>\n";
 468+ // buffer everything up
 469+ $rows = array();
470470 $this->_lastSequence = null;
471471 for( $i = 0; $i < $count; $i++ ) {
472472 $row = $resultSet->fetchObject();
473 - $item = new WikiOAIRecord( $row, $writer );
474 - if( $withData ) {
475 - echo $item->renderRecord( $metadataPrefix, $this->timeGranularity() );
476 - } else {
477 - echo $item->renderHeader( $this->timeGranularity() );
478 - }
 473+ $rows[] = $row;
479474 $this->_lastSequence = $row->up_sequence;
480475 }
481476 if( $row = $resultSet->fetchObject() ) {
@@ -483,21 +478,34 @@
484479 $token = "$metadataPrefix:$row->up_sequence:$limit";
485480 else
486481 $token = "$metadataPrefix:$row->up_sequence";
 482+ }
 483+ $resultSet->free();
 484+ // init writer
 485+ $writer = $this->makeWriter($metadataPrefix,$rows);
 486+ // render
 487+ foreach( $rows as $row ) {
 488+ $item = new WikiOAIRecord( $row, $writer );
 489+ if( $withData ) {
 490+ echo $item->renderRecord( $metadataPrefix, $this->timeGranularity() );
 491+ } else {
 492+ echo $item->renderHeader( $this->timeGranularity() );
 493+ }
 494+ }
 495+ if( isset($token) ) {
487496 echo oaiTag( 'resumptionToken', array(), $token ) . "\n";
488497 }
489498 echo "</$verb>\n";
490499 } else {
491500 $this->addError( 'noRecordsMatch', 'No records available match the request.' );
492501 }
493 - $resultSet->free();
494502 }
495503
496504 function getRecord() {
497 - $writer = new OAIDumpWriter();
498505 $metadataPrefix = $this->validateMetadata( 'metadataPrefix' );
499506 if( !$this->errorCondition() ) {
500 - $row = $this->getRecordItem( $this->_request['identifier'], $metadataPrefix );
 507+ $row = $this->getRecordItem( $this->_request['identifier']);
501508 if( !$this->errorCondition() ) {
 509+ $writer = $this->makeWriter($metadataPrefix,array($row));
502510 $item = new WikiOAIRecord( $row, $writer );
503511 echo "<GetRecord>\n";
504512 echo $item->renderRecord( $metadataPrefix, $this->timeGranularity() );
@@ -506,10 +514,10 @@
507515 }
508516 }
509517
510 - function getRecordItem( $identifier, $metadataPrefix ) {
 518+ function getRecordItem( $identifier) {
511519 $pageid = $this->stripIdentifier( $identifier );
512520 if( $pageid ) {
513 - $resultSet = $this->fetchRecord( $pageid, $metadataPrefix );
 521+ $resultSet = $this->fetchRecord( $pageid);
514522 $row = $resultSet->fetchObject();
515523 $resultSet->free();
516524 if( $row ) {
@@ -556,16 +564,25 @@
557565 wfDebugDieBacktrace( 'Bogus result.' );
558566 }
559567 }
 568+
 569+ function makeWriter($metadataPrefix, $rows) {
 570+ if($metadataPrefix == 'lsearch'){
 571+ $res = $this->fetchReferenceData($rows);
 572+ $writer = new OAILSearchWriter($res);
 573+ $res->free();
 574+ return $writer;
 575+ } else
 576+ return new OAIDumpWriter;
 577+ }
560578
561579 function newSchema() {
562580 global $wgVersion;
563581 return version_compare( $wgVersion, '1.5alpha', 'ge' );
564582 }
565583
566 - function fetchRecord( $pageid, $type ) {
567 - extract( $this->_db->tableNames( 'updates', 'cur', 'page', 'revision', 'text', 'pagelinks' ) );
568 - if( $type == 'lsearch' ){
569 - $sql = "SELECT up_page,page_id,up_timestamp,up_action,up_sequence,
 584+ function fetchRecord( $pageid ) {
 585+ extract( $this->_db->tableNames( 'updates', 'page', 'revision', 'text' ) );
 586+ $sql = "SELECT up_page,page_id,up_timestamp,up_action,up_sequence,
570587 page_namespace,
571588 page_title,
572589 old_text,
@@ -577,29 +594,6 @@
578595 rev_user_text,
579596 rev_timestamp,
580597 page_restrictions,
581 - rev_minor_edit,
582 - COUNT(pl_from) as num_page_ref
583 - FROM $updates
584 - LEFT JOIN $page ON page_id=up_page
585 - LEFT JOIN $revision ON page_latest=rev_id
586 - LEFT JOIN $text ON rev_text_id=old_id
587 - LEFT JOIN $pagelinks ON page_namespace=pl_namespace AND page_title=pl_title
588 - WHERE up_page=" . IntVal( $pageid ) . "
589 - GROUP BY up_page LIMIT 1";
590 - } else{
591 - if( $this->newSchema() ) {
592 - $sql = "SELECT up_page,page_id,up_timestamp,up_action,up_sequence,
593 - page_namespace,
594 - page_title,
595 - old_text,
596 - old_flags,
597 - rev_id,
598 - rev_deleted,
599 - rev_comment,
600 - rev_user,
601 - rev_user_text,
602 - rev_timestamp,
603 - page_restrictions,
604598 rev_minor_edit
605599 FROM $updates,$page,$revision,$text
606600 WHERE up_page=" . IntVal( $pageid ) . '
@@ -607,34 +601,15 @@
608602 AND page_latest=rev_id
609603 AND rev_text_id=old_id
610604 LIMIT 1';
611 - } else { // FIXME: this will work only with dublin core?
612 - $sql = "SELECT page_id,up_timestamp,up_action,up_sequence,
613 - cur_namespace AS namespace,
614 - cur_title AS title,
615 - cur_text AS text,
616 - '' AS flags,
617 - cur_comment AS comment,
618 - cur_user AS user,
619 - cur_user_text AS user_text,
620 - cur_timestamp AS timestamp,
621 - cur_restrictions AS restrictions,
622 - cur_minor_edit AS minor_edit
623 - FROM $updates LEFT JOIN $cur ON cur_id=up_page
624 - WHERE up_page=" . IntVal( $pageid ) .
625 - ' LIMIT 1';
626 - }
627 - }
628605
629606 return $this->_db->resultObject( $this->_db->query( $sql ) );
630607 }
631608
632 - function fetchRows( $from, $until, $chunk, $token = null, $type ) {
633 - extract( $this->_db->tableNames( 'updates', 'cur', 'page', 'revision', 'text', 'pagelinks' ) );
 609+ function fetchRows( $from, $until, $chunk, $token = null ) {
 610+ extract( $this->_db->tableNames( 'updates', 'page', 'revision', 'text' ) );
634611 $chunk = IntVal( $chunk );
635612
636 - // lucene-search output: joins pagelinks table to get page ranks
637 - if( $type == "lsearch" ){
638 - $sql = "SELECT up_page,page_id,up_timestamp,up_action,up_sequence,
 613+ $sql = "SELECT up_page,page_id,up_timestamp,up_action,up_sequence,
639614 page_namespace,
640615 page_title,
641616 old_text,
@@ -646,47 +621,12 @@
647622 rev_user_text,
648623 rev_timestamp,
649624 page_restrictions,
650 - rev_minor_edit,
651 - COUNT(pl_from) as num_page_ref
652 - FROM $updates
653 - LEFT JOIN $page ON page_id=up_page
654 - LEFT JOIN $revision ON page_latest=rev_id
655 - LEFT JOIN $text ON rev_text_id=old_id
656 - LEFT JOIN $pagelinks ON page_namespace=pl_namespace AND page_title=pl_title";
657 - } else{
658 - if( $this->newSchema() ) {
659 - $sql = "SELECT up_page,page_id,up_timestamp,up_action,up_sequence,
660 - page_namespace,
661 - page_title,
662 - old_text,
663 - old_flags,
664 - rev_id,
665 - rev_deleted,
666 - rev_comment,
667 - rev_user,
668 - rev_user_text,
669 - rev_timestamp,
670 - page_restrictions,
671625 rev_minor_edit
672626 FROM $updates
673627 LEFT JOIN $page ON page_id=up_page
674628 LEFT JOIN $revision ON page_latest=rev_id
675629 LEFT JOIN $text ON rev_text_id=old_id ";
676 - } else { // FIXME: this will only work with dublin core?
677 - $sql = "SELECT page_id,up_timestamp,up_action,up_sequence,
678 - cur_namespace AS namespace,
679 - cur_title AS title,
680 - cur_text AS text,
681 - '' AS flags,
682 - cur_comment AS comment,
683 - cur_user AS user,
684 - cur_user_text AS user_text,
685 - cur_timestamp AS timestamp,
686 - cur_restrictions AS restrictions,
687 - cur_minor_edit AS minor_edit
688 - FROM $updates LEFT JOIN $cur ON cur_id=up_page ";
689 - }
690 - }
 630+
691631 $where = array();
692632 if( $token ) {
693633 $where[] = 'up_sequence >= ' . IntVal( $token );
@@ -703,12 +643,40 @@
704644 if( !empty( $where ) ) {
705645 $sql .= ' WHERE ' . implode( ' AND ', $where );
706646 }
707 - if($type == 'lsearch')
708 - $sql .= " GROUP BY up_page";
709647 $sql .= " ORDER BY $order LIMIT $chunk";
710648
711649 return $this->_db->resultObject( $this->_db->query( $sql ) );
712650 }
 651+
 652+ function fetchReferenceData( $rows ) {
 653+ $page_ids = array();
 654+ foreach($rows as $row){
 655+ $page_ids[] = $row->up_page;
 656+ }
 657+
 658+ if(count($page_ids) == 1)
 659+ $pages_where = " AND up_page = $page_ids[0] ";
 660+ else
 661+ $pages_where = " AND up_page IN (".implode(",",$page_ids).") ";
 662+
 663+ extract( $this->_db->tableNames( 'updates', 'page', 'revision', 'text', 'pagelinks' ) );
 664+ $sql = "SELECT up_page,up_sequence,
 665+ r.page_namespace AS page_namespace,
 666+ r.page_title AS page_title,
 667+ COUNT(pl.pl_from) AS num_page_ref
 668+ FROM updates
 669+ LEFT JOIN page AS p ON p.page_id=up_page
 670+ LEFT JOIN pagelinks AS pl ON p.page_namespace=pl.pl_namespace AND p.page_title=pl.pl_title
 671+ LEFT JOIN page AS ns ON pl.pl_from=ns.page_id
 672+ LEFT JOIN page AS r ON pl.pl_from=r.page_id AND r.page_is_redirect=1
 673+ LEFT JOIN pagelinks AS rpl ON r.page_namespace=rpl.pl_namespace AND r.page_title=rpl.pl_title
 674+ WHERE ns.page_namespace = p.page_namespace
 675+ $pages_where
 676+ GROUP BY up_page,r.page_id";
 677+
 678+ return $this->_db->resultObject( $this->_db->query( $sql ) );
 679+ }
 680+
713681
714682 function identifyInfo() {
715683 global $wgSitename;
@@ -737,8 +705,8 @@
738706 'namespace' => 'http://www.mediawiki.org/xml/export-0.3/',
739707 'schema' => 'http://www.mediawiki.org/xml/export-0.3.xsd' ) ,
740708 'lsearch' => array(
741 - 'namespace' => 'http://www.mediawiki.org/xml/export-0.3/',
742 - 'schema' => 'http://www.mediawiki.org/xml/export-0.3.xsd' ) );
 709+ 'namespace' => 'http://www.mediawiki.org/xml/lsearch-0.1/',
 710+ 'schema' => 'http://www.mediawiki.org/xml/lsearch-0.1.xsd' ) );
743711 }
744712
745713 }
@@ -841,10 +809,12 @@
842810 case 'oai_dc':
843811 $data = $this->renderDublinCore();
844812 break;
845 - case 'lsearch':
846813 case 'mediawiki':
847814 $data = $this->renderMediaWiki();
848815 break;
 816+ case 'lsearch':
 817+ $data = $this->renderLSearch();
 818+ break;
849819 default:
850820 wfDebugDieBacktrace( 'Unsupported metadata format.' );
851821 }
@@ -887,8 +857,24 @@
888858 $out .= $this->_writer->closePage().$this->_writer->closeStream();
889859
890860 return $out;
891 - }
 861+ }
892862
 863+ function renderLSearch() {
 864+ $title = Title::makeTitle( $this->_row->page_namespace, $this->_row->page_title );
 865+
 866+ $out = $this->_writer->openStream().$this->_writer->openPage($this->_row).
 867+ $this->_writer->writeRedirects($this->_row).
 868+ $this->_writer->writeRevision($this->_row);
 869+
 870+ if( $title->getNamespace() == NS_IMAGE ) {
 871+ $out .= $this->renderUpload();
 872+ }
 873+
 874+ $out .= $this->_writer->closePage().$this->_writer->closeStream();
 875+
 876+ return $out;
 877+ }
 878+
893879 function renderUpload() {
894880 $fname = 'WikiOAIRecord::renderUpload';
895881 $db =& wfGetDB( DB_SLAVE );
@@ -957,7 +943,69 @@
958944 } else
959945 return "";
960946 }
 947+}
961948
 949+/**
 950+ * Extends the MW import/export format with the lsearch syntax,
 951+ * i.e. schema lsearch-0.1
 952+ */
 953+class OAILSearchWriter extends OAIDumpWriter {
 954+
 955+ function __construct($resultSet){
 956+ parent::__construct();
 957+ $this->_redirects = array();
 958+ $this->_references = array();
 959+ for($i = 0 ; $i < $resultSet->numRows(); $i++){
 960+ $row = $resultSet->fetchObject();
 961+ if(isset($row->page_title))
 962+ $this->_redirects[$row->up_page][] = $row;
 963+ else
 964+ $this->_references[$row->up_page] = $row;
 965+
 966+ }
 967+ }
 968+
 969+ function openStream() {
 970+ global $wgContLanguageCode;
 971+ $ver = "0.1";
 972+ return wfElement( 'mediawiki', array(
 973+ 'xmlns' => "http://www.mediawiki.org/xml/lsearch-$ver/",
 974+ 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
 975+ 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/lsearch-$ver/ " .
 976+ "http://www.mediawiki.org/xml/lsearch-$ver.xsd",
 977+ 'version' => $ver,
 978+ 'xml:lang' => $wgContLanguageCode ),
 979+ null ) .
 980+ "\n" .
 981+ $this->siteInfo();
 982+ }
 983+
 984+ function openPage( $row ) {
 985+ $out = parent::openPage( $row );
 986+ if(isset($this->_references[$row->up_page]) && isset($this->_references[$row->up_page]->num_page_ref))
 987+ $page_ref = $this->_references[$row->up_page]->num_page_ref;
 988+ else
 989+ $page_ref = 0;
 990+ $out .= ' ' . wfElement( 'references', array(), strval( $page_ref ) ) . "\n";
 991+ return $out;
 992+ }
 993+
 994+ function writeRedirects($row){
 995+ $out = '';
 996+ if(isset($this->_redirects[$row->up_page])){
 997+ foreach($this->_redirects[$row->up_page] as $row){
 998+ $title = Title::makeTitle( $row->page_namespace, $row->page_title );
 999+ $out .= " <redirect>\n";
 1000+ $out .= ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n";
 1001+ if(isset($row->num_page_ref))
 1002+ $out .= ' ' . wfElement( 'references', array(), strval( $row->num_page_ref ) ) . "\n";
 1003+ $out .= " </redirect>\n";
 1004+ }
 1005+ }
 1006+ return $out;
 1007+ }
 1008+
9621009 }
9631010
 1011+
9641012 ?>
Index: trunk/extensions/OAI/lsearch-0.1.xsd
@@ -0,0 +1,171 @@
 2+<?xml version="1.0" encoding="UTF-8" ?>
 3+<!--
 4+ This is an XML Schema description of the format
 5+ used by MediaWiki's Lucene-Search extension.
 6+
 7+ Version 0.1 is based of MediaWiki import/export format 0.3
 8+ (i.e. export-0.3.xsd). With addition of one propery
 9+ references in page
 10+
 11+ The canonical URL to the schema document is:
 12+ http://www.mediawiki.org/xml/lsearch-0.1.xsd
 13+
 14+ Use the namespace:
 15+ http://www.mediawiki.org/xml/lsearch-0.1/
 16+-->
 17+<schema xmlns="http://www.w3.org/2001/XMLSchema"
 18+ xmlns:mw="http://www.mediawiki.org/xml/lsearch-0.1/"
 19+ targetNamespace="http://www.mediawiki.org/xml/lsearch-0.1/"
 20+ elementFormDefault="qualified">
 21+
 22+ <annotation>
 23+ <documentation xml:lang="en">
 24+ MediaWiki's page export format
 25+ </documentation>
 26+ </annotation>
 27+
 28+ <!-- Need this to reference xml:lang -->
 29+ <import namespace="http://www.w3.org/XML/1998/namespace"
 30+ schemaLocation="http://www.w3.org/2001/xml.xsd"/>
 31+
 32+ <!-- Our root element -->
 33+ <element name="mediawiki" type="mw:MediaWikiType"/>
 34+
 35+ <complexType name="MediaWikiType">
 36+ <sequence>
 37+ <element name="siteinfo" type="mw:SiteInfoType"
 38+ minOccurs="0" maxOccurs="1"/>
 39+ <element name="page" type="mw:PageType"
 40+ minOccurs="0" maxOccurs="unbounded"/>
 41+ </sequence>
 42+ <attribute name="version" type="string" use="required"/>
 43+ <attribute ref="xml:lang" use="required"/>
 44+ </complexType>
 45+
 46+ <complexType name="SiteInfoType">
 47+ <sequence>
 48+ <element name="sitename" type="string" minOccurs="0" />
 49+ <element name="base" type="anyURI" minOccurs="0" />
 50+ <element name="generator" type="string" minOccurs="0" />
 51+ <element name="case" type="mw:CaseType" minOccurs="0" />
 52+ <element name="namespaces" type="mw:NamespacesType" minOccurs="0" />
 53+ </sequence>
 54+ </complexType>
 55+
 56+ <simpleType name="CaseType">
 57+ <restriction base="NMTOKEN">
 58+ <!-- Cannot have two titles differing only by case of first letter. -->
 59+ <!-- Default behavior through 1.5, $wgCapitalLinks = true -->
 60+ <enumeration value="first-letter" />
 61+
 62+ <!-- Complete title is case-sensitive -->
 63+ <!-- Behavior when $wgCapitalLinks = false -->
 64+ <enumeration value="case-sensitive" />
 65+
 66+ <!-- Cannot have two titles differing only by case. -->
 67+ <!-- Not yet implemented as of MediaWiki 1.5 -->
 68+ <enumeration value="case-insensitive" />
 69+ </restriction>
 70+ </simpleType>
 71+
 72+ <complexType name="NamespacesType">
 73+ <sequence>
 74+ <element name="namespace" type="mw:NamespaceType"
 75+ minOccurs="0" maxOccurs="unbounded" />
 76+ </sequence>
 77+ </complexType>
 78+
 79+ <complexType name="NamespaceType">
 80+ <simpleContent>
 81+ <extension base="string">
 82+ <attribute name="key" type="integer" />
 83+ </extension>
 84+ </simpleContent>
 85+ </complexType>
 86+
 87+ <complexType name="RedirectType">
 88+ <sequence>
 89+ <!-- Title in text form. (Using spaces, not underscores; with namespace ) -->
 90+ <element name="title" type="string"/>
 91+
 92+ <!-- optional page ID number -->
 93+ <element name="id" type="positiveInteger" minOccurs="0"/>
 94+
 95+ <!-- optional: number of pages that link to this page -->
 96+ <element name="references" type="positiveInteger" minOccurs="0"/>
 97+ </sequence>
 98+ </complexType>
 99+
 100+ <complexType name="PageType">
 101+ <sequence>
 102+ <!-- Title in text form. (Using spaces, not underscores; with namespace ) -->
 103+ <element name="title" type="string"/>
 104+
 105+ <!-- optional page ID number -->
 106+ <element name="id" type="positiveInteger" minOccurs="0"/>
 107+
 108+ <!-- optional: number of pages that link to this page (without redirects) -->
 109+ <element name="references" type="positiveInteger" minOccurs="0"/>
 110+
 111+ <!-- 0 or more redirects to this page -->
 112+ <element name="redirect" type="mw:RedirectType" minOccurs="0" maxOccurs="unbounded"/>
 113+
 114+ <!-- comma-separated list of string tokens, if present -->
 115+ <element name="restrictions" type="string" minOccurs="0"/>
 116+
 117+ <!-- Zero or more sets of revision or upload data -->
 118+ <choice minOccurs="0" maxOccurs="unbounded">
 119+ <element name="revision" type="mw:RevisionType" />
 120+ <element name="upload" type="mw:UploadType" />
 121+ </choice>
 122+ </sequence>
 123+ </complexType>
 124+
 125+ <complexType name="RevisionType">
 126+ <sequence>
 127+ <element name="id" type="positiveInteger" minOccurs="0"/>
 128+ <element name="timestamp" type="dateTime"/>
 129+ <element name="contributor" type="mw:ContributorType"/>
 130+ <element name="minor" minOccurs="0" />
 131+ <element name="comment" type="string" minOccurs="0"/>
 132+ <element name="text" type="mw:TextType" />
 133+ </sequence>
 134+ </complexType>
 135+
 136+ <complexType name="TextType">
 137+ <simpleContent>
 138+ <extension base="string">
 139+ <attribute ref="xml:space" use="optional" default="preserve" />
 140+ </extension>
 141+ </simpleContent>
 142+ </complexType>
 143+
 144+ <complexType name="ContributorType">
 145+ <sequence>
 146+ <element name="username" type="string" minOccurs="0"/>
 147+ <element name="id" type="positiveInteger" minOccurs="0" />
 148+
 149+ <element name="ip" type="string" minOccurs="0"/>
 150+ </sequence>
 151+ </complexType>
 152+
 153+ <complexType name="UploadType">
 154+ <sequence>
 155+ <!-- Revision-style data... -->
 156+ <element name="timestamp" type="dateTime"/>
 157+ <element name="contributor" type="mw:ContributorType"/>
 158+ <element name="comment" type="string" minOccurs="0"/>
 159+
 160+ <!-- Filename. (Using underscores, not spaces. No 'Image:' namespace marker.) -->
 161+ <element name="filename" type="string"/>
 162+
 163+ <!-- URI at which this resource can be obtained -->
 164+ <element name="src" type="anyURI"/>
 165+
 166+ <element name="size" type="positiveInteger" />
 167+
 168+ <!-- TODO: add other metadata fields -->
 169+ </sequence>
 170+ </complexType>
 171+
 172+</schema>