r69397 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r69396‎ | r69397 | r69398 >
Date:17:54, 15 July 2010
Author:nimishg
Status:ok
Tags:
Comment:
Parser to take XML page histories and put them into a CSV with information such as reversions and net size change
Modified paths:
  • /trunk/tools/analysis/StreamingXMLHistory.php (added) (history)

Diff [purge]

Index: trunk/tools/analysis/StreamingXMLHistory.php
@@ -0,0 +1,187 @@
 2+#!/usr/bin/php -q
 3+<?php
 4+//keep PHP 5.3 happy
 5+date_default_timezone_set("UTC");
 6+
 7+if(count($argv) != 3){
 8+ print("\n\tUsage: $argv[0] <inputfile> <outputfile>\n");
 9+ exit(-1);
 10+}
 11+
 12+$parser = new StreamingXMLHistoryParser($argv[1], $argv[2]);
 13+$parser->run();
 14+
 15+
 16+ class Edit{
 17+ public $isAccepted;
 18+
 19+ public function __construct($isAccepted){
 20+ $this->isAccepted = $isAccepted;
 21+ }
 22+
 23+ public function accept(){
 24+ $this->isAccepted = true;
 25+ }
 26+
 27+ public function reject(){
 28+ $this->isAccepted = false;
 29+ }
 30+
 31+ }
 32+
 33+
 34+ class Revert{
 35+ public $revertToIndex;
 36+ public $selfIndex;
 37+ public $isAccepted; //true = accepted, false = rejected
 38+ public $revTypes;
 39+
 40+ public function setStatus( $status ){
 41+ $this->isAccepted = $status;
 42+ $this->updateHistory();
 43+ }
 44+
 45+ public function updateHistory(){
 46+ for($i = ($this->selfIndex -1); $i > $this->revertToIndex; $i--){
 47+ if(get_class( $this->revTypes[$i] ) == "Revert" ){
 48+ $this->revTypes[$i]->setStatus( !$this->isAccepted );
 49+ }
 50+ else{
 51+ //we're accepting a revert, which means rejecting everything in between
 52+ $this->revTypes[$i]->isAccepted = !$this->isAccepted;
 53+ }
 54+ }
 55+ }
 56+
 57+ public function __construct($selfIndex, &$revTypes, $isAccepted, $revertToIndex){
 58+ $this->selfIndex = $selfIndex;
 59+ $this->revTypes = &$revTypes;
 60+ $this->isAccepted = $isAccepted;
 61+ $this->revertToIndex = $revertToIndex;
 62+ }
 63+}
 64+
 65+
 66+class StreamingXMLHistoryParser{
 67+
 68+ public $inputFileName;
 69+ public $outputFileName;
 70+ public $outputFile;
 71+
 72+ //md5 hashes of the revision texts
 73+ public $md5History;
 74+
 75+ //revision types
 76+ public $revTypes;
 77+
 78+ //size of previous revision
 79+ public $oldSize;
 80+
 81+ public function __construct( $inputFN, $outputFN){
 82+ $this->inputFileName = $inputFN;
 83+ $this->outputFileName = $outputFN;
 84+ $this->outputFile = fopen($this->outputFileName, "w+");
 85+ $this->md5History = array();
 86+ $this->revTypes = array();
 87+ $this->oldSize = 0;
 88+ }
 89+
 90+ public function writeRevisionStatus(){
 91+ $csvOutput = fopen($this->outputFileName.".REVSTATUS", "w+");
 92+ fputcsv($csvOutput, array("status"));
 93+
 94+ $counter = 0;
 95+ foreach($this->revTypes as $i){
 96+ $csvLine = "";
 97+ if( get_class($i) == "Revert" ){
 98+ if( ($i->selfIndex - $i->revertToIndex) == 1){
 99+ $csvLine .= "status-change-";
 100+ }
 101+ else{
 102+ $csvLine .= "Revert-";
 103+ }
 104+ }
 105+ $csvLine .= ($i->isAccepted)?"accepted":"rejected";
 106+ $csvData = array( $csvLine );
 107+ fputcsv($csvOutput, $csvData);
 108+ $counter++;
 109+ }
 110+
 111+ fclose($csvOutput);
 112+ }
 113+
 114+ public function writeCSVHeader(){
 115+ $csvData = array(
 116+ "Rev ID",
 117+ "UNIX Timestamp",
 118+ "Contributor ID",
 119+ "Comment",
 120+ "Revision MD5",
 121+ "new?",
 122+ "edit size",
 123+ "net size change"
 124+ );
 125+ fputcsv($this->outputFile, $csvData);
 126+ }
 127+
 128+ public function run(){
 129+ $reader = new XMLReader();
 130+ $reader->open($this->inputFileName);
 131+ $this->writeCSVHeader();
 132+ $current_rev = 0;
 133+ //read each revision
 134+ while ( $reader->read()){
 135+ if ( $reader->nodeType == XMLREADER::ELEMENT
 136+ && $reader->localName == "revision") {
 137+
 138+ $current_rev++;
 139+ $this->parseRev($reader->readOuterXML());
 140+ }//revision
 141+ } //while
 142+ $this->writeRevisionStatus();
 143+
 144+ }
 145+
 146+
 147+ //foreach revision...
 148+ public function parseRev($xmlTEXT){
 149+ $revision = new SimpleXMLElement($xmlTEXT);
 150+ $textSize = strlen($revision->text);
 151+
 152+ $md5 = md5($revision->text);
 153+ $isNew = "no";
 154+
 155+ $revertIndex = array_search($md5, $this->md5History);
 156+
 157+ if($revertIndex === FALSE ){
 158+ $isNew = 'yes';
 159+ $this->revTypes[] = new Edit(true);
 160+ }
 161+ else{
 162+ $revert = new Revert(count($this->revTypes), $this->revTypes, true, $revertIndex);
 163+ $this->revTypes[] = $revert;
 164+ $revert->updateHistory();
 165+ }
 166+ $this->md5History[] = $md5;
 167+
 168+ $csvData = array(
 169+ $revision->id,
 170+ strtotime($revision->timestamp),
 171+ isset($revision->contributor->username)?
 172+ $revision->contributor->username : $revision->contributor->ip,
 173+ isset($revision->comment) ?
 174+ (preg_replace("[\n|\r]", " ", $revision->comment)) : "",
 175+ $md5,
 176+ $isNew,
 177+ $textSize,
 178+ $textSize - $this->oldSize
 179+ );
 180+ $this->oldSize = $textSize;
 181+ fputcsv($this->outputFile, $csvData);
 182+ }
 183+
 184+
 185+}
 186+
 187+
 188+
Property changes on: trunk/tools/analysis/StreamingXMLHistory.php
___________________________________________________________________
Added: svn:eol-style
1189 + native
Added: svn:executable
2190 + *

Status & tagging log