r13898 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r13897‎ | r13898 | r13899 >
Date:16:35, 28 April 2006
Author:hashar
Status:old
Tags:
Comment:
use unix end of lines
Modified paths:
  • /trunk/wiki2xml/php/CREDITS (modified) (history)
  • /trunk/wiki2xml/php/browse_texts.php (modified) (history)
  • /trunk/wiki2xml/php/extension.php (modified) (history)
  • /trunk/wiki2xml/php/xmldump2files.php (modified) (history)

Diff [purge]

Index: trunk/wiki2xml/php/xmldump2files.php
@@ -1,104 +1,104 @@
2 -<?php
3 -
4 -# Change there to your local settings
5 -$dumpfile = "K:\\dewiki-20060327-pages-articles.xml" ;
6 -$basedir = "C:" ;
7 -
8 -#______________________________________________________________________________
9 -# GLOBAL VARIABLES
10 -$dir = "" ;
11 -$namespaces = array () ;
12 -$mem = array () ;
13 -$tags = array () ;
14 -$page_counter = 0 ;
15 -
16 -# FUNCTIONS
17 -
18 -require_once ( "global_functions.php" ) ;
19 -
20 -function store_file ( &$loc , &$text , $mode = "text" ) {
21 - if ( $mode == "text" ) {
22 - if ( !$handle = fopen($loc->fullname.".txt", 'wb') ) {
23 - print "Failed to open {$loc->file}.txt!<br/>" ;
24 - flush () ;
25 - }
26 - fwrite($handle, $text) ;
27 - fclose ( $handle ) ;
28 - } else if ( $mode == "gzip" ) {
29 - if ( !$gz = gzopen($loc->fullname.".gz",'w9') ) {
30 - print "Failed to open {$loc->file}.gz!<br/>" ;
31 - flush () ;
32 - }
33 - gzwrite($gz, $text);
34 - gzclose($gz);
35 - }
36 -}
37 -
38 -function microtime_float()
39 -{
40 - list($usec, $sec) = explode(" ", microtime());
41 - return ((float)$usec + (float)$sec);
42 -}
43 -
 2+<?php
 3+
 4+# Change there to your local settings
 5+$dumpfile = "K:\\dewiki-20060327-pages-articles.xml" ;
 6+$basedir = "C:" ;
 7+
 8+#______________________________________________________________________________
 9+# GLOBAL VARIABLES
 10+$dir = "" ;
 11+$namespaces = array () ;
 12+$mem = array () ;
 13+$tags = array () ;
 14+$page_counter = 0 ;
 15+
 16+# FUNCTIONS
 17+
 18+require_once ( "global_functions.php" ) ;
 19+
 20+function store_file ( &$loc , &$text , $mode = "text" ) {
 21+ if ( $mode == "text" ) {
 22+ if ( !$handle = fopen($loc->fullname.".txt", 'wb') ) {
 23+ print "Failed to open {$loc->file}.txt!<br/>" ;
 24+ flush () ;
 25+ }
 26+ fwrite($handle, $text) ;
 27+ fclose ( $handle ) ;
 28+ } else if ( $mode == "gzip" ) {
 29+ if ( !$gz = gzopen($loc->fullname.".gz",'w9') ) {
 30+ print "Failed to open {$loc->file}.gz!<br/>" ;
 31+ flush () ;
 32+ }
 33+ gzwrite($gz, $text);
 34+ gzclose($gz);
 35+ }
 36+}
 37+
 38+function microtime_float()
 39+{
 40+ list($usec, $sec) = explode(" ", microtime());
 41+ return ((float)$usec + (float)$sec);
 42+}
 43+
4444 # Global functions for parsing
4545
46 -function XML2TXT_START($parser, $name, $attrs) {
47 - global $mem , $tags ;
48 - $mem["name"] = $name ;
49 - $tags[] = $name ;
50 - if ( $name == "NAMESPACE" ) {
51 - $mem['key'] = $attrs["KEY"] ;
52 - } else if ( $name == "TEXT" ) {
53 - $mem['text'] = "" ;
54 - }
 46+function XML2TXT_START($parser, $name, $attrs) {
 47+ global $mem , $tags ;
 48+ $mem["name"] = $name ;
 49+ $tags[] = $name ;
 50+ if ( $name == "NAMESPACE" ) {
 51+ $mem['key'] = $attrs["KEY"] ;
 52+ } else if ( $name == "TEXT" ) {
 53+ $mem['text'] = "" ;
 54+ }
5555 }
5656
5757 function XML2TXT_END($parser, $name) {
58 - global $mem , $namespaces , $tags , $page_counter , $dir ;
59 - if ( $mem['name'] == 'NAMESPACE' ) {
60 - $namespaces[$mem['key']] = $mem['text'] ;
61 - } else if ( $mem['name'] == 'PAGE' ) {
62 - $loc = get_file_location_global ( $dir , $mem['namespace'] , $mem['title'] , true ) ;
63 - store_file ( $loc , $mem['text'] , 'text' ) ;
64 -
65 - $page_counter++ ;
66 - if ( $page_counter % 1000 == 0 ) {
67 - print '.' ;
68 - if ( $page_counter % 50000 == 0 ) print "<br/>" ;
69 - flush () ;
70 - }
71 - }
72 -
73 - array_pop ( $tags ) ;
74 - if ( count ( $tags ) > 0 ) {
75 - $mem['name'] = array_pop ( $tags ) ;
76 - $tags[] = $mem['name'] ;
77 - } else {
78 - $mem['name'] = "" ;
 58+ global $mem , $namespaces , $tags , $page_counter , $dir ;
 59+ if ( $mem['name'] == 'NAMESPACE' ) {
 60+ $namespaces[$mem['key']] = $mem['text'] ;
 61+ } else if ( $mem['name'] == 'PAGE' ) {
 62+ $loc = get_file_location_global ( $dir , $mem['namespace'] , $mem['title'] , true ) ;
 63+ store_file ( $loc , $mem['text'] , 'text' ) ;
 64+
 65+ $page_counter++ ;
 66+ if ( $page_counter % 1000 == 0 ) {
 67+ print '.' ;
 68+ if ( $page_counter % 50000 == 0 ) print "<br/>" ;
 69+ flush () ;
 70+ }
7971 }
 72+
 73+ array_pop ( $tags ) ;
 74+ if ( count ( $tags ) > 0 ) {
 75+ $mem['name'] = array_pop ( $tags ) ;
 76+ $tags[] = $mem['name'] ;
 77+ } else {
 78+ $mem['name'] = "" ;
 79+ }
8080 }
8181
82 -function XML2TXT_DATA ( $parser, $data ) {
83 - global $mem , $namespaces ;
84 - if ( $mem['name'] == 'NAMESPACE' ) {
85 - $mem['text'] = $data ;
86 - } else if ( $mem['name'] == 'TITLE' ) {
87 - $ns = 0 ;
88 - foreach ( $namespaces AS $k => $v ) {
89 - if ( $k <= 0 ) continue ;
90 - if ( substr ( 0 , strlen ( $v ) + 1 ) != $v.":" ) continue ;
91 - $ns = $k ;
92 - $data = substr ( $data , strlen ( $v ) + 1 ) ;
93 - break ;
94 - }
95 - $mem['title'] = $data ;
96 - $mem['namespace'] = $ns ;
97 - } else if ( $mem['name'] == 'TEXT' ) {
98 - $mem['text'] .= $data ;
99 - }
 82+function XML2TXT_DATA ( $parser, $data ) {
 83+ global $mem , $namespaces ;
 84+ if ( $mem['name'] == 'NAMESPACE' ) {
 85+ $mem['text'] = $data ;
 86+ } else if ( $mem['name'] == 'TITLE' ) {
 87+ $ns = 0 ;
 88+ foreach ( $namespaces AS $k => $v ) {
 89+ if ( $k <= 0 ) continue ;
 90+ if ( substr ( 0 , strlen ( $v ) + 1 ) != $v.":" ) continue ;
 91+ $ns = $k ;
 92+ $data = substr ( $data , strlen ( $v ) + 1 ) ;
 93+ break ;
 94+ }
 95+ $mem['title'] = $data ;
 96+ $mem['namespace'] = $ns ;
 97+ } else if ( $mem['name'] == 'TEXT' ) {
 98+ $mem['text'] .= $data ;
 99+ }
100100 }
101101
102 -function scan_xml_file ( $xml_filename ) {
 102+function scan_xml_file ( $xml_filename ) {
103103 global $namespaces , $dir , $page_counter ;
104104 $xml_parser_handle = xml_parser_create();
105105 xml_set_element_handler($xml_parser_handle, "XML2TXT_START", "XML2TXT_END");
@@ -107,45 +107,45 @@
108108 if (!($parse_handle = fopen($xml_filename, 'r'))) {
109109 die("FEHLER: Datei $xml_filename nicht gefunden.");
110110 }
111 -
112 - $t1 = microtime_float() ;
 111+
 112+ $t1 = microtime_float() ;
113113 while ($xml_data = fread($parse_handle, 8192)) {
114114 if (!xml_parse($xml_parser_handle, $xml_data, feof($parse_handle))) {
115115 die(sprintf('XML error: %s at line %d',
116116 xml_error_string(xml_get_error_code($xml_parser_handle)),
117117 xml_get_current_line_number($xml_parser_handle)));
118 - }
119 -
120 -/* if ( $page_counter % 100 == 0 ) {
121 - $t2 = microtime_float() - $t1 ;
122 - $t3 = $t2 * 1000 / $page_counter ;
123 - print $t3 . " sec/1000 pages<br/>" ; flush () ;
 118+ }
 119+
 120+/* if ( $page_counter % 100 == 0 ) {
 121+ $t2 = microtime_float() - $t1 ;
 122+ $t3 = $t2 * 1000 / $page_counter ;
 123+ print $t3 . " sec/1000 pages<br/>" ; flush () ;
124124 }*/
125 - }
126 - $t2 = microtime_float() - $t1 ;
127 - print "Took {$t2} seconds total.<br/>" ; flush () ;
 125+ }
 126+ $t2 = microtime_float() - $t1 ;
 127+ print "Took {$t2} seconds total.<br/>" ; flush () ;
128128
129 - xml_parser_free($xml_parser_handle);
130 -
131 - $handle = fopen($dir."/namespaces.txt", 'wb') ;
132 - foreach ( $namespaces AS $ns => $nst ) {
133 - $t = "{$ns}:{$nst}\n" ;
134 - fwrite($handle, $t) ;
135 - }
136 - fclose ( $handle ) ;
 129+ xml_parser_free($xml_parser_handle);
137130
 131+ $handle = fopen($dir."/namespaces.txt", 'wb') ;
 132+ foreach ( $namespaces AS $ns => $nst ) {
 133+ $t = "{$ns}:{$nst}\n" ;
 134+ fwrite($handle, $t) ;
 135+ }
 136+ fclose ( $handle ) ;
 137+
138138 }
139 -
140 -
141 -# MAIN
142 -
143 -$dir = array_pop ( explode ( "/" , str_replace ( "\\" , "/" , $dumpfile ) ) ) ;
144 -$dir = $basedir . "/" . str_replace ( ".xml" , "" , $dir ) ;
145 -
146 -@set_time_limit ( 0 ) ; # No time limit
147 -#ini_set('user_agent','MSIE 4\.0b2;'); # Fake user agent
148 -header ('Content-type: text/html; charset=utf-8');
149 -@mkdir ( $dir ) ;
150 -scan_xml_file ( $dumpfile ) ;
151 -
152 -?>
 139+
 140+
 141+# MAIN
 142+
 143+$dir = array_pop ( explode ( "/" , str_replace ( "\\" , "/" , $dumpfile ) ) ) ;
 144+$dir = $basedir . "/" . str_replace ( ".xml" , "" , $dir ) ;
 145+
 146+@set_time_limit ( 0 ) ; # No time limit
 147+#ini_set('user_agent','MSIE 4\.0b2;'); # Fake user agent
 148+header ('Content-type: text/html; charset=utf-8');
 149+@mkdir ( $dir ) ;
 150+scan_xml_file ( $dumpfile ) ;
 151+
 152+?>
Index: trunk/wiki2xml/php/CREDITS
@@ -1,8 +1,8 @@
22 wiki2xml is (c) by Magnus Manske 2005-2006 and released under the GPL.
33
44 The following people (in alphabetic order) contributed to this project:
5 -
6 -Magnus Manske <magnus.manske@web.de> Everything Tels didn't do ;-)
75
 6+Magnus Manske <magnus.manske@web.de> Everything Tels didn't do ;-)
 7+
88 Tels <nospam-abuse@bloodgate.com> Linux fixes, OpenOffice output
99 REDME and doc
Index: trunk/wiki2xml/php/extension.php
@@ -1,24 +1,24 @@
2 -<?php
3 -/*
4 -To enable this extension, put all files in this directory into a "wiki2xml" subdirectory of your MediaWiki extensions directory
5 -Also, add
6 - require_once ( "extensions/wiki2xml/extension.php" ) ;
7 -to your LocalSettings.php
8 -The extension can then be accessed as [[Special:Wiki2XML]]
9 -*/
10 -
11 -if( !defined( 'MEDIAWIKI' ) ) die();
12 -
13 -# Integrating into the MediaWiki environment
14 -
15 -$wgExtensionCredits['Wiki2XML'][] = array(
16 - 'name' => 'Wiki2XML',
17 - 'description' => 'An extension to convert wiki markup into XML.',
18 - 'author' => 'Magnus Manske'
19 -);
20 -
21 -$wgExtensionFunctions[] = 'wfWiki2XMLExtension';
22 -
 2+<?php
 3+/*
 4+To enable this extension, put all files in this directory into a "wiki2xml" subdirectory of your MediaWiki extensions directory
 5+Also, add
 6+ require_once ( "extensions/wiki2xml/extension.php" ) ;
 7+to your LocalSettings.php
 8+The extension can then be accessed as [[Special:Wiki2XML]]
 9+*/
 10+
 11+if( !defined( 'MEDIAWIKI' ) ) die();
 12+
 13+# Integrating into the MediaWiki environment
 14+
 15+$wgExtensionCredits['Wiki2XML'][] = array(
 16+ 'name' => 'Wiki2XML',
 17+ 'description' => 'An extension to convert wiki markup into XML.',
 18+ 'author' => 'Magnus Manske'
 19+);
 20+
 21+$wgExtensionFunctions[] = 'wfWiki2XMLExtension';
 22+
2323 # for Special::Version:
2424 $wgExtensionCredits['parserhook'][] = array(
2525 'name' => 'wiki2xml extension',
@@ -26,51 +26,51 @@
2727 'url' => 'http://en.wikipedia.org/wiki/User:Magnus_Manske',
2828 'version' => 'v0.02',
2929 );
30 -
31 -
32 -#_____________________________________________________________________________
33 -
34 -/**
35 -* The special page
36 -*/
37 -function wfWiki2XMLExtension() { # Checked for HTML and MySQL insertion attacks
38 - global $IP, $wgMessageCache;
39 -# wfTasksAddCache();
40 -
41 - // FIXME : i18n
42 - $wgMessageCache->addMessage( 'wiki2xml', 'Wiki2XML' );
43 -
44 - require_once $IP.'/includes/SpecialPage.php';
45 -
46 - class SpecialWiki2XML extends SpecialPage {
47 -
48 - /**
49 - * Constructor
50 - */
51 - function SpecialWiki2XML() { # Checked for HTML and MySQL insertion attacks
52 - SpecialPage::SpecialPage( 'Wiki2XML' );
53 - $this->includable( true );
54 - }
55 -
56 - /**
57 - * Special page main function
58 - */
59 - function execute( $par = null ) { # Checked for HTML and MySQL insertion attacks
60 - global $wgOut, $wgRequest, $wgUser, $wgTitle, $IP;
61 - $fname = 'Special::Tasks:execute';
62 - global $xmlg , $html_named_entities_mapping_mine, $content_provider;
63 - include_once ( "default.php" ) ;
64 - $xmlg['sourcedir'] = $IP.'/extensions/wiki2xml' ;
65 - include_once ( "w2x.php" ) ;
66 -
67 - $this->setHeaders();
68 - $wgOut->addHtml( $out );
69 - }
70 -
71 - } # end of class
72 -
73 - SpecialPage::addPage( new SpecialWiki2XML );
74 -}
75 -
76 -
77 -?>
 30+
 31+
 32+#_____________________________________________________________________________
 33+
 34+/**
 35+* The special page
 36+*/
 37+function wfWiki2XMLExtension() { # Checked for HTML and MySQL insertion attacks
 38+ global $IP, $wgMessageCache;
 39+# wfTasksAddCache();
 40+
 41+ // FIXME : i18n
 42+ $wgMessageCache->addMessage( 'wiki2xml', 'Wiki2XML' );
 43+
 44+ require_once $IP.'/includes/SpecialPage.php';
 45+
 46+ class SpecialWiki2XML extends SpecialPage {
 47+
 48+ /**
 49+ * Constructor
 50+ */
 51+ function SpecialWiki2XML() { # Checked for HTML and MySQL insertion attacks
 52+ SpecialPage::SpecialPage( 'Wiki2XML' );
 53+ $this->includable( true );
 54+ }
 55+
 56+ /**
 57+ * Special page main function
 58+ */
 59+ function execute( $par = null ) { # Checked for HTML and MySQL insertion attacks
 60+ global $wgOut, $wgRequest, $wgUser, $wgTitle, $IP;
 61+ $fname = 'Special::Tasks:execute';
 62+ global $xmlg , $html_named_entities_mapping_mine, $content_provider;
 63+ include_once ( "default.php" ) ;
 64+ $xmlg['sourcedir'] = $IP.'/extensions/wiki2xml' ;
 65+ include_once ( "w2x.php" ) ;
 66+
 67+ $this->setHeaders();
 68+ $wgOut->addHtml( $out );
 69+ }
 70+
 71+ } # end of class
 72+
 73+ SpecialPage::addPage( new SpecialWiki2XML );
 74+}
 75+
 76+
 77+?>
Index: trunk/wiki2xml/php/browse_texts.php
@@ -1,66 +1,67 @@
2 -<?php
3 -
4 -require_once ( "default.php" ) ;
5 -require_once ( "global_functions.php" ) ;
6 -require_once ( "filter_named_entities.php" ) ;
7 -require_once ( "content_provider.php" ) ;
8 -require_once ( "wiki2xml.php" ) ;
9 -require_once ( "xml2xhtml.php" ) ;
10 -require_once ( "mediawiki_converter.php" ) ;
11 -
12 -# FUNCTIONS
13 -
14 -function get_param ( $key , $default = "" ) {
15 - if ( !isset ( $_REQUEST[$key] ) ) return $default ;
16 - return $_REQUEST[$key] ;
17 -}
18 -
19 -# MAIN
20 -
21 -@set_time_limit ( 0 ) ; # No time limit
22 -
23 -$xmlg = array (
24 - 'site_base_url' => "SBU" ,
25 - 'resolvetemplates' => true ,
26 - 'templates' => array () ,
27 - 'namespace_template' => 'Vorlage' ,
28 -) ;
29 -
30 -$content_provider = new ContentProviderTextFile ;
31 -$converter = new MediaWikiConverter ;
32 -
33 -$title = urldecode ( get_param ( 'title' , urlencode ( 'Main Page' ) ) ) ;
34 -$xmlg['page_title'] = $title ;
35 -
36 -$format = strtolower ( get_param ( 'format' , 'xhtml' ) ) ;
37 -$content_provider->basedir = $base_text_dir ;
38 -
39 -$text = $content_provider->get_wiki_text ( $title ) ;
40 -$xml = $converter->article2xml ( $title , $text , $xmlg ) ;
41 -
42 -if ( $format =="xml" ) {
43 - # XML
44 - header('Content-type: text/xml; charset=utf-8');
45 - print "<?xml version='1.0' encoding='UTF-8' ?>\n" ;
46 - print $xml ;
47 -} else if ( $format == "text" ) {
48 - # Plain text
49 - $xmlg['plaintext_markup'] = true ;
50 - $xmlg['plaintext_prelink'] = true ;
51 - $out = $converter->articles2text ( $xml , $xmlg ) ;
52 - $out = str_replace ( "\n" , "<br/>" , $out ) ;
53 - header('Content-type: text/html; charset=utf-8');
54 - print $out ;
55 -} else {
56 - # XHTML
57 - if ( stristr($_SERVER["HTTP_ACCEPT"],"application/xhtml+xml") ) {
58 - header("Content-type: text/html; charset=utf-8"); # Skipping the "strict" part ;-)
59 -# header("Content-type: application/xhtml+xml");
60 - } else {
61 - # Header hack for IE
62 - header("Content-type: text/html; charset=utf-8");
63 - }
64 - print $converter->articles2xhtml ( $xml , $xmlg ) ;
65 -}
66 -
67 -?>
 2+<?php
 3+
 4+require_once ( "default.php" ) ;
 5+require_once ( "global_functions.php" ) ;
 6+require_once ( "filter_named_entities.php" ) ;
 7+require_once ( "content_provider.php" ) ;
 8+require_once ( "wiki2xml.php" ) ;
 9+require_once ( "xml2xhtml.php" ) ;
 10+require_once ( "mediawiki_converter.php" ) ;
 11+
 12+# FUNCTIONS
 13+
 14+function get_param ( $key , $default = "" ) {
 15+ if ( !isset ( $_REQUEST[$key] ) ) return $default ;
 16+ return $_REQUEST[$key] ;
 17+}
 18+
 19+# MAIN
 20+
 21+@set_time_limit ( 0 ) ; # No time limit
 22+
 23+$xmlg = array (
 24+ 'site_base_url' => "SBU" ,
 25+ 'resolvetemplates' => true ,
 26+ 'templates' => array () ,
 27+ 'namespace_template' => 'Vorlage' ,
 28+) ;
 29+
 30+$content_provider = new ContentProviderTextFile ;
 31+$converter = new MediaWikiConverter ;
 32+
 33+$title = urldecode ( get_param ( 'title' , urlencode ( 'Main Page' ) ) ) ;
 34+$xmlg['page_title'] = $title ;
 35+
 36+$format = strtolower ( get_param ( 'format' , 'xhtml' ) ) ;
 37+$content_provider->basedir = $base_text_dir ;
 38+
 39+$text = $content_provider->get_wiki_text ( $title ) ;
 40+$xml = $converter->article2xml ( $title , $text , $xmlg ) ;
 41+
 42+if ( $format =="xml" ) {
 43+ # XML
 44+ header('Content-type: text/xml; charset=utf-8');
 45+ print "<?xml version='1.0' encoding='UTF-8' ?>\n" ;
 46+ print $xml ;
 47+} else if ( $format == "text" ) {
 48+ # Plain text
 49+ $xmlg['plaintext_markup'] = true ;
 50+ $xmlg['plaintext_prelink'] = true ;
 51+ $out = $converter->articles2text ( $xml , $xmlg ) ;
 52+ $out = str_replace ( "\n" , "<br/>" , $out ) ;
 53+ header('Content-type: text/html; charset=utf-8');
 54+ print $out ;
 55+} else {
 56+ # XHTML
 57+ if ( stristr($_SERVER["HTTP_ACCEPT"],"application/xhtml+xml") ) {
 58+ # Skipping the "strict" part ;-)
 59+ header("Content-type: text/html; charset=utf-8");
 60+# header("Content-type: application/xhtml+xml");
 61+ } else {
 62+ # Header hack for IE
 63+ header("Content-type: text/html; charset=utf-8");
 64+ }
 65+ print $converter->articles2xhtml ( $xml , $xmlg ) ;
 66+}
 67+
 68+?>

Status & tagging log