r75936 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75935‎ | r75936 | r75937 >
Date:17:01, 3 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
PEP8 compliant.
Modified paths:
  • /trunk/tools/editor_trends/split_xml_file.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/split_xml_file.py
@@ -36,7 +36,6 @@
3737 pass
3838
3939
40 -
4140 RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
4241
4342
@@ -49,7 +48,7 @@
5049 return unichr(int(m.group(1)))
5150 except ValueError:
5251 '''
53 - There are a few articles that raise a Value Error here, the reason is
 52+ There are a few articles that raise a Value Error here, the reason is
5453 that I am using a narrow Python build (UCS2) instead of a wide build
5554 (UCS4). The quick fix is to return an empty string...
5655 Real solution is to rebuild Python with UCS4 support.....
@@ -58,7 +57,7 @@
5958
6059
6160 def remove_namespace(element, namespace):
62 - '''Remove namespace from the document.'''
 61+ '''Remove namespace from the XML document.'''
6362 ns = u'{%s}' % namespace
6463 nsl = len(ns)
6564 for elem in element.getiterator():
@@ -66,6 +65,7 @@
6766 elem.tag = elem.tag[nsl:]
6867 return element
6968
 69+
7070 def load_namespace(language):
7171 file = '%s_ns.json' % language
7272 fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING)
@@ -76,6 +76,9 @@
7777
7878
7979 def build_namespaces_locale(namespaces):
 80+ '''
 81+ Construct a list of all the non-main namespaces
 82+ '''
8083 ns = []
8184 for namespace in namespaces:
8285 value = namespaces[namespace].get(u'*', None)
@@ -89,15 +92,15 @@
9093 for revision in revisions:
9194 comment = revision.find('comment')
9295 timestamp = revision.find('timestamp').text
93 -# text1 = remove_ascii_control_characters(text)
94 -# text2 = remove_numeric_character_references(text)
95 -# text3 = convert_html_entities(text)
9696 if comment != None and comment.text != None:
9797 comment.text = function(comment.text)
9898 return xml
9999
100100
101101 def is_article_main_namespace(elem, namespace):
 102+ '''
 103+ checks whether the article belongs to the main namespace
 104+ '''
102105 title = elem.find('title').text
103106 for ns in namespace:
104107 if title.startswith(ns):
@@ -105,7 +108,6 @@
106109 return True
107110
108111
109 -
110112 def write_xml_file(element, fh, counter, language):
111113 '''Get file handle and write xml element to file'''
112114 size = len(cElementTree.tostring(element))
@@ -120,7 +122,7 @@
121123
122124 def create_xml_file_handle(fh, counter, size, language):
123125 '''Create file handle if none is supplied or if file size > max file size.'''
124 - path = os.path.join(settings.XML_FILE_LOCATION , language, '%s.xml' % counter)
 126+ path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter)
125127 if not fh:
126128 counter = 0
127129 fh = codecs.open(path, 'w', encoding=settings.ENCODING)
@@ -147,14 +149,13 @@
148150 ns = load_namespace(language)
149151 ns = build_namespaces_locale(ns)
150152
151 -
152153 fh = None
153154 counter = None
154155 tag = '{%s}page' % settings.NAME_SPACE
155 -
 156+
156157 context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
157158 context = iter(context)
158 - event, root = context.next() # get the root element of the XML doc
 159+ event, root = context.next() #get the root element of the XML doc
159160
160161 for event, elem in context:
161162 if event == 'end':
@@ -163,10 +164,8 @@
164165 elem = parse_comments(elem, remove_numeric_character_references)
165166
166167 if is_article_main_namespace(elem, ns):
167 - #fh, counter = write_xml_file(elem, fh, counter, language)
168 - pass
 168+ fh, counter = write_xml_file(elem, fh, counter, language)
169169 root.clear() # when done parsing a section clear the tree to safe memory
170 -
171170 #elem = parse_comments(elem, convert_html_entities)
172171 #elem = parse_comments(elem, remove_ascii_control_characters)
173172 #print cElementTree.tostring(elem)

Status & tagging log