Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -36,7 +36,6 @@ |
37 | 37 | pass |
38 | 38 | |
39 | 39 | |
40 | | - |
41 | 40 | RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
42 | 41 | |
43 | 42 | |
— | — | @@ -49,7 +48,7 @@ |
50 | 49 | return unichr(int(m.group(1))) |
51 | 50 | except ValueError: |
52 | 51 | ''' |
53 | | - There are a few articles that raise a Value Error here, the reason is |
| 52 | + There are a few articles that raise a Value Error here, the reason is |
54 | 53 | that I am using a narrow Python build (UCS2) instead of a wide build |
55 | 54 | (UCS4). The quick fix is to return an empty string... |
56 | 55 | Real solution is to rebuild Python with UCS4 support..... |
— | — | @@ -58,7 +57,7 @@ |
59 | 58 | |
60 | 59 | |
61 | 60 | def remove_namespace(element, namespace): |
62 | | - '''Remove namespace from the document.''' |
| 61 | + '''Remove namespace from the XML document.''' |
63 | 62 | ns = u'{%s}' % namespace |
64 | 63 | nsl = len(ns) |
65 | 64 | for elem in element.getiterator(): |
— | — | @@ -66,6 +65,7 @@ |
67 | 66 | elem.tag = elem.tag[nsl:] |
68 | 67 | return element |
69 | 68 | |
| 69 | + |
70 | 70 | def load_namespace(language): |
71 | 71 | file = '%s_ns.json' % language |
72 | 72 | fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING) |
— | — | @@ -76,6 +76,9 @@ |
77 | 77 | |
78 | 78 | |
79 | 79 | def build_namespaces_locale(namespaces): |
| 80 | + ''' |
| 81 | + Construct a list of all the non-main namespaces |
| 82 | + ''' |
80 | 83 | ns = [] |
81 | 84 | for namespace in namespaces: |
82 | 85 | value = namespaces[namespace].get(u'*', None) |
— | — | @@ -89,15 +92,15 @@ |
90 | 93 | for revision in revisions: |
91 | 94 | comment = revision.find('comment') |
92 | 95 | timestamp = revision.find('timestamp').text |
93 | | -# text1 = remove_ascii_control_characters(text) |
94 | | -# text2 = remove_numeric_character_references(text) |
95 | | -# text3 = convert_html_entities(text) |
96 | 96 | if comment != None and comment.text != None: |
97 | 97 | comment.text = function(comment.text) |
98 | 98 | return xml |
99 | 99 | |
100 | 100 | |
101 | 101 | def is_article_main_namespace(elem, namespace): |
| 102 | + ''' |
| 103 | + checks whether the article belongs to the main namespace |
| 104 | + ''' |
102 | 105 | title = elem.find('title').text |
103 | 106 | for ns in namespace: |
104 | 107 | if title.startswith(ns): |
— | — | @@ -105,7 +108,6 @@ |
106 | 109 | return True |
107 | 110 | |
108 | 111 | |
109 | | - |
110 | 112 | def write_xml_file(element, fh, counter, language): |
111 | 113 | '''Get file handle and write xml element to file''' |
112 | 114 | size = len(cElementTree.tostring(element)) |
— | — | @@ -120,7 +122,7 @@ |
121 | 123 | |
122 | 124 | def create_xml_file_handle(fh, counter, size, language): |
123 | 125 | '''Create file handle if none is supplied or if file size > max file size.''' |
124 | | - path = os.path.join(settings.XML_FILE_LOCATION , language, '%s.xml' % counter) |
| 126 | + path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter) |
125 | 127 | if not fh: |
126 | 128 | counter = 0 |
127 | 129 | fh = codecs.open(path, 'w', encoding=settings.ENCODING) |
— | — | @@ -147,14 +149,13 @@ |
148 | 150 | ns = load_namespace(language) |
149 | 151 | ns = build_namespaces_locale(ns) |
150 | 152 | |
151 | | - |
152 | 153 | fh = None |
153 | 154 | counter = None |
154 | 155 | tag = '{%s}page' % settings.NAME_SPACE |
155 | | - |
| 156 | + |
156 | 157 | context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end')) |
157 | 158 | context = iter(context) |
158 | | - event, root = context.next() # get the root element of the XML doc |
| 159 | + event, root = context.next() #get the root element of the XML doc |
159 | 160 | |
160 | 161 | for event, elem in context: |
161 | 162 | if event == 'end': |
— | — | @@ -163,10 +164,8 @@ |
164 | 165 | elem = parse_comments(elem, remove_numeric_character_references) |
165 | 166 | |
166 | 167 | if is_article_main_namespace(elem, ns): |
167 | | - #fh, counter = write_xml_file(elem, fh, counter, language) |
168 | | - pass |
| 168 | + fh, counter = write_xml_file(elem, fh, counter, language) |
169 | 169 | root.clear() # when done parsing a section clear the tree to safe memory |
170 | | - |
171 | 170 | #elem = parse_comments(elem, convert_html_entities) |
172 | 171 | #elem = parse_comments(elem, remove_ascii_control_characters) |
173 | 172 | #print cElementTree.tostring(elem) |