Index: trunk/tools/editor_trends/etl/variables.py |
— | — | @@ -49,13 +49,14 @@ |
50 | 50 | |
51 | 51 | |
52 | 52 | def extract_revision_text(revision): |
53 | | - rev = revision.find('ns0:text') |
54 | | - if rev != None: |
55 | | - if rev.text == None: |
56 | | - rev = fix_revision_text(revision) |
57 | | - return rev.text.encode('utf-8') |
58 | | - else: |
59 | | - return '' |
| 53 | + return revision.text |
| 54 | +# rev = revision.find('ns0:text') |
| 55 | +# if rev != None: |
| 56 | +# if rev.text == None: |
| 57 | +# rev = fix_revision_text(revision) |
| 58 | +# return rev.text.encode('utf-8') |
| 59 | +# else: |
| 60 | +# return '' |
60 | 61 | |
61 | 62 | |
62 | 63 | def parse_title(title): |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -18,7 +18,7 @@ |
19 | 19 | __date__ = '2011-04-10'
|
20 | 20 | __version__ = '0.1'
|
21 | 21 |
|
22 | | -
|
| 22 | +import itertools
|
23 | 23 | from collections import deque
|
24 | 24 | import sys
|
25 | 25 | import os
|
— | — | @@ -54,6 +54,7 @@ |
55 | 55 | return md5hashes, size
|
56 | 56 |
|
57 | 57 | revision_id = revision.find('%s%s' % (xml_namespace, 'id'))
|
| 58 | + print revision_id
|
58 | 59 | revision_id = variables.extract_revision_id(revision_id)
|
59 | 60 | if revision_id == None:
|
60 | 61 | #revision_id is missing, which is weird
|
— | — | @@ -149,7 +150,7 @@ |
150 | 151 | if event is start:
|
151 | 152 | clear = False
|
152 | 153 | else:
|
153 | | - counts = datacompetition_parse_revision(revision, xml_namespace, bots, counts)
|
| 154 | + counts = datacompetition_parse_revision(elem, xml_namespace, bots, counts)
|
154 | 155 | clear = True
|
155 | 156 | if clear:
|
156 | 157 | elem.clear()
|
— | — | @@ -160,9 +161,6 @@ |
161 | 162 | id = False
|
162 | 163 | parse = False
|
163 | 164 |
|
164 | | - else:
|
165 | | - elem.clear()
|
166 | | -
|
167 | 165 | except SyntaxError, error:
|
168 | 166 | print 'Encountered invalid XML tag. Error message: %s' % error
|
169 | 167 | dump(elem)
|
— | — | @@ -175,6 +173,7 @@ |
176 | 174 | print error
|
177 | 175 |
|
178 | 176 | filename = 'counts_kaggle_%s.csv' % file_id
|
| 177 | + keys = counts.keys()
|
179 | 178 | fh = file_utils.create_txt_filehandle(rts.txt, filename, 'w', 'utf-8')
|
180 | 179 | file_utils.write_dict_to_csv(counts, fh, keys)
|
181 | 180 | fh.close()
|