Index: trunk/tools/editor_trends/etl/variables.py |
— | — | @@ -50,18 +50,11 @@ |
51 | 51 | return True |
52 | 52 | |
53 | 53 | |
54 | | -def extract_revision_text(revision): |
55 | | - dump(revision) |
56 | | - if revision.text == None: |
57 | | - revision.text = fix_revision_text(revision) |
58 | | - return revision.text |
59 | | -# rev = revision.find('ns0:text') |
60 | | -# if rev != None: |
61 | | -# if rev.text == None: |
62 | | -# rev = fix_revision_text(revision) |
63 | | -# return rev.text.encode('utf-8') |
64 | | -# else: |
65 | | -# return '' |
| 54 | +def extract_revision_text(revision, xml_namespace): |
| 55 | + rev_text = revision.find('%s%s' % (xml_namespace, 'text')) |
| 56 | + if rev_text.text == None: |
| 57 | + rev_text.text = fix_revision_text(revision) |
| 58 | + return rev_text.text |
66 | 59 | |
67 | 60 | |
68 | 61 | def parse_title(title): |
— | — | @@ -158,15 +151,14 @@ |
159 | 152 | |
160 | 153 | def fix_revision_text(revision): |
161 | 154 | if revision.text == None: |
162 | | - revision.text = '' |
163 | | - return revision |
| 155 | + return '' |
164 | 156 | |
165 | 157 | |
166 | 158 | def create_md5hash(text): |
167 | 159 | hash = {} |
168 | 160 | if text != None: |
169 | 161 | m = hashlib.md5() |
170 | | - m.update(text) |
| 162 | + m.update(text.encode('utf-8')) |
171 | 163 | #echo m.digest() |
172 | 164 | hash['hash'] = m.hexdigest() |
173 | 165 | else: |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -54,14 +54,13 @@ |
55 | 55 | return md5hashes, size
|
56 | 56 |
|
57 | 57 | revision_id = revision.find('%s%s' % (xml_namespace, 'id'))
|
58 | | - print revision_id
|
59 | 58 | revision_id = variables.extract_revision_id(revision_id)
|
60 | 59 | if revision_id == None:
|
61 | 60 | #revision_id is missing, which is weird
|
62 | 61 | return md5hashes, size
|
63 | 62 |
|
64 | 63 | article['revision_id'] = revision_id
|
65 | | - text = variables.extract_revision_text(revision)
|
| 64 | + text = variables.extract_revision_text(revision, xml_namespace)
|
66 | 65 | article.update(contributor)
|
67 | 66 |
|
68 | 67 | comment = variables.extract_comment_text(revision_id, revision)
|