Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -80,17 +80,31 @@ |
81 | 81 | cache.add(article)
|
82 | 82 | return md5hashes, size
|
83 | 83 |
|
84 | | -def setup_parser(rts):
|
85 | | - bots = bot_detector.retrieve_bots(rts.language.code)
|
86 | | - include_ns = {3: 'User Talk',
|
87 | | - 5: 'Wikipedia Talk',
|
88 | | - 1: 'Talk',
|
89 | | - 2: 'User',
|
90 | | - 4: 'Wikipedia'}
|
91 | 84 |
|
92 | | - return bots, include_ns
|
| 85 | +def datacompetition_parse_revision(revision, xml_namespace, bots, counts):
|
| 86 | + '''
|
| 87 | + This function has as input a single revision from a Wikipedia dump file,
|
| 88 | + article information it belongs to, the xml_namespace of the Wikipedia dump
|
| 89 | + file, the cache object that collects parsed revisions, a list of md5hashes
|
| 90 | + to determine whether an edit was reverted and a size dictionary to determine
|
| 91 | + how many characters were added and removed compared to the previous revision.
|
| 92 | + '''
|
| 93 | + if revision == None:
|
| 94 | + #the entire revision is empty, weird.
|
| 95 | + #dump(revision)
|
| 96 | + return counts
|
93 | 97 |
|
| 98 | + contributor = revision.find('%s%s' % (xml_namespace, 'contributor'))
|
| 99 | + contributor = variables.parse_contributor(contributor, bots, xml_namespace)
|
| 100 | + if not contributor:
|
| 101 | + #editor is anonymous, ignore
|
| 102 | + return counts
|
| 103 | + else:
|
| 104 | + counts.setdefault(contributor['id'], 0)
|
| 105 | + counts[contributor['id']] += 1
|
| 106 | + return counts
|
94 | 107 |
|
| 108 | +
|
95 | 109 | def datacompetition_count_edits(fh, rts, file_id):
|
96 | 110 | '''
|
97 | 111 | This function counts for every editor the total number of edits that person
|
— | — | @@ -99,7 +113,8 @@ |
100 | 114 | edits. This function is only to be used to create the prediction dataset
|
101 | 115 | for the datacompetition.
|
102 | 116 | '''
|
103 | | - bots, include_ns = setup_parser(rts)
|
| 117 | + bots = bot_detector.retrieve_bots(rts.language.code)
|
| 118 | + include_ns = {}
|
104 | 119 |
|
105 | 120 | start = 'start'; end = 'end'
|
106 | 121 | context = iterparse(fh, events=(start, end))
|
— | — | @@ -132,8 +147,7 @@ |
133 | 148 | if event is start:
|
134 | 149 | clear = False
|
135 | 150 | else:
|
136 | | - print 'IMPLEMENT'
|
137 | | - #md5hashes, size = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size)
|
| 151 | + counts = datacompetition_parse_revision(revision, xml_namespace, bots, counts)
|
138 | 152 | cache.count_revisions += 1
|
139 | 153 | clear = True
|
140 | 154 | if clear:
|
— | — | @@ -159,12 +173,17 @@ |
160 | 174 | file_utils.write_dict_to_csv(counts, fh, keys)
|
161 | 175 | fh.close()
|
162 | 176 |
|
163 | | - filename = 'counts_kaggle_%s.bin' % file_id
|
164 | | - file_utils.store_object(counts, location, filename)
|
| 177 | + #filename = 'counts_kaggle_%s.bin' % file_id
|
| 178 | + #file_utils.store_object(counts, location, filename)
|
165 | 179 |
|
166 | 180 |
|
167 | 181 | def parse_xml(fh, rts, cache, process_id, file_id):
|
168 | | - bots, include_ns = setup_parser(rts)
|
| 182 | + bots = bot_detector.retrieve_bots(rts.language.code)
|
| 183 | + include_ns = {3: 'User Talk',
|
| 184 | + 5: 'Wikipedia Talk',
|
| 185 | + 1: 'Talk',
|
| 186 | + 2: 'User',
|
| 187 | + 4: 'Wikipedia'}
|
169 | 188 |
|
170 | 189 | start = 'start'; end = 'end'
|
171 | 190 | context = iterparse(fh, events=(start, end))
|
— | — | @@ -239,7 +258,8 @@ |
240 | 259 | def stream_raw_xml(input_queue, process_id, lock, rts):
|
241 | 260 | t0 = datetime.now()
|
242 | 261 | file_id = 0
|
243 | | - cache = buffer.CSVBuffer(process_id, rts, lock)
|
| 262 | + if rts.kaggle:
|
| 263 | + cache = buffer.CSVBuffer(process_id, rts, lock)
|
244 | 264 |
|
245 | 265 | while True:
|
246 | 266 | filename = input_queue.get()
|