Index: trunk/tools/editor_trends/run.bat |
— | — | @@ -1,3 +0,0 @@ |
2 | | -@echo off |
3 | | -python split_xml_file.py |
4 | | -python map_wiki_editors.py |
Index: trunk/tools/editor_trends/analyses/plugins/taxonomy_burnout.py |
— | — | @@ -21,32 +21,39 @@ |
22 | 22 | |
23 | 23 | def taxonomy_burnout(var, editor, **kwargs): |
24 | 24 | new_wikipedian = editor['new_wikipedian'] |
25 | | - edits = editor['monthly_edits'] |
26 | | - cutoff = kwargs.pop('cutoff') |
27 | | - |
| 25 | + edits = editor['edit_count'] |
| 26 | + final_edit = editor['final_edit'] |
| 27 | + cutoff = kwargs.get('cutoff', 149) |
| 28 | + |
28 | 29 | burnout = False |
29 | | - sum = 0.0 |
| 30 | + sum = 0.0 |
30 | 31 | count = 0.0 |
31 | | - |
32 | | - for year in xrange(2001, var.max_year): |
33 | | - year = str(year) |
34 | | - for month in xrange(1, 13): |
35 | | - month = str(month) |
| 32 | + |
| 33 | + if new_wikipedian: |
| 34 | + years = edits.keys() |
| 35 | + for year in years: |
| 36 | + year = str(year) |
| 37 | + months = edits[year].keys() |
| 38 | + for month in months: |
| 39 | + month = str(month) |
| 40 | + try: |
| 41 | + if edits[year][month].get('0', 0) > cutoff: |
| 42 | + burnout = True |
| 43 | + if burnout == True: |
| 44 | + n = edits[year][month].get('0', 0) |
| 45 | + sum += n |
| 46 | + if n > 0: |
| 47 | + count += 1.0 |
| 48 | + except (AttributeError, KeyError): |
| 49 | + print 'Editor %s does not have data for year: %s and month %s. Data: %s' \ |
| 50 | + % (editor['username'], year, month, edits[year]) |
| 51 | + |
| 52 | + if burnout and sum / count > 10: |
| 53 | + avg_edit = sum / count |
| 54 | + |
36 | 55 | try: |
37 | | - if edits[year][month] > 149: |
38 | | - burnout = True |
39 | | - if burnout == True: |
40 | | - sum += edits[year][month] |
41 | | - count += 1.0 |
42 | | - except (AttributeError, KeyError): |
43 | | - print 'Editor %s does not have data for year: %s and month %s' % (editor['username'], year, month) |
44 | | - |
45 | | - if burnout and sum / count > 10: |
46 | | - avg_edit = sum / count |
47 | | - |
48 | | - try: |
49 | | - var.add(new_wikipedian, avg_edit, {'username' : editor['username']}) |
50 | | - except Exception, error: |
51 | | - print 'user: %s error: %s' %(editor['username'], error) |
52 | | - |
| 56 | + var.add(new_wikipedian, avg_edit, {'username' : editor['username']}) |
| 57 | + except Exception, error: |
| 58 | + print 'user: %s error: %s' % (editor['username'].encode('utf-8'), error) |
| 59 | + |
53 | 60 | return var |
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | |
21 | 21 | from multiprocessing import JoinableQueue, Manager, RLock, Process |
22 | 22 | from multiprocessing.managers import BaseManager |
| 23 | +from Queue import Empty |
23 | 24 | |
24 | 25 | import sys |
25 | 26 | import cPickle |
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py |
— | — | @@ -45,7 +45,7 @@ |
46 | 46 | ids = db.retrieve_distinct_keys(project, collection, 'editor') |
47 | 47 | conn = db.init_mongo_db(project) |
48 | 48 | ids.sort() |
49 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding) |
| 49 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') |
50 | 50 | for i in ids: |
51 | 51 | author_i = conn[collection].find_one({'editor': i}) |
52 | 52 | article_i = create_articles_set(author_i['edits']) |
Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py |
— | — | @@ -50,7 +50,7 @@ |
51 | 51 | location = os.path.join(settings.input_location, language_code, project) |
52 | 52 | fh = file_utils.create_txt_filehandle(location, |
53 | 53 | '%s%s-latest-stub-meta-history.xml' % (language_code, project), |
54 | | - 'r', settings.encoding) |
| 54 | + 'r', 'utf-8') |
55 | 55 | |
56 | 56 | for page, article_size in wikitree.parser.read_input(fh): |
57 | 57 | title = page.find('title') |
Index: trunk/tools/editor_trends/wikitree/parser.py |
— | — | @@ -21,7 +21,9 @@ |
22 | 22 | import cStringIO |
23 | 23 | import codecs |
24 | 24 | import xml.etree.cElementTree as cElementTree |
| 25 | +from lxml import etree |
25 | 26 | import sys |
| 27 | +import gzip |
26 | 28 | |
27 | 29 | if '..' not in sys.path: |
28 | 30 | sys.path.append('..') |
— | — | @@ -64,7 +66,7 @@ |
65 | 67 | d[key] = extract_text(ns) |
66 | 68 | text = ns.text if ns.text != None else '' |
67 | 69 | try: |
68 | | - print key, text.encode(settings.encoding) |
| 70 | + print key, text.encode('utf-8') |
69 | 71 | except UnicodeEncodeError: |
70 | 72 | print key |
71 | 73 | return d |
— | — | @@ -77,7 +79,7 @@ |
78 | 80 | 2) Create a dictionary with the namespaces |
79 | 81 | ''' |
80 | 82 | buffer = cStringIO.StringIO() |
81 | | - wrapper = codecs.getwriter(settings.encoding)(buffer) |
| 83 | + wrapper = codecs.getwriter('utf-8')(buffer) |
82 | 84 | wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
83 | 85 | re_version = re.compile('\"\d\.\d\"') |
84 | 86 | for x, raw_data in enumerate(fh): |
— | — | @@ -99,42 +101,71 @@ |
100 | 102 | return namespaces, xml_namespace |
101 | 103 | |
102 | 104 | |
103 | | - |
104 | 105 | def read_input(fh): |
105 | | - buffer = cStringIO.StringIO() |
106 | | - wrapper = codecs.getwriter(settings.encoding)(buffer) |
107 | | - wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
108 | | - start_parsing = False |
| 106 | + context = cElementTree.iterparse(fh, events=('end',)) |
| 107 | + context = iter(context) |
109 | 108 | |
| 109 | + article = {} |
| 110 | + article['revisions'] = [] |
| 111 | + id = False |
| 112 | + namespace = '{http://www.mediawiki.org/xml/export-0.4/}' |
| 113 | + |
| 114 | + for event, elem in context: |
| 115 | + if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'): |
| 116 | + article['title'] = elem |
| 117 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'): |
| 118 | + article['revisions'].append(elem) |
| 119 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False: |
| 120 | + article['id'] = elem |
| 121 | + id = True |
| 122 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'): |
| 123 | + yield article, 0 |
| 124 | + elem.clear() |
| 125 | + article = {} |
| 126 | + article['revisions'] = [] |
| 127 | + id = False |
| 128 | + elif event == 'end': |
| 129 | + elem.clear() |
| 130 | + |
| 131 | +#def read_input(fh): |
| 132 | +# buffer = cStringIO.StringIO() |
| 133 | +# wrapper = codecs.getwriter('utf-8')(buffer) |
| 134 | +# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
| 135 | +# start_parsing = False |
| 136 | +# |
| 137 | +# for raw_data in fh: |
| 138 | +# if raw_data == '\n': |
| 139 | +# continue |
| 140 | +# if start_parsing == False and raw_data.find('<page>') > -1: |
| 141 | +# start_parsing = True |
| 142 | +# if start_parsing: |
| 143 | +# raw_data = ''.join(raw_data.strip()) |
| 144 | +# wrapper.write(raw_data) |
| 145 | +# if raw_data.find('</page>') > -1: |
| 146 | +# article = wrapper.getvalue() |
| 147 | +# size = len(article) |
| 148 | +# #article.encode('utf-8') |
| 149 | +# article = cElementTree.XML(article) |
| 150 | +# yield article, size |
| 151 | +# ''' |
| 152 | +# #This looks counter intuitive but Python continues with this |
| 153 | +# call after it has finished the yield statement |
| 154 | +# ''' |
| 155 | +# buffer = cStringIO.StringIO() |
| 156 | +# wrapper = codecs.getwriter('utf-8')(buffer) |
| 157 | +# wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
| 158 | +# fh.close() |
| 159 | + |
| 160 | + |
| 161 | +def debug(): |
| 162 | + #fh = codecs.open('c:\\wikimedia\\en\\wiki\dewiki-latest-stub-meta-history.xml', 'r', 'utf-8') |
| 163 | + filename = 'c:\\wikimedia\\en\\wiki\\enwiki-latest-stub-meta-history10.xml.gz' |
| 164 | + fh = gzip.GzipFile(filename, 'rb') |
| 165 | + |
110 | 166 | for raw_data in fh: |
111 | | - if raw_data == '\n': |
112 | | - continue |
113 | | - if start_parsing == False and raw_data.find('<page>') > -1: |
114 | | - start_parsing = True |
115 | | - if start_parsing: |
116 | | - raw_data = ''.join(raw_data.strip()) |
117 | | - wrapper.write(raw_data) |
118 | | - if raw_data.find('</page>') > -1: |
119 | | - article = wrapper.getvalue() |
120 | | - size = len(article) |
121 | | - #article.encode(settings.encoding) |
122 | | - article = cElementTree.XML(article) |
123 | | - yield article, size |
124 | | - ''' |
125 | | - #This looks counter intuitive but Python continues with this |
126 | | - call after it has finished the yield statement |
127 | | - ''' |
128 | | - buffer = cStringIO.StringIO() |
129 | | - wrapper = codecs.getwriter(settings.encoding)(buffer) |
130 | | - wrapper.write("<?xml version='1.0' encoding='UTF-8' ?>\n") |
131 | | - fh.close() |
| 167 | + print raw_data |
132 | 168 | |
133 | 169 | |
134 | | -def debug(): |
135 | | - fh = codecs.open('c:\\wikimedia\\en\\wiki\dewiki-latest-stub-meta-history.xml', 'r', 'utf-8') |
136 | | - for article in read_input(fh): |
137 | | - print article |
138 | | - extract_meta_information(fh) |
139 | 170 | fh.close() |
140 | 171 | |
141 | 172 | |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -64,7 +64,7 @@ |
65 | 65 | editor_cache.add(prev_editor, 'NEXT') |
66 | 66 | |
67 | 67 | data = self.prepare_data(line) |
68 | | - print editor, data['username'] |
| 68 | + #print editor, data['username'] |
69 | 69 | editor_cache.add(editor, data) |
70 | 70 | prev_editor = editor |
71 | 71 | fh.close() |
— | — | @@ -168,7 +168,7 @@ |
169 | 169 | while True: |
170 | 170 | while ppills > 0: |
171 | 171 | try: |
172 | | - res = result.get(block=True) |
| 172 | + res = result.get(block=False) |
173 | 173 | if res == True: |
174 | 174 | pbar.update(pbar.currval + 1) |
175 | 175 | else: |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -237,7 +237,7 @@ |
238 | 238 | for tag in tags: |
239 | 239 | el = revision.find('%s' % tag) |
240 | 240 | if el == None: |
241 | | - #print cElementTree.tostring(revision, settings.encoding) |
| 241 | + #print cElementTree.tostring(revision, 'utf-8') |
242 | 242 | del vars[x] |
243 | 243 | break |
244 | 244 | for function in tags[tag].keys(): |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -210,8 +210,8 @@ |
211 | 211 | |
212 | 212 | def determine_edit_volume(edits, first_year, final_year): |
213 | 213 | ''' |
214 | | - This function counts the number of edits by year by month by namespace for |
215 | | - a particular editor. |
| 214 | + This function counts the number of characters added and remove by year |
| 215 | + by month by namespace for a particular editor. |
216 | 216 | ''' |
217 | 217 | dc = shaper.create_datacontainer(first_year, final_year) |
218 | 218 | dc = shaper.add_months_to_datacontainer(dc, 'dict') |
— | — | @@ -222,6 +222,7 @@ |
223 | 223 | dc[year][month].setdefault(ns, {}) |
224 | 224 | dc[year][month][ns].setdefault('added', 0) |
225 | 225 | dc[year][month][ns].setdefault('removed', 0) |
| 226 | + print edit |
226 | 227 | if edit['delta'] < 0: |
227 | 228 | dc[year][month][ns]['removed'] += edit['delta'] |
228 | 229 | elif edit['delta'] > 0: |
Index: trunk/tools/editor_trends/classes/settings.py |
— | — | @@ -55,7 +55,6 @@ |
56 | 56 | |
57 | 57 | #Date format as used by Erik Zachte |
58 | 58 | self.date_format = '%Y-%m-%d' |
59 | | - |
60 | 59 | # Timestamp format as generated by the MediaWiki dumps |
61 | 60 | self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
62 | 61 | self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z' |
— | — | @@ -153,7 +152,6 @@ |
154 | 153 | try: |
155 | 154 | error = False |
156 | 155 | os.makedirs(directory) |
157 | | - |
158 | 156 | except IOError, error: |
159 | 157 | pass |
160 | 158 | except OSError, error: |
— | — | @@ -187,7 +185,6 @@ |
188 | 186 | path = path + program |
189 | 187 | elif self.platform == 'Linux': |
190 | 188 | path = self.detect_linux_program(program) |
191 | | - |
192 | 189 | return path |
193 | 190 | |
194 | 191 | def determine_max_filehandles_open(self): |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -163,7 +163,7 @@ |
164 | 164 | max_length_key = max([len(key) for key in about.keys()]) |
165 | 165 | print 'Final settings after parsing command line arguments:' |
166 | 166 | for ab in about: |
167 | | - print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.encoding)) |
| 167 | + print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode('utf-8')) |
168 | 168 | |
169 | 169 | |
170 | 170 | def get_value(self, key): |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -459,7 +459,7 @@ |
460 | 460 | fh = file_utils.create_txt_filehandle(settings.dataset_location, |
461 | 461 | self.filename, |
462 | 462 | 'w', |
463 | | - settings.encoding) |
| 463 | + 'utf-8') |
464 | 464 | file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) |
465 | 465 | file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, |
466 | 466 | format=self.format, |
Index: trunk/tools/editor_trends/bots/detector.py |
— | — | @@ -93,7 +93,7 @@ |
94 | 94 | keys = ['name', 'verified', 'projects'] |
95 | 95 | bots = file_utils.create_dict_from_csv_file(settings.csv_location, |
96 | 96 | 'bots_ids.csv', |
97 | | - settings.encoding, |
| 97 | + 'utf-8', |
98 | 98 | keys) |
99 | 99 | mongo = db.init_mongo_db('bots') |
100 | 100 | collection = mongo['ids'] |
— | — | @@ -122,7 +122,7 @@ |
123 | 123 | |
124 | 124 | def write_bot_list_to_csv(bots, keys): |
125 | 125 | fh = file_utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', |
126 | | - 'w', settings.encoding) |
| 126 | + 'w', 'utf-8') |
127 | 127 | bot_dict = convert_object_to_dict(bots, exclude=['time', 'written']) |
128 | 128 | for bot in bot_dict: |
129 | 129 | bot = bot_dict[bot] |
— | — | @@ -194,7 +194,7 @@ |
195 | 195 | output_file = 'bots_ids.csv' |
196 | 196 | files = file_utils.retrieve_file_list(input_txt, 'txt', mask=None) |
197 | 197 | input_queue = pc.load_queue(files, poison_pill=True) |
198 | | - bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager) |
| 198 | + bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', 'utf-8', manager=manager) |
199 | 199 | for file in files: |
200 | 200 | tasks.put(consumers.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
201 | 201 | |
— | — | @@ -229,7 +229,7 @@ |
230 | 230 | keys = bots.keys() |
231 | 231 | for key in keys: |
232 | 232 | try: |
233 | | - print '%s' % key.encode(settings.encoding) |
| 233 | + print '%s' % key.encode('utf-8') |
234 | 234 | except: |
235 | 235 | pass |
236 | 236 | else: |
— | — | @@ -238,7 +238,7 @@ |
239 | 239 | |
240 | 240 | |
241 | 241 | def bot_training_dataset(bots): |
242 | | - fh = file_utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', settings.encoding) |
| 242 | + fh = file_utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', 'utf-8') |
243 | 243 | keys = bots.keys() |
244 | 244 | for key in keys: |
245 | 245 | bot = bots.get(key) |
Index: trunk/tools/editor_trends/code-snippets/chunker.py |
— | — | @@ -71,7 +71,7 @@ |
72 | 72 | |
73 | 73 | #def load_namespace(language): |
74 | 74 | # file = '%s_ns.json' % language |
75 | | -# fh = file_utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding) |
| 75 | +# fh = file_utils.create_txt_filehandle(settings.namespace_location, file, 'r', 'utf-8') |
76 | 76 | # ns = json.load(fh) |
77 | 77 | # fh.close() |
78 | 78 | # ns = ns['query']['namespaces'] |
— | — | @@ -146,14 +146,14 @@ |
147 | 147 | if not fh: |
148 | 148 | counter = 0 |
149 | 149 | path = os.path.join(output, '%s.%s' % (counter, format)) |
150 | | - fh = codecs.open(path, 'w', encoding=settings.encoding) |
| 150 | + fh = codecs.open(path, 'w', encoding='utf-8') |
151 | 151 | return fh, counter, False |
152 | 152 | elif (fh.tell() + size) > settings.max_xmlfile_size: |
153 | 153 | print 'Created chunk %s' % (counter + 1) |
154 | 154 | fh.close |
155 | 155 | counter += 1 |
156 | 156 | path = os.path.join(output, '%s.%s' % (counter, format)) |
157 | | - fh = codecs.open(path, 'w', encoding=settings.encoding) |
| 157 | + fh = codecs.open(path, 'w', encoding='utf-8') |
158 | 158 | return fh, counter, True |
159 | 159 | else: |
160 | 160 | return fh, counter, False |
— | — | @@ -248,7 +248,7 @@ |
249 | 249 | root.clear() # when done parsing a section clear the tree to safe memory |
250 | 250 | |
251 | 251 | except SyntaxError: |
252 | | - f = file_utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding) |
| 252 | + f = file_utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', 'utf-8') |
253 | 253 | f.write(cElementTree.tostring(elem)) |
254 | 254 | f.close() |
255 | 255 | |
Index: trunk/tools/editor_trends/code-snippets/cohort_charts.py |
— | — | @@ -28,7 +28,7 @@ |
29 | 29 | def prepare_cohort_dataset(dbname, filename): |
30 | 30 | dataset = file_utils.load_object(settings.binary_location, '%s_%s' % (dbname, filename)) |
31 | 31 | filename = filename.replace('.bin', '.txt') |
32 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', settings.encoding) |
| 32 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', 'utf-8') |
33 | 33 | |
34 | 34 | years = dataset.keys() |
35 | 35 | years.sort() |
Index: trunk/tools/editor_trends/code-snippets/cohort_confidence_intervals.py |
— | — | @@ -32,7 +32,7 @@ |
33 | 33 | # mongo = db.init_mongo_db(dbname) |
34 | 34 | # editors = mongo['dataset'] |
35 | 35 | # name = dbname + '_edits_by_month.csv' |
36 | | -# fh = file_utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding) |
| 36 | +# fh = file_utils.create_txt_filehandle(settings.dataset_location, name, 'w', 'utf-8') |
37 | 37 | # x = 0 |
38 | 38 | # vars_to_expand = ['monthly_edits'] |
39 | 39 | # while True: |
Index: trunk/tools/editor_trends/code-snippets/match_talkpage_article_old.py |
— | — | @@ -63,7 +63,7 @@ |
64 | 64 | articles = {} |
65 | 65 | talks = {} |
66 | 66 | for file in files: |
67 | | - fh = file_utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 67 | + fh = file_utils.create_txt_filehandle(input, file, 'r', 'utf-8') |
68 | 68 | for line in fh: |
69 | 69 | line = line.strip() |
70 | 70 | id, article = line.split('\t') |
— | — | @@ -118,7 +118,7 @@ |
119 | 119 | def debug_article_to_talk(): |
120 | 120 | input = os.path.join(settings.input_location, 'en', 'wiki', 'chunks', '0.xml') |
121 | 121 | output = os.path.join(settings.input_location, 'en', 'wiki', 'txt', 'test.txt') |
122 | | - f = codecs.open(output, 'w', encoding=settings.encoding) |
| 122 | + f = codecs.open(output, 'w', encoding='utf-8') |
123 | 123 | fh = open(input, 'r') |
124 | 124 | data = xml.read_input(fh) |
125 | 125 | for raw_data in data: |
Index: trunk/tools/editor_trends/code-snippets/exporter.py |
— | — | @@ -86,7 +86,7 @@ |
87 | 87 | editors = mongo[collection + '_dataset'] |
88 | 88 | vars = ['monthly_edits'] |
89 | 89 | name = dbname + '_long_editors.csv' |
90 | | - #fh = file_utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding) |
| 90 | + #fh = file_utils.create_txt_filehandle(settings.dataset_location, name, 'w', 'utf-8') |
91 | 91 | vars_to_expand = [] |
92 | 92 | keys = dict([(var, 1) for var in vars]) |
93 | 93 | ld = LongDataset(vars) |
— | — | @@ -183,7 +183,7 @@ |
184 | 184 | cohort_charts.prepare_cohort_dataset(dbname, filename) |
185 | 185 | |
186 | 186 | filename = '_cohort_data_forward_histogram.csv' |
187 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', settings.encoding) |
| 187 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', 'utf-8') |
188 | 188 | for year in data: |
189 | 189 | for month in data[year]: |
190 | 190 | obs = data[year][month].keys() |
— | — | @@ -260,7 +260,7 @@ |
261 | 261 | mongo = db.init_mongo_db(dbname) |
262 | 262 | editors = mongo[collection + '_dataset'] |
263 | 263 | name = dbname + '_wide_editors.csv' |
264 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding) |
| 264 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, name, 'a', 'utf-8') |
265 | 265 | x = 0 |
266 | 266 | vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year'] |
267 | 267 | while True: |
Index: trunk/tools/editor_trends/code-snippets/sqlite/sqlite_logic.py |
— | — | @@ -62,9 +62,9 @@ |
63 | 63 | while True: |
64 | 64 | try: |
65 | 65 | chunk = data_queue.get(block=False) |
66 | | - contributor = chunk['contributor'].encode(settings.encoding) |
| 66 | + contributor = chunk['contributor'].encode('utf-8') |
67 | 67 | article = chunk['article'] |
68 | | - timestamp = chunk['timestamp'].encode(settings.encoding) |
| 68 | + timestamp = chunk['timestamp'].encode('utf-8') |
69 | 69 | bot = chunk['bot'] |
70 | 70 | values.append((contributor, article, timestamp, bot)) |
71 | 71 | |
— | — | @@ -96,7 +96,7 @@ |
97 | 97 | db.create_tables(cursor, db_settings.BOT_TABLE) |
98 | 98 | values = [] |
99 | 99 | fields = [field[0] for field in db_settings.BOT_TABLE['bots']] |
100 | | - for line in file_utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.encoding): |
| 100 | + for line in file_utils.read_data_from_csv('data/csv/StatisticsBots.csv', 'utf-8'): |
101 | 101 | line = line.split(',') |
102 | 102 | row = [] |
103 | 103 | for x, (field, value) in enumerate(zip(fields, line)): |
Index: trunk/tools/editor_trends/code-snippets/count_editors.py |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | files = file_utils.retrieve_file_list(input, 'txt', mask='merged_final') |
19 | 19 | editors = {} |
20 | 20 | for file in files: |
21 | | - fh = file_utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 21 | + fh = file_utils.create_txt_filehandle(input, file, 'r', 'utf-8') |
22 | 22 | for line in fh: |
23 | 23 | author = line.split('\t')[0] |
24 | 24 | if author not in editors: |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Modified: svn:ignore |
25 | 25 | - wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
26 | 26 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
libs |