r81920 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81919‎ | r81920 | r81921 >
Date:22:10, 10 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
Small fixes
Modified paths:
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/utils/data_converter.py (modified) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/parser.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/wikitree/parser.py
@@ -98,17 +98,7 @@
9999 return namespaces, xml_namespace
100100
101101
102 -def read_unicode_text(fh):
103 - data = []
104 - try:
105 - for line in fh:
106 - line = line.strip()
107 - data.append(line)
108 - except UnicodeDecodeError, e:
109 - print e
110102
111 - return data
112 -
113103 def read_input(fh):
114104 buffer = cStringIO.StringIO()
115105 wrapper = codecs.getwriter(settings.encoding)(buffer)
Index: trunk/tools/editor_trends/etl/store.py
@@ -36,7 +36,7 @@
3737 location = os.path.join(settings.input_location, language_code, project)
3838 fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', settings.encoding)
3939 headers = ['id', 'title']
40 - data = fh.readlines()
 40+ data = file_utils.read_unicode_text(fh)
4141 fh.close()
4242
4343 dbname = '%s%s' % (language_code, project)
@@ -45,9 +45,12 @@
4646 collection = mongo[collection]
4747
4848 articles = {}
49 - for d in data:
50 - for header in headers:
51 - articles[header] = d
 49+ for x, d in enumerate(data):
 50+ d = d.split('\t')
 51+ x = str(x)
 52+ articles[x] = {}
 53+ for k, v in zip(headers, d):
 54+ articles[x][k] = v
5255
5356 collection.insert(articles)
5457
@@ -128,3 +131,7 @@
129132 tasks.join()
130133
131134
 135+def debug():
 136+ store_articles('wiki', 'cs')
 137+if __name__ == '__main__':
 138+ debug()
Index: trunk/tools/editor_trends/etl/sort.py
@@ -30,7 +30,7 @@
3131
3232 from utils import file_utils
3333 from utils import messages
34 -import wikitree.parser
 34+#import wikitree.parser
3535
3636 def quick_sort(obs):
3737 '''
@@ -124,7 +124,7 @@
125125 settings.encoding)
126126 #print fh
127127 #data = fh.readlines()
128 - data = wikitree.parser.read_unicode_text(fh)
 128+ data = file_utils.read_unicode_text(fh)
129129 fh.close()
130130 data = [d.strip() for d in data]
131131 data = [d.split('\t') for d in data]
Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -133,7 +133,7 @@
134134 return 'wb'
135135
136136
137 -def write_list_to_csv(data, fh, recursive=False, newline=True):
 137+def write_list_to_csv(data, fh, recursive=False, newline=True, format='long'):
138138 '''
139139 @data is a list which can contain other lists that will be written as a
140140 single line to a textfile
@@ -158,7 +158,7 @@
159159 if len(d) == len(data[x]):
160160 fh.write('\n')
161161 elif isinstance(d, dict):
162 - tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
 162+ tab = write_dict_to_csv(d, fh, d.keys(), write_key=True, format=format)
163163 else:
164164 fh.write('%s' % d)
165165 tab = True
@@ -182,8 +182,6 @@
183183 fh.write('%s\t%s\n' % (key, d))
184184 elif isinstance(data[key], dict):
185185 write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
186 -# for d in data[key]:
187 -# fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
188186 else:
189187 fh.write('%s\n' % (data[key]))
190188 elif format == 'wide':
@@ -191,11 +189,9 @@
192190 if write_key:
193191 fh.write('%s\t' % key)
194192 if isinstance(data[key], list):
195 - #if type(data[key]) == type([]):
196193 for d in data[key]:
197194 fh.write('%s\t')
198195 elif isinstance(data[key], list):
199 - #elif type(data[key]) == type({}):
200196 write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
201197 else:
202198 fh.write('%s\t' % (data[key]))
Index: trunk/tools/editor_trends/utils/data_converter.py
@@ -37,91 +37,97 @@
3838
3939 def convert_dataset_to_lists(ds, caller):
4040 assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.'
41 - data, all_keys = [], []
 41+ data = []
4242 for var in ds.variables:
4343 if caller == 'django':
4444 var = ds.variables[var]
4545 else:
4646 var = getattr(ds, var)
4747
48 - for date in var['obs'].keys():
49 - datum = convert_seconds_to_date(float(date))
50 - if ds.format == 'long':
 48+ if ds.format == 'long':
 49+ for obs in var.obs.values():
5150 o = []
52 - else:
53 - o = {}
54 - o['date'] = datum
55 -
56 - for obs in var['obs'][date]['data']:
57 - if ds.format == 'long':
58 - if isinstance(var['obs'][date]['data'], dict):
59 - #for subdata in var['obs'][date]['data']:
60 - for k, v in var['obs'][date]['data'][obs]['data'].iteritems():
61 - o.append([datum, obs, k, v])
62 - else:
63 - o.append([datum, obs, var['obs'][date]['data'][obs]])
64 - data.extend(o)
65 - o = []
66 - else:
67 - o[obs] = var['obs'][date]['data'][obs]
68 - if ds.format == 'wide':
 51+ o.append(obs.get_date_range())
 52+ for prop in obs.props:
 53+ o.append(getattr(obs, prop))
 54+ o.append(obs.data)
6955 data.append(o)
70 - if ds.format == 'wide':
71 - #Make sure that each variable / observation combination exists.
72 - all_keys = get_all_keys(data)
73 - data = make_data_rectangular(data, all_keys)
74 - data = sort(data, all_keys)
75 - return data, all_keys
 56+ else:
 57+ '''
 58+ This only works for observations with one variable and time_unit==year
 59+ '''
 60+ props = get_all_props(var)
 61+ for year in xrange(var.min_year, var.max_year):
 62+ for prop in props:
 63+ yaxis = get_all_keys(var, prop)
 64+ o = [0 for y in yaxis]
 65+ for x, y in enumerate(yaxis):
 66+ for obs in var.obs.values():
 67+ if obs.t1.year == year and getattr(obs, prop) == y:
 68+ o[x] += obs.data
 69+ o = [year] + o
 70+ data.append(o)
 71+ return data
7672
7773
78 -def add_headers(ds, all_keys):
 74+def add_headers(ds):
7975 assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.'
8076 headers = []
8177 if ds.format == 'long':
8278 headers.append('date')
8379 for var in ds:
8480 if ds.format == 'long':
85 - headers.extend([var.time_unit, var.name])
 81+ headers.extend([var.name])
8682 else:
87 - for key in all_keys:
88 - header = '%s_%s' % (key, var.name)
89 - headers.append(header)
 83+ props = get_all_props(var)
 84+ for prop in props:
 85+ all_keys = get_all_keys(var, prop)
 86+ for key in all_keys:
 87+ header = '%s_%s' % (key, var.name)
 88+ headers.append(header)
9089 return headers
9190
9291
93 -def make_data_rectangular(data, all_keys):
94 - for i, d in enumerate(data):
95 - for key in all_keys:
96 - if key not in d:
97 - d[key] = 0
98 - data[i] = d
99 - return data
 92+#def make_data_rectangular(data, all_keys):
 93+# for i, d in enumerate(data):
 94+# for key in all_keys:
 95+# if key not in d:
 96+# d[key] = 0
 97+# data[i] = d
 98+# return data
10099
101100
102 -def get_all_keys(data):
 101+def get_all_props(var):
103102 all_keys = []
104 - for d in data:
105 - for key in d:
106 - if key not in all_keys:
107 - all_keys.append(key)
 103+ for obs in var.obs.values():
 104+ for prop in obs.props:
 105+ if prop not in all_keys:
 106+ all_keys.append(prop)
 107+ return all_keys
 108+
 109+
 110+def get_all_keys(var, prop):
 111+ all_keys = []
 112+ for obs in var.obs.values():
 113+ v = getattr(obs, prop)
 114+ if v not in all_keys:
 115+ all_keys.append(v)
108116 all_keys.sort()
109 - all_keys.insert(0, all_keys[-1])
110 - del all_keys[-1]
111117 return all_keys
112118
113119
114 -def sort(data, all_keys):
115 - dates = [date['date'] for date in data]
116 - dates.sort()
117 - cube = []
118 - for date in dates:
119 - for i, d in enumerate(data):
120 - if d['date'] == date:
121 - raw_data = d
122 - del data[i]
123 - break
124 - obs = []
125 - for key in all_keys:
126 - obs.append(raw_data[key])
127 - cube.append(obs)
128 - return cube
 120+#def sort(data, all_keys):
 121+# dates = [date['date'] for date in data]
 122+# dates.sort()
 123+# cube = []
 124+# for date in dates:
 125+# for i, d in enumerate(data):
 126+# if d['date'] == date:
 127+# raw_data = d
 128+# del data[i]
 129+# break
 130+# obs = []
 131+# for key in all_keys:
 132+# obs.append(raw_data[key])
 133+# cube.append(obs)
 134+# return cube

Status & tagging log