Index: trunk/tools/editor_trends/wikitree/parser.py |
— | — | @@ -98,17 +98,7 @@ |
99 | 99 | return namespaces, xml_namespace |
100 | 100 | |
101 | 101 | |
102 | | -def read_unicode_text(fh): |
103 | | - data = [] |
104 | | - try: |
105 | | - for line in fh: |
106 | | - line = line.strip() |
107 | | - data.append(line) |
108 | | - except UnicodeDecodeError, e: |
109 | | - print e |
110 | 102 | |
111 | | - return data |
112 | | - |
113 | 103 | def read_input(fh): |
114 | 104 | buffer = cStringIO.StringIO() |
115 | 105 | wrapper = codecs.getwriter(settings.encoding)(buffer) |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | location = os.path.join(settings.input_location, language_code, project) |
38 | 38 | fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', settings.encoding) |
39 | 39 | headers = ['id', 'title'] |
40 | | - data = fh.readlines() |
| 40 | + data = file_utils.read_unicode_text(fh) |
41 | 41 | fh.close() |
42 | 42 | |
43 | 43 | dbname = '%s%s' % (language_code, project) |
— | — | @@ -45,9 +45,12 @@ |
46 | 46 | collection = mongo[collection] |
47 | 47 | |
48 | 48 | articles = {} |
49 | | - for d in data: |
50 | | - for header in headers: |
51 | | - articles[header] = d |
| 49 | + for x, d in enumerate(data): |
| 50 | + d = d.split('\t') |
| 51 | + x = str(x) |
| 52 | + articles[x] = {} |
| 53 | + for k, v in zip(headers, d): |
| 54 | + articles[x][k] = v |
52 | 55 | |
53 | 56 | collection.insert(articles) |
54 | 57 | |
— | — | @@ -128,3 +131,7 @@ |
129 | 132 | tasks.join() |
130 | 133 | |
131 | 134 | |
| 135 | +def debug(): |
| 136 | + store_articles('wiki', 'cs') |
| 137 | +if __name__ == '__main__': |
| 138 | + debug() |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -30,7 +30,7 @@ |
31 | 31 | |
32 | 32 | from utils import file_utils |
33 | 33 | from utils import messages |
34 | | -import wikitree.parser |
| 34 | +#import wikitree.parser |
35 | 35 | |
36 | 36 | def quick_sort(obs): |
37 | 37 | ''' |
— | — | @@ -124,7 +124,7 @@ |
125 | 125 | settings.encoding) |
126 | 126 | #print fh |
127 | 127 | #data = fh.readlines() |
128 | | - data = wikitree.parser.read_unicode_text(fh) |
| 128 | + data = file_utils.read_unicode_text(fh) |
129 | 129 | fh.close() |
130 | 130 | data = [d.strip() for d in data] |
131 | 131 | data = [d.split('\t') for d in data] |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -133,7 +133,7 @@ |
134 | 134 | return 'wb' |
135 | 135 | |
136 | 136 | |
137 | | -def write_list_to_csv(data, fh, recursive=False, newline=True): |
| 137 | +def write_list_to_csv(data, fh, recursive=False, newline=True, format='long'): |
138 | 138 | ''' |
139 | 139 | @data is a list which can contain other lists that will be written as a |
140 | 140 | single line to a textfile |
— | — | @@ -158,7 +158,7 @@ |
159 | 159 | if len(d) == len(data[x]): |
160 | 160 | fh.write('\n') |
161 | 161 | elif isinstance(d, dict): |
162 | | - tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format) |
| 162 | + tab = write_dict_to_csv(d, fh, d.keys(), write_key=True, format=format) |
163 | 163 | else: |
164 | 164 | fh.write('%s' % d) |
165 | 165 | tab = True |
— | — | @@ -182,8 +182,6 @@ |
183 | 183 | fh.write('%s\t%s\n' % (key, d)) |
184 | 184 | elif isinstance(data[key], dict): |
185 | 185 | write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format) |
186 | | -# for d in data[key]: |
187 | | -# fh.write('%s\t%s\t%s\n' % (key, d, data[key][d])) |
188 | 186 | else: |
189 | 187 | fh.write('%s\n' % (data[key])) |
190 | 188 | elif format == 'wide': |
— | — | @@ -191,11 +189,9 @@ |
192 | 190 | if write_key: |
193 | 191 | fh.write('%s\t' % key) |
194 | 192 | if isinstance(data[key], list): |
195 | | - #if type(data[key]) == type([]): |
196 | 193 | for d in data[key]: |
197 | 194 | fh.write('%s\t') |
198 | 195 | elif isinstance(data[key], list): |
199 | | - #elif type(data[key]) == type({}): |
200 | 196 | write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format) |
201 | 197 | else: |
202 | 198 | fh.write('%s\t' % (data[key])) |
Index: trunk/tools/editor_trends/utils/data_converter.py |
— | — | @@ -37,91 +37,97 @@ |
38 | 38 | |
39 | 39 | def convert_dataset_to_lists(ds, caller): |
40 | 40 | assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.' |
41 | | - data, all_keys = [], [] |
| 41 | + data = [] |
42 | 42 | for var in ds.variables: |
43 | 43 | if caller == 'django': |
44 | 44 | var = ds.variables[var] |
45 | 45 | else: |
46 | 46 | var = getattr(ds, var) |
47 | 47 | |
48 | | - for date in var['obs'].keys(): |
49 | | - datum = convert_seconds_to_date(float(date)) |
50 | | - if ds.format == 'long': |
| 48 | + if ds.format == 'long': |
| 49 | + for obs in var.obs.values(): |
51 | 50 | o = [] |
52 | | - else: |
53 | | - o = {} |
54 | | - o['date'] = datum |
55 | | - |
56 | | - for obs in var['obs'][date]['data']: |
57 | | - if ds.format == 'long': |
58 | | - if isinstance(var['obs'][date]['data'], dict): |
59 | | - #for subdata in var['obs'][date]['data']: |
60 | | - for k, v in var['obs'][date]['data'][obs]['data'].iteritems(): |
61 | | - o.append([datum, obs, k, v]) |
62 | | - else: |
63 | | - o.append([datum, obs, var['obs'][date]['data'][obs]]) |
64 | | - data.extend(o) |
65 | | - o = [] |
66 | | - else: |
67 | | - o[obs] = var['obs'][date]['data'][obs] |
68 | | - if ds.format == 'wide': |
| 51 | + o.append(obs.get_date_range()) |
| 52 | + for prop in obs.props: |
| 53 | + o.append(getattr(obs, prop)) |
| 54 | + o.append(obs.data) |
69 | 55 | data.append(o) |
70 | | - if ds.format == 'wide': |
71 | | - #Make sure that each variable / observation combination exists. |
72 | | - all_keys = get_all_keys(data) |
73 | | - data = make_data_rectangular(data, all_keys) |
74 | | - data = sort(data, all_keys) |
75 | | - return data, all_keys |
| 56 | + else: |
| 57 | + ''' |
| 58 | + This only works for observations with one variable and time_unit==year |
| 59 | + ''' |
| 60 | + props = get_all_props(var) |
| 61 | + for year in xrange(var.min_year, var.max_year): |
| 62 | + for prop in props: |
| 63 | + yaxis = get_all_keys(var, prop) |
| 64 | + o = [0 for y in yaxis] |
| 65 | + for x, y in enumerate(yaxis): |
| 66 | + for obs in var.obs.values(): |
| 67 | + if obs.t1.year == year and getattr(obs, prop) == y: |
| 68 | + o[x] += obs.data |
| 69 | + o = [year] + o |
| 70 | + data.append(o) |
| 71 | + return data |
76 | 72 | |
77 | 73 | |
78 | | -def add_headers(ds, all_keys): |
| 74 | +def add_headers(ds): |
79 | 75 | assert ds.format == 'long' or ds.format == 'wide', 'Format should either be long or wide.' |
80 | 76 | headers = [] |
81 | 77 | if ds.format == 'long': |
82 | 78 | headers.append('date') |
83 | 79 | for var in ds: |
84 | 80 | if ds.format == 'long': |
85 | | - headers.extend([var.time_unit, var.name]) |
| 81 | + headers.extend([var.name]) |
86 | 82 | else: |
87 | | - for key in all_keys: |
88 | | - header = '%s_%s' % (key, var.name) |
89 | | - headers.append(header) |
| 83 | + props = get_all_props(var) |
| 84 | + for prop in props: |
| 85 | + all_keys = get_all_keys(var, prop) |
| 86 | + for key in all_keys: |
| 87 | + header = '%s_%s' % (key, var.name) |
| 88 | + headers.append(header) |
90 | 89 | return headers |
91 | 90 | |
92 | 91 | |
93 | | -def make_data_rectangular(data, all_keys): |
94 | | - for i, d in enumerate(data): |
95 | | - for key in all_keys: |
96 | | - if key not in d: |
97 | | - d[key] = 0 |
98 | | - data[i] = d |
99 | | - return data |
| 92 | +#def make_data_rectangular(data, all_keys): |
| 93 | +# for i, d in enumerate(data): |
| 94 | +# for key in all_keys: |
| 95 | +# if key not in d: |
| 96 | +# d[key] = 0 |
| 97 | +# data[i] = d |
| 98 | +# return data |
100 | 99 | |
101 | 100 | |
102 | | -def get_all_keys(data): |
| 101 | +def get_all_props(var): |
103 | 102 | all_keys = [] |
104 | | - for d in data: |
105 | | - for key in d: |
106 | | - if key not in all_keys: |
107 | | - all_keys.append(key) |
| 103 | + for obs in var.obs.values(): |
| 104 | + for prop in obs.props: |
| 105 | + if prop not in all_keys: |
| 106 | + all_keys.append(prop) |
| 107 | + return all_keys |
| 108 | + |
| 109 | + |
| 110 | +def get_all_keys(var, prop): |
| 111 | + all_keys = [] |
| 112 | + for obs in var.obs.values(): |
| 113 | + v = getattr(obs, prop) |
| 114 | + if v not in all_keys: |
| 115 | + all_keys.append(v) |
108 | 116 | all_keys.sort() |
109 | | - all_keys.insert(0, all_keys[-1]) |
110 | | - del all_keys[-1] |
111 | 117 | return all_keys |
112 | 118 | |
113 | 119 | |
114 | | -def sort(data, all_keys): |
115 | | - dates = [date['date'] for date in data] |
116 | | - dates.sort() |
117 | | - cube = [] |
118 | | - for date in dates: |
119 | | - for i, d in enumerate(data): |
120 | | - if d['date'] == date: |
121 | | - raw_data = d |
122 | | - del data[i] |
123 | | - break |
124 | | - obs = [] |
125 | | - for key in all_keys: |
126 | | - obs.append(raw_data[key]) |
127 | | - cube.append(obs) |
128 | | - return cube |
| 120 | +#def sort(data, all_keys): |
| 121 | +# dates = [date['date'] for date in data] |
| 122 | +# dates.sort() |
| 123 | +# cube = [] |
| 124 | +# for date in dates: |
| 125 | +# for i, d in enumerate(data): |
| 126 | +# if d['date'] == date: |
| 127 | +# raw_data = d |
| 128 | +# del data[i] |
| 129 | +# break |
| 130 | +# obs = [] |
| 131 | +# for key in all_keys: |
| 132 | +# obs.append(raw_data[key]) |
| 133 | +# cube.append(obs) |
| 134 | +# return cube |