Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -107,8 +107,15 @@ |
108 | 108 | for line in fh: |
109 | 109 | line = line.strip() |
110 | 110 | #print line.encode('utf-8') |
111 | | - id, title = line.split('\t') |
112 | | - collection.insert({'id':id, 'title':title}) |
| 111 | + line = line.split('\t') |
| 112 | + title = line[-1] |
| 113 | + id = line[0] |
| 114 | + ns = line[1] |
| 115 | + if len(line) == 4: |
| 116 | + category = line[2] |
| 117 | + collection.insert({'id':id, 'title':title, 'category': category, 'ns': ns}) |
| 118 | + else: |
| 119 | + collection.insert({'id':id, 'title':title, 'ns': ns}) |
113 | 120 | fh.close() |
114 | 121 | print 'Done...' |
115 | 122 | |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -43,20 +43,26 @@ |
44 | 44 | if filename == None: |
45 | 45 | self.result.put(None) |
46 | 46 | break |
47 | | - |
| 47 | + elif filename.startswith('comments') or filename.startswith('title'): |
| 48 | + continue |
48 | 49 | fh = file_utils.create_txt_filehandle(self.rts.txt, |
49 | 50 | filename, |
50 | 51 | 'r', |
51 | 52 | self.rts.encoding) |
52 | 53 | data = file_utils.read_unicode_text(fh) |
53 | 54 | fh.close() |
54 | | - data = [d.strip() for d in data] |
55 | | - data = [d.split('\t') for d in data] |
| 55 | + for x, d in enumerate(data): |
| 56 | + d = d.strip().split('\t') |
| 57 | + data[x] = d |
| 58 | + #data = [d.strip() for d in data] |
| 59 | + #data = [d.split('\t') for d in data] |
56 | 60 | sorted_data = mergesort(data) |
57 | 61 | write_sorted_file(sorted_data, filename, self.rts) |
58 | 62 | self.result.put(True) |
59 | 63 | except UnicodeDecodeError, e: |
60 | 64 | print 'Error: %s, (%s)' % (e, filename) |
| 65 | + except MemoryError, e: |
| 66 | + print 'Error: %s, (%s)' % (e, filename) |
61 | 67 | except Empty: |
62 | 68 | pass |
63 | 69 | |