Index: trunk/tools/editor_trends/etl/kaggle.py |
— | — | @@ -0,0 +1,49 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-04-12' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | + |
| 23 | +if '..' not in sys.path: |
| 24 | + sys.path.append('..') |
| 25 | + |
| 26 | +from utils import file_utils |
| 27 | + |
| 28 | + |
| 29 | +def launcher(): |
| 30 | + location = '/home/diederik/wikimedia/en/wiki/kaggle_training/' |
| 31 | + #location = 'C:\\wikimedia\\en\\wiki\\txt' |
| 32 | + files = file_utils.retrieve_file_list(location, extension='csv') |
| 33 | + files.sort() |
| 34 | + dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w', 'utf-8') |
| 35 | + for filename in files: |
| 36 | + if not filename.startswith('comments') and \ |
| 37 | + not filename.startswith('articles') and not filename.startswith('dataset'): |
| 38 | + fh = file_utils.create_txt_filehandle(location, filename, 'r', 'utf-8') |
| 39 | + print fh |
| 40 | + for line in fh: |
| 41 | + data = line.split('\t') |
| 42 | + username = data[3].lower() |
| 43 | + if username.endswith('bot'): |
| 44 | + continue |
| 45 | + else: |
| 46 | + dataset.write(line) |
| 47 | + fh.close() |
| 48 | + dataset.close() |
| 49 | + |
| 50 | +launcher() |
Property changes on: trunk/tools/editor_trends/etl/kaggle.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 51 | + native |