r86816 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86815‎ | r86816 | r86817 >
Date:14:26, 24 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Finalize datacompetition dataset
Modified paths:
  • /trunk/tools/editor_trends/etl/kaggle.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/kaggle.py
@@ -0,0 +1,49 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-04-12'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+
 23+if '..' not in sys.path:
 24+ sys.path.append('..')
 25+
 26+from utils import file_utils
 27+
 28+
 29+def launcher():
 30+ location = '/home/diederik/wikimedia/en/wiki/kaggle_training/'
 31+ #location = 'C:\\wikimedia\\en\\wiki\\txt'
 32+ files = file_utils.retrieve_file_list(location, extension='csv')
 33+ files.sort()
 34+ dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w', 'utf-8')
 35+ for filename in files:
 36+ if not filename.startswith('comments') and \
 37+ not filename.startswith('articles') and not filename.startswith('dataset'):
 38+ fh = file_utils.create_txt_filehandle(location, filename, 'r', 'utf-8')
 39+ print fh
 40+ for line in fh:
 41+ data = line.split('\t')
 42+ username = data[3].lower()
 43+ if username.endswith('bot'):
 44+ continue
 45+ else:
 46+ dataset.write(line)
 47+ fh.close()
 48+ dataset.close()
 49+
 50+launcher()
Property changes on: trunk/tools/editor_trends/etl/kaggle.py
___________________________________________________________________
Added: svn:eol-style
151 + native