Index: trunk/tools/editor_trends/kaggle/training.py |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +import codecs |
| 3 | +import os |
| 4 | + |
| 5 | + |
| 6 | + |
| 7 | +location = '/home/diederik/wikimedia/wikilytics/en/wiki/txt' |
| 8 | +files = os.listdir(location) |
| 9 | + |
| 10 | +output = codecs.open('training.txt', 'w', 'utf-8') |
| 11 | + |
| 12 | +for filename in files: |
| 13 | + fh = codecs.open(os.path.join(location, filename)) |
| 14 | + for line in fh: |
| 15 | + line = line.strip() |
| 16 | + line = line.split('\t') |
| 17 | + if len(line) != 13: |
| 18 | + continue |
| 19 | + username = line[12].lower() |
| 20 | + if username.endswith('bot'): |
| 21 | + line[5] = 1 |
| 22 | + line = '\t'.join(line) |
| 23 | + output.write(line) |
| 24 | + |
| 25 | + |
| 26 | +output.close() |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/kaggle/training.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 27 | + native |