r76979 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76978‎ | r76979 | r76980 >
Date:22:21, 18 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Fixed some missed renames
Modified paths:
  • /trunk/tools/editor_trends/etl/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/etl/loader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/optimize_editors.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/sort.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -33,9 +33,9 @@
3434 from utils import utils
3535 from utils import dump_downloader
3636 from etl import chunker
37 -import map_wiki_editors
38 -import optimize_editors
39 -import construct_datasets
 37+from etl import extract
 38+from etl import optimize_editors
 39+from etl import construct_datasets
4040 import config
4141
4242
@@ -155,7 +155,7 @@
156156
157157 def mongodb_script_launcher(args, location, filename, project, full_project, language_code, language):
158158 print 'mongodb_script_launcher'
159 - map_wiki_editors.run_parse_editors(project, language_code, location)
 159+ extract.run_parse_editors(project, language_code, location)
160160
161161
162162 def sort_launcher(args, location, filename, project, full_project, language_code):
Index: trunk/tools/editor_trends/etl/optimize_editors.py
@@ -21,7 +21,9 @@
2222 from Queue import Empty
2323 from operator import itemgetter
2424 import datetime
 25+import sys
2526
 27+sys.path.append('..')
2628 import configuration
2729 settings = configuration.Settings()
2830 from database import db
@@ -154,7 +156,7 @@
155157 }
156158 print len(ids)
157159 ids = list(ids)
158 - chunks = dict(0, ids)
 160+ chunks = {0: ids}
159161 pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
160162
161163
Index: trunk/tools/editor_trends/etl/construct_datasets.py
@@ -21,9 +21,10 @@
2222 from Queue import Empty
2323 import datetime
2424 from dateutil.relativedelta import *
25 -
 25+import sys
2626 import progressbar
2727
 28+sys.path.append('..')
2829 import configuration
2930 settings = configuration.Settings()
3031 from utils import models, utils
Index: trunk/tools/editor_trends/etl/loader.py
@@ -17,8 +17,9 @@
1818 __date__ = '2010-11-16'
1919 __version__ = '0.1'
2020
21 -
 21+import os
2222 import sys
 23+from Queue import Empty
2324
2425 sys.path.append('..')
2526 import configuration
@@ -26,9 +27,11 @@
2728 from database import db
2829 from database import cache
2930 from utils import utils
 31+from utils import sort
3032 import process_constructor as pc
3133
3234
 35+
3336 def store_editors(input, filename, dbname):
3437 fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
3538 mongo = db.init_mongo_db(dbname)
@@ -77,17 +80,17 @@
7881 chunks = utils.split_list(files, int(x))
7982 '''1st iteration external mergesort'''
8083 for chunk in chunks:
81 - filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
82 - filename = merge_sorted_files(output, filehandles, chunk)
83 - filehandles = [fh.close() for fh in filehandles]
 84+ #filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
 85+ #filename = sort.merge_sorted_files(output, filehandles, chunk)
 86+ #filehandles = [fh.close() for fh in filehandles]
8487 pass
8588 '''2nd iteration external mergesort, if necessary'''
8689 if len(chunks) > 1:
8790 files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
8891 filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.encoding) for file in files]
89 - filename = merge_sorted_files(output, filehandles, 'final')
 92+ filename = sort.merge_sorted_files(output, filehandles, 'final')
9093 filehandles = [fh.close() for fh in filehandles]
91 - filename = 'merged_final.txt'
 94+ filename = 'merged_final.txt'
9295 store_editors(output, filename, dbname)
9396
9497
@@ -102,8 +105,8 @@
103106 fh.close()
104107 data = [d.replace('\n', '') for d in data]
105108 data = [d.split('\t') for d in data]
106 - sorted_data = mergesort(data)
107 - write_sorted_file(sorted_data, file, output)
 109+ sorted_data = sort.mergesort(data)
 110+ sort.write_sorted_file(sorted_data, file, output)
108111 except Empty:
109112 break
110113
@@ -136,5 +139,5 @@
137140 input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
138141 output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
139142 dbname = 'enwiki'
140 - mergesort_launcher(input, output)
 143+ #mergesort_launcher(input, output)
141144 mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/utils.py
@@ -331,6 +331,8 @@
332332 all_files = os.listdir(location)
333333 files = []
334334 for file in all_files:
 335+ if file == 'merged_1.txt':
 336+ print 'debug'
335337 file = file.split('.')
336338 if len(file) == 1:
337339 continue
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -74,7 +74,7 @@
7575 else:
7676 result_queues[i] = False
7777
78 - if settings.progress_bar:
 78+ if settings.progressbar:
7979 size = sum([input_queues[q].qsize() for q in input_queues])
8080 pbar = progressbar.ProgressBar(maxval=size).start()
8181 kwargs['pbar'] = pbar
Index: trunk/tools/editor_trends/utils/sort.py
@@ -28,9 +28,17 @@
2929 from multiprocessing import Queue
3030 from Queue import Empty
3131 import datetime
 32+import sys
3233
 34+sys.path.append('..')
 35+import configuration
 36+settings = configuration.Settings()
3337
3438
 39+import utils
 40+
 41+
 42+
3543 def quick_sort(obs):
3644 if obs == []:
3745 return []
@@ -68,6 +76,7 @@
6977
7078 def readline(file):
7179 for line in file:
 80+ print file.stream.name
7281 line = line.replace('\n', '')
7382 if line == '':
7483 continue

Status & tagging log