Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -56,7 +56,7 @@ |
57 | 57 | Config launcher is used to reconfigure editor trends toolkit. |
58 | 58 | ''' |
59 | 59 | # settings.load_configuration() |
60 | | -# |
| 60 | + pc = projects.ProjectContainer() |
61 | 61 | if not os.path.exists('wiki.cfg') or properties.force: |
62 | 62 | config = ConfigParser.RawConfigParser() |
63 | 63 | project = None |
— | — | @@ -65,17 +65,17 @@ |
66 | 66 | working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd()) |
67 | 67 | input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location) |
68 | 68 | |
69 | | - while project not in properties.projects.keys(): |
70 | | - project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % properties.projects[properties.short_project].capitalize()) |
71 | | - project = project if len(project) > 0 else properties.short_project |
72 | | - if project not in properties.projects.keys(): |
73 | | - print 'Valid choices for a project are: %s' % ','.join(properties.projects.keys()) |
| 69 | + while project not in pc.projects.keys(): |
| 70 | + project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % pc.projects[properties.project.name]) |
| 71 | + project = project if len(project) > 0 else properties.project.name |
| 72 | + if project not in pc.projects.keys(): |
| 73 | + print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys()) |
74 | 74 | |
75 | | - while language not in properties.valid_languages: |
76 | | - language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (properties.projects[project].capitalize(), properties.language)) |
| 75 | + while language not in properties.project.valid_languages: |
| 76 | + language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (pc.projects[project], properties.language)) |
77 | 77 | if len(language) == 0: |
78 | | - language = properties.language_code |
79 | | - language = language if language in properties.valid_languages else properties.language |
| 78 | + language = properties.language.code |
| 79 | + language = language if language in properties.project.valid_languages else properties.language |
80 | 80 | |
81 | 81 | input_location = input_location if len(input_location) > 0 else settings.input_location |
82 | 82 | working_directory = working_directory if len(working_directory) > 0 else os.getcwd() |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | from Queue import Empty |
22 | 22 | import multiprocessing |
23 | 23 | import sys |
| 24 | +import os |
24 | 25 | |
25 | 26 | sys.path.append('..') |
26 | 27 | import configuration |
— | — | @@ -31,6 +32,26 @@ |
32 | 33 | from database import db |
33 | 34 | |
34 | 35 | |
| 36 | +def store_articles(project, language_code): |
| 37 | + location = os.path.join(settings.input_location, language_code, project) |
| 38 | + fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', settings.encoding) |
| 39 | + headers = ['id', 'title'] |
| 40 | + data = fh.readlines() |
| 41 | + fh.close() |
| 42 | + |
| 43 | + dbname = '%s%s' % (language_code, project) |
| 44 | + collection = '%s_%s' % (dbname, 'articles') |
| 45 | + mongo = db.init_mongo_db(dbname) |
| 46 | + collection = mongo[collection] |
| 47 | + |
| 48 | + articles = {} |
| 49 | + for d in data: |
| 50 | + for header in headers: |
| 51 | + articles[header] = d |
| 52 | + |
| 53 | + collection.insert(articles) |
| 54 | + |
| 55 | + |
35 | 56 | def store_editors(tasks, dbname, collection, source): |
36 | 57 | ''' |
37 | 58 | This function is called by multiple consumers who each take a sorted file |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -63,7 +63,7 @@ |
64 | 64 | return '%s' % (self.id) |
65 | 65 | |
66 | 66 | def __call__(self): |
67 | | - |
| 67 | + cutoff = 9 |
68 | 68 | editor = self.input_db.find_one({'editor': self.id}) |
69 | 69 | if editor == None: |
70 | 70 | return |
— | — | @@ -74,7 +74,10 @@ |
75 | 75 | monthly_edits = db.stringify_keys(monthly_edits) |
76 | 76 | edits = sort_edits(edits) |
77 | 77 | edit_count = len(edits) |
78 | | - new_wikipedian = edits[9]['date'] |
| 78 | + if len(edits) > cutoff: |
| 79 | + new_wikipedian = edits[cutoff]['date'] |
| 80 | + else: |
| 81 | + new_wikipedian = False |
79 | 82 | first_edit = edits[0]['date'] |
80 | 83 | final_edit = edits[-1]['date'] |
81 | 84 | edits_by_year = determine_edits_by_year(edits, first_year, final_year) |
— | — | @@ -83,7 +86,7 @@ |
84 | 87 | last_edit_by_year = db.stringify_keys(last_edit_by_year) |
85 | 88 | articles_by_year = determine_articles_by_year(edits, first_year, final_year) |
86 | 89 | articles_by_year = db.stringify_keys(articles_by_year) |
87 | | - edits = edits[:10] |
| 90 | + edits = edits[:cutoff] |
88 | 91 | |
89 | 92 | self.output_db.insert({'editor': self.id, |
90 | 93 | 'edits': edits, |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -134,7 +134,7 @@ |
135 | 135 | except UnicodeDecodeError, e: |
136 | 136 | print e |
137 | 137 | except Empty: |
138 | | - break |
| 138 | + pass |
139 | 139 | |
140 | 140 | |
141 | 141 | def mergesort_launcher(source, target): |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -52,6 +52,8 @@ |
53 | 53 | if args: |
54 | 54 | self.args = args |
55 | 55 | self.hash = self.secs_since_epoch() |
| 56 | + print self.settings.input_location |
| 57 | + print self.get_value('location') |
56 | 58 | self.base_location = self.settings.input_location if \ |
57 | 59 | self.settings.input_location != None else self.get_value('location') |
58 | 60 | self.project = self.update_project_settings() |
— | — | @@ -84,6 +86,7 @@ |
85 | 87 | self.dump_filename = self.generate_wikidump_filename() |
86 | 88 | self.dump_relative_path = self.set_dump_path() |
87 | 89 | self.dump_absolute_path = self.set_dump_path(absolute=True) |
| 90 | + print self.directories |
88 | 91 | settings.verify_environment(self.directories) |
89 | 92 | |
90 | 93 | def __str__(self): |
— | — | @@ -138,7 +141,7 @@ |
139 | 142 | max_length_key = max([len(key) for key in about.keys()]) |
140 | 143 | print 'Final settings after parsing command line arguments:' |
141 | 144 | for ab in about: |
142 | | - print '%s: %s' % (ab.rjust(max_length_key), about[ab]) |
| 145 | + print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.settings.encoding)) |
143 | 146 | |
144 | 147 | |
145 | 148 | def get_value(self, key): |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -92,7 +92,7 @@ |
93 | 93 | self.working_directory = self.determine_working_directory() |
94 | 94 | self.update_python_path() |
95 | 95 | |
96 | | - self.root = '/' if self.platform != 'Windows' else 'c:\\' |
| 96 | + self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\' |
97 | 97 | self.max_filehandles = self.determine_max_filehandles_open() |
98 | 98 | self.tab_width = 4 if self.platform == 'Windows' else 8 |
99 | 99 | |
— | — | @@ -158,7 +158,7 @@ |
159 | 159 | try: |
160 | 160 | os.makedirs(directory) |
161 | 161 | except IOError: |
162 | | - raise 'Configuration Error, could not create directory.' |
| 162 | + print 'Configuration Error, could not create directory %s.' % directory |
163 | 163 | |
164 | 164 | def detect_windows_program(self, program): |
165 | 165 | entry = self.windows_register.get(program, None) |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -226,6 +226,7 @@ |
227 | 227 | print error |
228 | 228 | return res |
229 | 229 | |
| 230 | + |
230 | 231 | def determine_filesize(location, filename): |
231 | 232 | path = os.path.join(location, filename) |
232 | 233 | return os.path.getsize(path) |