Index: trunk/tools/code-utils/read_wfMsgCalls.py |
— | — | @@ -0,0 +1,279 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +# |
| 5 | +# Two small tools to analyse the keys of the i18n messages of MediaWiki |
| 6 | +# 1/ Get the current keys (and corresponding language and message) from their definition |
| 7 | +# 2/ Get the calls of the wfMsg functions (and corresponding file, line, type (wfMsg, wfMsgForContent, etc.), message key (when possible)) |
| 8 | +# |
| 9 | +# |
| 10 | +# Some details: |
| 11 | +# * this program calls a PHP interpreter, so you need it |
| 12 | +# * the occurences of wfMsg functions in block comments are removed (about 30 occurences), I didn’t check for single-line comments |
| 13 | +# * I assumed the key of the messages are '([a-zA-Z0-9_-]+?)', it seems quite correct after tests (anyway no wfMsg call is forgotten, even if the key is not recognized) |
| 14 | +# * hence the key with a variable are never computed since you must have a deeper program analysis, the cases are wfMsg( $wgLogNames[$type] ) or (easier?) wfMessage( 'block-log-flags-' . $flag ) |
| 15 | +# * some calls are missed when called by call_user_func_array, but in these case you probably have no chance to get the associated message key because it is probably a variable) |
| 16 | +# * the results are CSV |
| 17 | +# * the format of the messageStrings file is: 1/ language code (from the name of the file when available); 2/ message key; 3/ content of the message |
| 18 | +# * the format of the wfMsgCalls file is: 1/ path of the file; 2/ line number; 3/ wfMsg type (wfMsg, wfMessage, etc.); 4/ message key (when possible); 5/ complete call of the function |
| 19 | +# |
| 20 | + |
| 21 | + |
| 22 | +# # # # # # # # |
| 23 | +# Parameters # |
| 24 | +# # # # # # # # |
| 25 | + |
| 26 | +# BASE PARAMETERS |
| 27 | + |
| 28 | +# Folder containing a tree of MediaWiki |
| 29 | +baseFolder = "mediawiki/repo/phase3" |
| 30 | + |
| 31 | +# Name of the CSV result file (in the current folder) containing the calls to the functions wfMsg* (specified thereafter in a parameter) obtained by analysing the code |
| 32 | +wfMsgCallsResultFile = "wfMsgCalls.csv" |
| 33 | + |
| 34 | +# Name of the CSV result file (in the current folder) containing the associations lang-msgkey-message by retrieving the content of the PHP $messages variable in the 'languages' and i18n folders |
| 35 | +messageStringsResultFile = "messageStrings.csv" |
| 36 | + |
| 37 | +# Save also the content of the messages (count 10Mio without and 30Mio with) |
| 38 | +lightMessageStrings = False |
| 39 | + |
| 40 | +# Name of the wfMsg functions to search in the code |
| 41 | +messageFunctions = [ "wfMsg", "wfMessage", "wfMessageFallback", "wfMsgExt", "wfMsgForContent", "wfMsgNoTrans", "wfMsgForContentNoTrans", "wfMsgReal", "wfMsgHtml", "wfMsgWikiHtml", "wfEmptyMsg", "wfMsgReplaceArgs", "wfMsgGetKey" ] |
| 42 | + |
| 43 | + |
| 44 | +# MESSAGES FOLDERS AND FILES |
| 45 | + |
| 46 | +# Folders (let the # to include messagesIndividualFiles) |
| 47 | +messagesFolders = { 'phase3': [ 'languages/messages' ], 'extensions': [ ], '#':'#' } |
| 48 | + |
| 49 | +# Exclude these files |
| 50 | +messagesExcludeFiles = [] |
| 51 | + |
| 52 | +# Include these files (must not be in the previous folders else it would be duplicated) |
| 53 | +messagesIndividualFiles = [] |
| 54 | + |
| 55 | + |
| 56 | +# CODE FOLDERS AND FILES |
| 57 | + |
| 58 | +# Folders (let the # to include codeIndividualFiles) |
| 59 | +codeFolders = [ "includes", "extensions", "skins", "languages/classes", "#" ] |
| 60 | + |
| 61 | +# Exclude these files |
| 62 | +codeExcludeFiles = [] |
| 63 | + |
| 64 | +# Include these files (must not be in the previous folders else it would be duplicated) |
| 65 | +codeIndividualFiles = [ "languages/Language.php", "languages/LanguageConverter.php", "languages/Names.php", "resources/Resources.php" ] |
| 66 | + |
| 67 | + |
| 68 | + |
| 69 | + |
| 70 | +# # # # # # # # # # # # |
| 71 | +# Read the i18n files # |
| 72 | +# # # # # # # # # # # # |
| 73 | + |
| 74 | +import os, os.path, re, csv, subprocess |
| 75 | + |
| 76 | + |
| 77 | +currentFolder = os.getcwd() |
| 78 | +os.chdir( baseFolder ) |
| 79 | + |
| 80 | +i18nMessages = [] |
| 81 | + |
| 82 | +# Iterate over folders and files |
| 83 | +for messagesFolderType in messagesFolders: |
| 84 | + |
| 85 | + directories = [] |
| 86 | + if messagesFolderType == 'extensions': |
| 87 | + for directory in messagesFolders[messagesFolderType]: |
| 88 | + l = os.walk( directory ) |
| 89 | + for j in l: |
| 90 | + if '.svn' in j[0]: |
| 91 | + continue |
| 92 | + directories.append( j[0] ) |
| 93 | + messagesFolders[messagesFolderType] = directories |
| 94 | + |
| 95 | + for messagesFolder in messagesFolders[messagesFolderType]: |
| 96 | + |
| 97 | + if messagesFolderType != '#': |
| 98 | + files = os.listdir( messagesFolder ) |
| 99 | + else: |
| 100 | + files = messagesIndividualFiles |
| 101 | + messageFolder = '' |
| 102 | + |
| 103 | + for filename in files: |
| 104 | + |
| 105 | + if filename[-4:] != '.php': |
| 106 | + continue |
| 107 | + |
| 108 | + if messagesFolderType == 'extensions' and filename[-9:] != '.i18n.php': |
| 109 | + continue |
| 110 | + |
| 111 | + if filename in messagesExcludeFiles: |
| 112 | + continue |
| 113 | + |
| 114 | + if messagesFolderType == 'phase3' and filename[:8] == 'Messages': |
| 115 | + lang = filename[8:-4] |
| 116 | + |
| 117 | + # Read the PHP $messages variable |
| 118 | + p = subprocess.Popen( 'php', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True ) |
| 119 | + print >>p.stdin, '<?php' |
| 120 | + print >>p.stdin, "require( '"+os.path.join( 'includes', 'Defines.php' )+"' );" |
| 121 | + print >>p.stdin, "require( '"+os.path.join( messagesFolder, filename )+"' );" |
| 122 | + print >>p.stdin, """ |
| 123 | + if( isset( $messages ) && is_array( $messages ) && count( $messages ) > 0 ) { |
| 124 | + if( is_array( current( $messages ) ) ) { |
| 125 | + foreach( $messages as $lang => $msgs ) |
| 126 | + foreach( $msgs as $key => $msg ) |
| 127 | + echo $lang.'|'.$key.' '.str_replace( array("\r\n", "\n", "\r"), "5197361546748612348916973", $msg )."\n"; |
| 128 | + } |
| 129 | + else { |
| 130 | + foreach( $messages as $key => $msg ) |
| 131 | + echo $key.' '.str_replace( array("\r\n", "\n", "\r"), "5197361546748612348916973", $msg )."\n"; |
| 132 | + } |
| 133 | + } |
| 134 | + """ |
| 135 | + p.stdin.close() |
| 136 | + |
| 137 | + messages = str.splitlines( p.stdout.read() ) |
| 138 | + |
| 139 | + if len(messages) == 0: |
| 140 | + if messagesFolderType == 'phase3' and filename[:8] == 'Messages': |
| 141 | + print 'Core language '+lang+' doesn’t have a $message variable in '+filename+' or is empty.' |
| 142 | + else: |
| 143 | + print 'File '+filename+' doesn’t have a $message variable or is empty.' |
| 144 | + continue |
| 145 | + |
| 146 | + # Retrieve the result and put it in a list of list |
| 147 | + for message in messages: |
| 148 | + |
| 149 | + msg = message.split( ' ', 1 ) |
| 150 | + i18nMessage = [] |
| 151 | + if '|' in msg[0]: |
| 152 | + sp = msg[0].split( '|' ) |
| 153 | + i18nMessage.append( sp[0] ) |
| 154 | + i18nMessage.append( sp[1] ) |
| 155 | + else: |
| 156 | + i18nMessage.append( lang.lower() ) |
| 157 | + i18nMessage.append( msg[0] ) |
| 158 | + if not lightMessageStrings: |
| 159 | + i18nMessage.append( msg[1].replace( '5197361546748612348916973', '\n' ) ) |
| 160 | + i18nMessages.append( i18nMessage ) |
| 161 | + |
| 162 | +os.chdir( currentFolder ) |
| 163 | +writer = csv.writer( open( messageStringsResultFile, 'w' ) ) |
| 164 | +writer.writerows( i18nMessages ) |
| 165 | + |
| 166 | + |
| 167 | +# # # # # # # # # # # # |
| 168 | +# Read the code files # |
| 169 | +# # # # # # # # # # # # |
| 170 | + |
| 171 | +os.chdir( baseFolder ) |
| 172 | + |
| 173 | +wfMsgCalls = [] |
| 174 | + |
| 175 | +msgFunctions = "(" + '|'.join( messageFunctions ) + ")( *\(.*?\))" |
| 176 | +msgFunctionsSoft = "(" + '|'.join( messageFunctions ) + ")" |
| 177 | +msgFunctionsMaxLength = max( [ len(f) for f in messageFunctions ] ) |
| 178 | + |
| 179 | +# Iterate over folders and files |
| 180 | +for folder in codeFolders: |
| 181 | + |
| 182 | + if folder != '#': |
| 183 | + directories = os.walk( folder ) |
| 184 | + else: |
| 185 | + directories = [ '.' ] |
| 186 | + |
| 187 | + for directory in directories: |
| 188 | + |
| 189 | + if folder != '#': |
| 190 | + if '.svn' in directory[0]: |
| 191 | + continue |
| 192 | + fileset = directory[2] |
| 193 | + direct = directory[0] |
| 194 | + else: |
| 195 | + fileset = codeIndividualFiles |
| 196 | + direct = '' |
| 197 | + |
| 198 | + for filename in fileset: |
| 199 | + |
| 200 | + if filename[-4:] != ".php": |
| 201 | + continue |
| 202 | + |
| 203 | + if filename in codeExcludeFiles: |
| 204 | + continue |
| 205 | + |
| 206 | + fyle = open( os.path.join( direct, filename ), 'r' ) |
| 207 | + |
| 208 | + content = fyle.read() |
| 209 | + |
| 210 | + # Remove the false positive in block comments (some could remain if in single-line comments) |
| 211 | + incomment = False |
| 212 | + for c in range(len(content)-1): |
| 213 | + if c == len(content): |
| 214 | + break |
| 215 | + if content[c] == '/' and content[c+1] == '*': |
| 216 | + incomment = True |
| 217 | + if content[c] == '*' and content[c+1] == '/': |
| 218 | + incomment = False |
| 219 | + if content[c] == 'w' and incomment: |
| 220 | + f = re.search( '^'+msgFunctionsSoft, content[c:c+msgFunctionsMaxLength] ) |
| 221 | + if f != None: |
| 222 | + content = content[:c] + content[c+f.end():] |
| 223 | + |
| 224 | + # Get the indexes of the beginning of lines (to compute after the line number) |
| 225 | + indexOfBeginningOfLines = [0] |
| 226 | + for m in re.finditer( '(?:\n|\r|\n\r|\r\n)', content ): |
| 227 | + indexOfBeginningOfLines.append( m.end() ) |
| 228 | + |
| 229 | + if indexOfBeginningOfLines[-1] != len(content): |
| 230 | + indexOfBeginningOfLines.append( len(content) ) |
| 231 | + |
| 232 | + # Iterate to find the wfMsg functions |
| 233 | + for m in re.finditer( msgFunctions, content, re.S ): |
| 234 | + |
| 235 | + i = -1 |
| 236 | + while m.start()-indexOfBeginningOfLines[i] < 0: |
| 237 | + i = i - 1 |
| 238 | + |
| 239 | + # Search the key once we recognized the message |
| 240 | + key = '' |
| 241 | + k = re.search( "^\(\s*'([a-zA-Z0-9_-]+?)'\s*(?:,|\))", m.group(2) ) |
| 242 | + if k != None: |
| 243 | + key = k.group(1) |
| 244 | + else: |
| 245 | + k = re.search( '^\(\s*"([a-zA-Z0-9_-]+?)"\s*(?:,|\))', m.group(2) ) |
| 246 | + if k != None: |
| 247 | + key = k.group(1) |
| 248 | + |
| 249 | + wfMsgCall = [ os.path.join( directory[0], filename ), len(indexOfBeginningOfLines)+i+1, m.group(1), key, m.group(0) ] |
| 250 | + |
| 251 | + # You must have the same number of opening and closing parenthesis |
| 252 | + if m.group(0).count( '(' ) > 1: |
| 253 | + |
| 254 | + recursion = 0 |
| 255 | + pos = m.end() |
| 256 | + |
| 257 | + while wfMsgCall[4].count( '(' ) != wfMsgCall[4].count( ')' ): |
| 258 | + |
| 259 | + endparenthesis = '' |
| 260 | + for nbparenthesis in range( wfMsgCall[4].count( '(' ) - wfMsgCall[4].count( ')' ) ): |
| 261 | + endparenthesis = endparenthesis + '.*?\)' |
| 262 | + res = re.search( endparenthesis, content[pos:], re.S ) |
| 263 | + |
| 264 | + pos = pos + res.end() |
| 265 | + |
| 266 | + if res != None: |
| 267 | + wfMsgCall[4] = wfMsgCall[4] + res.group(0) |
| 268 | + else: |
| 269 | + raise Exception( 'parenthesis expected' ) |
| 270 | + recursion = recursion + 1 |
| 271 | + |
| 272 | + if recursion == 10: |
| 273 | + raise Exception( 'recursion' ) |
| 274 | + |
| 275 | + wfMsgCalls.append( wfMsgCall ) |
| 276 | + |
| 277 | +os.chdir( currentFolder ) |
| 278 | +writer = csv.writer( open( wfMsgCallsResultFile, 'w' ) ) |
| 279 | +writer.writerows( wfMsgCalls ) |
| 280 | + |
Property changes on: trunk/tools/code-utils/read_wfMsgCalls.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 281 | + native |