Index: trunk/php/luasandbox/ustring.c |
— | — | @@ -0,0 +1,1038 @@ |
| 2 | +#ifdef HAVE_CONFIG_H |
| 3 | +#include "config.h" |
| 4 | +#endif |
| 5 | + |
| 6 | +#include <lua.h> |
| 7 | +#include <lauxlib.h> |
| 8 | + |
| 9 | +#include <unicode/utf.h> |
| 10 | +#include <unicode/uchar.h> |
| 11 | +#include <unicode/ustring.h> |
| 12 | + |
| 13 | +#include "php.h" |
| 14 | +#include "php_luasandbox.h" |
| 15 | +#include "luasandbox_unicode.h" |
| 16 | + |
| 17 | +#define LUASANDBOX_CHECK_ICU_ERROR(errorCode, cleanupCode) { \ |
| 18 | + if( U_FAILURE(errorCode) ) { \ |
| 19 | + char _luasandbox_errmsg[1024]; \ |
| 20 | + snprintf( _luasandbox_errmsg, 1024, "Unicode handling error: %s", u_errorName(errorCode) ); \ |
| 21 | + lua_pushstring( L, _luasandbox_errmsg ); \ |
| 22 | + cleanupCode; \ |
| 23 | + lua_error(L); \ |
| 24 | + } \ |
| 25 | + errorCode = U_ZERO_ERROR; \ |
| 26 | + } |
| 27 | + |
| 28 | +/****************** Prototypes ******************/ |
| 29 | + |
| 30 | +int luasandbox_ustr_create(lua_State * L); |
| 31 | +int luasandbox_ustr_len(lua_State * L); |
| 32 | +int luasandbox_ustr_concat(lua_State * L); |
| 33 | +int luasandbox_ustr_eq(lua_State * L); |
| 34 | +int luasandbox_ustr_index(lua_State * L); |
| 35 | + |
| 36 | +int luasandbox_ustr_ucfirst(lua_State * L); |
| 37 | +int luasandbox_ustr_uc(lua_State * L); |
| 38 | +int luasandbox_ustr_lc(lua_State * L); |
| 39 | +int luasandbox_ustr_tc(lua_State * L); |
| 40 | +int luasandbox_ustr_trim(lua_State * L); |
| 41 | +int luasandbox_ustr_sub(lua_State * L); |
| 42 | +int luasandbox_ustr_pos(lua_State * L); |
| 43 | +int luasandbox_ustr_replace(lua_State * L); |
| 44 | +int luasandbox_ustr_split(lua_State * L); |
| 45 | + |
| 46 | +/****************** Registration of functions ******************/ |
| 47 | + |
| 48 | +static luaL_Reg luasandbox_ustr_functions[] = { |
| 49 | + { "len", luasandbox_ustr_len }, |
| 50 | + { "ucfirst", luasandbox_ustr_ucfirst }, |
| 51 | + { "uc", luasandbox_ustr_uc }, |
| 52 | + { "lc", luasandbox_ustr_lc }, |
| 53 | + { "tc", luasandbox_ustr_tc }, |
| 54 | + { "trim", luasandbox_ustr_trim }, |
| 55 | + { "sub", luasandbox_ustr_sub }, |
| 56 | + { "pos", luasandbox_ustr_pos }, |
| 57 | + { "replace", luasandbox_ustr_replace }, |
| 58 | + { "split", luasandbox_ustr_split }, |
| 59 | + NULL |
| 60 | +}; |
| 61 | + |
| 62 | +/** {{{ luasandbox_install_unicode_functions |
| 63 | + * |
| 64 | + * Installs the unicode module into the global namespace. |
| 65 | + */ |
| 66 | +void luasandbox_install_unicode_functions(lua_State * L) |
| 67 | +{ |
| 68 | + luaL_newmetatable( L, "luasandbox_ustr" ); |
| 69 | + |
| 70 | + lua_pushstring( L, "__len" ); |
| 71 | + lua_pushcfunction( L, luasandbox_ustr_len ); |
| 72 | + lua_rawset( L, -3 ); |
| 73 | + |
| 74 | + lua_pushstring( L, "__concat" ); |
| 75 | + lua_pushcfunction( L, luasandbox_ustr_concat ); |
| 76 | + lua_rawset( L, -3 ); |
| 77 | + |
| 78 | + lua_pushstring( L, "__eq" ); |
| 79 | + lua_pushcfunction( L, luasandbox_ustr_eq ); |
| 80 | + lua_rawset( L, -3 ); |
| 81 | + |
| 82 | + lua_pushstring( L, "__index" ); |
| 83 | + lua_pushcfunction( L, luasandbox_ustr_index ); |
| 84 | + lua_rawset( L, -3 ); |
| 85 | + |
| 86 | + lua_pushcfunction( L, luasandbox_ustr_create ); |
| 87 | + lua_setglobal( L, "u" ); |
| 88 | + |
| 89 | + luaL_register( L, "ustring", luasandbox_ustr_functions ); |
| 90 | +} |
| 91 | +/* }}} */ |
| 92 | + |
| 93 | +/****************** Common functions ******************/ |
| 94 | + |
| 95 | +/** {{{ luasandbox_init_ustr |
| 96 | + * |
| 97 | + * Initializes a ustring header and assigns the metatable to it. |
| 98 | + */ |
| 99 | +luasandbox_ustr_header *luasandbox_init_ustr(lua_State * L, size_t len) |
| 100 | +{ |
| 101 | + luasandbox_ustr_header *result; |
| 102 | + |
| 103 | + result = (luasandbox_ustr_header*) lua_newuserdata( L, sizeof(luasandbox_ustr_header) + len ); |
| 104 | + result->raw_len = len; |
| 105 | + |
| 106 | + luaL_getmetatable( L, "luasandbox_ustr" ); |
| 107 | + lua_setmetatable( L, -2 ); |
| 108 | + |
| 109 | + return result; |
| 110 | +} |
| 111 | +/* }}} */ |
| 112 | + |
| 113 | +/** {{{ luasandbox_push_ustr |
| 114 | + * |
| 115 | + * Constructs the ustring object from a UTF-8 string. Validates the string and |
| 116 | + * raises an error if the string is invalid. |
| 117 | + */ |
| 118 | +luasandbox_ustr_header *luasandbox_push_ustr(lua_State * L, uint8_t *str, size_t len) |
| 119 | +{ |
| 120 | + luasandbox_ustr_header *header; |
| 121 | + int32_t i, cp_len; |
| 122 | + |
| 123 | + // Validate the string + calculate length |
| 124 | + for( i = cp_len = 0; i < len; cp_len++ ) { |
| 125 | + UChar32 cur; |
| 126 | + |
| 127 | + U8_NEXT( str, i, len, cur ); |
| 128 | + if( cur < 0 ) { |
| 129 | + lua_pushstring( L, "Invalid UTF-8 supplied" ); |
| 130 | + lua_error( L ); |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + header = luasandbox_init_ustr( L, len ); |
| 135 | + header->cp_len = cp_len; |
| 136 | + memcpy( LUASANDBOX_USTR_RAW(header), str, len ); |
| 137 | + |
| 138 | + return header; |
| 139 | +} |
| 140 | +/* }}} */ |
| 141 | + |
| 142 | +/** {{{ luasandbox_isustr |
| 143 | + * |
| 144 | + * Checks if the the object on the stack is a ustring. |
| 145 | + */ |
| 146 | +int luasandbox_isustr(lua_State * L, int idx) |
| 147 | +{ |
| 148 | + int result; |
| 149 | + |
| 150 | + if( lua_type( L, idx ) != LUA_TUSERDATA ) |
| 151 | + return FALSE; |
| 152 | + |
| 153 | + if( !lua_getmetatable( L, idx ) ) |
| 154 | + return FALSE; |
| 155 | + |
| 156 | + luaL_getmetatable( L, "luasandbox_ustr" ); |
| 157 | + |
| 158 | + result = lua_equal( L, -1, -2 ); |
| 159 | + lua_pop( L, 2 ); |
| 160 | + return result; |
| 161 | +} |
| 162 | +/* }}} */ |
| 163 | + |
| 164 | +/** {{{ luasandbox_checkustring |
| 165 | + * |
| 166 | + * Checks whether the specified object on the stack is a ustring |
| 167 | + * or an object which may be converted to it. Returns the pointer |
| 168 | + * to the ustring's header. |
| 169 | + */ |
| 170 | +luasandbox_ustr_header* luasandbox_checkustring(lua_State * L, int idx) |
| 171 | +{ |
| 172 | + if ( lua_type( L, idx ) == LUA_TSTRING || lua_type( L, idx ) == LUA_TNUMBER ) { |
| 173 | + // A usual string. Magically convert it to ustring. |
| 174 | + lua_checkstack( L, 2 ); |
| 175 | + lua_pushvalue( L, idx ); |
| 176 | + luasandbox_ustr_create(L); |
| 177 | + lua_replace( L, idx ); |
| 178 | + lua_pop( L, 1 ); |
| 179 | + } |
| 180 | + |
| 181 | + return luaL_checkudata( L, idx, "luasandbox_ustr" ); |
| 182 | +} |
| 183 | +/* }}} */ |
| 184 | + |
| 185 | +/** {{{ luasandbox_checkustring |
| 186 | + * |
| 187 | + * Returns the pointer to the string itself and sets raw_len |
| 188 | + * to the length of string in bytes. |
| 189 | + */ |
| 190 | +const uint8_t* luasandbox_getustr(lua_State * L, int idx, size_t* raw_len) |
| 191 | +{ |
| 192 | + luasandbox_ustr_header *header; |
| 193 | + header = luasandbox_checkustring( L, idx ); |
| 194 | + *raw_len = header->raw_len; |
| 195 | + return LUASANDBOX_USTR_RAW(header); |
| 196 | +} |
| 197 | +/* }}} */ |
| 198 | + |
| 199 | +/** {{{ luasandbox_ustr_index_to_offset |
| 200 | + * |
| 201 | + * Converts a Lua index (starting with 1) to a C offset (starting with 0). |
| 202 | + * Handles negative indexes as indexes numbered from the end of the string. |
| 203 | + */ |
| 204 | +int32_t luasandbox_ustr_index_to_offset(lua_State * L, luasandbox_ustr_header *str, int32_t idx, int check_limits) |
| 205 | +{ |
| 206 | + if( !idx || check_limits && (idx > str->cp_len || -idx > str->cp_len) ) { |
| 207 | + lua_pushfstring( L, "Trying to access invalid index %d for string with length %d", idx, str->cp_len ); |
| 208 | + lua_error( L ); |
| 209 | + } |
| 210 | + |
| 211 | + if( idx > 0 ) { |
| 212 | + return idx - 1; |
| 213 | + } else { |
| 214 | + return str->cp_len + idx; |
| 215 | + } |
| 216 | +} |
| 217 | +/* }}} */ |
| 218 | + |
| 219 | +/****************** Conversions ******************/ |
| 220 | + |
| 221 | +/** {{{ luasandbox_convert_toUTF16 |
| 222 | + * |
| 223 | + * Converts the specified ustring to UTF-16, and pushes |
| 224 | + * the resulting UTF-16 string on the top of the stack. |
| 225 | + */ |
| 226 | +void luasandbox_convert_toUTF16(lua_State * L, int idx) |
| 227 | +{ |
| 228 | + luasandbox_ustr_header *header; |
| 229 | + UChar *utf16_string; |
| 230 | + int32_t result_len; |
| 231 | + UErrorCode error_code = U_ZERO_ERROR; |
| 232 | + |
| 233 | + header = luasandbox_checkustring( L, idx ); |
| 234 | + |
| 235 | + utf16_string = emalloc( header->raw_len * 2 ); |
| 236 | + u_strFromUTF8( utf16_string, header->raw_len, &result_len, |
| 237 | + LUASANDBOX_USTR_RAW(header), header->raw_len, &error_code ); |
| 238 | + LUASANDBOX_CHECK_ICU_ERROR( error_code, efree( utf16_string ) ); |
| 239 | + |
| 240 | + lua_pushlstring( L, (char*)utf16_string, result_len * 2 ); |
| 241 | + efree( utf16_string ); |
| 242 | +} |
| 243 | +/* }}} */ |
| 244 | + |
| 245 | +/** {{{ luasandbox_convert_fromUTF16 |
| 246 | + * |
| 247 | + * Converts the specified UTF-16 string to UTF-8, and pushes |
| 248 | + * the resulting ustring on the top of the stack. |
| 249 | + */ |
| 250 | +void luasandbox_convert_fromUTF16(lua_State * L, int idx) |
| 251 | +{ |
| 252 | + luasandbox_ustr_header *header; |
| 253 | + uint8_t *utf8_string; |
| 254 | + UChar *utf16_string; |
| 255 | + size_t orig_len; |
| 256 | + int32_t result_len; |
| 257 | + UErrorCode error_code = U_ZERO_ERROR; |
| 258 | + |
| 259 | + utf16_string = (UChar*) lua_tolstring( L, idx, &orig_len ); |
| 260 | + |
| 261 | + utf8_string = emalloc( orig_len ); |
| 262 | + u_strToUTF8( utf8_string, orig_len, &result_len, |
| 263 | + utf16_string, orig_len / 2, &error_code ); |
| 264 | + LUASANDBOX_CHECK_ICU_ERROR( error_code, efree( utf8_string ) ); |
| 265 | + |
| 266 | + luasandbox_push_ustr( L, utf8_string, result_len ); |
| 267 | + efree( utf8_string ); |
| 268 | +} |
| 269 | +/* }}} */ |
| 270 | + |
| 271 | +/****************** Operators ******************/ |
| 272 | + |
| 273 | +/** {{{ luasandbox_ustr_create |
| 274 | + * |
| 275 | + * Initializes the Unicode string from the string on the top of the stack. |
| 276 | + */ |
| 277 | +int luasandbox_ustr_create(lua_State * L) |
| 278 | +{ |
| 279 | + uint8_t *str; |
| 280 | + size_t raw_len = 0; |
| 281 | + |
| 282 | + str = luaL_checklstring( L, -1, &raw_len ); |
| 283 | + luasandbox_push_ustr( L, str, raw_len ); |
| 284 | + return 1; |
| 285 | +} |
| 286 | +/* }}} */ |
| 287 | + |
| 288 | +/** {{{ luasandbox_ustr_len |
| 289 | + * |
| 290 | + * Lua function providing the length of the string. |
| 291 | + */ |
| 292 | +int luasandbox_ustr_len(lua_State * L) |
| 293 | +{ |
| 294 | + luasandbox_ustr_header *header; |
| 295 | + |
| 296 | + header = luaL_checkudata( L, 1, "luasandbox_ustr" ); |
| 297 | + |
| 298 | + lua_pushinteger( L, header->cp_len ); |
| 299 | + return 1; |
| 300 | +} |
| 301 | +/* }}} */ |
| 302 | + |
| 303 | +/** {{{ luasandbox_ustr_concat |
| 304 | + * |
| 305 | + * Lua function handling the concatention operator. |
| 306 | + */ |
| 307 | +int luasandbox_ustr_concat(lua_State * L) |
| 308 | +{ |
| 309 | + luasandbox_ustr_header *s1, *s2, *newhdr; |
| 310 | + int32_t new_len; |
| 311 | + void* newstr; |
| 312 | + |
| 313 | + s1 = luasandbox_checkustring( L, 1 ); |
| 314 | + s2 = luasandbox_checkustring( L, 2 ); |
| 315 | + |
| 316 | + new_len = s1->raw_len + s2->raw_len; |
| 317 | + newhdr = luasandbox_init_ustr( L, new_len ); |
| 318 | + newhdr->cp_len = s1->cp_len + s2->cp_len; |
| 319 | + newstr = LUASANDBOX_USTR_RAW(newhdr); |
| 320 | + memcpy( newstr, LUASANDBOX_USTR_RAW(s1), s1->raw_len ); |
| 321 | + memcpy( newstr + s1->raw_len, LUASANDBOX_USTR_RAW(s2), s2->raw_len ); |
| 322 | + |
| 323 | + return 1; |
| 324 | +} |
| 325 | +/* }}} */ |
| 326 | + |
| 327 | +/** {{{ luasandbox_ustr_eq |
| 328 | + * |
| 329 | + * Lua function providing the equality operator. |
| 330 | + */ |
| 331 | +int luasandbox_ustr_eq(lua_State * L) |
| 332 | +{ |
| 333 | + luasandbox_ustr_header *s1, *s2; |
| 334 | + |
| 335 | + s1 = luasandbox_checkustring( L, 1 ); |
| 336 | + s2 = luasandbox_checkustring( L, 2 ); |
| 337 | + |
| 338 | + if( s1->cp_len != s2->cp_len || s1->raw_len != s2->raw_len ) { |
| 339 | + lua_pushboolean( L, FALSE ); |
| 340 | + return 1; |
| 341 | + } |
| 342 | + |
| 343 | + lua_pushboolean( L, !memcmp( LUASANDBOX_USTR_RAW(s1), LUASANDBOX_USTR_RAW(s2), s1->raw_len ) ); |
| 344 | + return 1; |
| 345 | +} |
| 346 | +/* }}} */ |
| 347 | + |
| 348 | +/** {{{ luasandbox_ustr_index |
| 349 | + * |
| 350 | + * Lua function providing the index operator. |
| 351 | + * Provides access both to class methods and |
| 352 | + * per-position access to string characters. |
| 353 | + */ |
| 354 | +int luasandbox_ustr_index(lua_State * L) |
| 355 | +{ |
| 356 | + luasandbox_ustr_header *str; |
| 357 | + uint8_t *raw; |
| 358 | + |
| 359 | + str = luaL_checkudata( L, 1, "luasandbox_ustr" ); |
| 360 | + raw = LUASANDBOX_USTR_RAW(str); |
| 361 | + |
| 362 | + if( lua_type( L, 2 ) == LUA_TNUMBER ) { |
| 363 | + // If it is a number, treat as accessing string by position |
| 364 | + int32_t i, idx, curidx, offset; |
| 365 | + uint8_t* result_pos; |
| 366 | + UChar32 cur, result; |
| 367 | + |
| 368 | + idx = lua_tointeger( L, 2 ); |
| 369 | + offset = luasandbox_ustr_index_to_offset( L, str, idx, TRUE ); |
| 370 | + |
| 371 | + for( i = curidx = 0; ; curidx++ ) { |
| 372 | + UChar32 tmp; |
| 373 | + |
| 374 | + U8_GET_UNSAFE( raw, i, result ); |
| 375 | + if( curidx == offset ) { |
| 376 | + result_pos = raw + i; |
| 377 | + break; |
| 378 | + } |
| 379 | + U8_NEXT_UNSAFE( raw, i, tmp ); |
| 380 | + } |
| 381 | + |
| 382 | + lua_pushlstring( L, result_pos, U8_LENGTH( result ) ); |
| 383 | + return 1; |
| 384 | + } else { |
| 385 | + // Otherwise treat it as an access to member functions |
| 386 | + lua_getglobal( L, "ustring" ); |
| 387 | + lua_pushvalue( L, 2 ); |
| 388 | + lua_gettable( L, -2 ); |
| 389 | + return 1; |
| 390 | + } |
| 391 | +} |
| 392 | +/* }}} */ |
| 393 | + |
| 394 | +/****************** Library ******************/ |
| 395 | + |
| 396 | +/** {{{ luasandbox_ustr_ucfirst |
| 397 | + * |
| 398 | + * Lua function: |
| 399 | + * ustring ucfirst( ustring str ) |
| 400 | + * Converts the first code point of str to upper case. |
| 401 | + */ |
| 402 | +int luasandbox_ustr_ucfirst(lua_State * L) |
| 403 | +{ |
| 404 | + luasandbox_ustr_header *header; |
| 405 | + uint8_t *utf_string; |
| 406 | + size_t raw_len; |
| 407 | + UChar32 first, newfirst; |
| 408 | + int offset = 0; |
| 409 | + |
| 410 | + header = luaL_checkudata( L, 1, "luasandbox_ustr" ); |
| 411 | + utf_string = LUASANDBOX_USTR_RAW( header ); |
| 412 | + raw_len = header->raw_len; |
| 413 | + |
| 414 | + if( !raw_len ) { |
| 415 | + lua_pushstring( L, "" ); |
| 416 | + return 1; |
| 417 | + } |
| 418 | + |
| 419 | + U8_GET_UNSAFE( utf_string, 0, first ); |
| 420 | + |
| 421 | + newfirst = u_toupper( first ); |
| 422 | + |
| 423 | + // The actions depend upon whether the lengths of symbol match |
| 424 | + if( U8_LENGTH(first) == U8_LENGTH(newfirst) ) { |
| 425 | + // Just replace the symbol |
| 426 | + luasandbox_ustr_header *newstr; |
| 427 | + uint8_t *result; |
| 428 | + |
| 429 | + newstr = lua_newuserdata( L, LUASANDBOX_USTR_TOTALLEN(header) ); |
| 430 | + luaL_getmetatable( L, "luasandbox_ustr" ); |
| 431 | + lua_setmetatable( L, -2 ); |
| 432 | + |
| 433 | + memcpy( newstr, header, LUASANDBOX_USTR_TOTALLEN(header) ); |
| 434 | + result = LUASANDBOX_USTR_RAW(newstr); |
| 435 | + U8_APPEND_UNSAFE( result, offset, newfirst ); |
| 436 | + } else { |
| 437 | + // I have tested this code in cases when len(old) < len(new), |
| 438 | + // but I am unaware of any cases when those lengths do not match. |
| 439 | + // It should have happened with eszett, but since capital eszett is |
| 440 | + // considered substandard, u_toupper does not convert it. |
| 441 | + size_t oldlen = U8_LENGTH(first), |
| 442 | + newlen = U8_LENGTH(newfirst); |
| 443 | + size_t delta = newlen - oldlen; |
| 444 | + |
| 445 | + uint8_t *result; |
| 446 | + size_t new_len; |
| 447 | + |
| 448 | + result = emalloc( raw_len + delta ); |
| 449 | + memcpy( result + newlen, utf_string + oldlen, raw_len - oldlen ); |
| 450 | + U8_APPEND_UNSAFE( result, offset, newfirst ); |
| 451 | + new_len = raw_len + delta; |
| 452 | + |
| 453 | + luasandbox_push_ustr( L, result, new_len ); |
| 454 | + efree( result ); |
| 455 | + } |
| 456 | + |
| 457 | + return 1; |
| 458 | +} |
| 459 | +/* }}} */ |
| 460 | + |
| 461 | +#define LUASANDBOX_UTF8_CHANGE_CASE_TOUPPER 1 |
| 462 | +#define LUASANDBOX_UTF8_CHANGE_CASE_TOLOWER 2 |
| 463 | +#define LUASANDBOX_UTF8_CHANGE_CASE_TOTITLE 3 |
| 464 | + |
| 465 | +/** {{{ luasandbox_ustr_change_case |
| 466 | + * |
| 467 | + * Backend function for uc(), lc() and tc(). Converts string into UTF-16, |
| 468 | + * passes it to ICU function and then converts back to UTF-8. This is required |
| 469 | + * since casing algorithms are rather non-trivial and may be even locale-dependant. |
| 470 | + */ |
| 471 | +static int luasandbox_ustr_change_case(lua_State * L, int action) |
| 472 | +{ |
| 473 | + UChar *utf16_orig, *utf16_result; |
| 474 | + size_t orig_length, x; |
| 475 | + int32_t result_len; |
| 476 | + UErrorCode errorCode = U_ZERO_ERROR; |
| 477 | + |
| 478 | + luasandbox_convert_toUTF16( L, 1 ); |
| 479 | + utf16_orig = (UChar*)lua_tolstring( L, -1, &orig_length ); |
| 480 | + |
| 481 | + utf16_result = emalloc( orig_length * 2 ); |
| 482 | + switch( action ) { |
| 483 | + case LUASANDBOX_UTF8_CHANGE_CASE_TOUPPER: |
| 484 | + result_len = u_strToUpper( utf16_result, orig_length, utf16_orig, orig_length / 2, "", &errorCode ); |
| 485 | + break; |
| 486 | + case LUASANDBOX_UTF8_CHANGE_CASE_TOLOWER: |
| 487 | + result_len = u_strToLower( utf16_result, orig_length, utf16_orig, orig_length / 2, "", &errorCode ); |
| 488 | + break; |
| 489 | + case LUASANDBOX_UTF8_CHANGE_CASE_TOTITLE: |
| 490 | + result_len = u_strToTitle( utf16_result, orig_length, utf16_orig, orig_length / 2, NULL, "", &errorCode ); |
| 491 | + break; |
| 492 | + } |
| 493 | + LUASANDBOX_CHECK_ICU_ERROR( errorCode, efree(utf16_result) ); |
| 494 | + lua_pop( L, 1 ); // Pop UTF-16 string out of the stack |
| 495 | + |
| 496 | + // Back to UTF-8 |
| 497 | + lua_pushlstring( L, utf16_result, result_len * 2 ); |
| 498 | + luasandbox_convert_fromUTF16( L, -1 ); |
| 499 | + lua_replace( L, -2 ); |
| 500 | + efree( utf16_result ); |
| 501 | + |
| 502 | + return 1; |
| 503 | +} |
| 504 | +/* }}} */ |
| 505 | + |
| 506 | +int luasandbox_ustr_uc(lua_State * L) |
| 507 | +{ |
| 508 | + luasandbox_ustr_change_case( L, LUASANDBOX_UTF8_CHANGE_CASE_TOUPPER ); |
| 509 | +} |
| 510 | + |
| 511 | +int luasandbox_ustr_lc(lua_State * L) |
| 512 | +{ |
| 513 | + luasandbox_ustr_change_case( L, LUASANDBOX_UTF8_CHANGE_CASE_TOLOWER ); |
| 514 | +} |
| 515 | + |
| 516 | +int luasandbox_ustr_tc(lua_State * L) |
| 517 | +{ |
| 518 | + luasandbox_ustr_change_case( L, LUASANDBOX_UTF8_CHANGE_CASE_TOTITLE ); |
| 519 | +} |
| 520 | + |
| 521 | +/** {{{ luasandbox_utf8_trim_lua |
| 522 | + * |
| 523 | + * Lua function: |
| 524 | + * ustring trim( ustring str ) |
| 525 | + * Removes all the whitespace from the beginning and end of the string. |
| 526 | + */ |
| 527 | +int luasandbox_ustr_trim(lua_State * L) |
| 528 | +{ |
| 529 | + luasandbox_ustr_header *header, *newheader; |
| 530 | + uint8_t *utf_string, *result; |
| 531 | + size_t new_len; |
| 532 | + UChar32 cur; |
| 533 | + uint32_t i = 0, ltrim_len = 0, rtrim_len = 0, ltrim_len_cp = 0, rtrim_len_cp = 0; |
| 534 | + |
| 535 | + header = luasandbox_checkustring( L, 1 ); |
| 536 | + utf_string = LUASANDBOX_USTR_RAW(header); |
| 537 | + |
| 538 | + // Left side |
| 539 | + while( i < header->raw_len ) { |
| 540 | + U8_NEXT_UNSAFE( utf_string, i, cur ); |
| 541 | + |
| 542 | + if( u_isWhitespace( cur ) || u_isUWhiteSpace( cur ) ) { |
| 543 | + ltrim_len = i; |
| 544 | + ltrim_len_cp++; |
| 545 | + } else { |
| 546 | + break; |
| 547 | + } |
| 548 | + } |
| 549 | + // Right side |
| 550 | + while( i < header->raw_len ) { |
| 551 | + U8_NEXT_UNSAFE( utf_string, i, cur ); |
| 552 | + |
| 553 | + if( u_isWhitespace( cur ) || u_isUWhiteSpace( cur ) ) { |
| 554 | + rtrim_len += U8_LENGTH( cur ); |
| 555 | + rtrim_len_cp++; |
| 556 | + } else { |
| 557 | + rtrim_len = 0; |
| 558 | + rtrim_len_cp = 0; |
| 559 | + } |
| 560 | + } |
| 561 | + |
| 562 | + new_len = header->raw_len - ltrim_len - rtrim_len; |
| 563 | + newheader = luasandbox_init_ustr( L, new_len ); |
| 564 | + newheader->cp_len = header->cp_len - ltrim_len_cp - rtrim_len_cp; |
| 565 | + memcpy( LUASANDBOX_USTR_RAW(newheader), utf_string + ltrim_len, new_len ); |
| 566 | + |
| 567 | + return 1; |
| 568 | +} |
| 569 | +/* }}} */ |
| 570 | + |
| 571 | +/** {{{ luasandbox_ustr_sub |
| 572 | + * |
| 573 | + * Lua function: |
| 574 | + * ustring sub( ustring str, int offset[, int length] ) |
| 575 | + * Returns the substring of str. Starts from the offset, |
| 576 | + * and returns at most length code points. |
| 577 | + */ |
| 578 | +int luasandbox_ustr_sub(lua_State * L) |
| 579 | +{ |
| 580 | + luasandbox_ustr_header *header; |
| 581 | + uint8_t *utf_string, *result; |
| 582 | + size_t len; |
| 583 | + |
| 584 | + int32_t i = 0, idx = 0, target = 0, target_len; |
| 585 | + int32_t target_start, target_end = -1; |
| 586 | + int found = 0; |
| 587 | + UChar32 cur; |
| 588 | + |
| 589 | + header = luasandbox_checkustring( L, 1 ); |
| 590 | + utf_string = LUASANDBOX_USTR_RAW(header); |
| 591 | + target = luaL_checkinteger( L, 2 ); |
| 592 | + if( lua_type( L, 3 ) == LUA_TNUMBER ) { |
| 593 | + target_len = lua_tointeger( L, 3 ); |
| 594 | + } else { |
| 595 | + target_len = -1; |
| 596 | + } |
| 597 | + |
| 598 | + target = luasandbox_ustr_index_to_offset( L, header, target, TRUE ); |
| 599 | + |
| 600 | + // Find the start symbol |
| 601 | + while( i < header->raw_len ) { |
| 602 | + if( idx == target ) { |
| 603 | + found = TRUE; |
| 604 | + break; |
| 605 | + } |
| 606 | + |
| 607 | + U8_NEXT_UNSAFE( utf_string, i, cur ); |
| 608 | + idx++; |
| 609 | + } |
| 610 | + |
| 611 | + // If start symbol index is larger than string size, return null |
| 612 | + if( !found ) { |
| 613 | + lua_pushstring( L, "" ); |
| 614 | + return 1; |
| 615 | + } |
| 616 | + |
| 617 | + target_start = i; |
| 618 | + idx = 0; |
| 619 | + |
| 620 | + // Find the end position |
| 621 | + while( i < header->raw_len ) { |
| 622 | + if( idx == target_len ) { |
| 623 | + target_end = i; |
| 624 | + break; |
| 625 | + } |
| 626 | + |
| 627 | + U8_NEXT_UNSAFE( utf_string, i, cur ); |
| 628 | + idx++; |
| 629 | + } |
| 630 | + |
| 631 | + if( target_end == -1 ) { |
| 632 | + target_end = header->raw_len; |
| 633 | + } |
| 634 | + |
| 635 | + luasandbox_push_ustr( L, utf_string + target_start, target_end - target_start ); |
| 636 | + return 1; |
| 637 | +} |
| 638 | +/* }}} */ |
| 639 | + |
| 640 | +/****************** Substring search and related operators. Beware. ******************/ |
| 641 | + |
| 642 | +typedef struct { |
| 643 | + UChar32* string; // UTF-32 representation of the needle string |
| 644 | + int32_t* table; // KMP table |
| 645 | + int32_t length; // Length of the needle string in code points |
| 646 | + int32_t raw_length; // Length of the needle string in UTF-8 bytes |
| 647 | + int singleCharMode; // Whether the needle string is a single character |
| 648 | +} ustr_needle_string; |
| 649 | + |
| 650 | +#define UTF8_SEARCH_STATUS_FOUND 1 |
| 651 | +#define UTF8_SEARCH_STATUS_NOTFOUND 0 |
| 652 | + |
| 653 | +typedef struct { |
| 654 | + int32_t status; // Status of the search |
| 655 | + int32_t raw_index; // Index in bytes |
| 656 | + int32_t cp_index; // Index in codepoints |
| 657 | +} ustr_search_result; |
| 658 | + |
| 659 | +/** {{{ luasandbox_ustr_search_prepare |
| 660 | + * |
| 661 | + * Preprocesses the string so a search may be performed on it using KMP algorithm. |
| 662 | + */ |
| 663 | +static ustr_needle_string* luasandbox_ustr_search_prepare(uint8_t* utf_string, int32_t raw_len) |
| 664 | +{ |
| 665 | + ustr_needle_string* str; |
| 666 | + int32_t i, idx; |
| 667 | + UChar32 cur; |
| 668 | + UErrorCode errorCode = U_ZERO_ERROR; |
| 669 | + int32_t cnd = 0; |
| 670 | + |
| 671 | + // Here we use the worst-case allocation |
| 672 | + str = emalloc( sizeof( ustr_needle_string ) ); |
| 673 | + memset( str, 0, sizeof( ustr_needle_string ) ); |
| 674 | + str->string = emalloc( raw_len * 4 ); |
| 675 | + str->raw_length = raw_len; |
| 676 | + |
| 677 | + // Convert UTF-8 to UTF-32 for search purposes |
| 678 | + for( i = idx = 0; i < raw_len; idx++ ) { |
| 679 | + U8_NEXT_UNSAFE( utf_string, i, cur ); |
| 680 | + str->string[idx] = cur; |
| 681 | + } |
| 682 | + str->length = idx; |
| 683 | + |
| 684 | + // KMP cannot handle single character search |
| 685 | + // (or it can, but my implementation cannot) |
| 686 | + // Use special case handler |
| 687 | + str->singleCharMode = str->length == 1; |
| 688 | + if( str->singleCharMode ) |
| 689 | + return str; |
| 690 | + |
| 691 | + // Fill the search prefix table |
| 692 | + str->table = emalloc( str->length * sizeof(int32_t) ); |
| 693 | + str->table[0] = -1; // Yes, UChar32 is a signed type. "U" is for "Unicode", not for "unsigned" |
| 694 | + str->table[1] = 0; |
| 695 | + for( i = 2; i < str->length; i++ ) { |
| 696 | + if( str->string[i - 1] == str->string[cnd] ) { |
| 697 | + cnd++; |
| 698 | + str->table[i] = cnd; |
| 699 | + } else if( cnd > 0 ) { |
| 700 | + cnd = str->table[cnd]; |
| 701 | + i--; |
| 702 | + } else { |
| 703 | + str->table[i] = 0; |
| 704 | + } |
| 705 | + } |
| 706 | + |
| 707 | + return str; |
| 708 | +} |
| 709 | + |
| 710 | +/** {{{ luasandbox_ustr_search_free |
| 711 | + * |
| 712 | + * Frees the memory allocated for the preprocessed needle string. |
| 713 | + */ |
| 714 | +void luasandbox_ustr_search_free(ustr_needle_string *needle) |
| 715 | +{ |
| 716 | + if( needle->table ) |
| 717 | + efree( needle->table ); |
| 718 | + efree( needle->string ); |
| 719 | + efree( needle ); |
| 720 | +} |
| 721 | + |
| 722 | +#define UTF8_SEARCH_OFFSET_NONE 0 |
| 723 | +#define UTF8_SEARCH_OFFSET_RAW 1 |
| 724 | +#define UTF8_SEARCH_OFFSET_CP 2 |
| 725 | + |
| 726 | +/** {{{ luasandbox_ustr_search |
| 727 | + * |
| 728 | + * Performs search of a substring in a string using the Knuth-Morris-Pratt algorithm. |
| 729 | + * Allows different types of start offset. The needle string must be preprocessed. |
| 730 | + */ |
| 731 | +ustr_search_result luasandbox_ustr_search(uint8_t *haystack, int32_t haystack_len, int offset_type, int offset, ustr_needle_string* needle) { |
| 732 | + int i, j, idx; // Raw offset in haystack, CP offset in needle, CP offset in haystack |
| 733 | + UChar32 cur; |
| 734 | + ustr_search_result result; |
| 735 | + |
| 736 | + // Defaults |
| 737 | + result.raw_index = -1; |
| 738 | + result.cp_index = -1; |
| 739 | + |
| 740 | + // If we are given raw offset, start with it |
| 741 | + if( offset_type == UTF8_SEARCH_OFFSET_RAW ) { |
| 742 | + i = offset; |
| 743 | + } else { |
| 744 | + i = 0; |
| 745 | + } |
| 746 | + |
| 747 | + if( needle->singleCharMode ) { |
| 748 | + // Handle special case of single character |
| 749 | + for( idx = 0; i < haystack_len; idx++ ) { |
| 750 | + U8_NEXT_UNSAFE( haystack, i, cur ); |
| 751 | + |
| 752 | + if( offset_type == UTF8_SEARCH_OFFSET_CP && idx < offset ) |
| 753 | + continue; |
| 754 | + |
| 755 | + if( needle->string[0] == cur ) { |
| 756 | + result.status = UTF8_SEARCH_STATUS_FOUND; |
| 757 | + result.cp_index = idx; |
| 758 | + result.raw_index = i - needle->raw_length; |
| 759 | + return result; |
| 760 | + } |
| 761 | + } |
| 762 | + } else { |
| 763 | + // Otherwise use KMP search |
| 764 | + for( j = idx = 0; i < haystack_len; idx++ ) { |
| 765 | + U8_NEXT_UNSAFE( haystack, i, cur ); |
| 766 | + |
| 767 | + if( offset_type == UTF8_SEARCH_OFFSET_CP && idx < offset ) |
| 768 | + continue; |
| 769 | + |
| 770 | + while( j > 0 && needle->string[j] != cur ) { |
| 771 | + j = needle->table[j]; |
| 772 | + } |
| 773 | + if( needle->string[j] == cur ) |
| 774 | + j++; |
| 775 | + if( j == needle->length ) { |
| 776 | + result.status = UTF8_SEARCH_STATUS_FOUND; |
| 777 | + result.cp_index = (idx+1) - needle->length; |
| 778 | + result.raw_index = i - needle->raw_length; |
| 779 | + return result; |
| 780 | + } |
| 781 | + } |
| 782 | + } |
| 783 | + |
| 784 | + result.status = UTF8_SEARCH_STATUS_NOTFOUND; |
| 785 | + return result; |
| 786 | +} |
| 787 | +/* }}} */ |
| 788 | + |
| 789 | +/** {{{ luasandbox_ustr_pos |
| 790 | + * |
| 791 | + * Lua function |
| 792 | + * int pos( ustring haystack, ustring needle[, int offset] ) |
| 793 | + * Searches for a substring in a string. Returns an offset |
| 794 | + * according to Lua conventions (starting with 1). |
| 795 | + */ |
| 796 | +int luasandbox_ustr_pos(lua_State * L) |
| 797 | +{ |
| 798 | + luasandbox_ustr_header *header_haystack, *header_needle; |
| 799 | + uint8_t *haystack, *needle_raw; |
| 800 | + ustr_needle_string *needle; |
| 801 | + int32_t offset; |
| 802 | + ustr_search_result result; |
| 803 | + |
| 804 | + header_haystack = luasandbox_checkustring( L, 1 ); |
| 805 | + header_needle = luasandbox_checkustring( L, 2 ); |
| 806 | + |
| 807 | + haystack = LUASANDBOX_USTR_RAW(header_haystack); |
| 808 | + needle_raw = LUASANDBOX_USTR_RAW(header_needle); |
| 809 | + if( lua_type( L, 3 ) == LUA_TNUMBER ) { |
| 810 | + offset = lua_tointeger( L, 3 ); |
| 811 | + } else { |
| 812 | + offset = 1; |
| 813 | + } |
| 814 | + |
| 815 | + offset = luasandbox_ustr_index_to_offset( L, header_haystack, offset, TRUE ); |
| 816 | + |
| 817 | + if( !header_needle->raw_len ) { |
| 818 | + lua_pushstring( L, "The needle parameter may not be empty" ); |
| 819 | + lua_error( L ); |
| 820 | + } |
| 821 | + |
| 822 | + needle = luasandbox_ustr_search_prepare( needle_raw, header_needle->raw_len ); |
| 823 | + |
| 824 | + result = luasandbox_ustr_search( haystack, header_haystack->raw_len, UTF8_SEARCH_OFFSET_CP, offset, needle ); |
| 825 | + luasandbox_ustr_search_free( needle ); |
| 826 | + |
| 827 | + switch( result.status ) { |
| 828 | + case UTF8_SEARCH_STATUS_FOUND: |
| 829 | + lua_pushinteger( L, result.cp_index + 1 ); |
| 830 | + return 1; |
| 831 | + case UTF8_SEARCH_STATUS_NOTFOUND: |
| 832 | + lua_pushinteger( L, -1 ); |
| 833 | + return 1; |
| 834 | + } |
| 835 | +} |
| 836 | +/* }}} */ |
| 837 | + |
| 838 | +/** {{{ luasandbox_ustr_replace |
| 839 | + * |
| 840 | + * Lua function: |
| 841 | + * replace( ustring haystack, ustring needle, ustring replacement[, int offset[, int limit]] ) |
| 842 | + * Replaces at most limit occurances of needle in haystack with replacement, |
| 843 | + * starting at offset. |
| 844 | + */ |
| 845 | +int luasandbox_ustr_replace(lua_State * L) |
| 846 | +{ |
| 847 | + luasandbox_ustr_header *header_haystack, *header_needle, *header_replacement, *header_result; |
| 848 | + uint8_t *haystack, *needle_raw, *replacement, *result; |
| 849 | + size_t haystack_len, needle_len, replacement_len, result_len; |
| 850 | + ustr_needle_string *needle; |
| 851 | + ustr_search_result cur; |
| 852 | + int32_t i, offset, offset_src, offset_dest, matches_num, limit; |
| 853 | + int32_t *matches; |
| 854 | + int offset_mode; |
| 855 | + |
| 856 | + header_haystack = luasandbox_checkustring( L, 1 ); |
| 857 | + header_needle = luasandbox_checkustring( L, 2 ); |
| 858 | + header_replacement = luasandbox_checkustring( L, 3 ); |
| 859 | + |
| 860 | + haystack = LUASANDBOX_USTR_RAW(header_haystack); |
| 861 | + haystack_len = header_haystack->raw_len; |
| 862 | + needle_raw = LUASANDBOX_USTR_RAW(header_needle); |
| 863 | + needle_len = header_needle->raw_len; |
| 864 | + replacement = LUASANDBOX_USTR_RAW(header_replacement); |
| 865 | + replacement_len = header_replacement->raw_len; |
| 866 | + |
| 867 | + if( lua_type( L, 4 ) == LUA_TNUMBER ) { |
| 868 | + offset = lua_tointeger( L, 4 ); |
| 869 | + offset = luasandbox_ustr_index_to_offset( L, header_haystack, offset, TRUE ); |
| 870 | + offset_mode = UTF8_SEARCH_OFFSET_CP; |
| 871 | + } else { |
| 872 | + offset = 0; |
| 873 | + offset_mode = UTF8_SEARCH_OFFSET_RAW; |
| 874 | + } |
| 875 | + limit = ( lua_type( L, 5 ) == LUA_TNUMBER ) ? |
| 876 | + luaL_checkinteger( L, 5 ) : |
| 877 | + -1; |
| 878 | + |
| 879 | + if( !needle_len ) { |
| 880 | + lua_pushstring( L, "The needle parameter may not be empty" ); |
| 881 | + lua_error( L ); |
| 882 | + } |
| 883 | + |
| 884 | + needle = luasandbox_ustr_search_prepare( needle_raw, needle_len ); |
| 885 | + |
| 886 | + // As usually, just use worst-case scenario for memory allocation |
| 887 | + matches = emalloc( ( haystack_len / needle_len + 1 ) * sizeof(int32_t) ); |
| 888 | + |
| 889 | + // Find all substrings to repalce |
| 890 | + matches_num = 0; |
| 891 | + for(;;) { |
| 892 | + if( limit > 0 && matches_num >= limit ) { |
| 893 | + break; |
| 894 | + } |
| 895 | + |
| 896 | + cur = luasandbox_ustr_search( haystack, haystack_len, offset_mode, offset, needle ); |
| 897 | + |
| 898 | + if( cur.status == UTF8_SEARCH_STATUS_FOUND ) { |
| 899 | + matches[matches_num] = cur.raw_index; |
| 900 | + matches_num++; |
| 901 | + offset = cur.raw_index + needle->raw_length; |
| 902 | + offset_mode = UTF8_SEARCH_OFFSET_RAW; |
| 903 | + } else { |
| 904 | + break; |
| 905 | + } |
| 906 | + } |
| 907 | + luasandbox_ustr_search_free( needle ); |
| 908 | + |
| 909 | + if( !matches_num ) { |
| 910 | + lua_pushvalue( L, 1 ); |
| 911 | + return 1; |
| 912 | + } |
| 913 | + |
| 914 | + // Initialize the resulting string |
| 915 | + result_len = haystack_len + ( replacement_len - needle_len ) * matches_num; |
| 916 | + header_result = luasandbox_init_ustr( L, result_len ); |
| 917 | + header_result->cp_len = header_haystack->cp_len + |
| 918 | + ( header_replacement->raw_len - header_needle->raw_len ) * matches_num; |
| 919 | + result = LUASANDBOX_USTR_RAW(header_result); |
| 920 | + |
| 921 | + // Replace all substrings |
| 922 | + memcpy( result, haystack, matches[i] ); |
| 923 | + offset_src = offset_dest = matches[i]; |
| 924 | + for( i = 0; i < matches_num; i++ ) { |
| 925 | + int32_t postfix_len; |
| 926 | + |
| 927 | + memcpy( result + offset_dest, replacement, replacement_len ); |
| 928 | + offset_src += needle_len; |
| 929 | + offset_dest += replacement_len; |
| 930 | + |
| 931 | + if( i == matches_num - 1 ) { |
| 932 | + postfix_len = haystack_len - offset_src; |
| 933 | + } else { |
| 934 | + postfix_len = matches[i+1] - offset_src; |
| 935 | + } |
| 936 | + |
| 937 | + memcpy( result + offset_dest, haystack + offset_src, postfix_len ); |
| 938 | + offset_src += postfix_len; |
| 939 | + offset_dest += postfix_len; |
| 940 | + } |
| 941 | + |
| 942 | + efree( matches ); |
| 943 | + |
| 944 | + return 1; |
| 945 | +} |
| 946 | +/* }}} */ |
| 947 | + |
| 948 | +/** {{{ luasandbox_ustr_split |
| 949 | + * |
| 950 | + * Lua function: |
| 951 | + * split( ustring haystack, ustring separator[, int limit] ) |
| 952 | + * |
| 953 | + */ |
| 954 | +int luasandbox_ustr_split(lua_State * L) |
| 955 | +{ |
| 956 | + luasandbox_ustr_header *header_haystack, *header_needle; |
| 957 | + uint8_t *haystack, *needle_raw; |
| 958 | + size_t haystack_len, needle_len; |
| 959 | + ustr_needle_string *needle; |
| 960 | + ustr_search_result cur; |
| 961 | + int32_t i, offset, matches_num, limit; |
| 962 | + int32_t *matches; |
| 963 | + |
| 964 | + header_haystack = luasandbox_checkustring( L, 1 ); |
| 965 | + header_needle = luasandbox_checkustring( L, 2 ); |
| 966 | + |
| 967 | + haystack = LUASANDBOX_USTR_RAW(header_haystack); |
| 968 | + needle_raw = LUASANDBOX_USTR_RAW(header_needle); |
| 969 | + haystack_len = header_haystack->raw_len; |
| 970 | + needle_len = header_needle->raw_len; |
| 971 | + |
| 972 | + limit = ( lua_tointeger( L, 3 ) == LUA_TNUMBER ) ? |
| 973 | + luaL_checkinteger( L, 3 ) : |
| 974 | + -1; |
| 975 | + |
| 976 | + if( !needle_len ) { |
| 977 | + lua_pushstring( L, "The needle parameter may not be empty" ); |
| 978 | + lua_error( L ); |
| 979 | + } |
| 980 | + |
| 981 | + needle = luasandbox_ustr_search_prepare( needle_raw, needle_len ); |
| 982 | + if( !needle ) { |
| 983 | + LUASANDBOX_UNICODE_INVALID_FAIL(); |
| 984 | + } |
| 985 | + |
| 986 | + // As usually, just use worst-case scenario for memory allocation |
| 987 | + matches = emalloc( ( haystack_len / needle_len + 1 ) * sizeof(int32_t) ); |
| 988 | + |
| 989 | + // Find all substrings to split |
| 990 | + matches_num = 0; |
| 991 | + offset = 0; |
| 992 | + for(;;) { |
| 993 | + if( limit > 0 && matches_num >= limit ) { |
| 994 | + break; |
| 995 | + } |
| 996 | + |
| 997 | + cur = luasandbox_ustr_search( haystack, haystack_len, UTF8_SEARCH_OFFSET_RAW, offset, needle ); |
| 998 | + |
| 999 | + if( cur.status == UTF8_SEARCH_STATUS_FOUND ) { |
| 1000 | + matches[matches_num] = cur.raw_index; |
| 1001 | + matches_num++; |
| 1002 | + offset = cur.raw_index + needle->raw_length; |
| 1003 | + } else { |
| 1004 | + break; |
| 1005 | + } |
| 1006 | + } |
| 1007 | + luasandbox_ustr_search_free( needle ); |
| 1008 | + |
| 1009 | + lua_createtable( L, matches_num + 1, 0 ); |
| 1010 | + |
| 1011 | + if( !matches_num ) { |
| 1012 | + lua_pushlstring( L, haystack, haystack_len ); |
| 1013 | + lua_rawseti( L, -2, 1 ); |
| 1014 | + return 1; |
| 1015 | + } |
| 1016 | + |
| 1017 | + // Push all matches into the table |
| 1018 | + lua_pushlstring( L, haystack, matches[0] ); |
| 1019 | + lua_rawseti( L, -2, 1 ); |
| 1020 | + offset = matches[0]; |
| 1021 | + for( i = 0; i < matches_num; i++ ) { |
| 1022 | + int32_t bit_len; |
| 1023 | + |
| 1024 | + offset += needle_len; |
| 1025 | + |
| 1026 | + if( i == matches_num - 1 ) { |
| 1027 | + bit_len = haystack_len - offset; |
| 1028 | + } else { |
| 1029 | + bit_len = matches[i+1] - offset; |
| 1030 | + } |
| 1031 | + |
| 1032 | + lua_pushlstring( L, haystack + offset, bit_len ); |
| 1033 | + lua_rawseti( L, -2, i + 2 ); |
| 1034 | + offset += bit_len; |
| 1035 | + } |
| 1036 | + |
| 1037 | + return 1; |
| 1038 | +} |
| 1039 | +/* }}} */ |
Property changes on: trunk/php/luasandbox/ustring.c |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 1040 | + native |
Index: trunk/php/luasandbox/m4/ac_check_icu.m4 |
— | — | @@ -0,0 +1,62 @@ |
| 2 | +dnl @synopsis AC_CHECK_ICU(version, action-if, action-if-not) |
| 3 | +dnl |
| 4 | +dnl @summary check for ICU of sufficient version by looking at icu-config |
| 5 | +dnl |
| 6 | +dnl Defines ICU_LIBS, ICU_CFLAGS, ICU_CXXFLAGS. See icu-config(1) man |
| 7 | +dnl page. |
| 8 | +dnl |
| 9 | +dnl @category InstalledPackages |
| 10 | +dnl @author Akos Maroy <darkeye@tyrell.hu> |
| 11 | +dnl @version 2005-09-20 |
| 12 | +dnl @license AllPermissive |
| 13 | + |
| 14 | +AC_DEFUN([AC_CHECK_ICU], [ |
| 15 | + succeeded=no |
| 16 | + |
| 17 | + if test -z "$ICU_CONFIG"; then |
| 18 | + AC_PATH_PROG(ICU_CONFIG, icu-config, no) |
| 19 | + fi |
| 20 | + |
| 21 | + if test "$ICU_CONFIG" = "no" ; then |
| 22 | + echo "*** The icu-config script could not be found. Make sure it is" |
| 23 | + echo "*** in your path, and that taglib is properly installed." |
| 24 | + echo "*** Or see http://ibm.com/software/globalization/icu/" |
| 25 | + else |
| 26 | + ICU_VERSION=`$ICU_CONFIG --version` |
| 27 | + AC_MSG_CHECKING(for ICU >= $1) |
| 28 | + VERSION_CHECK=`expr $ICU_VERSION \>\= $1` |
| 29 | + if test "$VERSION_CHECK" = "1" ; then |
| 30 | + AC_MSG_RESULT(yes) |
| 31 | + succeeded=yes |
| 32 | + |
| 33 | + AC_MSG_CHECKING(ICU_CFLAGS) |
| 34 | + ICU_CFLAGS=`$ICU_CONFIG --cflags` |
| 35 | + AC_MSG_RESULT($ICU_CFLAGS) |
| 36 | + |
| 37 | + AC_MSG_CHECKING(ICU_CXXFLAGS) |
| 38 | + ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags` |
| 39 | + AC_MSG_RESULT($ICU_CXXFLAGS) |
| 40 | + |
| 41 | + AC_MSG_CHECKING(ICU_LIBS) |
| 42 | + ICU_LIBS=`$ICU_CONFIG --ldflags` |
| 43 | + AC_MSG_RESULT($ICU_LIBS) |
| 44 | + else |
| 45 | + ICU_CFLAGS="" |
| 46 | + ICU_CXXFLAGS="" |
| 47 | + ICU_LIBS="" |
| 48 | + ## If we have a custom action on failure, don't print errors, but |
| 49 | + ## do set a variable so people can do so. |
| 50 | + ifelse([$3], ,echo "can't find ICU >= $1",) |
| 51 | + fi |
| 52 | + |
| 53 | + AC_SUBST(ICU_CFLAGS) |
| 54 | + AC_SUBST(ICU_CXXFLAGS) |
| 55 | + AC_SUBST(ICU_LIBS) |
| 56 | + fi |
| 57 | + |
| 58 | + if test $succeeded = yes; then |
| 59 | + ifelse([$2], , :, [$2]) |
| 60 | + else |
| 61 | + ifelse([$3], , AC_MSG_ERROR([Library requirements (ICU) not met.]), [$3]) |
| 62 | + fi |
| 63 | +]) |
Index: trunk/php/luasandbox/config.m4 |
— | — | @@ -9,6 +9,10 @@ |
10 | 10 | if test "$PHP_LUASANDBOX" != "no"; then |
11 | 11 | dnl Include pkg-config macros definitions: |
12 | 12 | m4_include([m4/pkg.m4]) |
| 13 | + |
| 14 | + dnl ICU did not support pkg-config till recently; current WM version |
| 15 | + dnl probably does not support it as well |
| 16 | + m4_include([m4/ac_check_icu.m4]) |
13 | 17 | PKG_PROG_PKG_CONFIG |
14 | 18 | |
15 | 19 | dnl We need lua or fallback to luajit. |
— | — | @@ -19,12 +23,17 @@ |
20 | 24 | ]) |
21 | 25 | ]) |
22 | 26 | |
| 27 | + AC_CHECK_ICU( [4.0] ) |
| 28 | + |
23 | 29 | dnl LUA_LIBS and LUA_CFLAGS interprets them: |
24 | 30 | PHP_EVAL_INCLINE($LUA_CFLAGS) |
25 | 31 | PHP_EVAL_LIBLINE($LUA_LIBS, LUASANDBOX_SHARED_LIBADD) |
26 | | - |
| 32 | + |
| 33 | + PHP_EVAL_INCLINE($ICU_CFLAGS) |
| 34 | + PHP_EVAL_LIBLINE($ICU_LIBS, LUASANDBOX_SHARED_LIBADD) |
| 35 | + |
27 | 36 | PHP_EVAL_LIBLINE("-lrt", LUASANDBOX_SHARED_LIBADD) |
28 | 37 | |
29 | 38 | PHP_SUBST(LUASANDBOX_SHARED_LIBADD) |
30 | | - PHP_NEW_EXTENSION(luasandbox, alloc.c data_conversion.c library.c luasandbox.c timer.c, $ext_shared) |
| 39 | + PHP_NEW_EXTENSION(luasandbox, alloc.c data_conversion.c library.c luasandbox.c timer.c ustring.c, $ext_shared) |
31 | 40 | fi |
Index: trunk/php/luasandbox/library.c |
— | — | @@ -14,6 +14,7 @@ |
15 | 15 | |
16 | 16 | #include "php.h" |
17 | 17 | #include "php_luasandbox.h" |
| 18 | +#include "luasandbox_unicode.h" |
18 | 19 | |
19 | 20 | static HashTable * luasandbox_lib_get_allowed_globals(TSRMLS_D); |
20 | 21 | |
— | — | @@ -128,6 +129,9 @@ |
129 | 130 | lua_pushcfunction(L, luasandbox_math_randomseed); |
130 | 131 | lua_setfield(L, -2, "randomseed"); |
131 | 132 | lua_pop(L, 1); |
| 133 | + |
| 134 | + // Install string-related functions |
| 135 | + luasandbox_install_unicode_functions(L); |
132 | 136 | } |
133 | 137 | /* }}} */ |
134 | 138 | |
Index: trunk/php/luasandbox/luasandbox_unicode.h |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +#ifndef LUASANDBOX_UNICODE_H |
| 3 | +#define LUASANDBOX_UNICODE_H |
| 4 | + |
| 5 | +#include <stdint.h> |
| 6 | +#include <lua.h> |
| 7 | + |
| 8 | +/** |
| 9 | + * Unicode string are input and stored as UTF-8. |
| 10 | + */ |
| 11 | +typedef struct { |
| 12 | + size_t raw_len; // Byte length in UTF-8 |
| 13 | + int32_t cp_len; // Amount of code points |
| 14 | +} luasandbox_ustr_header; |
| 15 | + |
| 16 | +#define LUASANDBOX_USTR_RAW(header) ((uint8_t*) ( ((void*)header) + sizeof(luasandbox_ustr_header) )) |
| 17 | +#define LUASANDBOX_USTR_TOTALLEN(header) ( sizeof(luasandbox_ustr_header) + header->raw_len ) |
| 18 | + |
| 19 | +void luasandbox_install_unicode_functions(lua_State * L); |
| 20 | + |
| 21 | +luasandbox_ustr_header *luasandbox_init_ustr(lua_State * L, size_t len); |
| 22 | +luasandbox_ustr_header *luasandbox_push_ustr(lua_State * L, uint8_t *str, size_t len); |
| 23 | +int luasandbox_isustr(lua_State * L, int idx); |
| 24 | +luasandbox_ustr_header* luasandbox_checkustring(lua_State * L, int idx); |
| 25 | +const uint8_t* luasandbox_getustr(lua_State * L, int idx, size_t* raw_len); |
| 26 | +int32_t luasandbox_ustr_index_to_offset(lua_State * L, luasandbox_ustr_header *str, int32_t idx, int check_limits); |
| 27 | + |
| 28 | +void luasandbox_convert_toUTF16(lua_State * L, int idx); |
| 29 | +void luasandbox_convert_fromUTF16(lua_State * L, int idx); |
| 30 | + |
| 31 | +#endif |
Property changes on: trunk/php/luasandbox/luasandbox_unicode.h |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 32 | + native |
Added: svn:keywords |
2 | 33 | + Author Date Id Rev URL |
Index: trunk/php/luasandbox/data_conversion.c |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | |
12 | 12 | #include "php.h" |
13 | 13 | #include "php_luasandbox.h" |
| 14 | +#include "luasandbox_unicode.h" |
14 | 15 | |
15 | 16 | static void luasandbox_lua_to_array(HashTable *ht, lua_State *L, int index, |
16 | 17 | zval * sandbox_zval, HashTable * recursionGuard TSRMLS_DC); |
— | — | @@ -305,6 +306,13 @@ |
306 | 307 | break; |
307 | 308 | } |
308 | 309 | case LUA_TUSERDATA: |
| 310 | + if(luasandbox_isustr(L, index)) { |
| 311 | + const uint8_t *str; |
| 312 | + size_t length; |
| 313 | + str = luasandbox_getustr(L, index, &length); |
| 314 | + ZVAL_STRINGL(z, str, length, 1); |
| 315 | + break; |
| 316 | + } |
309 | 317 | case LUA_TTHREAD: |
310 | 318 | case LUA_TLIGHTUSERDATA: |
311 | 319 | default: |