From b60993014298a4943b7d9660375814393a75519e Mon Sep 17 00:00:00 2001 From: Nuno Cruces Date: Wed, 7 May 2025 12:30:19 +0100 Subject: [PATCH] Refactor #274. --- sqlite3/libc/libc.wasm | Bin 3754 -> 3761 bytes sqlite3/libc/libc.wat | 533 +++++++++++++++++++++-------------------- sqlite3/libc/string.h | 174 +++++++------- 3 files changed, 353 insertions(+), 354 deletions(-) diff --git a/sqlite3/libc/libc.wasm b/sqlite3/libc/libc.wasm index 9352b8eee6c1ccc40a36fee8779059a77875e6c0..e34936036794e6898620413504bd48177ce76ae2 100755 GIT binary patch delta 653 zcmZ`$u}<7T5Z&3?vkwgNNfAQgLNeP#K?RQZdWi7)Kl)5CAMYdB j3=m|I4kwO=yTMuw>}*XzKjA=NkC6z~0fwfZ=eNyYn>}~_ delta 685 zcmY+BJ&P1U5QeL&d$#x7U3Ws|T&x+813wT9O>SDa!$TI4!&LDjkXaCRT{gL3H8bzT zRLm~%WHS>ZbAN(A!B}YV?e<2!WTvL3s^0gh`ZPJ7oLpsozq!UMnS5lCH6KZ{OTJwC za#oAd!HASq?u9Sr=CCG2BTk8F_-S^+6?^N+N!RvYqsfh?L8VCz;>&~77l}wIJ?kPq zh=(K5B0k%*ji>P-Rn00<_GMeK!bA%}Gu?c)@A%RiD|THR?ZVCtw99Ezb{byxF0aLp z=e6(6pUo?QusM6y)%P?iG+Q=Cl6 zhz3e9NouFbcEzTNLlzr`?mQ-Q8@Ao!{T4X*NC@0wk9LYir2P7o3I>_Re}ftAsayvQ zdAhruopm}%PQq7LUBE80mP#DMQC3W|!!LL74xbe=XM{`^>Z~VLfMc#v+0S-(@YaT> z{bPeIhT+z7XNH|n&G$2zoRalRZC-;gzo9H`_^|b`?Z@XY5NA+a4wPWUx&DDdivEE1 kfXWF}{(|B>_}!UH$HPVz$zwj6o3A|2_wu?okIo(1-!g4^oB#j- diff --git a/sqlite3/libc/libc.wat b/sqlite3/libc/libc.wat index 0f86ea1..2c62f2a 100644 --- a/sqlite3/libc/libc.wat +++ b/sqlite3/libc/libc.wat @@ -550,16 +550,17 @@ (memory.size) (i32.const 16) ) - (i32.const 16) + (select + (local.get $0) + (local.get $1) + (i32.gt_u + (local.get $0) + (local.get $1) + ) + ) ) ) - (local.get $0) - ) - ) - (br_if $block - (i32.gt_u - (local.get $1) - (local.get $2) + (i32.const 16) ) ) (loop $label @@ -595,21 +596,21 @@ (i32.const 16) ) ) - (br_if $block - (i32.gt_u - (local.tee $0 - (i32.add - (local.get $0) - (i32.const 16) - ) - ) - (local.get $2) + (local.set $0 + (i32.add + (local.get $0) + (i32.const 16) ) ) (br_if $label - (i32.le_u - (local.get $1) - (local.get $2) + (i32.gt_u + (local.tee $2 + (i32.sub + (local.get $2) + (i32.const 16) + ) + ) + (i32.const 15) ) ) ) @@ -643,7 +644,10 @@ (loop $label1 (if (i32.eqz - (local.get $2) + (i32.and + (local.get $2) + (i32.const 255) + ) ) (then (return @@ -691,87 +695,91 @@ (local $3 i32) (local $4 i32) (local $5 v128) - (block $block1 - (block $block - (br_if $block - (i32.lt_u - (local.tee $3 - (i32.sub - (i32.shl - (memory.size) - (i32.const 16) + (block $block + (if + (i32.ge_u + (local.tee $2 + (select + (local.tee $3 + (i32.sub + (i32.shl + (memory.size) + (i32.const 16) + ) + (select + (local.get $0) + (local.get $1) + (i32.gt_u + (local.get $0) + (local.get $1) + ) + ) ) - (i32.const 16) + ) + (local.get $2) + (i32.gt_u + (local.get $2) + (local.get $3) ) ) - (local.get $0) ) + (i32.const 16) ) - (loop $label - (br_if $block - (i32.gt_u - (local.get $1) - (local.get $3) - ) - ) - (br_if $block - (i32.lt_u - (local.get $2) - (i32.const 16) - ) - ) - (br_if $block1 - (v128.any_true - (v128.xor - (v128.load align=1 - (local.get $1) - ) - (local.tee $5 + (then + (loop $label + (br_if $block + (v128.any_true + (v128.xor (v128.load align=1 - (local.get $0) + (local.get $1) + ) + (local.tee $5 + (v128.load align=1 + (local.get $0) + ) ) ) ) ) - ) - (if - (i32.eqz - (i8x16.all_true - (local.get $5) - ) - ) - (then - (return - (i32.const 0) - ) - ) - ) - (local.set $2 - (i32.sub - (local.get $2) - (i32.const 16) - ) - ) - (local.set $1 - (i32.add - (local.get $1) - (i32.const 16) - ) - ) - (br_if $label - (i32.le_u - (local.tee $0 - (i32.add - (local.get $0) - (i32.const 16) + (if + (i32.eqz + (i8x16.all_true + (local.get $5) ) ) - (local.get $3) + (then + (return + (i32.const 0) + ) + ) + ) + (local.set $1 + (i32.add + (local.get $1) + (i32.const 16) + ) + ) + (local.set $0 + (i32.add + (local.get $0) + (i32.const 16) + ) + ) + (br_if $label + (i32.gt_u + (local.tee $2 + (i32.sub + (local.get $2) + (i32.const 16) + ) + ) + (i32.const 15) + ) ) ) ) ) - (br_if $block1 + (br_if $block (local.get $2) ) (return @@ -1077,7 +1085,7 @@ ) (if (i32.eqz - (local.tee $2 + (local.tee $3 (i32.load8_u (local.get $1) ) @@ -1093,7 +1101,7 @@ ) (i32.const 16) ) - (i32.const 16) + (local.get $0) ) ) (block $block @@ -1116,9 +1124,9 @@ (i32.store8 (local.tee $5 (i32.or - (local.tee $3 + (local.tee $2 (i32.and - (local.get $2) + (local.get $3) (i32.const 15) ) ) @@ -1131,10 +1139,10 @@ ) (i32.shl (i32.const 1) - (local.tee $2 + (local.tee $3 (i32.and (i32.shr_u - (local.get $2) + (local.get $3) (i32.const 4) ) (i32.const 15) @@ -1148,26 +1156,26 @@ (local.get $7) ) (i32.store8 - (local.tee $3 + (local.tee $2 (i32.or - (local.get $3) + (local.get $2) (i32.const 1008) ) ) (i32.or (i32.load8_u - (local.get $3) + (local.get $2) ) (i32.shl (i32.const 1) (i32.sub - (local.get $2) + (local.get $3) (i32.const 8) ) ) ) ) - (local.set $2 + (local.set $3 (i32.load8_u (local.get $1) ) @@ -1189,16 +1197,16 @@ ) ) (br_if $label - (local.get $2) + (local.get $3) ) ) - (local.set $2 + (local.set $3 (local.get $0) ) (if - (i32.le_u - (local.get $0) + (i32.ge_u (local.get $4) + (i32.const 16) ) (then (local.set $1 @@ -1217,10 +1225,7 @@ (i8x16.shr_u (local.tee $8 (v128.load align=1 - (i32.add - (local.get $0) - (local.get $1) - ) + (local.get $3) ) ) (i32.const 4) @@ -1254,7 +1259,7 @@ ) (then (return - (i32.add + (i32.sub (i32.ctz (i32.xor (i8x16.bitmask @@ -1268,20 +1273,24 @@ ) ) ) + (local.set $3 + (i32.add + (local.get $3) + (i32.const 16) + ) + ) (br_if $label1 - (i32.le_u - (local.tee $2 - (i32.add - (local.get $0) - (local.tee $1 - (i32.add - (local.get $1) - (i32.const 16) - ) + (i32.gt_u + (i32.add + (local.get $4) + (local.tee $1 + (i32.sub + (local.get $1) + (i32.const 16) ) ) ) - (local.get $4) + (i32.const 15) ) ) ) @@ -1293,7 +1302,7 @@ (local.get $0) (i32.const -1) ) - (local.get $2) + (local.get $3) ) ) (loop $label2 @@ -1305,7 +1314,7 @@ (i32.lt_s (local.tee $0 (i32.load8_s - (local.get $2) + (local.get $3) ) ) (i32.const 0) @@ -1318,9 +1327,9 @@ (i32.const 1) ) ) - (local.set $2 + (local.set $3 (i32.add - (local.get $2) + (local.get $3) (i32.const 1) ) ) @@ -1351,21 +1360,21 @@ (br $block) ) ) - (local.set $3 + (local.set $2 (local.get $0) ) (if - (i32.le_u - (local.get $0) + (i32.ge_u (local.get $4) + (i32.const 16) ) (then (local.set $6 (i8x16.splat - (local.get $2) + (local.get $3) ) ) - (local.set $1 + (local.set $2 (i32.const 0) ) (loop $label3 @@ -1377,7 +1386,7 @@ (v128.load align=1 (i32.add (local.get $0) - (local.get $1) + (local.get $2) ) ) (local.get $6) @@ -1396,28 +1405,35 @@ (i32.const -1) ) ) - (local.get $1) + (local.get $2) ) ) ) ) + (local.set $2 + (i32.add + (local.get $2) + (i32.const 16) + ) + ) (br_if $label3 - (i32.le_u - (local.tee $3 - (i32.add - (local.get $0) - (local.tee $1 - (i32.add - (local.get $1) - (i32.const 16) - ) - ) + (i32.gt_u + (local.tee $4 + (i32.sub + (local.get $4) + (i32.const 16) ) ) - (local.get $4) + (i32.const 15) ) ) ) + (local.set $2 + (i32.add + (local.get $0) + (local.get $2) + ) + ) ) ) (local.set $1 @@ -1426,7 +1442,7 @@ (local.get $0) (i32.const -1) ) - (local.get $3) + (local.get $2) ) ) (loop $label4 @@ -1441,18 +1457,18 @@ (block (result i32) (local.set $scratch_12 (i32.load8_u - (local.get $3) + (local.get $2) ) ) - (local.set $3 + (local.set $2 (i32.add - (local.get $3) + (local.get $2) (i32.const 1) ) ) (local.get $scratch_12) ) - (local.get $2) + (local.get $3) ) ) ) @@ -1584,109 +1600,106 @@ ) (if (i32.ge_u - (local.tee $3 + (local.tee $2 (i32.sub - (block (result i32) - (local.set $scratch - (i32.shl + (i32.shl + (block (result i32) + (local.set $scratch (memory.size) - (i32.const 16) ) - ) - (loop $label1 - (v128.store - (i32.const 1008) - (local.get $6) - ) - (i32.store8 - (i32.or - (local.tee $3 - (i32.and - (local.tee $2 - (i32.load8_u - (local.get $1) + (loop $label1 + (v128.store + (i32.const 1008) + (local.get $6) + ) + (i32.store8 + (i32.or + (local.tee $3 + (i32.and + (local.tee $2 + (i32.load8_u + (local.get $1) + ) ) + (i32.const 15) + ) + ) + (i32.const 1008) + ) + (i32.or + (i32.load8_u + (i32.or + (local.get $3) + (i32.const 1008) + ) + ) + (i32.shl + (i32.const 1) + (i32.sub + (local.tee $5 + (i32.shr_u + (local.get $2) + (i32.const 4) + ) + ) + (i32.const 8) ) - (i32.const 15) ) ) - (i32.const 1008) ) - (i32.or - (i32.load8_u + (v128.store + (i32.const 992) + (local.get $7) + ) + (i32.store8 + (local.tee $3 (i32.or (local.get $3) - (i32.const 1008) + (i32.const 992) ) ) - (i32.shl - (i32.const 1) - (i32.sub - (local.tee $5 - (i32.shr_u - (local.get $2) - (i32.const 4) - ) - ) - (i32.const 8) + (i32.or + (i32.load8_u + (local.get $3) + ) + (i32.shl + (i32.const 1) + (local.get $5) ) ) ) - ) - (v128.store - (i32.const 992) - (local.get $7) - ) - (i32.store8 - (local.tee $3 - (i32.or - (local.get $3) + (local.set $1 + (i32.add + (local.get $1) + (i32.const 1) + ) + ) + (local.set $6 + (v128.load + (i32.const 1008) + ) + ) + (local.set $7 + (v128.load (i32.const 992) ) ) - (i32.or - (i32.load8_u - (local.get $3) - ) - (i32.shl - (i32.const 1) - (local.get $5) - ) + (br_if $label1 + (local.get $2) ) ) - (local.set $1 - (i32.add - (local.get $1) - (i32.const 1) - ) - ) - (local.set $6 - (v128.load - (i32.const 1008) - ) - ) - (local.set $7 - (v128.load - (i32.const 992) - ) - ) - (br_if $label1 - (local.get $2) - ) + (local.get $scratch) ) - (local.get $scratch) + (i32.const 16) + ) + (local.tee $1 + (local.get $0) ) - (i32.const 16) ) ) - (local.tee $1 - (local.get $0) - ) + (i32.const 16) ) (then - (local.set $2 - (i32.const 0) - ) (loop $label2 (if (v128.any_true @@ -1699,10 +1712,7 @@ (i8x16.shr_u (local.tee $8 (v128.load align=1 - (i32.add - (local.get $0) - (local.get $2) - ) + (local.get $1) ) ) (i32.const 4) @@ -1735,43 +1745,44 @@ ) (then (return - (i32.add - (i32.ctz - (i8x16.bitmask - (local.get $8) + (i32.sub + (i32.add + (local.get $1) + (i32.ctz + (i8x16.bitmask + (local.get $8) + ) ) ) - (local.get $2) + (local.get $0) ) ) ) ) + (local.set $1 + (i32.add + (local.get $1) + (i32.const 16) + ) + ) (br_if $label2 - (i32.le_u - (local.tee $1 - (i32.add - (local.get $0) - (local.tee $2 - (i32.add - (local.get $2) - (i32.const 16) - ) - ) + (i32.gt_u + (local.tee $2 + (i32.sub + (local.get $2) + (i32.const 16) ) ) - (local.get $3) + (i32.const 15) ) ) ) ) ) - (local.set $0 - (i32.add - (i32.xor - (local.get $0) - (i32.const -1) - ) + (local.set $2 + (i32.sub (local.get $1) + (i32.const 1) ) ) (loop $label3 @@ -1781,27 +1792,20 @@ (local.get $6) (local.get $7) (i32.lt_s - (local.tee $2 + (local.tee $1 (i32.load8_s - (local.get $1) + (local.tee $2 + (i32.add + (local.get $2) + (i32.const 1) + ) + ) ) ) (i32.const 0) ) ) ) - (local.set $0 - (i32.add - (local.get $0) - (i32.const 1) - ) - ) - (local.set $1 - (i32.add - (local.get $1) - (i32.const 1) - ) - ) (br_if $label3 (i32.eqz (i32.and @@ -1809,7 +1813,7 @@ (i32.load8_u (i32.or (i32.and - (local.get $2) + (local.get $1) (i32.const 15) ) (i32.const 976) @@ -1817,7 +1821,7 @@ ) (i32.and (i32.shr_u - (local.get $2) + (local.get $1) (i32.const 4) ) (i32.const 7) @@ -1828,7 +1832,10 @@ ) ) ) - (local.get $0) + (i32.sub + (local.get $2) + (local.get $0) + ) ) (func $memccpy (param $0 i32) (param $1 i32) (param $2 i32) (param $3 i32) (result i32) (memory.copy diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index 3de90f7..71b97c3 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -176,17 +176,15 @@ size_t strlen(const char *s) { } static int __strcmp(const char *s1, const char *s2) { - // Set limit to the largest possible valid v128_t pointer. - // Unsigned modular arithmetic gives the correct result - // unless memory size is zero, in which case all pointers are invalid. - const v128_t *const limit = - (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; + // How many bytes can be read before pointers go out of bounds. + size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - // + (size_t)(s1 > s2 ? s1 : s2); // Unaligned loads handle the case where the strings // have mismatching alignments. const v128_t *w1 = (v128_t *)s1; const v128_t *w2 = (v128_t *)s2; - while (w1 <= limit && w2 <= limit) { + for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { // The strings may still be equal, @@ -237,17 +235,16 @@ int strcmp(const char *s1, const char *s2) { __attribute__((weak)) int strncmp(const char *s1, const char *s2, size_t n) { - // Set limit to the largest possible valid v128_t pointer. - // Unsigned modular arithmetic gives the correct result - // unless memory size is zero, in which case all pointers are invalid. - const v128_t *const limit = - (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; + // How many bytes can be read before pointers go out of bounds. + size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - // + (size_t)(s1 > s2 ? s1 : s2); + if (n > N) n = N; // Unaligned loads handle the case where the strings // have mismatching alignments. const v128_t *w1 = (v128_t *)s1; const v128_t *w2 = (v128_t *)s2; - for (; w1 <= limit && w2 <= limit && n >= sizeof(v128_t); n -= sizeof(v128_t)) { + for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) { // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { // The strings may still be equal, @@ -332,21 +329,62 @@ char *strrchr(const char *s, int c) { return (char *)memrchr(s, c, strlen(s) + 1); } +// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html + +#define _WASM_SIMD128_BITMAP256_T \ + struct { \ + uint8_t l __attribute__((__vector_size__(16), __aligned__(16))); \ + uint8_t h __attribute__((__vector_size__(16), __aligned__(16))); \ + } + +#define _WASM_SIMD128_SETBIT(bitmap, i) \ + ({ \ + uint8_t _c = (uint8_t)(i); \ + uint8_t _hi_nibble = _c >> 4; \ + uint8_t _lo_nibble = _c & 0xf; \ + bitmap.l[_lo_nibble] |= 1 << (_hi_nibble - 0); \ + bitmap.h[_lo_nibble] |= 1 << (_hi_nibble - 8); \ + }) + +#define _WASM_SIMD128_CHKBIT(bitmap, i) \ + ({ \ + uint8_t _c = (uint8_t)(i); \ + uint8_t _hi_nibble = _c >> 4; \ + uint8_t _lo_nibble = _c & 0xf; \ + uint8_t _bitmask = 1 << (_hi_nibble & 0x7); \ + uint8_t _bitset = (_hi_nibble < 8 ? bitmap.l : bitmap.h)[_lo_nibble]; \ + _bitmask & _bitset; \ + }) + +#define _WASM_SIMD128_CHKBITS(bitmap, v) \ + ({ \ + v128_t _w = v; \ + v128_t _hi_nibbles = wasm_u8x16_shr(_w, 4); \ + v128_t _lo_nibbles = _w & wasm_u8x16_const_splat(0xf); \ + \ + v128_t _bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, \ + 1, 2, 4, 8, 16, 32, 64, 128); \ + \ + v128_t _bitmask = wasm_i8x16_swizzle(_bitmask_lookup, _hi_nibbles); \ + v128_t _bitsets = wasm_v128_bitselect( \ + wasm_i8x16_swizzle(bitmap.l, _lo_nibbles), \ + wasm_i8x16_swizzle(bitmap.h, _lo_nibbles), \ + wasm_i8x16_lt(_hi_nibbles, wasm_u8x16_const_splat(8))); \ + \ + wasm_i8x16_eq(_bitsets & _bitmask, _bitmask); \ + }) + __attribute__((weak)) size_t strspn(const char *s, const char *c) { - // Set limit to the largest possible valid v128_t pointer. - // Unsigned modular arithmetic gives the correct result - // unless memory size is zero, in which case all pointers are invalid. - const v128_t *const limit = - (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; - + // How many bytes can be read before the pointer goes out of bounds. + size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - (size_t)s; const v128_t *w = (v128_t *)s; const char *const a = s; if (!c[0]) return 0; if (!c[1]) { const v128_t wc = wasm_i8x16_splat(*c); - while (w <= limit) { + for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc); // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(cmp)) { @@ -361,111 +399,65 @@ size_t strspn(const char *s, const char *c) { return s - a; } - // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html - typedef unsigned char u8x16 - __attribute__((__vector_size__(16), __aligned__(16))); + _WASM_SIMD128_BITMAP256_T bitmap = {}; - u8x16 bitmap07 = {}; - u8x16 bitmap8f = {}; for (; *c; c++) { - unsigned lo_nibble = *(unsigned char *)c % 16; - unsigned hi_nibble = *(unsigned char *)c / 16; - bitmap07[lo_nibble] |= 1 << (hi_nibble - 0); - bitmap8f[lo_nibble] |= 1 << (hi_nibble - 8); + _WASM_SIMD128_SETBIT(bitmap, *c); // Terminator IS NOT on the bitmap. } - for (; w <= limit; w++) { - const v128_t lo_nibbles = wasm_v128_load(w) & wasm_u8x16_const_splat(0xf); - const v128_t hi_nibbles = wasm_u8x16_shr(wasm_v128_load(w), 4); - - const v128_t bitmask_lookup = - wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, // - 1, 2, 4, 8, 16, 32, 64, 128); - - const v128_t bitmask = wasm_i8x16_swizzle(bitmask_lookup, hi_nibbles); - const v128_t bitsets = wasm_v128_bitselect( - wasm_i8x16_swizzle(bitmap07, lo_nibbles), - wasm_i8x16_swizzle(bitmap8f, lo_nibbles), - wasm_i8x16_lt(hi_nibbles, wasm_u8x16_const_splat(8))); - - const v128_t cmp = wasm_i8x16_eq(bitsets & bitmask, bitmask); + for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { + const v128_t cmp = _WASM_SIMD128_CHKBITS(bitmap, wasm_v128_load(w)); + // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(cmp)) { size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); return (char *)w + ctz - s; } + w++; } // Baseline algorithm. - for (s = (char *)w;; s++) { - const unsigned lo_nibble = *(unsigned char *)s & 0xf; - const unsigned hi_nibble = *(unsigned char *)s >> 4; - const unsigned bitmask = 1 << (hi_nibble & 0x7); - const unsigned bitset = - hi_nibble < 8 ? bitmap07[lo_nibble] : bitmap8f[lo_nibble]; - if ((bitset & bitmask) == 0) return s - a; - } + for (s = (char *)w; _WASM_SIMD128_CHKBIT(bitmap, *s); s++); + return s - a; } __attribute__((weak)) size_t strcspn(const char *s, const char *c) { if (!c[0] || !c[1]) return __strchrnul(s, *c) - s; - // Set limit to the largest possible valid v128_t pointer. - // Unsigned modular arithmetic gives the correct result - // unless memory size is zero, in which case all pointers are invalid. - const v128_t *const limit = - (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; - + // How many bytes can be read before the pointer goes out of bounds. + size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - (size_t)s; const v128_t *w = (v128_t *)s; const char *const a = s; - // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html - typedef unsigned char u8x16 - __attribute__((__vector_size__(16), __aligned__(16))); + _WASM_SIMD128_BITMAP256_T bitmap = {}; - u8x16 bitmap07 = {}; - u8x16 bitmap8f = {}; - for (;; c++) { - unsigned lo_nibble = *(unsigned char *)c % 16; - unsigned hi_nibble = *(unsigned char *)c / 16; - bitmap07[lo_nibble] |= 1 << (hi_nibble - 0); - bitmap8f[lo_nibble] |= 1 << (hi_nibble - 8); - if (!*c) break; // Terminator IS on the bitmap. + for (;;) { + _WASM_SIMD128_SETBIT(bitmap, *c); + // Terminator IS on the bitmap. + if (!*c++) break; } - for (; w <= limit; w++) { - const v128_t lo_nibbles = wasm_v128_load(w) & wasm_u8x16_const_splat(0xf); - const v128_t hi_nibbles = wasm_u8x16_shr(wasm_v128_load(w), 4); - - const v128_t bitmask_lookup = - wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, // - 1, 2, 4, 8, 16, 32, 64, 128); - - const v128_t bitmask = wasm_i8x16_swizzle(bitmask_lookup, hi_nibbles); - const v128_t bitsets = wasm_v128_bitselect( - wasm_i8x16_swizzle(bitmap07, lo_nibbles), - wasm_i8x16_swizzle(bitmap8f, lo_nibbles), - wasm_i8x16_lt(hi_nibbles, wasm_u8x16_const_splat(8))); - - const v128_t cmp = wasm_i8x16_eq(bitsets & bitmask, bitmask); + for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { + const v128_t cmp = _WASM_SIMD128_CHKBITS(bitmap, wasm_v128_load(w)); + // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { size_t ctz = __builtin_ctz(wasm_i8x16_bitmask(cmp)); return (char *)w + ctz - s; } + w++; } // Baseline algorithm. - for (s = (char *)w;; s++) { - const unsigned lo_nibble = *(unsigned char *)s & 0xf; - const unsigned hi_nibble = *(unsigned char *)s >> 4; - const unsigned bitmask = 1 << (hi_nibble & 0x7); - const unsigned bitset = - hi_nibble < 8 ? bitmap07[lo_nibble] : bitmap8f[lo_nibble]; - if (bitset & bitmask) return s - a; - } + for (s = (char *)w; !_WASM_SIMD128_CHKBIT(bitmap, *s); s++); + return s - a; } +#undef _WASM_SIMD128_SETBIT +#undef _WASM_SIMD128_CHKBIT +#undef _WASM_SIMD128_CHKBITS +#undef _WASM_SIMD128_BITMAP256_T + // Given the above SIMD implementations, // these are best implemented as // small wrappers over those functions.