diff --git a/sqlite3/libc/libc.wasm b/sqlite3/libc/libc.wasm index f8b8f94..42dbc39 100755 Binary files a/sqlite3/libc/libc.wasm and b/sqlite3/libc/libc.wasm differ diff --git a/sqlite3/libc/libc.wat b/sqlite3/libc/libc.wat index 9302590..b11ea9f 100644 --- a/sqlite3/libc/libc.wat +++ b/sqlite3/libc/libc.wat @@ -36,103 +36,130 @@ (local.get $0) ) (func $memcmp (param $0 i32) (param $1 i32) (param $2 i32) (result i32) - (local $3 i32) + (local $3 v128) (local $4 i32) - (block $block1 - (block $block + (local $5 i32) + (block $block + (br_if $block + (i32.lt_u + (local.get $2) + (i32.const 16) + ) + ) + (loop $label (if - (i32.ge_u - (local.get $2) - (i32.const 16) - ) - (then - (loop $label - (br_if $block - (v128.any_true - (v128.xor - (v128.load align=1 - (local.get $1) - ) - (v128.load align=1 - (local.get $0) - ) - ) - ) - ) - (local.set $1 - (i32.add - (local.get $1) - (i32.const 16) - ) - ) - (local.set $0 - (i32.add + (i8x16.all_true + (local.tee $3 + (i8x16.eq + (v128.load align=1 (local.get $0) - (i32.const 16) ) - ) - (br_if $label - (i32.gt_u - (local.tee $2 - (i32.sub - (local.get $2) - (i32.const 16) - ) - ) - (i32.const 15) + (v128.load align=1 + (local.get $1) ) ) ) ) - ) - (br_if $block1 - (i32.eqz - (local.get $2) + (then + (local.set $1 + (i32.add + (local.get $1) + (i32.const 16) + ) + ) + (local.set $0 + (i32.add + (local.get $0) + (i32.const 16) + ) + ) + (br_if $label + (i32.gt_u + (local.tee $2 + (i32.sub + (local.get $2) + (i32.const 16) + ) + ) + (i32.const 15) + ) + ) + (br $block) ) ) ) - (loop $label1 - (if - (i32.ne - (local.tee $3 - (i32.load8_u - (local.get $0) - ) - ) - (local.tee $4 - (i32.load8_u - (local.get $1) + (return + (i32.sub + (i32.load8_u + (i32.add + (local.get $0) + (local.tee $2 + (i32.ctz + (i32.xor + (i8x16.bitmask + (local.get $3) + ) + (i32.const -1) + ) + ) ) ) ) - (then - (return - (i32.sub - (local.get $3) - (local.get $4) - ) - ) - ) - ) - (local.set $1 - (i32.add - (local.get $1) - (i32.const 1) - ) - ) - (local.set $0 - (i32.add - (local.get $0) - (i32.const 1) - ) - ) - (br_if $label1 - (local.tee $2 - (i32.sub + (i32.load8_u + (i32.add + (local.get $1) (local.get $2) + ) + ) + ) + ) + ) + (if + (local.get $2) + (then + (loop $label1 + (if + (i32.ne + (local.tee $4 + (i32.load8_u + (local.get $0) + ) + ) + (local.tee $5 + (i32.load8_u + (local.get $1) + ) + ) + ) + (then + (return + (i32.sub + (local.get $4) + (local.get $5) + ) + ) + ) + ) + (local.set $1 + (i32.add + (local.get $1) (i32.const 1) ) ) + (local.set $0 + (i32.add + (local.get $0) + (i32.const 1) + ) + ) + (br_if $label1 + (local.tee $2 + (i32.sub + (local.get $2) + (i32.const 1) + ) + ) + ) ) ) ) @@ -886,7 +913,9 @@ (func $strspn (param $0 i32) (param $1 i32) (result i32) (local $2 i32) (local $3 i32) - (local $4 v128) + (local $4 i32) + (local $5 v128) + (local $6 v128) (local $scratch i32) (if (i32.eqz @@ -902,7 +931,7 @@ ) ) ) - (block $block1 + (block $block (if (i32.eqz (i32.load8_u offset=1 @@ -910,50 +939,75 @@ ) ) (then - (block $block - (br_if $block - (i32.gt_u - (local.tee $1 - (local.get $0) - ) - (local.tee $3 - (i32.sub - (i32.shl - (memory.size) - (i32.const 16) - ) + (if + (i32.ge_u + (local.tee $4 + (i32.sub + (i32.shl + (memory.size) (i32.const 16) ) + (i32.const 16) ) ) - ) - (local.set $4 - (i8x16.splat - (local.get $2) + (local.tee $1 + (local.get $0) ) ) - (loop $label - (br_if $block - (i32.eqz - (i8x16.all_true - (i8x16.eq - (v128.load align=1 - (local.get $1) + (then + (local.set $5 + (i8x16.splat + (local.get $2) + ) + ) + (loop $label + (if + (i32.eqz + (i8x16.all_true + (local.tee $6 + (i8x16.eq + (v128.load align=1 + (i32.add + (local.get $0) + (local.get $3) + ) + ) + (local.get $5) + ) + ) + ) + ) + (then + (return + (i32.add + (i32.ctz + (i32.xor + (i8x16.bitmask + (local.get $6) + ) + (i32.const -1) + ) + ) + (local.get $3) ) - (local.get $4) ) ) ) - ) - (br_if $label - (i32.le_u - (local.tee $1 - (i32.add - (local.get $1) - (i32.const 16) + (br_if $label + (i32.le_u + (local.tee $1 + (i32.add + (local.get $0) + (local.tee $3 + (i32.add + (local.get $3) + (i32.const 16) + ) + ) + ) ) + (local.get $4) ) - (local.get $3) ) ) ) @@ -992,7 +1046,7 @@ ) ) ) - (br $block1) + (br $block) ) ) (v128.store @@ -1094,11 +1148,11 @@ (local.set $2 (local.get $0) ) - (block $block2 - (block $block3 - (block $block4 + (block $block1 + (block $block2 + (block $block3 (loop $label3 - (br_if $block2 + (br_if $block1 (i32.eqz (i32.load8_u (i32.add @@ -1110,7 +1164,7 @@ ) ) ) - (br_if $block3 + (br_if $block2 (i32.eqz (i32.load8_u (i32.add @@ -1122,7 +1176,7 @@ ) ) ) - (br_if $block4 + (br_if $block3 (i32.eqz (i32.load8_u (i32.add @@ -1162,7 +1216,7 @@ (i32.const 1) ) ) - (br $block2) + (br $block1) ) (local.set $2 (i32.add @@ -1170,7 +1224,7 @@ (i32.const 2) ) ) - (br $block2) + (br $block1) ) (local.set $2 (i32.add diff --git a/sqlite3/libc/math.h b/sqlite3/libc/math.h index 76128b3..485a29d 100644 --- a/sqlite3/libc/math.h +++ b/sqlite3/libc/math.h @@ -11,9 +11,16 @@ extern "C" { #ifdef __wasm_relaxed_simd__ +// This header assumes "relaxed fused multiply-add" +// is both faster and more precise. + +#define FP_FAST_FMA 1 + __attribute__((weak)) double fma(double x, double y, double z) { - const v128_t wx = wasm_f64x2_splat(x); + // If we get a software implementation from the host, + // this is enough to short circuit it on the 2nd lane. + const v128_t wx = wasm_f64x2_replace_lane(b, 0, x); const v128_t wy = wasm_f64x2_splat(y); const v128_t wz = wasm_f64x2_splat(z); const v128_t wr = wasm_f64x2_relaxed_madd(wx, wy, wz); diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index 0fd8996..b56703f 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -39,13 +39,6 @@ void *memmove(void *dest, const void *src, size_t n) { #ifdef __wasm_simd128__ // SIMD versions of some string.h functions. -// -// These assume aligned v128_t loads can't fail, -// and so can't unaligned loads up to the last -// aligned address less than memory size. -// -// These also assume unaligned access is not painfully slow, -// but that bitmask extraction is really slow on AArch64. __attribute__((weak)) int memcmp(const void *v1, const void *v2, size_t n) { @@ -55,9 +48,13 @@ int memcmp(const void *v1, const void *v2, size_t n) { const v128_t *w1 = (v128_t *)v1; const v128_t *w2 = (v128_t *)v2; for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) { - // Find any single bit difference. - if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { - break; + const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2)); + // Bitmask is slow on AArch64, all_true is much faster. + if (!wasm_i8x16_all_true(cmp)) { + size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); + const unsigned char *u1 = (unsigned char *)w1 + ctz; + const unsigned char *u2 = (unsigned char *)w2 + ctz; + return *u1 - *u2; } w1++; w2++; @@ -77,7 +74,7 @@ int memcmp(const void *v1, const void *v2, size_t n) { __attribute__((weak)) void *memchr(const void *v, int c, size_t n) { // When n is zero, a function that locates a character finds no occurrence. - // Otherwise, decrement n to ensure __builtin_sub_overflow "overflows" + // Otherwise, decrement n to ensure __builtin_sub_overflow overflows // when n would go equal-to-or-below zero. if (n-- == 0) { return NULL; @@ -98,7 +95,7 @@ void *memchr(const void *v, int c, size_t n) { // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless we cleared them. - // Knowing this helps the compiler. + // Knowing this helps the compiler. __builtin_assume(mask || align); // If the mask is zero because of alignment, // it's as if we didn't find anything. @@ -109,7 +106,7 @@ void *memchr(const void *v, int c, size_t n) { return ctz <= n + align ? (char *)w + ctz : NULL; } } - // Decrement n; if it "overflows" we're done. + // Decrement n; if it overflows we're done. if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) { return NULL; } @@ -133,7 +130,7 @@ size_t strlen(const char *s) { // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless we cleared them. - // Knowing this helps the compiler. + // Knowing this helps the compiler. __builtin_assume(mask || align); if (mask) { return (char *)w - s + __builtin_ctz(mask); @@ -181,12 +178,23 @@ static int __strcmp(const char *s1, const char *s2) { return 0; } +static int __strcmp_s(const char *s1, const char *s2) { + const unsigned char *u1 = (unsigned char *)s1; + const unsigned char *u2 = (unsigned char *)s2; + while (true) { + if (*u1 != *u2) return *u1 - *u2; + if (*u1 == 0) break; + u1++; + u2++; + } + return 0; +} + __attribute__((weak, always_inline)) int strcmp(const char *s1, const char *s2) { - // Use strncmp when comparing against literal strings. - // If the literal is small, the vector search will be skipped. - if (__builtin_constant_p(strlen(s2))) { - return strncmp(s1, s2, strlen(s2)); + // Skip the vector search when comparing against small literal strings. + if (__builtin_constant_p(strlen(s2) && strlen(s2) < sizeof(v128_t))) { + return __strcmp_s(s1, s2); } return __strcmp(s1, s2); } @@ -244,7 +252,7 @@ static char *__strchrnul(const char *s, int c) { // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless we cleared them. - // Knowing this helps the compiler. + // Knowing this helps the compiler. __builtin_assume(mask || align); if (mask) { return (char *)w + __builtin_ctz(mask); @@ -277,7 +285,7 @@ char *strchr(const char *s, int c) { __attribute__((weak)) size_t strspn(const char *s, const char *c) { #ifndef _REENTRANT - static // Avoid the stack for builds without threads. + static // Avoid the stack for builds without threads. #endif char byteset[UCHAR_MAX + 1]; const char *const a = s; @@ -293,12 +301,16 @@ size_t strspn(const char *s, const char *c) { const v128_t *w = (v128_t *)s; const v128_t wc = wasm_i8x16_splat(*c); while (w <= limit) { - if (!wasm_i8x16_all_true(wasm_i8x16_eq(wasm_v128_load(w), wc))) { - break; + const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc); + // Bitmask is slow on AArch64, all_true is much faster. + if (!wasm_i8x16_all_true(cmp)) { + size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); + return (char *)w + ctz - s; } w++; } + // Continue byte-by-byte. s = (char *)w; while (*s == *c) s++; return s - a; @@ -311,20 +323,21 @@ size_t strspn(const char *s, const char *c) { while (*c && (byteset[*(unsigned char *)c] = 1)) c++; while (byteset[*(unsigned char *)s]) s++; -#else +#else // __OPTIMIZE__ // This is faster than memset. + // Going backward helps bounds check elimination. volatile v128_t *w = (v128_t *)byteset; #pragma unroll for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){}; static_assert(sizeof(byteset) % sizeof(v128_t) == 0); - // Keeping byteset[0] = 0 avoids the other loop having to test for it. + // Keeping byteset[0] = 0 avoids the next loop needing that check. while (*c && (byteset[*(unsigned char *)c] = 1)) c++; #pragma unroll 4 while (byteset[*(unsigned char *)s]) s++; -#endif +#endif // __OPTIMIZE__ return s - a; } @@ -332,7 +345,7 @@ size_t strspn(const char *s, const char *c) { __attribute__((weak)) size_t strcspn(const char *s, const char *c) { #ifndef _REENTRANT - static // Avoid the stack for builds without threads. + static // Avoid the stack for builds without threads. #endif char byteset[UCHAR_MAX + 1]; const char *const a = s; @@ -346,24 +359,31 @@ size_t strcspn(const char *s, const char *c) { while ((byteset[*(unsigned char *)c] = 1) && *c) c++; while (!byteset[*(unsigned char *)s]) s++; -#else +#else // __OPTIMIZE__ // This is faster than memset. + // Going backward helps bounds check elimination. volatile v128_t *w = (v128_t *)byteset; #pragma unroll for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){}; static_assert(sizeof(byteset) % sizeof(v128_t) == 0); - // Setting byteset[0] = 1 avoids the other loop having to test for it. + // Setting byteset[0] = 1 avoids the next loop needing that check. while ((byteset[*(unsigned char *)c] = 1) && *c) c++; #pragma unroll 4 while (!byteset[*(unsigned char *)s]) s++; -#endif +#endif // __OPTIMIZE__ return s - a; } +__attribute__((weak, always_inline)) +char *strpbrk(const char *s, const char *b) { + s += strcspn(s, b); + return *s ? (char *)s : 0; +} + #endif // __wasm_simd128__ #ifdef __cplusplus