diff --git a/sqlite3/libc/build.sh b/sqlite3/libc/build.sh index cff8387..b9082d7 100755 --- a/sqlite3/libc/build.sh +++ b/sqlite3/libc/build.sh @@ -10,10 +10,13 @@ SRCS="${1:-libc.c}" "../tools.sh" trap 'rm -f libc.c libc.tmp' EXIT -echo '#include ' > libc.c -echo '#include ' >> libc.c +cat << EOF > libc.c +#include +#include +EOF "$WASI_SDK/clang" --target=wasm32-wasi -std=c23 -g0 -O2 \ + -Wall -Wextra -Wno-unused-parameter -Wno-unused-function \ -o libc.wasm -I. "$SRCS" \ -mexec-model=reactor \ -msimd128 -mmutable-globals -mmultivalue \ diff --git a/sqlite3/libc/libc.wasm b/sqlite3/libc/libc.wasm index 4358642..0566afe 100755 Binary files a/sqlite3/libc/libc.wasm and b/sqlite3/libc/libc.wasm differ diff --git a/sqlite3/libc/libc.wat b/sqlite3/libc/libc.wat index 9944612..2b7a27d 100644 --- a/sqlite3/libc/libc.wat +++ b/sqlite3/libc/libc.wat @@ -142,42 +142,53 @@ (local $3 i32) (local $4 i32) (local $5 i32) - (local $6 v128) + (local $6 i32) (local $7 v128) - (local.set $4 - (i32.and - (local.get $0) - (i32.const 15) + (local $8 v128) + (local $scratch i32) + (block $block + (br_if $block + (i32.eqz + (local.get $2) + ) ) - ) - (block $block1 - (block $block - (if - (v128.any_true - (local.tee $6 - (i8x16.eq - (v128.load - (local.tee $3 - (i32.and - (local.get $0) - (i32.const -16) - ) - ) - ) + (local.set $4 + (i32.and + (local.get $0) + (i32.const 15) + ) + ) + (block $block2 + (block $block1 + (br_if $block1 + (i32.eqz + (v128.any_true (local.tee $7 - (i8x16.splat - (local.get $1) + (i8x16.eq + (v128.load + (local.tee $3 + (i32.and + (local.get $0) + (i32.const -16) + ) + ) + ) + (local.tee $8 + (i8x16.splat + (local.get $1) + ) + ) ) ) ) ) ) - (then - (br_if $block - (local.tee $1 + (br_if $block1 + (i32.eqz + (local.tee $5 (i32.and (i8x16.bitmask - (local.get $6) + (local.get $7) ) (i32.shl (i32.const -1) @@ -187,44 +198,63 @@ ) ) ) + (local.set $1 + (local.get $2) + ) + (br $block2) ) - (br_if $block1 - (i32.gt_u + (br_if $block + (i32.lt_u + (local.get $2) (local.tee $1 (i32.sub - (i32.add - (local.get $2) - (local.get $4) + (local.get $2) + (local.tee $3 + (i32.sub + (i32.const 16) + (local.get $4) + ) ) - (i32.const 16) ) ) - (local.get $2) + ) + ) + (br_if $block + (i32.eqz + (local.get $1) ) ) (local.set $3 (i32.add - (i32.sub - (local.get $0) - (local.get $4) - ) - (i32.const 16) + (local.get $0) + (local.get $3) ) ) - (block $block2 + (block $block3 (loop $label - (br_if $block2 + (br_if $block3 (v128.any_true - (local.tee $6 + (local.tee $7 (i8x16.eq (v128.load (local.get $3) ) - (local.get $7) + (local.get $8) ) ) ) ) + (br_if $block + (i32.gt_u + (local.tee $0 + (i32.sub + (local.get $1) + (i32.const 16) + ) + ) + (local.get $1) + ) + ) (local.set $3 (i32.add (local.get $3) @@ -232,35 +262,49 @@ ) ) (br_if $label - (i32.ge_u - (local.get $1) - (local.tee $1 - (i32.sub - (local.get $1) - (i32.const 16) + (i32.eqz + (block (result i32) + (local.set $scratch + (i32.eq + (local.get $1) + (i32.const 16) + ) ) + (local.set $1 + (local.get $0) + ) + (local.get $scratch) ) ) ) ) - (br $block1) + (br $block) ) - (local.set $1 + (local.set $5 (i8x16.bitmask - (local.get $6) + (local.get $7) ) ) ) - (local.set $5 - (i32.add - (local.get $3) - (i32.ctz + (local.set $6 + (select + (i32.add + (local.get $3) + (local.tee $0 + (i32.ctz + (local.get $5) + ) + ) + ) + (i32.const 0) + (i32.lt_u + (local.get $0) (local.get $1) ) ) ) ) - (local.get $5) + (local.get $6) ) (func $strlen (param $0 i32) (result i32) (local $1 i32) diff --git a/sqlite3/libc/libc_test.go b/sqlite3/libc/libc_test.go index 9fd4c3a..ba7bbca 100644 --- a/sqlite3/libc/libc_test.go +++ b/sqlite3/libc/libc_test.go @@ -116,6 +116,9 @@ func Benchmark_memchr(b *testing.B) { if got := call(memchr, ptr1, 5, size); got != ptr1+size/2 { b.Fatal(got) } + if got := call(memchr, ptr1, 5, size/2); got != 0 { + b.Fatal(got, ptr1+size/2) + } } func Benchmark_memcmp(b *testing.B) { diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index ea4d5bb..140d701 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -45,7 +45,7 @@ void *memmove(void *dest, const void *src, size_t n) { // aligned address less than memory size. // // These also assume unaligned access is not painfully slow, -// but that bitmask extraction is slow on AArch64. +// but that bitmask extraction is really slow on AArch64. __attribute__((weak)) int memcmp(const void *v1, const void *v2, size_t n) { @@ -75,13 +75,14 @@ void *memchr(const void *v, int c, size_t n) { const v128_t *w = (void *)(v - align); const v128_t wc = wasm_i8x16_splat(c); - while (true) { + while (n) { const v128_t cmp = wasm_i8x16_eq(*w, wc); if (wasm_v128_any_true(cmp)) { int mask = wasm_i8x16_bitmask(cmp) >> align << align; __builtin_assume(mask || align); if (mask) { - return (void *)w + __builtin_ctz(mask); + size_t ctz = __builtin_ctz(mask); + return ctz < n ? (void *)w + ctz : NULL; } } if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) { @@ -90,6 +91,7 @@ void *memchr(const void *v, int c, size_t n) { align = 0; w++; } + return NULL; } __attribute__((weak)) @@ -111,12 +113,7 @@ size_t strlen(const char *s) { } } -__attribute__((weak)) -int strcmp(const char *s1, const char *s2) { - if (__builtin_constant_p(__builtin_strlen(s2))) { - return strncmp(s1, s2, __builtin_strlen(s2)); - } - +static int __strcmp(const char *s1, const char *s2) { const v128_t *const limit = (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; @@ -144,6 +141,14 @@ int strcmp(const char *s1, const char *s2) { return 0; } +__attribute__((weak, always_inline)) +int strcmp(const char *s1, const char *s2) { + if (__builtin_constant_p(strlen(s2))) { + return strncmp(s1, s2, strlen(s2)); + } + return __strcmp(s1, s2); +} + __attribute__((weak)) int strncmp(const char *s1, const char *s2, size_t n) { const v128_t *const limit = @@ -173,12 +178,7 @@ int strncmp(const char *s1, const char *s2, size_t n) { return 0; } -__attribute__((always_inline)) static char *__strchrnul(const char *s, int c) { - if (__builtin_constant_p(c) && (char)c == 0) { - return (char *)s + strlen(s); - } - uintptr_t align = (uintptr_t)s % sizeof(v128_t); const v128_t *w = (void *)(s - align); const v128_t wc = wasm_i8x16_splat(c); @@ -197,13 +197,19 @@ static char *__strchrnul(const char *s, int c) { } } -__attribute__((weak)) +__attribute__((weak, always_inline)) char *strchrnul(const char *s, int c) { + if (__builtin_constant_p(c) && (char)c == 0) { + return (char *)s + strlen(s); + } return __strchrnul(s, c); } -__attribute__((weak)) +__attribute__((weak, always_inline)) char *strchr(const char *s, int c) { + if (__builtin_constant_p(c) && (char)c == 0) { + return (char *)s + strlen(s); + } char *r = __strchrnul(s, c); return *(char *)r == (char)c ? r : NULL; }