From c159bbd88f6721cb8e6a769dddce066cd3b97a10 Mon Sep 17 00:00:00 2001 From: Nuno Cruces Date: Wed, 4 Jun 2025 11:28:24 +0100 Subject: [PATCH] Docs, tweaks. --- sqlite3/libc/README.md | 41 +++++++++++++++++++++++++++++++++++++++++ sqlite3/libc/string.h | 35 ++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 17 deletions(-) create mode 100644 sqlite3/libc/README.md diff --git a/sqlite3/libc/README.md b/sqlite3/libc/README.md new file mode 100644 index 0000000..b30a8a5 --- /dev/null +++ b/sqlite3/libc/README.md @@ -0,0 +1,41 @@ +# Using SIMD for libc + +I found that implementing some libc functions with Wasm SIMD128 can make them significantly faster. + +Rough numbers for [wazero](https://wazero.io/): + + function | speedup +------------ | ----- +`strlen` | 4.1× +`memchr` | 4.1× +`strchr` | 4.0× +`strrchr` | 9.1× +`memcmp` | 13.0× +`strcmp` | 10.4× +`strncmp` | 15.7× +`strcasecmp` | 8.8× +`strncasecmp`| 8.6× +`strspn` | 9.9× +`strcspn` | 9.0× +`memmem` | 2.2× +`strstr` | 5.5× +`strcasestr` | 25.2× + +For functions where musl uses SWAR on a 4-byte `size_t`, +the improvement is around 4×. +This is very close to the expected theoretical improvement, +as we're processing 4× the bytes per cycle (16 _vs._ 4). + +For other functions where there's no algorithmic change, +the improvement is around 8×. +These functions are harder to optimize +(which is why musl doesn't bother with SWAR), +so getting an 8× improvement from processing 16× bytes seems decent. + +String search is harder to compare, since there are algorithmic changes, +and different needles produce very different numbers. +We use [Quick Search](https://igm.univ-mlv.fr/~lecroq/string/node19.html) for `memmem`, +and a [Rabin–Karp](https://igm.univ-mlv.fr/~lecroq/string/node5.html) for `strstr` and `strcasestr`; +musl uses [Two Way](https://igm.univ-mlv.fr/~lecroq/string/node26.html) for `memmem` and `strstr`, +and [brute force](https://igm.univ-mlv.fr/~lecroq/string/node3.html) for `strcasestr`. +Unlike Two-Way, both replacements can go quadratic for long, periodic needles. \ No newline at end of file diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index e1314ef..a040674 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -402,8 +402,8 @@ size_t strspn(const char *s, const char *c) { __wasm_v128_bitmap256_t bitmap = {}; for (; *c; c++) { - __wasm_v128_setbit(&bitmap, *c); // Terminator IS NOT on the bitmap. + __wasm_v128_setbit(&bitmap, *c); } for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { @@ -433,11 +433,10 @@ size_t strcspn(const char *s, const char *c) { __wasm_v128_bitmap256_t bitmap = {}; - for (;;) { - __wasm_v128_setbit(&bitmap, *c); + do { // Terminator IS on the bitmap. - if (!*c++) break; - } + __wasm_v128_setbit(&bitmap, *c); + } while (*c++); for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w)); @@ -465,13 +464,13 @@ size_t strcspn(const char *s, const char *c) { // We augment the SIMD algorithm with Quick Search's // bad-character shift. // -// https://www-igm.univ-mlv.fr/~lecroq/string/node14.html -// https://www-igm.univ-mlv.fr/~lecroq/string/node18.html -// https://www-igm.univ-mlv.fr/~lecroq/string/node19.html -// https://www-igm.univ-mlv.fr/~lecroq/string/node22.html +// https://igm.univ-mlv.fr/~lecroq/string/node14.html +// https://igm.univ-mlv.fr/~lecroq/string/node18.html +// https://igm.univ-mlv.fr/~lecroq/string/node19.html +// https://igm.univ-mlv.fr/~lecroq/string/node22.html -static const char *__memmem(const char *haystk, size_t sh, - const char *needle, size_t sn, +static const char *__memmem(const char *haystk, size_t sh, // + const char *needle, size_t sn, // uint8_t bmbc[256]) { // We've handled empty and single character needles. // The needle is not longer than the haystack. @@ -490,8 +489,8 @@ static const char *__memmem(const char *haystk, size_t sh, const v128_t lst = wasm_i8x16_splat(needle[i]); // The last haystack offset for which loading blk_lst is safe. - const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - - sizeof(v128_t)); + const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - // + (sizeof(v128_t) + i)); while (haystk <= H) { const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk)); @@ -622,8 +621,8 @@ char *strcasestr(const char *haystk, const char *needle) { const v128_t lstu = wasm_i8x16_splat(toupper(needle[i])); // The last haystk offset for which loading blk_lst is safe. - const char *H = - (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - sizeof(v128_t)); + const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - // + (sizeof(v128_t) + i)); while (haystk <= H) { const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk)); @@ -680,7 +679,8 @@ char *strcasestr(const char *haystk, const char *needle) { // - strtok __attribute__((weak)) -void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) { +void *memccpy(void *__restrict dest, const void *__restrict src, int c, + size_t n) { const void *m = memchr(src, c, n); if (m != NULL) { n = (char *)m - (char *)src + 1; @@ -717,7 +717,8 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) { return dest + slen; } -static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) { +static char *__stpncpy(char *__restrict dest, const char *__restrict src, + size_t n) { size_t strnlen(const char *s, size_t n); size_t slen = strnlen(src, n); memcpy(dest, src, slen);