Docs, tweaks.

This commit is contained in:
Nuno Cruces
2025-06-04 11:28:24 +01:00
parent c90f8205f7
commit c159bbd88f
2 changed files with 59 additions and 17 deletions

41
sqlite3/libc/README.md Normal file
View File

@@ -0,0 +1,41 @@
# Using SIMD for libc
I found that implementing some libc functions with Wasm SIMD128 can make them significantly faster.
Rough numbers for [wazero](https://wazero.io/):
function | speedup
------------ | -----
`strlen` | 4.1×
`memchr` | 4.1×
`strchr` | 4.0×
`strrchr` | 9.1×
`memcmp` | 13.0×
`strcmp` | 10.4×
`strncmp` | 15.7×
`strcasecmp` | 8.8×
`strncasecmp`| 8.6×
`strspn` | 9.9×
`strcspn` | 9.0×
`memmem` | 2.2×
`strstr` | 5.5×
`strcasestr` | 25.2×
For functions where musl uses SWAR on a 4-byte `size_t`,
the improvement is around 4×.
This is very close to the expected theoretical improvement,
as we're processing 4× the bytes per cycle (16 _vs._ 4).
For other functions where there's no algorithmic change,
the improvement is around 8×.
These functions are harder to optimize
(which is why musl doesn't bother with SWAR),
so getting an 8× improvement from processing 16× bytes seems decent.
String search is harder to compare, since there are algorithmic changes,
and different needles produce very different numbers.
We use [Quick Search](https://igm.univ-mlv.fr/~lecroq/string/node19.html) for `memmem`,
and a [RabinKarp](https://igm.univ-mlv.fr/~lecroq/string/node5.html) for `strstr` and `strcasestr`;
musl uses [Two Way](https://igm.univ-mlv.fr/~lecroq/string/node26.html) for `memmem` and `strstr`,
and [brute force](https://igm.univ-mlv.fr/~lecroq/string/node3.html) for `strcasestr`.
Unlike Two-Way, both replacements can go quadratic for long, periodic needles.

View File

@@ -402,8 +402,8 @@ size_t strspn(const char *s, const char *c) {
__wasm_v128_bitmap256_t bitmap = {};
for (; *c; c++) {
__wasm_v128_setbit(&bitmap, *c);
// Terminator IS NOT on the bitmap.
__wasm_v128_setbit(&bitmap, *c);
}
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
@@ -433,11 +433,10 @@ size_t strcspn(const char *s, const char *c) {
__wasm_v128_bitmap256_t bitmap = {};
for (;;) {
__wasm_v128_setbit(&bitmap, *c);
do {
// Terminator IS on the bitmap.
if (!*c++) break;
}
__wasm_v128_setbit(&bitmap, *c);
} while (*c++);
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w));
@@ -465,13 +464,13 @@ size_t strcspn(const char *s, const char *c) {
// We augment the SIMD algorithm with Quick Search's
// bad-character shift.
//
// https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
// https://www-igm.univ-mlv.fr/~lecroq/string/node18.html
// https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
// https://www-igm.univ-mlv.fr/~lecroq/string/node22.html
// https://igm.univ-mlv.fr/~lecroq/string/node14.html
// https://igm.univ-mlv.fr/~lecroq/string/node18.html
// https://igm.univ-mlv.fr/~lecroq/string/node19.html
// https://igm.univ-mlv.fr/~lecroq/string/node22.html
static const char *__memmem(const char *haystk, size_t sh,
const char *needle, size_t sn,
static const char *__memmem(const char *haystk, size_t sh, //
const char *needle, size_t sn, //
uint8_t bmbc[256]) {
// We've handled empty and single character needles.
// The needle is not longer than the haystack.
@@ -490,8 +489,8 @@ static const char *__memmem(const char *haystk, size_t sh,
const v128_t lst = wasm_i8x16_splat(needle[i]);
// The last haystack offset for which loading blk_lst is safe.
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i -
sizeof(v128_t));
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
(sizeof(v128_t) + i));
while (haystk <= H) {
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
@@ -622,8 +621,8 @@ char *strcasestr(const char *haystk, const char *needle) {
const v128_t lstu = wasm_i8x16_splat(toupper(needle[i]));
// The last haystk offset for which loading blk_lst is safe.
const char *H =
(char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - sizeof(v128_t));
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
(sizeof(v128_t) + i));
while (haystk <= H) {
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
@@ -680,7 +679,8 @@ char *strcasestr(const char *haystk, const char *needle) {
// - strtok
__attribute__((weak))
void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) {
void *memccpy(void *__restrict dest, const void *__restrict src, int c,
size_t n) {
const void *m = memchr(src, c, n);
if (m != NULL) {
n = (char *)m - (char *)src + 1;
@@ -717,7 +717,8 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) {
return dest + slen;
}
static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
static char *__stpncpy(char *__restrict dest, const char *__restrict src,
size_t n) {
size_t strnlen(const char *s, size_t n);
size_t slen = strnlen(src, n);
memcpy(dest, src, slen);