Docs, tweaks.

2026-01-12 05:59:14 +00:00 · 2025-06-04 11:28:24 +01:00
parent c90f8205f7
commit c159bbd88f
2 changed files with 59 additions and 17 deletions
--- a/sqlite3/libc/README.md
+++ b/sqlite3/libc/README.md
@@ -0,0 +1,41 @@
+# Using SIMD for libc
+
+I found that implementing some libc functions with Wasm SIMD128 can make them significantly faster.
+
+Rough numbers for [wazero](https://wazero.io/):
+
+  function   | speedup
+------------ | -----
+`strlen`     |  4.1×
+`memchr`     |  4.1×
+`strchr`     |  4.0×
+`strrchr`    |  9.1×
+`memcmp`     | 13.0×
+`strcmp`     | 10.4×
+`strncmp`    | 15.7×
+`strcasecmp` |  8.8×
+`strncasecmp`|  8.6×
+`strspn`     |  9.9×
+`strcspn`    |  9.0×
+`memmem`     |  2.2×
+`strstr`     |  5.5×
+`strcasestr` | 25.2×
+
+For functions where musl uses SWAR on a 4-byte `size_t`,
+the improvement is around 4×.
+This is very close to the expected theoretical improvement,
+as we're processing 4× the bytes per cycle (16 _vs._ 4).
+
+For other functions where there's no algorithmic change,
+the improvement is around 8×.
+These functions are harder to optimize
+(which is why musl doesn't bother with SWAR),
+so getting an 8× improvement from processing 16× bytes seems decent.
+
+String search is harder to compare, since there are algorithmic changes,
+and different needles produce very different numbers.
+We use [Quick Search](https://igm.univ-mlv.fr/~lecroq/string/node19.html) for `memmem`,
+and a [Rabin–Karp](https://igm.univ-mlv.fr/~lecroq/string/node5.html) for `strstr` and `strcasestr`;
+musl uses [Two Way](https://igm.univ-mlv.fr/~lecroq/string/node26.html) for `memmem` and `strstr`,
+and [brute force](https://igm.univ-mlv.fr/~lecroq/string/node3.html) for `strcasestr`.
+Unlike Two-Way, both replacements can go quadratic for long, periodic needles.
--- a/sqlite3/libc/string.h
+++ b/sqlite3/libc/string.h
@@ -402,8 +402,8 @@ size_t strspn(const char *s, const char *c) {
  __wasm_v128_bitmap256_t bitmap = {};

  for (; *c; c++) {
-    __wasm_v128_setbit(&bitmap, *c);
    // Terminator IS NOT on the bitmap.
+    __wasm_v128_setbit(&bitmap, *c);
  }

  for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
@@ -433,11 +433,10 @@ size_t strcspn(const char *s, const char *c) {

  __wasm_v128_bitmap256_t bitmap = {};

-  for (;;) {
-    __wasm_v128_setbit(&bitmap, *c);
+  do {
    // Terminator IS on the bitmap.
-    if (!*c++) break;
-  }
+    __wasm_v128_setbit(&bitmap, *c);
+  } while (*c++);

  for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
    const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w));
@@ -465,13 +464,13 @@ size_t strcspn(const char *s, const char *c) {
 // We augment the SIMD algorithm with Quick Search's
 // bad-character shift.
 //
-// https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-// https://www-igm.univ-mlv.fr/~lecroq/string/node18.html
-// https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-// https://www-igm.univ-mlv.fr/~lecroq/string/node22.html
+// https://igm.univ-mlv.fr/~lecroq/string/node14.html
+// https://igm.univ-mlv.fr/~lecroq/string/node18.html
+// https://igm.univ-mlv.fr/~lecroq/string/node19.html
+// https://igm.univ-mlv.fr/~lecroq/string/node22.html

-static const char *__memmem(const char *haystk, size_t sh,
-                            const char *needle, size_t sn,
+static const char *__memmem(const char *haystk, size_t sh,  //
+                            const char *needle, size_t sn,  //
                            uint8_t bmbc[256]) {
  // We've handled empty and single character needles.
  // The needle is not longer than the haystack.
@@ -490,8 +489,8 @@ static const char *__memmem(const char *haystk, size_t sh,
  const v128_t lst = wasm_i8x16_splat(needle[i]);

  // The last haystack offset for which loading blk_lst is safe.
-  const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i -
-                           sizeof(v128_t));
+  const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE -  //
+                           (sizeof(v128_t) + i));

  while (haystk <= H) {
    const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
@@ -622,8 +621,8 @@ char *strcasestr(const char *haystk, const char *needle) {
  const v128_t lstu = wasm_i8x16_splat(toupper(needle[i]));

  // The last haystk offset for which loading blk_lst is safe.
-  const char *H =
-      (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - sizeof(v128_t));
+  const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE -  //
+                           (sizeof(v128_t) + i));

  while (haystk <= H) {
    const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
@@ -680,7 +679,8 @@ char *strcasestr(const char *haystk, const char *needle) {
 //  - strtok

 __attribute__((weak))
-void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) {
+void *memccpy(void *__restrict dest, const void *__restrict src, int c,
+              size_t n) {
  const void *m = memchr(src, c, n);
  if (m != NULL) {
    n = (char *)m - (char *)src + 1;
@@ -717,7 +717,8 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) {
  return dest + slen;
 }

-static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
+static char *__stpncpy(char *__restrict dest, const char *__restrict src,
+                       size_t n) {
  size_t strnlen(const char *s, size_t n);
  size_t slen = strnlen(src, n);
  memcpy(dest, src, slen);