Fix.

2026-01-12 05:59:14 +00:00 · 2025-05-01 12:43:24 +01:00
parent 13b8642384
commit d748d98e39
4 changed files with 233 additions and 152 deletions
--- a/sqlite3/libc/libc.wasm
+++ b/sqlite3/libc/libc.wasm
--- a/sqlite3/libc/libc.wat
+++ b/sqlite3/libc/libc.wat
@@ -36,103 +36,130 @@
  (local.get $0)
 )
 (func $memcmp (param $0 i32) (param $1 i32) (param $2 i32) (result i32)
-  (local $3 i32)
+  (local $3 v128)
  (local $4 i32)
-  (block $block1
-   (block $block
+  (local $5 i32)
+  (block $block
+   (br_if $block
+    (i32.lt_u
+     (local.get $2)
+     (i32.const 16)
+    )
+   )
+   (loop $label
    (if
-     (i32.ge_u
-      (local.get $2)
-      (i32.const 16)
-     )
-     (then
-      (loop $label
-       (br_if $block
-        (v128.any_true
-         (v128.xor
-          (v128.load align=1
-           (local.get $1)
-          )
-          (v128.load align=1
-           (local.get $0)
-          )
-         )
-        )
-       )
-       (local.set $1
-        (i32.add
-         (local.get $1)
-         (i32.const 16)
-        )
-       )
-       (local.set $0
-        (i32.add
+     (i8x16.all_true
+      (local.tee $3
+       (i8x16.eq
+        (v128.load align=1
         (local.get $0)
-         (i32.const 16)
        )
-       )
-       (br_if $label
-        (i32.gt_u
-         (local.tee $2
-          (i32.sub
-           (local.get $2)
-           (i32.const 16)
-          )
-         )
-         (i32.const 15)
+        (v128.load align=1
+         (local.get $1)
        )
       )
      )
     )
-    )
-    (br_if $block1
-     (i32.eqz
-      (local.get $2)
+     (then
+      (local.set $1
+       (i32.add
+        (local.get $1)
+        (i32.const 16)
+       )
+      )
+      (local.set $0
+       (i32.add
+        (local.get $0)
+        (i32.const 16)
+       )
+      )
+      (br_if $label
+       (i32.gt_u
+        (local.tee $2
+         (i32.sub
+          (local.get $2)
+          (i32.const 16)
+         )
+        )
+        (i32.const 15)
+       )
+      )
+      (br $block)
     )
    )
   )
-   (loop $label1
-    (if
-     (i32.ne
-      (local.tee $3
-       (i32.load8_u
-        (local.get $0)
-       )
-      )
-      (local.tee $4
-       (i32.load8_u
-        (local.get $1)
+   (return
+    (i32.sub
+     (i32.load8_u
+      (i32.add
+       (local.get $0)
+       (local.tee $2
+        (i32.ctz
+         (i32.xor
+          (i8x16.bitmask
+           (local.get $3)
+          )
+          (i32.const -1)
+         )
+        )
       )
      )
     )
-     (then
-      (return
-       (i32.sub
-        (local.get $3)
-        (local.get $4)
-       )
-      )
-     )
-    )
-    (local.set $1
-     (i32.add
-      (local.get $1)
-      (i32.const 1)
-     )
-    )
-    (local.set $0
-     (i32.add
-      (local.get $0)
-      (i32.const 1)
-     )
-    )
-    (br_if $label1
-     (local.tee $2
-      (i32.sub
+     (i32.load8_u
+      (i32.add
+       (local.get $1)
       (local.get $2)
+      )
+     )
+    )
+   )
+  )
+  (if
+   (local.get $2)
+   (then
+    (loop $label1
+     (if
+      (i32.ne
+       (local.tee $4
+        (i32.load8_u
+         (local.get $0)
+        )
+       )
+       (local.tee $5
+        (i32.load8_u
+         (local.get $1)
+        )
+       )
+      )
+      (then
+       (return
+        (i32.sub
+         (local.get $4)
+         (local.get $5)
+        )
+       )
+      )
+     )
+     (local.set $1
+      (i32.add
+       (local.get $1)
       (i32.const 1)
      )
     )
+     (local.set $0
+      (i32.add
+       (local.get $0)
+       (i32.const 1)
+      )
+     )
+     (br_if $label1
+      (local.tee $2
+       (i32.sub
+        (local.get $2)
+        (i32.const 1)
+       )
+      )
+     )
    )
   )
  )
@@ -886,7 +913,9 @@
 (func $strspn (param $0 i32) (param $1 i32) (result i32)
  (local $2 i32)
  (local $3 i32)
-  (local $4 v128)
+  (local $4 i32)
+  (local $5 v128)
+  (local $6 v128)
  (local $scratch i32)
  (if
   (i32.eqz
@@ -902,7 +931,7 @@
    )
   )
  )
-  (block $block1
+  (block $block
   (if
    (i32.eqz
     (i32.load8_u offset=1
@@ -910,50 +939,75 @@
     )
    )
    (then
-     (block $block
-      (br_if $block
-       (i32.gt_u
-        (local.tee $1
-         (local.get $0)
-        )
-        (local.tee $3
-         (i32.sub
-          (i32.shl
-           (memory.size)
-           (i32.const 16)
-          )
+     (if
+      (i32.ge_u
+       (local.tee $4
+        (i32.sub
+         (i32.shl
+          (memory.size)
          (i32.const 16)
         )
+         (i32.const 16)
        )
       )
-      )
-      (local.set $4
-       (i8x16.splat
-        (local.get $2)
+       (local.tee $1
+        (local.get $0)
       )
      )
-      (loop $label
-       (br_if $block
-        (i32.eqz
-         (i8x16.all_true
-          (i8x16.eq
-           (v128.load align=1
-            (local.get $1)
+      (then
+       (local.set $5
+        (i8x16.splat
+         (local.get $2)
+        )
+       )
+       (loop $label
+        (if
+         (i32.eqz
+          (i8x16.all_true
+           (local.tee $6
+            (i8x16.eq
+             (v128.load align=1
+              (i32.add
+               (local.get $0)
+               (local.get $3)
+              )
+             )
+             (local.get $5)
+            )
+           )
+          )
+         )
+         (then
+          (return
+           (i32.add
+            (i32.ctz
+             (i32.xor
+              (i8x16.bitmask
+               (local.get $6)
+              )
+              (i32.const -1)
+             )
+            )
+            (local.get $3)
           )
-           (local.get $4)
          )
         )
        )
-       )
-       (br_if $label
-        (i32.le_u
-         (local.tee $1
-          (i32.add
-           (local.get $1)
-           (i32.const 16)
+        (br_if $label
+         (i32.le_u
+          (local.tee $1
+           (i32.add
+            (local.get $0)
+            (local.tee $3
+             (i32.add
+              (local.get $3)
+              (i32.const 16)
+             )
+            )
+           )
          )
+          (local.get $4)
         )
-         (local.get $3)
        )
       )
      )
@@ -992,7 +1046,7 @@
       )
      )
     )
-     (br $block1)
+     (br $block)
    )
   )
   (v128.store
@@ -1094,11 +1148,11 @@
   (local.set $2
    (local.get $0)
   )
-   (block $block2
-    (block $block3
-     (block $block4
+   (block $block1
+    (block $block2
+     (block $block3
      (loop $label3
-       (br_if $block2
+       (br_if $block1
        (i32.eqz
         (i32.load8_u
          (i32.add
@@ -1110,7 +1164,7 @@
         )
        )
       )
-       (br_if $block3
+       (br_if $block2
        (i32.eqz
         (i32.load8_u
          (i32.add
@@ -1122,7 +1176,7 @@
         )
        )
       )
-       (br_if $block4
+       (br_if $block3
        (i32.eqz
         (i32.load8_u
          (i32.add
@@ -1162,7 +1216,7 @@
        (i32.const 1)
       )
      )
-      (br $block2)
+      (br $block1)
     )
     (local.set $2
      (i32.add
@@ -1170,7 +1224,7 @@
       (i32.const 2)
      )
     )
-     (br $block2)
+     (br $block1)
    )
    (local.set $2
     (i32.add
--- a/sqlite3/libc/math.h
+++ b/sqlite3/libc/math.h
@@ -11,9 +11,16 @@ extern "C" {

 #ifdef __wasm_relaxed_simd__

+// This header assumes "relaxed fused multiply-add"
+// is both faster and more precise.
+
+#define FP_FAST_FMA 1
+
 __attribute__((weak))
 double fma(double x, double y, double z) {
-  const v128_t wx = wasm_f64x2_splat(x);
+  // If we get a software implementation from the host,
+  // this is enough to short circuit it on the 2nd lane.
+  const v128_t wx = wasm_f64x2_replace_lane(b, 0, x);
  const v128_t wy = wasm_f64x2_splat(y);
  const v128_t wz = wasm_f64x2_splat(z);
 	const v128_t wr = wasm_f64x2_relaxed_madd(wx, wy, wz);
--- a/sqlite3/libc/string.h
+++ b/sqlite3/libc/string.h
@@ -39,13 +39,6 @@ void *memmove(void *dest, const void *src, size_t n) {
 #ifdef __wasm_simd128__

 // SIMD versions of some string.h functions.
-//
-// These assume aligned v128_t loads can't fail,
-// and so can't unaligned loads up to the last
-// aligned address less than memory size.
-//
-// These also assume unaligned access is not painfully slow,
-// but that bitmask extraction is really slow on AArch64.

 __attribute__((weak))
 int memcmp(const void *v1, const void *v2, size_t n) {
@@ -55,9 +48,13 @@ int memcmp(const void *v1, const void *v2, size_t n) {
  const v128_t *w1 = (v128_t *)v1;
  const v128_t *w2 = (v128_t *)v2;
  for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
-    // Find any single bit difference.
-    if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
-      break;
+    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2));
+    // Bitmask is slow on AArch64, all_true is much faster.
+    if (!wasm_i8x16_all_true(cmp)) {
+      size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+      const unsigned char *u1 = (unsigned char *)w1 + ctz;
+      const unsigned char *u2 = (unsigned char *)w2 + ctz;
+      return *u1 - *u2;
    }
    w1++;
    w2++;
@@ -77,7 +74,7 @@ int memcmp(const void *v1, const void *v2, size_t n) {
 __attribute__((weak))
 void *memchr(const void *v, int c, size_t n) {
  // When n is zero, a function that locates a character finds no occurrence.
-  // Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
+  // Otherwise, decrement n to ensure __builtin_sub_overflow overflows
  // when n would go equal-to-or-below zero.
  if (n-- == 0) {
    return NULL;
@@ -98,7 +95,7 @@ void *memchr(const void *v, int c, size_t n) {
      // so we can count trailing zeros.
      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
      // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
      __builtin_assume(mask || align);
      // If the mask is zero because of alignment,
      // it's as if we didn't find anything.
@@ -109,7 +106,7 @@ void *memchr(const void *v, int c, size_t n) {
        return ctz <= n + align ? (char *)w + ctz : NULL;
      }
    }
-    // Decrement n; if it "overflows" we're done.
+    // Decrement n; if it overflows we're done.
    if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
      return NULL;
    }
@@ -133,7 +130,7 @@ size_t strlen(const char *s) {
      // so we can count trailing zeros.
      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
      // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
      __builtin_assume(mask || align);
      if (mask) {
        return (char *)w - s + __builtin_ctz(mask);
@@ -181,12 +178,23 @@ static int __strcmp(const char *s1, const char *s2) {
  return 0;
 }

+static int __strcmp_s(const char *s1, const char *s2) {
+  const unsigned char *u1 = (unsigned char *)s1;
+  const unsigned char *u2 = (unsigned char *)s2;
+  while (true) {
+    if (*u1 != *u2) return *u1 - *u2;
+    if (*u1 == 0) break;
+    u1++;
+    u2++;
+  }
+  return 0;
+}
+
 __attribute__((weak, always_inline))
 int strcmp(const char *s1, const char *s2) {
-  // Use strncmp when comparing against literal strings.
-  // If the literal is small, the vector search will be skipped.
-  if (__builtin_constant_p(strlen(s2))) {
-    return strncmp(s1, s2, strlen(s2));
+  // Skip the vector search when comparing against small literal strings.
+  if (__builtin_constant_p(strlen(s2) && strlen(s2) < sizeof(v128_t))) {
+    return __strcmp_s(s1, s2);
  }
  return __strcmp(s1, s2);
 }
@@ -244,7 +252,7 @@ static char *__strchrnul(const char *s, int c) {
      // so we can count trailing zeros.
      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
      // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
      __builtin_assume(mask || align);
      if (mask) {
        return (char *)w + __builtin_ctz(mask);
@@ -277,7 +285,7 @@ char *strchr(const char *s, int c) {
 __attribute__((weak))
 size_t strspn(const char *s, const char *c) {
 #ifndef _REENTRANT
-  static // Avoid the stack for builds without threads.
+  static  // Avoid the stack for builds without threads.
 #endif
  char byteset[UCHAR_MAX + 1];
  const char *const a = s;
@@ -293,12 +301,16 @@ size_t strspn(const char *s, const char *c) {
    const v128_t *w = (v128_t *)s;
    const v128_t wc = wasm_i8x16_splat(*c);
    while (w <= limit) {
-      if (!wasm_i8x16_all_true(wasm_i8x16_eq(wasm_v128_load(w), wc))) {
-        break;
+      const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc);
+      // Bitmask is slow on AArch64, all_true is much faster.
+      if (!wasm_i8x16_all_true(cmp)) {
+        size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+        return (char *)w + ctz - s;
      }
      w++;
    }

+    // Continue byte-by-byte.
    s = (char *)w;
    while (*s == *c) s++;
    return s - a;
@@ -311,20 +323,21 @@ size_t strspn(const char *s, const char *c) {
  while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
  while (byteset[*(unsigned char *)s]) s++;

-#else
+#else  // __OPTIMIZE__

  // This is faster than memset.
+  // Going backward helps bounds check elimination.
  volatile v128_t *w = (v128_t *)byteset;
  #pragma unroll
  for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
  static_assert(sizeof(byteset) % sizeof(v128_t) == 0);

-  // Keeping byteset[0] = 0 avoids the other loop having to test for it.
+  // Keeping byteset[0] = 0 avoids the next loop needing that check.
  while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
  #pragma unroll 4
  while (byteset[*(unsigned char *)s]) s++;

-#endif
+#endif  // __OPTIMIZE__

  return s - a;
 }
@@ -332,7 +345,7 @@ size_t strspn(const char *s, const char *c) {
 __attribute__((weak))
 size_t strcspn(const char *s, const char *c) {
 #ifndef _REENTRANT
-  static // Avoid the stack for builds without threads.
+  static  // Avoid the stack for builds without threads.
 #endif
  char byteset[UCHAR_MAX + 1];
  const char *const a = s;
@@ -346,24 +359,31 @@ size_t strcspn(const char *s, const char *c) {
  while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
  while (!byteset[*(unsigned char *)s]) s++;

-#else
+#else  // __OPTIMIZE__

  // This is faster than memset.
+  // Going backward helps bounds check elimination.
  volatile v128_t *w = (v128_t *)byteset;
  #pragma unroll
  for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
  static_assert(sizeof(byteset) % sizeof(v128_t) == 0);

-  // Setting byteset[0] = 1 avoids the other loop having to test for it.
+  // Setting byteset[0] = 1 avoids the next loop needing that check.
  while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
  #pragma unroll 4
  while (!byteset[*(unsigned char *)s]) s++;

-#endif
+#endif  // __OPTIMIZE__

  return s - a;
 }

+__attribute__((weak, always_inline))
+char *strpbrk(const char *s, const char *b) {
+	s += strcspn(s, b);
+	return *s ? (char *)s : 0;
+}
+
 #endif  // __wasm_simd128__

 #ifdef __cplusplus