diff --git a/sqlite3/libc/libc.wasm b/sqlite3/libc/libc.wasm
index f8b8f94..42dbc39 100755
Binary files a/sqlite3/libc/libc.wasm and b/sqlite3/libc/libc.wasm differ
diff --git a/sqlite3/libc/libc.wat b/sqlite3/libc/libc.wat
index 9302590..b11ea9f 100644
--- a/sqlite3/libc/libc.wat
+++ b/sqlite3/libc/libc.wat
@@ -36,103 +36,130 @@
   (local.get $0)
  )
  (func $memcmp (param $0 i32) (param $1 i32) (param $2 i32) (result i32)
-  (local $3 i32)
+  (local $3 v128)
   (local $4 i32)
-  (block $block1
-   (block $block
+  (local $5 i32)
+  (block $block
+   (br_if $block
+    (i32.lt_u
+     (local.get $2)
+     (i32.const 16)
+    )
+   )
+   (loop $label
     (if
-     (i32.ge_u
-      (local.get $2)
-      (i32.const 16)
-     )
-     (then
-      (loop $label
-       (br_if $block
-        (v128.any_true
-         (v128.xor
-          (v128.load align=1
-           (local.get $1)
-          )
-          (v128.load align=1
-           (local.get $0)
-          )
-         )
-        )
-       )
-       (local.set $1
-        (i32.add
-         (local.get $1)
-         (i32.const 16)
-        )
-       )
-       (local.set $0
-        (i32.add
+     (i8x16.all_true
+      (local.tee $3
+       (i8x16.eq
+        (v128.load align=1
          (local.get $0)
-         (i32.const 16)
         )
-       )
-       (br_if $label
-        (i32.gt_u
-         (local.tee $2
-          (i32.sub
-           (local.get $2)
-           (i32.const 16)
-          )
-         )
-         (i32.const 15)
+        (v128.load align=1
+         (local.get $1)
         )
        )
       )
      )
-    )
-    (br_if $block1
-     (i32.eqz
-      (local.get $2)
+     (then
+      (local.set $1
+       (i32.add
+        (local.get $1)
+        (i32.const 16)
+       )
+      )
+      (local.set $0
+       (i32.add
+        (local.get $0)
+        (i32.const 16)
+       )
+      )
+      (br_if $label
+       (i32.gt_u
+        (local.tee $2
+         (i32.sub
+          (local.get $2)
+          (i32.const 16)
+         )
+        )
+        (i32.const 15)
+       )
+      )
+      (br $block)
      )
     )
    )
-   (loop $label1
-    (if
-     (i32.ne
-      (local.tee $3
-       (i32.load8_u
-        (local.get $0)
-       )
-      )
-      (local.tee $4
-       (i32.load8_u
-        (local.get $1)
+   (return
+    (i32.sub
+     (i32.load8_u
+      (i32.add
+       (local.get $0)
+       (local.tee $2
+        (i32.ctz
+         (i32.xor
+          (i8x16.bitmask
+           (local.get $3)
+          )
+          (i32.const -1)
+         )
+        )
        )
       )
      )
-     (then
-      (return
-       (i32.sub
-        (local.get $3)
-        (local.get $4)
-       )
-      )
-     )
-    )
-    (local.set $1
-     (i32.add
-      (local.get $1)
-      (i32.const 1)
-     )
-    )
-    (local.set $0
-     (i32.add
-      (local.get $0)
-      (i32.const 1)
-     )
-    )
-    (br_if $label1
-     (local.tee $2
-      (i32.sub
+     (i32.load8_u
+      (i32.add
+       (local.get $1)
        (local.get $2)
+      )
+     )
+    )
+   )
+  )
+  (if
+   (local.get $2)
+   (then
+    (loop $label1
+     (if
+      (i32.ne
+       (local.tee $4
+        (i32.load8_u
+         (local.get $0)
+        )
+       )
+       (local.tee $5
+        (i32.load8_u
+         (local.get $1)
+        )
+       )
+      )
+      (then
+       (return
+        (i32.sub
+         (local.get $4)
+         (local.get $5)
+        )
+       )
+      )
+     )
+     (local.set $1
+      (i32.add
+       (local.get $1)
        (i32.const 1)
       )
      )
+     (local.set $0
+      (i32.add
+       (local.get $0)
+       (i32.const 1)
+      )
+     )
+     (br_if $label1
+      (local.tee $2
+       (i32.sub
+        (local.get $2)
+        (i32.const 1)
+       )
+      )
+     )
     )
    )
   )
@@ -886,7 +913,9 @@
  (func $strspn (param $0 i32) (param $1 i32) (result i32)
   (local $2 i32)
   (local $3 i32)
-  (local $4 v128)
+  (local $4 i32)
+  (local $5 v128)
+  (local $6 v128)
   (local $scratch i32)
   (if
    (i32.eqz
@@ -902,7 +931,7 @@
     )
    )
   )
-  (block $block1
+  (block $block
    (if
     (i32.eqz
      (i32.load8_u offset=1
@@ -910,50 +939,75 @@
      )
     )
     (then
-     (block $block
-      (br_if $block
-       (i32.gt_u
-        (local.tee $1
-         (local.get $0)
-        )
-        (local.tee $3
-         (i32.sub
-          (i32.shl
-           (memory.size)
-           (i32.const 16)
-          )
+     (if
+      (i32.ge_u
+       (local.tee $4
+        (i32.sub
+         (i32.shl
+          (memory.size)
           (i32.const 16)
          )
+         (i32.const 16)
         )
        )
-      )
-      (local.set $4
-       (i8x16.splat
-        (local.get $2)
+       (local.tee $1
+        (local.get $0)
        )
       )
-      (loop $label
-       (br_if $block
-        (i32.eqz
-         (i8x16.all_true
-          (i8x16.eq
-           (v128.load align=1
-            (local.get $1)
+      (then
+       (local.set $5
+        (i8x16.splat
+         (local.get $2)
+        )
+       )
+       (loop $label
+        (if
+         (i32.eqz
+          (i8x16.all_true
+           (local.tee $6
+            (i8x16.eq
+             (v128.load align=1
+              (i32.add
+               (local.get $0)
+               (local.get $3)
+              )
+             )
+             (local.get $5)
+            )
+           )
+          )
+         )
+         (then
+          (return
+           (i32.add
+            (i32.ctz
+             (i32.xor
+              (i8x16.bitmask
+               (local.get $6)
+              )
+              (i32.const -1)
+             )
+            )
+            (local.get $3)
            )
-           (local.get $4)
           )
          )
         )
-       )
-       (br_if $label
-        (i32.le_u
-         (local.tee $1
-          (i32.add
-           (local.get $1)
-           (i32.const 16)
+        (br_if $label
+         (i32.le_u
+          (local.tee $1
+           (i32.add
+            (local.get $0)
+            (local.tee $3
+             (i32.add
+              (local.get $3)
+              (i32.const 16)
+             )
+            )
+           )
           )
+          (local.get $4)
          )
-         (local.get $3)
         )
        )
       )
@@ -992,7 +1046,7 @@
        )
       )
      )
-     (br $block1)
+     (br $block)
     )
    )
    (v128.store
@@ -1094,11 +1148,11 @@
    (local.set $2
     (local.get $0)
    )
-   (block $block2
-    (block $block3
-     (block $block4
+   (block $block1
+    (block $block2
+     (block $block3
       (loop $label3
-       (br_if $block2
+       (br_if $block1
         (i32.eqz
          (i32.load8_u
           (i32.add
@@ -1110,7 +1164,7 @@
          )
         )
        )
-       (br_if $block3
+       (br_if $block2
         (i32.eqz
          (i32.load8_u
           (i32.add
@@ -1122,7 +1176,7 @@
          )
         )
        )
-       (br_if $block4
+       (br_if $block3
         (i32.eqz
          (i32.load8_u
           (i32.add
@@ -1162,7 +1216,7 @@
         (i32.const 1)
        )
       )
-      (br $block2)
+      (br $block1)
      )
      (local.set $2
       (i32.add
@@ -1170,7 +1224,7 @@
        (i32.const 2)
       )
      )
-     (br $block2)
+     (br $block1)
     )
     (local.set $2
      (i32.add
diff --git a/sqlite3/libc/math.h b/sqlite3/libc/math.h
index 76128b3..485a29d 100644
--- a/sqlite3/libc/math.h
+++ b/sqlite3/libc/math.h
@@ -11,9 +11,16 @@ extern "C" {
 
 #ifdef __wasm_relaxed_simd__
 
+// This header assumes "relaxed fused multiply-add"
+// is both faster and more precise.
+
+#define FP_FAST_FMA 1
+
 __attribute__((weak))
 double fma(double x, double y, double z) {
-  const v128_t wx = wasm_f64x2_splat(x);
+  // If we get a software implementation from the host,
+  // this is enough to short circuit it on the 2nd lane.
+  const v128_t wx = wasm_f64x2_replace_lane(b, 0, x);
   const v128_t wy = wasm_f64x2_splat(y);
   const v128_t wz = wasm_f64x2_splat(z);
 	const v128_t wr = wasm_f64x2_relaxed_madd(wx, wy, wz);
diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h
index 0fd8996..b56703f 100644
--- a/sqlite3/libc/string.h
+++ b/sqlite3/libc/string.h
@@ -39,13 +39,6 @@ void *memmove(void *dest, const void *src, size_t n) {
 #ifdef __wasm_simd128__
 
 // SIMD versions of some string.h functions.
-//
-// These assume aligned v128_t loads can't fail,
-// and so can't unaligned loads up to the last
-// aligned address less than memory size.
-//
-// These also assume unaligned access is not painfully slow,
-// but that bitmask extraction is really slow on AArch64.
 
 __attribute__((weak))
 int memcmp(const void *v1, const void *v2, size_t n) {
@@ -55,9 +48,13 @@ int memcmp(const void *v1, const void *v2, size_t n) {
   const v128_t *w1 = (v128_t *)v1;
   const v128_t *w2 = (v128_t *)v2;
   for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
-    // Find any single bit difference.
-    if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
-      break;
+    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2));
+    // Bitmask is slow on AArch64, all_true is much faster.
+    if (!wasm_i8x16_all_true(cmp)) {
+      size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+      const unsigned char *u1 = (unsigned char *)w1 + ctz;
+      const unsigned char *u2 = (unsigned char *)w2 + ctz;
+      return *u1 - *u2;
     }
     w1++;
     w2++;
@@ -77,7 +74,7 @@ int memcmp(const void *v1, const void *v2, size_t n) {
 __attribute__((weak))
 void *memchr(const void *v, int c, size_t n) {
   // When n is zero, a function that locates a character finds no occurrence.
-  // Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
+  // Otherwise, decrement n to ensure __builtin_sub_overflow overflows
   // when n would go equal-to-or-below zero.
   if (n-- == 0) {
     return NULL;
@@ -98,7 +95,7 @@ void *memchr(const void *v, int c, size_t n) {
       // so we can count trailing zeros.
       int mask = wasm_i8x16_bitmask(cmp) >> align << align;
       // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
       __builtin_assume(mask || align);
       // If the mask is zero because of alignment,
       // it's as if we didn't find anything.
@@ -109,7 +106,7 @@ void *memchr(const void *v, int c, size_t n) {
         return ctz <= n + align ? (char *)w + ctz : NULL;
       }
     }
-    // Decrement n; if it "overflows" we're done.
+    // Decrement n; if it overflows we're done.
     if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
       return NULL;
     }
@@ -133,7 +130,7 @@ size_t strlen(const char *s) {
       // so we can count trailing zeros.
       int mask = wasm_i8x16_bitmask(cmp) >> align << align;
       // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
       __builtin_assume(mask || align);
       if (mask) {
         return (char *)w - s + __builtin_ctz(mask);
@@ -181,12 +178,23 @@ static int __strcmp(const char *s1, const char *s2) {
   return 0;
 }
 
+static int __strcmp_s(const char *s1, const char *s2) {
+  const unsigned char *u1 = (unsigned char *)s1;
+  const unsigned char *u2 = (unsigned char *)s2;
+  while (true) {
+    if (*u1 != *u2) return *u1 - *u2;
+    if (*u1 == 0) break;
+    u1++;
+    u2++;
+  }
+  return 0;
+}
+
 __attribute__((weak, always_inline))
 int strcmp(const char *s1, const char *s2) {
-  // Use strncmp when comparing against literal strings.
-  // If the literal is small, the vector search will be skipped.
-  if (__builtin_constant_p(strlen(s2))) {
-    return strncmp(s1, s2, strlen(s2));
+  // Skip the vector search when comparing against small literal strings.
+  if (__builtin_constant_p(strlen(s2) && strlen(s2) < sizeof(v128_t))) {
+    return __strcmp_s(s1, s2);
   }
   return __strcmp(s1, s2);
 }
@@ -244,7 +252,7 @@ static char *__strchrnul(const char *s, int c) {
       // so we can count trailing zeros.
       int mask = wasm_i8x16_bitmask(cmp) >> align << align;
       // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
       __builtin_assume(mask || align);
       if (mask) {
         return (char *)w + __builtin_ctz(mask);
@@ -277,7 +285,7 @@ char *strchr(const char *s, int c) {
 __attribute__((weak))
 size_t strspn(const char *s, const char *c) {
 #ifndef _REENTRANT
-  static // Avoid the stack for builds without threads.
+  static  // Avoid the stack for builds without threads.
 #endif
   char byteset[UCHAR_MAX + 1];
   const char *const a = s;
@@ -293,12 +301,16 @@ size_t strspn(const char *s, const char *c) {
     const v128_t *w = (v128_t *)s;
     const v128_t wc = wasm_i8x16_splat(*c);
     while (w <= limit) {
-      if (!wasm_i8x16_all_true(wasm_i8x16_eq(wasm_v128_load(w), wc))) {
-        break;
+      const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc);
+      // Bitmask is slow on AArch64, all_true is much faster.
+      if (!wasm_i8x16_all_true(cmp)) {
+        size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+        return (char *)w + ctz - s;
       }
       w++;
     }
 
+    // Continue byte-by-byte.
     s = (char *)w;
     while (*s == *c) s++;
     return s - a;
@@ -311,20 +323,21 @@ size_t strspn(const char *s, const char *c) {
   while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
   while (byteset[*(unsigned char *)s]) s++;
 
-#else
+#else  // __OPTIMIZE__
 
   // This is faster than memset.
+  // Going backward helps bounds check elimination.
   volatile v128_t *w = (v128_t *)byteset;
   #pragma unroll
   for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
   static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
 
-  // Keeping byteset[0] = 0 avoids the other loop having to test for it.
+  // Keeping byteset[0] = 0 avoids the next loop needing that check.
   while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
   #pragma unroll 4
   while (byteset[*(unsigned char *)s]) s++;
 
-#endif
+#endif  // __OPTIMIZE__
 
   return s - a;
 }
@@ -332,7 +345,7 @@ size_t strspn(const char *s, const char *c) {
 __attribute__((weak))
 size_t strcspn(const char *s, const char *c) {
 #ifndef _REENTRANT
-  static // Avoid the stack for builds without threads.
+  static  // Avoid the stack for builds without threads.
 #endif
   char byteset[UCHAR_MAX + 1];
   const char *const a = s;
@@ -346,24 +359,31 @@ size_t strcspn(const char *s, const char *c) {
   while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
   while (!byteset[*(unsigned char *)s]) s++;
 
-#else
+#else  // __OPTIMIZE__
 
   // This is faster than memset.
+  // Going backward helps bounds check elimination.
   volatile v128_t *w = (v128_t *)byteset;
   #pragma unroll
   for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
   static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
 
-  // Setting byteset[0] = 1 avoids the other loop having to test for it.
+  // Setting byteset[0] = 1 avoids the next loop needing that check.
   while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
   #pragma unroll 4
   while (!byteset[*(unsigned char *)s]) s++;
 
-#endif
+#endif  // __OPTIMIZE__
 
   return s - a;
 }
 
+__attribute__((weak, always_inline))
+char *strpbrk(const char *s, const char *b) {
+	s += strcspn(s, b);
+	return *s ? (char *)s : 0;
+}
+
 #endif  // __wasm_simd128__
 
 #ifdef __cplusplus