SQLite 3.49.2.

Refactor #274 .
Issue #274 .
2026-01-20 01:24:15 +00:00 · 2025-05-07 14:08:18 +01:00 · 2025-05-07 12:46:13 +01:00 · 2025-05-07 01:37:52 +01:00 · 2025-05-06 15:48:58 +01:00 · 2025-05-05 14:47:43 +01:00
18 changed files with 2227 additions and 1255 deletions
--- a/embed/README.md
+++ b/embed/README.md
@@ -1,6 +1,6 @@
 # Embeddable Wasm build of SQLite

-This folder includes an embeddable Wasm build of SQLite 3.49.1 for use with
+This folder includes an embeddable Wasm build of SQLite 3.49.2 for use with
 [`github.com/ncruces/go-sqlite3`](https://pkg.go.dev/github.com/ncruces/go-sqlite3).

 The following optional features are compiled in:
--- a/embed/bcw2/bcw2.wasm
+++ b/embed/bcw2/bcw2.wasm
--- a/embed/bcw2/build.sh
+++ b/embed/bcw2/build.sh
@@ -13,8 +13,8 @@ mkdir -p build/ext/
 cp "$ROOT"/sqlite3/*.[ch] build/
 cp "$ROOT"/sqlite3/*.patch build/

-# https://sqlite.org/src/info/3215186aa9204149
-curl -# https://sqlite.org/src/tarball/sqlite.tar.gz?r=3215186a | tar xz
+# https://sqlite.org/src/info/9d6517e7cc8bf175
+curl -# https://sqlite.org/src/tarball/sqlite.tar.gz?r=9d6517e7 | tar xz

 cd sqlite
 if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" ]]; then
@@ -43,8 +43,8 @@ cd ~-

 "$WASI_SDK/clang" --target=wasm32-wasi -std=c23 -g0 -O2 \
 	-Wall -Wextra -Wno-unused-parameter -Wno-unused-function \
-	-o bcw2.wasm "build/main.c" \
-	-I"build" \
+	-o bcw2.wasm build/main.c \
+	-I"$ROOT/sqlite3/libc" -I"build" \
 	-mexec-model=reactor \
 	-msimd128 -mmutable-globals -mmultivalue \
 	-mbulk-memory -mreference-types \
--- a/embed/init_test.go
+++ b/embed/init_test.go
@@ -19,7 +19,7 @@ func Test_init(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	if version != "3.49.1" {
+	if version != "3.49.2" {
 		t.Error(version)
 	}
 }
--- a/embed/sqlite3.wasm
+++ b/embed/sqlite3.wasm
--- a/go.mod
+++ b/go.mod
@@ -8,16 +8,16 @@ require (
 	github.com/ncruces/julianday v1.0.0
 	github.com/ncruces/sort v0.1.5
 	github.com/tetratelabs/wazero v1.9.0
-	golang.org/x/crypto v0.37.0
-	golang.org/x/sys v0.32.0
+	golang.org/x/crypto v0.38.0
+	golang.org/x/sys v0.33.0
 )

 require (
 	github.com/dchest/siphash v1.2.3 // ext/bloom
 	github.com/google/uuid v1.6.0 // ext/uuid
 	github.com/psanford/httpreadat v0.1.0 // example
-	golang.org/x/sync v0.13.0 // test
-	golang.org/x/text v0.24.0 // ext/unicode
+	golang.org/x/sync v0.14.0 // test
+	golang.org/x/text v0.25.0 // ext/unicode
 	lukechampine.com/adiantum v1.1.1 // vfs/adiantum
 )

--- a/go.sum
+++ b/go.sum
@@ -10,13 +10,13 @@ github.com/psanford/httpreadat v0.1.0 h1:VleW1HS2zO7/4c7c7zNl33fO6oYACSagjJIyMIw
 github.com/psanford/httpreadat v0.1.0/go.mod h1:Zg7P+TlBm3bYbyHTKv/EdtSJZn3qwbPwpfZ/I9GKCRE=
 github.com/tetratelabs/wazero v1.9.0 h1:IcZ56OuxrtaEz8UYNRHBrUa9bYeX9oVY93KspZZBf/I=
 github.com/tetratelabs/wazero v1.9.0/go.mod h1:TSbcXCfFP0L2FGkRPxHphadXPjo1T6W+CseNNY7EkjM=
-golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE=
-golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc=
-golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
-golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
-golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
-golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
-golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
-golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
+golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8=
+golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw=
+golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
+golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
+golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
+golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
 lukechampine.com/adiantum v1.1.1 h1:4fp6gTxWCqpEbLy40ExiYDDED3oUNWx5cTqBCtPdZqA=
 lukechampine.com/adiantum v1.1.1/go.mod h1:LrAYVnTYLnUtE/yMp5bQr0HstAf060YUF8nM0B6+rUw=
--- a/sqlite3/download.sh
+++ b/sqlite3/download.sh
@@ -3,7 +3,7 @@ set -euo pipefail

 cd -P -- "$(dirname -- "$0")"

-curl -#OL "https://sqlite.org/2025/sqlite-amalgamation-3490100.zip"
+curl -#OL "https://sqlite.org/2025/sqlite-amalgamation-3490200.zip"
 unzip -d . sqlite-amalgamation-*.zip
 mv sqlite-amalgamation-*/sqlite3.c .
 mv sqlite-amalgamation-*/sqlite3.h .
@@ -19,30 +19,30 @@ rm -rf sqlite-amalgamation-*

 mkdir -p ext/
 cd ext/
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/anycollseq.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/base64.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/decimal.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/ieee754.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/regexp.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/series.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/spellfix.c"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/ext/misc/uint.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/anycollseq.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/base64.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/decimal.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/ieee754.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/regexp.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/series.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/spellfix.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/ext/misc/uint.c"
 cd ~-

 cd ../vfs/tests/mptest/testdata/
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/mptest/config01.test"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/mptest/config02.test"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/mptest/crash01.test"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/mptest/crash02.subtest"
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/mptest/multiwrite01.test"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/mptest/config01.test"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/mptest/config02.test"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/mptest/crash01.test"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/mptest/crash02.subtest"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/mptest/multiwrite01.test"
 cd ~-

 cd ../vfs/tests/mptest/wasm/
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/mptest/mptest.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/mptest/mptest.c"
 cd ~-

 cd ../vfs/tests/speedtest1/wasm/
-curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.1/test/speedtest1.c"
+curl -#OL "https://github.com/sqlite/sqlite/raw/version-3.49.2/test/speedtest1.c"
 cd ~-

 cat *.patch | patch -p0 --no-backup-if-mismatch
--- a/sqlite3/libc/build.sh
+++ b/sqlite3/libc/build.sh
@@ -27,16 +27,25 @@ EOF
 	-Wl,--stack-first \
 	-Wl,--import-undefined \
 	-Wl,--initial-memory=16777216 \
+	-Wl,--export=memccpy \
 	-Wl,--export=memchr \
 	-Wl,--export=memcmp \
 	-Wl,--export=memcpy \
+	-Wl,--export=memmove \
+	-Wl,--export=memrchr \
 	-Wl,--export=memset \
+	-Wl,--export=stpcpy \
+	-Wl,--export=stpncpy \
 	-Wl,--export=strchr \
 	-Wl,--export=strchrnul \
 	-Wl,--export=strcmp \
+	-Wl,--export=strcpy \
 	-Wl,--export=strcspn \
 	-Wl,--export=strlen \
+	-Wl,--export=strncat \
 	-Wl,--export=strncmp \
+	-Wl,--export=strncpy \
+	-Wl,--export=strrchr \
 	-Wl,--export=strspn \
 	-Wl,--export=qsort

--- a/sqlite3/libc/libc.wasm
+++ b/sqlite3/libc/libc.wasm
--- a/sqlite3/libc/libc.wat
+++ b/sqlite3/libc/libc.wat
--- a/sqlite3/libc/libc_test.go
+++ b/sqlite3/libc/libc_test.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	_ "embed"
 	"os"
+	"strings"
 	"testing"

 	"github.com/tetratelabs/wazero"
@@ -31,6 +32,7 @@ var (
 	strchr  api.Function
 	strcmp  api.Function
 	strspn  api.Function
+	strrchr api.Function
 	strncmp api.Function
 	strcspn api.Function
 	stack   [8]uint64
@@ -63,6 +65,7 @@ func TestMain(m *testing.M) {
 	strchr = mod.ExportedFunction("strchr")
 	strcmp = mod.ExportedFunction("strcmp")
 	strspn = mod.ExportedFunction("strspn")
+	strrchr = mod.ExportedFunction("strrchr")
 	strncmp = mod.ExportedFunction("strncmp")
 	strcspn = mod.ExportedFunction("strcspn")
 	memory, _ = mod.Memory().Read(0, mod.Memory().Size())
@@ -139,6 +142,18 @@ func Benchmark_strchr(b *testing.B) {
 	}
 }

+func Benchmark_strrchr(b *testing.B) {
+	clear(memory)
+	fill(memory[ptr1:ptr1+size/2], 5)
+	fill(memory[ptr1+size/2:ptr1+size-1], 7)
+
+	b.SetBytes(size/2 + 1)
+	b.ResetTimer()
+	for range b.N {
+		call(strrchr, ptr1, 5)
+	}
+}
+
 func Benchmark_strcmp(b *testing.B) {
 	clear(memory)
 	fill(memory[ptr1:ptr1+size-1], 7)
@@ -195,43 +210,117 @@ func Benchmark_strcspn(b *testing.B) {
 	}
 }

-func Test_memchr(t *testing.T) {
-	for length := range 64 {
-		for pos := range length + 2 {
-			for alignment := range 24 {
-				clear(memory[:2*page])
+func Test_memcmp(t *testing.T) {
+	const s1 string = "" +
+		"\x94\x63\x8f\x01\x74\x63\x8f\x01\x54\x63\x8f\x01\x34\x63\x8f\x01" +
+		"\xb4\xf2\x93\x01\x94\xf2\x93\x01\x54\xf1\x93\x01\x34\xf1\x93\x01" +
+		"\x14\xf1\x93\x01\x14\xf2\x93\x01\x34\xf2\x93\x01\x54\xf2\x93\x01" +
+		"\x74\xf2\x93\x01\x74\xf1\x93\x01\xd4\xf2\x93\x01\x94\xf1\x93\x01" +
+		"\xb4\xf1\x93\x01\xd4\xf1\x93\x01\xf4\xf1\x93\x01\xf4\xf2\x93\x01" +
+		"\x14\xf4\x93\x01\xf4\xf3\x93\x01\xd4\xf3\x93\x01\xb4\xf3\x93\x01" +
+		"\x94\xf3\x93\x01\x74\x80\x93\x01\x54\xf3\x93\x01\x34\xf3\x93\x01" +
+		"\x7f\xf3\x93\x01\x00\x01"
+	const s2 string = "" +
+		"\x94\x63\x8f\x01\x74\x63\x8f\x01\x54\x63\x8f\x01\x34\x63\x8f\x01" +
+		"\xb4\xf2\x93\x01\x94\xf2\x93\x01\x54\xf1\x93\x01\x34\xf1\x93\x01" +
+		"\x14\xf1\x93\x01\x14\xf2\x93\x01\x34\xf2\x93\x01\x54\xf2\x93\x01" +
+		"\x74\xf2\x93\x01\x74\xf1\x93\x01\xd4\xf2\x93\x01\x94\xf1\x93\x01" +
+		"\xb4\xf1\x93\x01\xd4\xf1\x93\x01\xf4\xf1\x93\x01\xf4\xf2\x93\x01" +
+		"\xbc\x40\x96\x01\xf4\xf3\x93\x01\xd4\xf3\x93\x01\xb4\xf3\x93\x01" +
+		"\x94\xf3\x93\x01\x74\x7f\x93\x01\x54\xf3\x93\x01\x34\xf3\x93\x01" +
+		"\x80\xf3\x93\x01\x00\x02"

-				ptr := (page - 8) + alignment
-				fill(memory[ptr:ptr+max(pos, length)], 5)
-				memory[ptr+pos] = 7
+	p1 := ptr1
+	p2 := len(memory) - len(s2)

-				want := 0
-				if pos < length {
-					want = ptr + pos
-				}
+	clear(memory)
+	copy(memory[p1:], s1)
+	copy(memory[p2:], s2)

-				got := call(memchr, uint64(ptr), 7, uint64(length))
-				if uint32(got) != uint32(want) {
-					t.Errorf("memchr(%d, %d, %d) = %d, want %d",
-						ptr, 7, uint64(length), uint32(got), uint32(want))
-				}
+	for i := range len(s1) + 1 {
+		for j := range len(s1) - i {
+			want := strings.Compare(s1[i:i+j], s2[i:i+j])
+			got := call(memcmp, uint64(p1+i), uint64(p2+i), uint64(j))
+			if sign(int32(got)) != want {
+				t.Errorf("strcmp(%d, %d, %d) = %d, want %d",
+					ptr1+i, ptr2+i, j, int32(got), want)
 			}
 		}
+	}
+}

-		clear(memory)
-		ptr := len(memory) - length
-		fill(memory[ptr:ptr+length], 5)
-		memory[len(memory)-1] = 7
+func Test_strcmp(t *testing.T) {
+	const s1 string = "" +
+		"\x94\x63\x8f\x01\x74\x63\x8f\x01\x54\x63\x8f\x01\x34\x63\x8f\x01" +
+		"\xb4\xf2\x93\x01\x94\xf2\x93\x01\x54\xf1\x93\x01\x34\xf1\x93\x01" +
+		"\x14\xf1\x93\x01\x14\xf2\x93\x01\x34\xf2\x93\x01\x54\xf2\x93\x01" +
+		"\x74\xf2\x93\x01\x74\xf1\x93\x01\xd4\xf2\x93\x01\x94\xf1\x93\x01" +
+		"\xb4\xf1\x93\x01\xd4\xf1\x93\x01\xf4\xf1\x93\x01\xf4\xf2\x93\x01" +
+		"\x14\xf4\x93\x01\xf4\xf3\x93\x01\xd4\xf3\x93\x01\xb4\xf3\x93\x01" +
+		"\x94\xf3\x93\x01\x74\x80\x93\x01\x54\xf3\x93\x01\x34\xf3\x93\x01" +
+		"\x7f\xf3\x93\x01\x00\x01"
+	const s2 string = "" +
+		"\x94\x63\x8f\x01\x74\x63\x8f\x01\x54\x63\x8f\x01\x34\x63\x8f\x01" +
+		"\xb4\xf2\x93\x01\x94\xf2\x93\x01\x54\xf1\x93\x01\x34\xf1\x93\x01" +
+		"\x14\xf1\x93\x01\x14\xf2\x93\x01\x34\xf2\x93\x01\x54\xf2\x93\x01" +
+		"\x74\xf2\x93\x01\x74\xf1\x93\x01\xd4\xf2\x93\x01\x94\xf1\x93\x01" +
+		"\xb4\xf1\x93\x01\xd4\xf1\x93\x01\xf4\xf1\x93\x01\xf4\xf2\x93\x01" +
+		"\xbc\x40\x96\x01\xf4\xf3\x93\x01\xd4\xf3\x93\x01\xb4\xf3\x93\x01" +
+		"\x94\xf3\x93\x01\x74\x7f\x93\x01\x54\xf3\x93\x01\x34\xf3\x93\x01" +
+		"\x80\xf3\x93\x01\x00\x02"

-		want := len(memory) - 1
-		if length == 0 {
-			want = 0
+	p1 := ptr1
+	p2 := len(memory) - len(s2) - 1
+
+	clear(memory)
+	copy(memory[p1:], s1)
+	copy(memory[p2:], s2)
+
+	for i := range len(s1) + 1 {
+		want := strings.Compare(term(s1[i:]), term(s2[i:]))
+		got := call(strcmp, uint64(p1+i), uint64(p2+i))
+		if sign(int32(got)) != want {
+			t.Errorf("strcmp(%d, %d) = %d, want %d",
+				p1+i, ptr2+i, int32(got), want)
 		}
+	}
+}

-		got := call(memchr, uint64(ptr), 7, uint64(length))
-		if uint32(got) != uint32(want) {
-			t.Errorf("memchr(%d, %d, %d) = %d, want %d",
-				ptr, 7, uint64(length), uint32(got), uint32(want))
+func Test_strncmp(t *testing.T) {
+	const s1 string = "" +
+		"\x94\x63\x8f\x01\x74\x63\x8f\x01\x54\x63\x8f\x01\x34\x63\x8f\x01" +
+		"\xb4\xf2\x93\x01\x94\xf2\x93\x01\x54\xf1\x93\x01\x34\xf1\x93\x01" +
+		"\x14\xf1\x93\x01\x14\xf2\x93\x01\x34\xf2\x93\x01\x54\xf2\x93\x01" +
+		"\x74\xf2\x93\x01\x74\xf1\x93\x01\xd4\xf2\x93\x01\x94\xf1\x93\x01" +
+		"\xb4\xf1\x93\x01\xd4\xf1\x93\x01\xf4\xf1\x93\x01\xf4\xf2\x93\x01" +
+		"\x14\xf4\x93\x01\xf4\xf3\x93\x01\xd4\xf3\x93\x01\xb4\xf3\x93\x01" +
+		"\x94\xf3\x93\x01\x74\x80\x93\x01\x54\xf3\x93\x01\x34\xf3\x93\x01" +
+		"\x7f\xf3\x93\x01\x00\x01"
+	const s2 string = "" +
+		"\x94\x63\x8f\x01\x74\x63\x8f\x01\x54\x63\x8f\x01\x34\x63\x8f\x01" +
+		"\xb4\xf2\x93\x01\x94\xf2\x93\x01\x54\xf1\x93\x01\x34\xf1\x93\x01" +
+		"\x14\xf1\x93\x01\x14\xf2\x93\x01\x34\xf2\x93\x01\x54\xf2\x93\x01" +
+		"\x74\xf2\x93\x01\x74\xf1\x93\x01\xd4\xf2\x93\x01\x94\xf1\x93\x01" +
+		"\xb4\xf1\x93\x01\xd4\xf1\x93\x01\xf4\xf1\x93\x01\xf4\xf2\x93\x01" +
+		"\xbc\x40\x96\x01\xf4\xf3\x93\x01\xd4\xf3\x93\x01\xb4\xf3\x93\x01" +
+		"\x94\xf3\x93\x01\x74\x7f\x93\x01\x54\xf3\x93\x01\x34\xf3\x93\x01" +
+		"\x80\xf3\x93\x01\x00\x02"
+
+	p1 := ptr1
+	p2 := len(memory) - len(s2) - 1
+
+	clear(memory)
+	copy(memory[p1:], s1)
+	copy(memory[p2:], s2)
+
+	for i := range len(s1) + 1 {
+		for j := range len(s1) - i + 1 {
+			want := strings.Compare(term(s1[i:i+j]), term(s2[i:i+j]))
+			got := call(strncmp, uint64(p1+i), uint64(p2+i), uint64(j))
+			if sign(int32(got)) != want {
+				t.Errorf("strncmp(%d, %d, %d) = %d, want %d",
+					ptr1+i, ptr2+i, j, int32(got), want)
+			}
 		}
 	}
 }
@@ -239,9 +328,9 @@ func Test_memchr(t *testing.T) {
 func Test_strlen(t *testing.T) {
 	for length := range 64 {
 		for alignment := range 24 {
-			clear(memory[:2*page])
-
 			ptr := (page - 8) + alignment
+
+			clear(memory[:2*page])
 			fill(memory[ptr:ptr+length], 5)

 			got := call(strlen, uint64(ptr))
@@ -270,22 +359,62 @@ func Test_strlen(t *testing.T) {
 	}
 }

-func Test_strchr(t *testing.T) {
+func Test_memchr(t *testing.T) {
 	for length := range 64 {
 		for pos := range length + 2 {
 			for alignment := range 24 {
-				clear(memory[:2*page])
-
 				ptr := (page - 8) + alignment
-				fill(memory[ptr:ptr+max(pos, length)], 5)
-				memory[ptr+pos] = 7
-				memory[ptr+length] = 0
-
 				want := 0
 				if pos < length {
 					want = ptr + pos
 				}

+				clear(memory[:2*page])
+				fill(memory[ptr:ptr+max(pos, length)], 5)
+				memory[ptr+pos] = 7
+
+				got := call(memchr, uint64(ptr), 7, uint64(length))
+				if uint32(got) != uint32(want) {
+					t.Errorf("memchr(%d, %d, %d) = %d, want %d",
+						ptr, 7, uint64(length), uint32(got), uint32(want))
+				}
+			}
+		}
+
+		clear(memory)
+		ptr := len(memory) - length
+		fill(memory[ptr:ptr+length], 5)
+		memory[len(memory)-1] = 7
+
+		want := len(memory) - 1
+		if length == 0 {
+			want = 0
+		}
+
+		got := call(memchr, uint64(ptr), 7, uint64(length))
+		if uint32(got) != uint32(want) {
+			t.Errorf("memchr(%d, %d, %d) = %d, want %d",
+				ptr, 7, uint64(length), uint32(got), uint32(want))
+		}
+	}
+}
+
+func Test_strchr(t *testing.T) {
+	for length := range 64 {
+		for pos := range length + 2 {
+			for alignment := range 24 {
+				ptr := (page - 8) + alignment
+				want := 0
+				if pos < length {
+					want = ptr + pos
+				}
+
+				clear(memory[:2*page])
+				fill(memory[ptr:ptr+max(pos, length)], 5)
+				memory[ptr+pos] = 7
+				memory[ptr+pos+1] = 7
+				memory[ptr+length] = 0
+
 				got := call(strchr, uint64(ptr), 7)
 				if uint32(got) != uint32(want) {
 					t.Errorf("strchr(%d, %d) = %d, want %d",
@@ -312,21 +441,66 @@ func Test_strchr(t *testing.T) {
 	}
 }

+func Test_strrchr(t *testing.T) {
+	for length := range 64 {
+		for pos := range length + 2 {
+			for alignment := range 24 {
+				ptr := (page - 8) + alignment
+				want := 0
+				if pos < length {
+					want = ptr + pos
+				} else if length > 0 {
+					want = ptr
+				}
+
+				clear(memory[:2*page])
+				fill(memory[ptr:ptr+max(pos, length)], 5)
+				memory[ptr] = 7
+				memory[ptr+pos] = 7
+				memory[ptr+length] = 0
+
+				got := call(strrchr, uint64(ptr), 7)
+				if uint32(got) != uint32(want) {
+					t.Errorf("strrchr(%d, %d) = %d, want %d",
+						ptr, 7, uint32(got), uint32(want))
+				}
+			}
+		}
+
+		ptr := len(memory) - length
+		want := len(memory) - 2
+		if length <= 1 {
+			continue
+		}
+
+		clear(memory)
+		fill(memory[ptr:ptr+length], 5)
+		memory[ptr] = 7
+		memory[len(memory)-2] = 7
+		memory[len(memory)-1] = 0
+
+		got := call(strrchr, uint64(ptr), 7)
+		if uint32(got) != uint32(want) {
+			t.Errorf("strrchr(%d, %d) = %d, want %d",
+				ptr, 7, uint32(got), uint32(want))
+		}
+	}
+}
+
 func Test_strspn(t *testing.T) {
 	for length := range 64 {
 		for pos := range length + 2 {
 			for alignment := range 24 {
-				clear(memory[:2*page])
-
 				ptr := (page - 8) + alignment
+				want := min(pos, length)
+
+				clear(memory[:2*page])
 				fill(memory[ptr:ptr+max(pos, length)], 5)
 				memory[ptr+pos] = 7
 				memory[ptr+length] = 0
 				memory[128] = 3
 				memory[129] = 5

-				want := min(pos, length)
-
 				got := call(strspn, uint64(ptr), 129)
 				if uint32(got) != uint32(want) {
 					t.Errorf("strspn(%d, %d) = %d, want %d",
@@ -341,18 +515,18 @@ func Test_strspn(t *testing.T) {
 			}
 		}

-		clear(memory)
 		ptr := len(memory) - length
-		fill(memory[ptr:ptr+length], 5)
-		memory[len(memory)-1] = 7
-		memory[128] = 3
-		memory[129] = 5
-
 		want := length - 1
 		if length == 0 {
 			continue
 		}

+		clear(memory)
+		fill(memory[ptr:ptr+length], 5)
+		memory[len(memory)-1] = 7
+		memory[128] = 3
+		memory[129] = 5
+
 		got := call(strspn, uint64(ptr), 129)
 		if uint32(got) != uint32(want) {
 			t.Errorf("strspn(%d, %d) = %d, want %d",
@@ -371,17 +545,16 @@ func Test_strcspn(t *testing.T) {
 	for length := range 64 {
 		for pos := range length + 2 {
 			for alignment := range 24 {
-				clear(memory[:2*page])
-
 				ptr := (page - 8) + alignment
+				want := min(pos, length)
+
+				clear(memory[:2*page])
 				fill(memory[ptr:ptr+max(pos, length)], 5)
 				memory[ptr+pos] = 7
 				memory[ptr+length] = 0
 				memory[128] = 3
 				memory[129] = 7

-				want := min(pos, length)
-
 				got := call(strcspn, uint64(ptr), 129)
 				if uint32(got) != uint32(want) {
 					t.Errorf("strcspn(%d, %d) = %d, want %d",
@@ -396,18 +569,18 @@ func Test_strcspn(t *testing.T) {
 			}
 		}

-		clear(memory)
 		ptr := len(memory) - length
-		fill(memory[ptr:ptr+length], 5)
-		memory[len(memory)-1] = 7
-		memory[128] = 3
-		memory[129] = 7
-
 		want := length - 1
 		if length == 0 {
 			continue
 		}

+		clear(memory)
+		fill(memory[ptr:ptr+length], 5)
+		memory[len(memory)-1] = 7
+		memory[128] = 3
+		memory[129] = 7
+
 		got := call(strcspn, uint64(ptr), 129)
 		if uint32(got) != uint32(want) {
 			t.Errorf("strcspn(%d, %d) = %d, want %d",
@@ -427,3 +600,21 @@ func fill(s []byte, v byte) {
 		s[i] = v
 	}
 }
+
+func sign(x int32) int {
+	switch {
+	case x > 0:
+		return +1
+	case x < 0:
+		return -1
+	default:
+		return 0
+	}
+}
+
+func term(s string) string {
+	if i := strings.IndexByte(s, 0); i >= 0 {
+		return s[:i]
+	}
+	return s
+}
--- a/sqlite3/libc/math.h
+++ b/sqlite3/libc/math.h
@@ -0,0 +1,36 @@
+#ifndef _WASM_SIMD128_MATH_H
+#define _WASM_SIMD128_MATH_H
+
+#include <wasm_simd128.h>
+
+#include_next <math.h>  // the system math.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __wasm_relaxed_simd__
+
+// This header assumes "relaxed fused multiply-add"
+// is both faster and more precise.
+
+#define FP_FAST_FMA 1
+
+__attribute__((weak))
+double fma(double x, double y, double z) {
+  // If we get a software implementation from the host,
+  // this is enough to short circuit it on the 2nd lane.
+  const v128_t wx = wasm_f64x2_replace_lane(b, 0, x);
+  const v128_t wy = wasm_f64x2_splat(y);
+  const v128_t wz = wasm_f64x2_splat(z);
+	const v128_t wr = wasm_f64x2_relaxed_madd(wx, wy, wz);
+	return wasm_f64x2_extract_lane(wr, 0);
+}
+
+#endif  // __wasm_relaxed_simd__
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // _WASM_SIMD128_MATH_H
--- a/sqlite3/libc/stdlib.h
+++ b/sqlite3/libc/stdlib.h
@@ -17,22 +17,31 @@ extern "C" {
 __attribute__((weak))
 void qsort(void *base, size_t nel, size_t width,
           int (*comp)(const void *, const void *)) {
-  if (width == 0) return;
-
+  // If nel is zero, we're required to do nothing.
+  // If it's one, the array is already sorted.
  size_t wnel = width * nel;
  size_t gap = nel;
  while (gap > 1) {
+    // Use 64-bit unsigned arithmetic to avoid intermediate overflow.
+    // Absent overflow, gap will be strictly less than its previous value.
+    // Once it is one or zero, set it to one: do a final pass, and stop.
    gap = (5ull * gap - 1) / 11;
    if (gap == 0) gap = 1;

+    // It'd be undefined behavior for wnel to overflow a size_t;
+    // or if width is zero: the base pointer would be invalid.
+    // Since gap is stricly less than nel, we can assume
+    // wgap is strictly less than wnel.
    size_t wgap = width * gap;
    __builtin_assume(wgap < wnel);
    for (size_t i = wgap; i < wnel; i += width) {
+      // Even without overflow flags, the overflow builtin helps the compiler.
      for (size_t j = i; !__builtin_sub_overflow(j, wgap, &j);) {
        char *a = j + (char *)base;
        char *b = a + wgap;
        if (comp(a, b) <= 0) break;

+        // This well known loop is automatically vectorized.
        size_t s = width;
        do {
          char tmp = *a;
--- a/sqlite3/libc/string.h
+++ b/sqlite3/libc/string.h
@@ -25,7 +25,7 @@ void *memset(void *dest, int c, size_t n) {
 }

 __attribute__((weak))
-void *memcpy(void *restrict dest, const void *restrict src, size_t n) {
+void *memcpy(void *__restrict dest, const void *__restrict src, size_t n) {
  return __builtin_memcpy(dest, src, n);
 }

@@ -38,38 +38,46 @@ void *memmove(void *dest, const void *src, size_t n) {

 #ifdef __wasm_simd128__

-// SIMD versions of some string.h functions.
-//
-// These assume aligned v128_t loads can't fail,
-// and so can't unaligned loads up to the last
-// aligned address less than memory size.
-//
-// These also assume unaligned access is not painfully slow,
-// but that bitmask extraction is really slow on AArch64.
+// SIMD implementations of string.h functions.

 __attribute__((weak))
 int memcmp(const void *v1, const void *v2, size_t n) {
-  // memcmp can read up to n bytes from each object.
-  // Use unaligned loads to handle the case where
-  // the objects have mismatching alignments.
-  const v128_t *w1 = v1;
-  const v128_t *w2 = v2;
-  for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
-    // Find any single bit difference.
-    if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
-      break;
+  // Baseline algorithm.
+  if (n < sizeof(v128_t)) {
+    const unsigned char *u1 = (unsigned char *)v1;
+    const unsigned char *u2 = (unsigned char *)v2;
+    while (n--) {
+      if (*u1 != *u2) return *u1 - *u2;
+      u1++;
+      u2++;
    }
-    w1++;
-    w2++;
+    return 0;
  }

-  // Continue byte-by-byte.
-  const unsigned char *u1 = (void *)w1;
-  const unsigned char *u2 = (void *)w2;
-  while (n--) {
-    if (*u1 != *u2) return *u1 - *u2;
-    u1++;
-    u2++;
+  // memcmp is allowed to read up to n bytes from each object.
+  // Find the first different character in the objects.
+  // Unaligned loads handle the case where the objects
+  // have mismatching alignments.
+  const v128_t *w1 = (v128_t *)v1;
+  const v128_t *w2 = (v128_t *)v2;
+  while (n) {
+    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2));
+    // Bitmask is slow on AArch64, all_true is much faster.
+    if (!wasm_i8x16_all_true(cmp)) {
+      // Find the offset of the first zero bit (little-endian).
+      size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+      const unsigned char *u1 = (unsigned char *)w1 + ctz;
+      const unsigned char *u2 = (unsigned char *)w2 + ctz;
+      // This may help the compiler if the function is inlined.
+      __builtin_assume(*u1 - *u2 != 0);
+      return *u1 - *u2;
+    }
+    // This makes n a multiple of sizeof(v128_t)
+    // for every iteration except the first.
+    size_t align = (n - 1) % sizeof(v128_t) + 1;
+    w1 = (v128_t *)((char *)w1 + align);
+    w2 = (v128_t *)((char *)w2 + align);
+    n -= align;
  }
  return 0;
 }
@@ -77,7 +85,7 @@ int memcmp(const void *v1, const void *v2, size_t n) {
 __attribute__((weak))
 void *memchr(const void *v, int c, size_t n) {
  // When n is zero, a function that locates a character finds no occurrence.
-  // Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
+  // Otherwise, decrement n to ensure sub_overflow overflows
  // when n would go equal-to-or-below zero.
  if (n-- == 0) {
    return NULL;
@@ -85,20 +93,20 @@ void *memchr(const void *v, int c, size_t n) {

  // memchr must behave as if it reads characters sequentially
  // and stops as soon as a match is found.
-  // Aligning ensures loads can't fail.
+  // Aligning ensures loads beyond the first match don't fail.
  uintptr_t align = (uintptr_t)v % sizeof(v128_t);
-  const v128_t *w = (void *)(v - align);
+  const v128_t *w = (v128_t *)((char *)v - align);
  const v128_t wc = wasm_i8x16_splat(c);

-  while (true) {
+  for (;;) {
    const v128_t cmp = wasm_i8x16_eq(*w, wc);
    // Bitmask is slow on AArch64, any_true is much faster.
    if (wasm_v128_any_true(cmp)) {
-      // Clear the bits corresponding to alignment
+      // Clear the bits corresponding to alignment (little-endian)
      // so we can count trailing zeros.
      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
      // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
      __builtin_assume(mask || align);
      // If the mask is zero because of alignment,
      // it's as if we didn't find anything.
@@ -106,10 +114,10 @@ void *memchr(const void *v, int c, size_t n) {
        // We found a match, unless it is beyond the end of the object.
        // Recall that we decremented n, so less-than-or-equal-to is correct.
        size_t ctz = __builtin_ctz(mask);
-        return ctz <= n + align ? (void *)w + ctz : NULL;
+        return ctz <= n + align ? (char *)w + ctz : NULL;
      }
    }
-    // Decrement n; if it "overflows" we're done.
+    // Decrement n; if it overflows we're done.
    if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
      return NULL;
    }
@@ -118,22 +126,45 @@ void *memchr(const void *v, int c, size_t n) {
  }
 }

+__attribute__((weak))
+void *memrchr(const void *v, int c, size_t n) {
+  // memrchr is allowed to read up to n bytes from the object.
+  // Search backward for the last matching character.
+  const v128_t *w = (v128_t *)((char *)v + n);
+  const v128_t wc = wasm_i8x16_splat(c);
+  for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
+    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(--w), wc);
+    // Bitmask is slow on AArch64, any_true is much faster.
+    if (wasm_v128_any_true(cmp)) {
+      size_t clz = __builtin_clz(wasm_i8x16_bitmask(cmp)) - 15;
+      return (char *)(w + 1) - clz;
+    }
+  }
+
+  // Baseline algorithm.
+  const char *a = (char *)w;
+  while (n--) {
+    if (*(--a) == (char)c) return (char *)a;
+  }
+  return NULL;
+}
+
 __attribute__((weak))
 size_t strlen(const char *s) {
  // strlen must stop as soon as it finds the terminator.
-  // Aligning ensures loads can't fail.
+  // Aligning ensures loads beyond the terminator don't fail.
  uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-  const v128_t *w = (void *)(s - align);
+  const v128_t *w = (v128_t *)(s - align);

-  while (true) {
+  for (;;) {
    // Bitmask is slow on AArch64, all_true is much faster.
    if (!wasm_i8x16_all_true(*w)) {
      const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){});
-      // Clear the bits corresponding to alignment
+      // Clear the bits corresponding to alignment (little-endian)
      // so we can count trailing zeros.
      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
      // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
      __builtin_assume(mask || align);
      if (mask) {
        return (char *)w - s + __builtin_ctz(mask);
@@ -145,23 +176,23 @@ size_t strlen(const char *s) {
 }

 static int __strcmp(const char *s1, const char *s2) {
-  // Set limit to the largest possible valid v128_t pointer.
-  // Unsigned modular arithmetic gives the correct result
-  // unless memory size is zero, in which case all pointers are invalid.
-  const v128_t *const limit =
-      (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
+  // How many bytes can be read before pointers go out of bounds.
+  size_t N = __builtin_wasm_memory_size(0) * PAGESIZE -  //
+             (size_t)(s1 > s2 ? s1 : s2);

-  // Use unaligned loads to handle the case where
-  // the strings have mismatching alignments.
-  const v128_t *w1 = (void *)s1;
-  const v128_t *w2 = (void *)s2;
-  while (w1 <= limit && w2 <= limit) {
+  // Unaligned loads handle the case where the strings
+  // have mismatching alignments.
+  const v128_t *w1 = (v128_t *)s1;
+  const v128_t *w2 = (v128_t *)s2;
+  for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
    // Find any single bit difference.
    if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
+      // The strings may still be equal,
+      // if the terminator is found before that difference.
      break;
    }
-    // All bytes are equal.
-    // If any byte is zero (on both strings) the strings are equal.
+    // All characters are equal.
+    // If any is a terminator the strings are equal.
    if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
      return 0;
    }
@@ -169,10 +200,22 @@ static int __strcmp(const char *s1, const char *s2) {
    w2++;
  }

-  // Continue byte-by-byte.
-  const unsigned char *u1 = (void *)w1;
-  const unsigned char *u2 = (void *)w2;
-  while (true) {
+  // Baseline algorithm.
+  const unsigned char *u1 = (unsigned char *)w1;
+  const unsigned char *u2 = (unsigned char *)w2;
+  for (;;) {
+    if (*u1 != *u2) return *u1 - *u2;
+    if (*u1 == 0) break;
+    u1++;
+    u2++;
+  }
+  return 0;
+}
+
+static int __strcmp_s(const char *s1, const char *s2) {
+  const unsigned char *u1 = (unsigned char *)s1;
+  const unsigned char *u2 = (unsigned char *)s2;
+  for (;;) {
    if (*u1 != *u2) return *u1 - *u2;
    if (*u1 == 0) break;
    u1++;
@@ -183,33 +226,33 @@ static int __strcmp(const char *s1, const char *s2) {

 __attribute__((weak, always_inline))
 int strcmp(const char *s1, const char *s2) {
-  // Use strncmp when comparing against literal strings.
-  // If the literal is small, the vector search will be skipped.
-  if (__builtin_constant_p(strlen(s2))) {
-    return strncmp(s1, s2, strlen(s2));
+  // Skip the vector search when comparing against small literal strings.
+  if (__builtin_constant_p(strlen(s2)) && strlen(s2) < sizeof(v128_t)) {
+    return __strcmp_s(s1, s2);
  }
  return __strcmp(s1, s2);
 }

 __attribute__((weak))
 int strncmp(const char *s1, const char *s2, size_t n) {
-  // Set limit to the largest possible valid v128_t pointer.
-  // Unsigned modular arithmetic gives the correct result
-  // unless memory size is zero, in which case all pointers are invalid.
-  const v128_t *const limit =
-      (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
+  // How many bytes can be read before pointers go out of bounds.
+  size_t N = __builtin_wasm_memory_size(0) * PAGESIZE -  //
+             (size_t)(s1 > s2 ? s1 : s2);
+  if (n > N) n = N;

-  // Use unaligned loads to handle the case where
-  // the strings have mismatching alignments.
-  const v128_t *w1 = (void *)s1;
-  const v128_t *w2 = (void *)s2;
-  for (; w1 <= limit && w2 <= limit && n >= sizeof(v128_t); n -= sizeof(v128_t)) {
+  // Unaligned loads handle the case where the strings
+  // have mismatching alignments.
+  const v128_t *w1 = (v128_t *)s1;
+  const v128_t *w2 = (v128_t *)s2;
+  for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
    // Find any single bit difference.
    if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
+      // The strings may still be equal,
+      // if the terminator is found before that difference.
      break;
    }
-    // All bytes are equal.
-    // If any byte is zero (on both strings) the strings are equal.
+    // All characters are equal.
+    // If any is a terminator the strings are equal.
    if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
      return 0;
    }
@@ -217,9 +260,9 @@ int strncmp(const char *s1, const char *s2, size_t n) {
    w2++;
  }

-  // Continue byte-by-byte.
-  const unsigned char *u1 = (void *)w1;
-  const unsigned char *u2 = (void *)w2;
+  // Baseline algorithm.
+  const unsigned char *u1 = (unsigned char *)w1;
+  const unsigned char *u2 = (unsigned char *)w2;
  while (n--) {
    if (*u1 != *u2) return *u1 - *u2;
    if (*u1 == 0) break;
@@ -231,20 +274,20 @@ int strncmp(const char *s1, const char *s2, size_t n) {

 static char *__strchrnul(const char *s, int c) {
  // strchrnul must stop as soon as a match is found.
-  // Aligning ensures loads can't fail.
+  // Aligning ensures loads beyond the first match don't fail.
  uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-  const v128_t *w = (void *)(s - align);
+  const v128_t *w = (v128_t *)(s - align);
  const v128_t wc = wasm_i8x16_splat(c);

-  while (true) {
+  for (;;) {
    const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}) | wasm_i8x16_eq(*w, wc);
    // Bitmask is slow on AArch64, any_true is much faster.
    if (wasm_v128_any_true(cmp)) {
-      // Clear the bits corresponding to alignment
+      // Clear the bits corresponding to alignment (little-endian)
      // so we can count trailing zeros.
      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
      // At least one bit will be set, unless we cleared them.
-      // Knowing this helps the compiler. 
+      // Knowing this helps the compiler.
      __builtin_assume(mask || align);
      if (mask) {
        return (char *)w + __builtin_ctz(mask);
@@ -271,99 +314,221 @@ char *strchr(const char *s, int c) {
    return (char *)s + strlen(s);
  }
  char *r = __strchrnul(s, c);
-  return *(char *)r == (char)c ? r : NULL;
+  return *r == (char)c ? r : NULL;
 }

+__attribute__((weak, always_inline))
+char *strrchr(const char *s, int c) {
+  // For finding the terminator, strlen is faster.
+  if (__builtin_constant_p(c) && (char)c == 0) {
+    return (char *)s + strlen(s);
+  }
+  // This could also be implemented in a single pass using strchr,
+  // advancing to the next match until no more matches are found.
+  // That would be suboptimal with lots of consecutive matches.
+  return (char *)memrchr(s, c, strlen(s) + 1);
+}
+
+// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
+
+#define _WASM_SIMD128_BITMAP256_T                                    \
+  struct {                                                           \
+    uint8_t l __attribute__((__vector_size__(16), __aligned__(16))); \
+    uint8_t h __attribute__((__vector_size__(16), __aligned__(16))); \
+  }
+
+#define _WASM_SIMD128_SETBIT(bitmap, i)            \
+  ({                                               \
+    uint8_t _c = (uint8_t)(i);                     \
+    uint8_t _hi_nibble = _c >> 4;                  \
+    uint8_t _lo_nibble = _c & 0xf;                 \
+    bitmap.l[_lo_nibble] |= 1 << (_hi_nibble - 0); \
+    bitmap.h[_lo_nibble] |= 1 << (_hi_nibble - 8); \
+  })
+
+#define _WASM_SIMD128_CHKBIT(bitmap, i)                                   \
+  ({                                                                      \
+    uint8_t _c = (uint8_t)(i);                                            \
+    uint8_t _hi_nibble = _c >> 4;                                         \
+    uint8_t _lo_nibble = _c & 0xf;                                        \
+    uint8_t _bitmask = 1 << (_hi_nibble & 0x7);                           \
+    uint8_t _bitset = (_hi_nibble < 8 ? bitmap.l : bitmap.h)[_lo_nibble]; \
+    _bitmask & _bitset;                                                   \
+  })
+
+#define _WASM_SIMD128_CHKBITS(bitmap, v)                                    \
+  ({                                                                        \
+    v128_t _w = v;                                                          \
+    v128_t _hi_nibbles = wasm_u8x16_shr(_w, 4);                             \
+    v128_t _lo_nibbles = _w & wasm_u8x16_const_splat(0xf);                  \
+                                                                            \
+    v128_t _bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128,  \
+                                              1, 2, 4, 8, 16, 32, 64, 128); \
+                                                                            \
+    v128_t _bitmask = wasm_i8x16_swizzle(_bitmask_lookup, _hi_nibbles);     \
+    v128_t _bitsets = wasm_v128_bitselect(                                  \
+        wasm_i8x16_swizzle(bitmap.l, _lo_nibbles),                          \
+        wasm_i8x16_swizzle(bitmap.h, _lo_nibbles),                          \
+        wasm_i8x16_lt(_hi_nibbles, wasm_u8x16_const_splat(8)));             \
+                                                                            \
+    wasm_i8x16_eq(_bitsets & _bitmask, _bitmask);                           \
+  })
+
 __attribute__((weak))
 size_t strspn(const char *s, const char *c) {
-#ifndef _REENTRANT
-  static // Avoid the stack for builds without threads.
-#endif
-  char byteset[UCHAR_MAX + 1];
+  // How many bytes can be read before the pointer goes out of bounds.
+  size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - (size_t)s;
+  const v128_t *w = (v128_t *)s;
  const char *const a = s;

  if (!c[0]) return 0;
  if (!c[1]) {
-    // Set limit to the largest possible valid v128_t pointer.
-    // Unsigned modular arithmetic gives the correct result
-    // unless memory size is zero, in which case all pointers are invalid.
-    const v128_t *const limit =
-        (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
-
-    const v128_t *w = (void *)s;
    const v128_t wc = wasm_i8x16_splat(*c);
-    while (w <= limit) {
-      if (!wasm_i8x16_all_true(wasm_i8x16_eq(wasm_v128_load(w), wc))) {
-        break;
+    for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
+      const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc);
+      // Bitmask is slow on AArch64, all_true is much faster.
+      if (!wasm_i8x16_all_true(cmp)) {
+        size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+        return (char *)w + ctz - s;
      }
      w++;
    }

-    s = (void *)w;
-    while (*s == *c) s++;
+    // Baseline algorithm.
+    for (s = (char *)w; *s == *c; s++);
    return s - a;
  }

-#if !__OPTIMIZE__ || __OPTIMIZE_SIZE__
+  _WASM_SIMD128_BITMAP256_T bitmap = {};

-  // Unoptimized version.
-  memset(byteset, 0, sizeof(byteset));
-  while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
-  while (byteset[*(unsigned char *)s]) s++;
+  for (; *c; c++) {
+    _WASM_SIMD128_SETBIT(bitmap, *c);
+    // Terminator IS NOT on the bitmap.
+  }

-#else
-
-  // This is faster than memset.
-  volatile v128_t *w = (void *)byteset;
-  #pragma unroll
-  for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
-  static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
-
-  // Keeping byteset[0] = 0 avoids the other loop having to test for it.
-  while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
-  #pragma unroll 4
-  while (byteset[*(unsigned char *)s]) s++;
-
-#endif
+  for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
+    const v128_t cmp = _WASM_SIMD128_CHKBITS(bitmap, wasm_v128_load(w));
+    // Bitmask is slow on AArch64, all_true is much faster.
+    if (!wasm_i8x16_all_true(cmp)) {
+      size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
+      return (char *)w + ctz - s;
+    }
+    w++;
+  }

+  // Baseline algorithm.
+  for (s = (char *)w; _WASM_SIMD128_CHKBIT(bitmap, *s); s++);
  return s - a;
 }

 __attribute__((weak))
 size_t strcspn(const char *s, const char *c) {
-#ifndef _REENTRANT
-  static // Avoid the stack for builds without threads.
-#endif
-  char byteset[UCHAR_MAX + 1];
-  const char *const a = s;
-
  if (!c[0] || !c[1]) return __strchrnul(s, *c) - s;

-#if !__OPTIMIZE__ || __OPTIMIZE_SIZE__
+  // How many bytes can be read before the pointer goes out of bounds.
+  size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - (size_t)s;
+  const v128_t *w = (v128_t *)s;
+  const char *const a = s;

-  // Unoptimized version.
-  memset(byteset, 0, sizeof(byteset));
-  while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
-  while (!byteset[*(unsigned char *)s]) s++;
+  _WASM_SIMD128_BITMAP256_T bitmap = {};

-#else
+  for (;;) {
+    _WASM_SIMD128_SETBIT(bitmap, *c);
+    // Terminator IS on the bitmap.
+    if (!*c++) break;
+  }

-  // This is faster than memset.
-  volatile v128_t *w = (void *)byteset;
-  #pragma unroll
-  for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
-  static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
-
-  // Setting byteset[0] = 1 avoids the other loop having to test for it.
-  while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
-  #pragma unroll 4
-  while (!byteset[*(unsigned char *)s]) s++;
-
-#endif
+  for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
+    const v128_t cmp = _WASM_SIMD128_CHKBITS(bitmap, wasm_v128_load(w));
+    // Bitmask is slow on AArch64, any_true is much faster.
+    if (wasm_v128_any_true(cmp)) {
+      size_t ctz = __builtin_ctz(wasm_i8x16_bitmask(cmp));
+      return (char *)w + ctz - s;
+    }
+    w++;
+  }

+  // Baseline algorithm.
+  for (s = (char *)w; !_WASM_SIMD128_CHKBIT(bitmap, *s); s++);
  return s - a;
 }

+#undef _WASM_SIMD128_SETBIT
+#undef _WASM_SIMD128_CHKBIT
+#undef _WASM_SIMD128_CHKBITS
+#undef _WASM_SIMD128_BITMAP256_T
+
+// Given the above SIMD implementations,
+// these are best implemented as
+// small wrappers over those functions.
+
+// Simple wrappers already in musl:
+//  - mempcpy
+//  - strcat
+//  - strdup
+//  - strndup
+//  - strnlen
+//  - strpbrk
+//  - strsep
+//  - strtok
+
+__attribute__((weak))
+void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) {
+  void *memchr(const void *v, int c, size_t n);
+  const void *m = memchr(src, c, n);
+  if (m != NULL) {
+    n = (char *)m - (char *)src + 1;
+    m = (char *)dest + n;
+  }
+  memcpy(dest, src, n);
+  return (void *)m;
+}
+
+__attribute__((weak))
+char *strncat(char *__restrict dest, const char *__restrict src, size_t n) {
+  size_t strnlen(const char *s, size_t n);
+  size_t dlen = strlen(dest);
+  size_t slen = strnlen(src, n);
+  memcpy(dest + dlen, src, slen);
+  dest[dlen + slen] = 0;
+  return dest;
+}
+
+static char *__stpcpy(char *__restrict dest, const char *__restrict src) {
+  size_t slen = strlen(src);
+  memcpy(dest, src, slen + 1);
+  return dest + slen;
+}
+
+static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
+  size_t strnlen(const char *s, size_t n);
+  size_t slen = strnlen(src, n);
+  memcpy(dest, src, slen);
+  memset(dest + slen, 0, n - slen);
+  return dest + slen;
+}
+
+__attribute__((weak, always_inline))
+char *stpcpy(char *__restrict dest, const char *__restrict src) {
+  return __stpcpy(dest, src);
+}
+
+char *strcpy(char *__restrict dest, const char *__restrict src) {
+  __stpcpy(dest, src);
+  return dest;
+}
+
+__attribute__((weak, always_inline))
+char *stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
+  return __stpncpy(dest, src, n);
+}
+
+__attribute__((weak, always_inline))
+char *strncpy(char *__restrict dest, const char *__restrict src, size_t n) {
+  __stpncpy(dest, src, n);
+  return dest;
+}
+
 #endif  // __wasm_simd128__

 #ifdef __cplusplus
--- a/sqlite3/libc/strings.h
+++ b/sqlite3/libc/strings.h
@@ -0,0 +1,57 @@
+#ifndef _WASM_SIMD128_STRINGS_H
+#define _WASM_SIMD128_STRINGS_H
+
+#include <stddef.h>
+#include <wasm_simd128.h>
+
+#include_next <strings.h>  // the system strings.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __wasm_simd128__
+
+__attribute__((weak))
+int bcmp(const void *v1, const void *v2, size_t n) {
+  // bcmp is the same as memcmp but only compares for equality.
+
+  // Baseline algorithm.
+  if (n < sizeof(v128_t)) {
+    const unsigned char *u1 = (unsigned char *)v1;
+    const unsigned char *u2 = (unsigned char *)v2;
+    while (n--) {
+      if (*u1 != *u2) return 1;
+      u1++;
+      u2++;
+    }
+    return 0;
+  }
+
+  // bcmp is allowed to read up to n bytes from each object.
+  // Unaligned loads handle the case where the objects
+  // have mismatching alignments.
+  const v128_t *w1 = (v128_t *)v1;
+  const v128_t *w2 = (v128_t *)v2;
+  while (n) {
+    // Find any single bit difference.
+    if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
+      return 1;
+    }
+    // This makes n a multiple of sizeof(v128_t)
+    // for every iteration except the first.
+    size_t align = (n - 1) % sizeof(v128_t) + 1;
+    w1 = (v128_t *)((char *)w1 + align);
+    w2 = (v128_t *)((char *)w2 + align);
+    n -= align;
+  }
+  return 0;
+}
+
+#endif  // __wasm_simd128__
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // _WASM_SIMD128_STRINGS_H
--- a/vfs/tests/mptest/wasm/mptest.wasm
+++ b/vfs/tests/mptest/wasm/mptest.wasm
--- a/vfs/tests/speedtest1/wasm/speedtest1.wasm
+++ b/vfs/tests/speedtest1/wasm/speedtest1.wasm
Author	SHA1	Message	Date
Nuno Cruces	c780ef16e2	SQLite 3.49.2.	2025-05-07 14:08:18 +01:00
Nuno Cruces	b609930142	Refactor #274 .	2025-05-07 12:46:13 +01:00
Nuno Cruces	fd165ce724	Issue #274 .	2025-05-07 01:37:52 +01:00
Nuno Cruces	d3973b23e3	More `memcmp`.	2025-05-06 15:48:58 +01:00
Nuno Cruces	320b68e74f	More tests.	2025-05-05 14:47:43 +01:00
Nuno Cruces	2c3850e5d1	Reuse fast funcs.	2025-05-03 01:18:10 +01:00
Nuno Cruces	db7aacff9f	Add `strrchr`.	2025-05-02 14:35:14 +01:00
Nuno Cruces	d748d98e39	Fix.	2025-05-01 12:49:38 +01:00
Nuno Cruces	13b8642384	Compile as C++.	2025-04-29 14:03:59 +01:00
Nuno Cruces	29c5c816cb	More libc.	2025-04-27 23:35:13 +01:00