Less SIMD.

This commit is contained in:
Nuno Cruces
2025-06-22 11:06:57 +01:00
parent d7aef63844
commit cd6ba43e77
12 changed files with 109 additions and 3469 deletions

Binary file not shown.

Binary file not shown.

View File

@@ -1,41 +0,0 @@
# Using SIMD for libc
I found that implementing some libc functions with Wasm SIMD128 can make them significantly faster.
Rough numbers for [wazero](https://wazero.io/):
function | speedup
------------ | -----
`strlen` | 4.1×
`memchr` | 4.1×
`strchr` | 4.0×
`strrchr` | 9.1×
`memcmp` | 13.0×
`strcmp` | 10.4×
`strncmp` | 15.7×
`strcasecmp` | 8.8×
`strncasecmp`| 8.6×
`strspn` | 9.9×
`strcspn` | 9.0×
`memmem` | 2.2×
`strstr` | 5.5×
`strcasestr` | 25.2×
For functions where musl uses SWAR on a 4-byte `size_t`,
the improvement is around 4×.
This is very close to the expected theoretical improvement,
as we're processing 4× the bytes per cycle (16 _vs._ 4).
For other functions where there's no algorithmic change,
the improvement is around 8×.
These functions are harder to optimize
(which is why musl doesn't bother with SWAR),
so getting an 8× improvement from processing 16× bytes seems decent.
String search is harder to compare, since there are algorithmic changes,
and different needles produce very different numbers.
We use [Quick Search](https://igm.univ-mlv.fr/~lecroq/string/node19.html) for `memmem`,
and a [RabinKarp](https://igm.univ-mlv.fr/~lecroq/string/node5.html) for `strstr` and `strcasestr`;
musl uses [Two Way](https://igm.univ-mlv.fr/~lecroq/string/node26.html) for `memmem` and `strstr`,
and [brute force](https://igm.univ-mlv.fr/~lecroq/string/node3.html) for `strcasestr`.
Unlike Two-Way, both replacements can go quadratic for long, periodic needles.

View File

@@ -28,31 +28,18 @@ EOF
-Wl,--stack-first \
-Wl,--import-undefined \
-Wl,--initial-memory=16777216 \
-Wl,--export=memccpy \
-Wl,--export=memchr \
-Wl,--export=memcmp \
-Wl,--export=memcpy \
-Wl,--export=memmem \
-Wl,--export=memmove \
-Wl,--export=memrchr \
-Wl,--export=memset \
-Wl,--export=stpcpy \
-Wl,--export=stpncpy \
-Wl,--export=strcasecmp \
-Wl,--export=strcasestr \
-Wl,--export=strchr \
-Wl,--export=strchrnul \
-Wl,--export=strcmp \
-Wl,--export=strcpy \
-Wl,--export=strcspn \
-Wl,--export=strlen \
-Wl,--export=strncasecmp \
-Wl,--export=strncat \
-Wl,--export=strncmp \
-Wl,--export=strncpy \
-Wl,--export=strrchr \
-Wl,--export=strspn \
-Wl,--export=strstr \
-Wl,--export=qsort
"$BINARYEN/wasm-ctor-eval" -g -c _initialize libc.wasm -o libc.tmp

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -24,25 +24,18 @@ const (
)
var (
memory []byte
module api.Module
memset api.Function
memcpy api.Function
memchr api.Function
memcmp api.Function
memmem api.Function
strlen api.Function
strchr api.Function
strcmp api.Function
strstr api.Function
strspn api.Function
strrchr api.Function
strncmp api.Function
strcspn api.Function
strcasecmp api.Function
strcasestr api.Function
strncasecmp api.Function
stack [8]uint64
memory []byte
module api.Module
memset api.Function
memcpy api.Function
memchr api.Function
memcmp api.Function
strlen api.Function
strchr api.Function
strspn api.Function
strrchr api.Function
strcspn api.Function
stack [8]uint64
)
func call(fn api.Function, arg ...uint64) uint64 {
@@ -68,18 +61,11 @@ func TestMain(m *testing.M) {
memcpy = mod.ExportedFunction("memcpy")
memchr = mod.ExportedFunction("memchr")
memcmp = mod.ExportedFunction("memcmp")
memmem = mod.ExportedFunction("memmem")
strlen = mod.ExportedFunction("strlen")
strchr = mod.ExportedFunction("strchr")
strcmp = mod.ExportedFunction("strcmp")
strstr = mod.ExportedFunction("strstr")
strspn = mod.ExportedFunction("strspn")
strrchr = mod.ExportedFunction("strrchr")
strncmp = mod.ExportedFunction("strncmp")
strcspn = mod.ExportedFunction("strcspn")
strcasecmp = mod.ExportedFunction("strcasecmp")
strcasestr = mod.ExportedFunction("strcasestr")
strncasecmp = mod.ExportedFunction("strncasecmp")
memory, _ = mod.Memory().Read(0, mod.Memory().Size())
os.Exit(m.Run())
@@ -166,58 +152,6 @@ func Benchmark_memcmp(b *testing.B) {
}
}
func Benchmark_strcmp(b *testing.B) {
clear(memory)
fill(memory[ptr1:ptr1+size-1], 7)
fill(memory[ptr2:ptr2+size/2], 7)
fill(memory[ptr2+size/2:ptr2+size-1], 5)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strcmp, ptr1, ptr2, size)
}
}
func Benchmark_strncmp(b *testing.B) {
clear(memory)
fill(memory[ptr1:ptr1+size-1], 7)
fill(memory[ptr2:ptr2+size/2], 7)
fill(memory[ptr2+size/2:ptr2+size-1], 5)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strncmp, ptr1, ptr2, size-1)
}
}
func Benchmark_strcasecmp(b *testing.B) {
clear(memory)
fill(memory[ptr1:ptr1+size-1], 7)
fill(memory[ptr2:ptr2+size/2], 7)
fill(memory[ptr2+size/2:ptr2+size-1], 5)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strcasecmp, ptr1, ptr2, size)
}
}
func Benchmark_strncasecmp(b *testing.B) {
clear(memory)
fill(memory[ptr1:ptr1+size-1], 7)
fill(memory[ptr2:ptr2+size/2], 7)
fill(memory[ptr2+size/2:ptr2+size-1], 5)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strncasecmp, ptr1, ptr2, size-1)
}
}
func Benchmark_strspn(b *testing.B) {
clear(memory)
fill(memory[ptr1:ptr1+size/2], 7)
@@ -248,51 +182,6 @@ func Benchmark_strcspn(b *testing.B) {
}
}
//go:embed string.h
var source string
func Benchmark_memmem(b *testing.B) {
needle := "memcpy(dest, src, slen)"
clear(memory)
copy(memory[ptr1:], source)
copy(memory[ptr2:], needle)
b.SetBytes(int64(len(source)))
b.ResetTimer()
for range b.N {
call(memmem, ptr1, uint64(len(source)), ptr2, uint64(len(needle)))
}
}
func Benchmark_strstr(b *testing.B) {
needle := "memcpy(dest, src, slen)"
clear(memory)
copy(memory[ptr1:], source)
copy(memory[ptr2:], needle)
b.SetBytes(int64(len(source)))
b.ResetTimer()
for range b.N {
call(strstr, ptr1, ptr2)
}
}
func Benchmark_strcasestr(b *testing.B) {
needle := "MEMCPY(dest, src, slen)"
clear(memory)
copy(memory[ptr1:], source)
copy(memory[ptr2:], needle)
b.SetBytes(int64(len(source)))
b.ResetTimer()
for range b.N {
call(strcasestr, ptr1, ptr2)
}
}
func Test_strlen(t *testing.T) {
for length := range 64 {
for alignment := range 24 {
@@ -498,48 +387,6 @@ func Test_memcmp(t *testing.T) {
}
}
func Test_strcmp(t *testing.T) {
const s1 = compareTest1
const s2 = compareTest2
ptr2 := len(memory) - len(s2) - 1
clear(memory)
copy(memory[ptr1:], s1)
copy(memory[ptr2:], s2)
for i := range len(s1) + 1 {
want := strings.Compare(term(s1[i:]), term(s2[i:]))
got := call(strcmp, uint64(ptr1+i), uint64(ptr2+i))
if sign(int32(got)) != want {
t.Errorf("strcmp(%d, %d) = %d, want %d",
ptr1+i, ptr2+i, int32(got), want)
}
}
}
func Test_strncmp(t *testing.T) {
const s1 = compareTest1
const s2 = compareTest2
ptr2 := len(memory) - len(s2) - 1
clear(memory)
copy(memory[ptr1:], s1)
copy(memory[ptr2:], s2)
for i := range len(s1) + 1 {
for j := range len(s1) - i + 1 {
want := strings.Compare(term(s1[i:i+j]), term(s2[i:i+j]))
got := call(strncmp, uint64(ptr1+i), uint64(ptr2+i), uint64(j))
if sign(int32(got)) != want {
t.Errorf("strncmp(%d, %d, %d) = %d, want %d",
ptr1+i, ptr2+i, j, int32(got), want)
}
}
}
}
func Test_strspn(t *testing.T) {
for length := range 64 {
for pos := range length + 2 {
@@ -782,102 +629,6 @@ var searchTests = []searchTest{
{"000000000000000000000000000000000000000000000000000000000000000000000001", "0000000000000000000000000000000000000000000000000000000000000000001", 5},
}
func Test_memmem(t *testing.T) {
tt := append(searchTests,
searchTest{"abcABCabc", "A", 3},
searchTest{"fofofofofofo\x00foffofoobar", "foffof", 13},
searchTest{"0000000000000000\x000123456789012345678901234567890", "0123456789012345", 17},
)
for i := range tt {
ptr1 := uint64(len(memory) - len(tt[i].haystk))
clear(memory)
copy(memory[ptr1:], tt[i].haystk)
copy(memory[ptr2:], tt[i].needle)
var want uint64
if tt[i].out >= 0 {
want = ptr1 + uint64(tt[i].out)
}
got := call(memmem,
uint64(ptr1), uint64(len(tt[i].haystk)),
uint64(ptr2), uint64(len(tt[i].needle)))
if got != want {
t.Errorf("memmem(%q, %q) = %d, want %d",
tt[i].haystk, tt[i].needle,
uint32(got), uint32(want))
}
}
}
func Test_strstr(t *testing.T) {
tt := append(searchTests,
searchTest{"abcABCabc", "A", 3},
searchTest{"fofofofofofo\x00foffofoobar", "foffof", -1},
searchTest{"0000000000000000\x000123456789012345678901234567890", "0123456789012345", -1},
)
for i := range tt {
ptr1 := uint64(len(memory) - len(tt[i].haystk) - 1)
clear(memory)
copy(memory[ptr1:], tt[i].haystk)
copy(memory[ptr2:], tt[i].needle)
var want uint64
if tt[i].out >= 0 {
want = ptr1 + uint64(tt[i].out)
}
got := call(strstr, uint64(ptr1), uint64(ptr2))
if got != want {
t.Errorf("strstr(%q, %q) = %d, want %d",
tt[i].haystk, tt[i].needle,
uint32(got), uint32(want))
}
}
}
func Test_strcasestr(t *testing.T) {
tt := append(searchTests[1:],
searchTest{"A", "a", 0},
searchTest{"a", "A", 0},
searchTest{"Z", "z", 0},
searchTest{"z", "Z", 0},
searchTest{"@", "`", -1},
searchTest{"`", "@", -1},
searchTest{"[", "{", -1},
searchTest{"{", "[", -1},
searchTest{"abcABCabc", "A", 0},
searchTest{"fofofofofofofoffofoobarfoo", "FoFFoF", 12},
searchTest{"fofofofofofofOffOfoobarfoo", "FoFFoF", 12},
searchTest{"fofofofofofo\x00foffofoobar", "foffof", -1},
searchTest{"0000000000000000\x000123456789012345678901234567890", "0123456789012345", -1},
)
for i := range tt {
ptr1 := uint64(len(memory) - len(tt[i].haystk) - 1)
clear(memory)
copy(memory[ptr1:], tt[i].haystk)
copy(memory[ptr2:], tt[i].needle)
var want uint64
if tt[i].out >= 0 {
want = ptr1 + uint64(tt[i].out)
}
got := call(strcasestr, uint64(ptr1), uint64(ptr2))
if got != want {
t.Errorf("strcasestr(%q, %q) = %d, want %d",
tt[i].haystk, tt[i].needle,
uint32(got), uint32(want))
}
}
}
func Fuzz_memchr(f *testing.F) {
f.Fuzz(func(t *testing.T, s string, c, i byte) {
if len(s) > 128 || int(i) > len(s) {
@@ -971,120 +722,6 @@ func Fuzz_memcmp(f *testing.F) {
})
}
func Fuzz_strcmp(f *testing.F) {
const s1 = compareTest1
const s2 = compareTest2
for i := range len(compareTest1) + 1 {
f.Add(term(s1[i:]), term(s2[i:]))
}
f.Fuzz(func(t *testing.T, s1, s2 string) {
if len(s1) > 128 || len(s2) > 128 {
t.SkipNow()
}
copy(memory[ptr1:], s1)
copy(memory[ptr2:], s2)
memory[ptr1+len(s1)] = 0
memory[ptr2+len(s2)] = 0
got := call(strcmp, uint64(ptr1), uint64(ptr2))
want := strings.Compare(term(s1), term(s2))
if sign(int32(got)) != want {
t.Errorf("strcmp(%q, %q) = %d, want %d",
s1, s2, uint32(got), uint32(want))
}
})
}
func Fuzz_strncmp(f *testing.F) {
const s1 = compareTest1
const s2 = compareTest2
for i := range len(compareTest1) + 1 {
f.Add(term(s1[i:]), term(s2[i:]), byte(len(s1)))
}
f.Fuzz(func(t *testing.T, s1, s2 string, n byte) {
if len(s1) > 128 || len(s2) > 128 {
t.SkipNow()
}
copy(memory[ptr1:], s1)
copy(memory[ptr2:], s2)
memory[ptr1+len(s1)] = 0
memory[ptr2+len(s2)] = 0
got := call(strncmp, uint64(ptr1), uint64(ptr2), uint64(n))
want := bytes.Compare(
term(memory[ptr1:][:n]),
term(memory[ptr2:][:n]))
if sign(int32(got)) != want {
t.Errorf("strncmp(%q, %q, %d) = %d, want %d",
s1, s2, n, uint32(got), uint32(want))
}
})
}
func Fuzz_strcasecmp(f *testing.F) {
const s1 = compareTest1
const s2 = compareTest2
for i := range len(compareTest1) + 1 {
f.Add(term(s1[i:]), term(s2[i:]))
}
f.Fuzz(func(t *testing.T, s1, s2 string) {
if len(s1) > 128 || len(s2) > 128 {
t.SkipNow()
}
copy(memory[ptr1:], s1)
copy(memory[ptr2:], s2)
memory[ptr1+len(s1)] = 0
memory[ptr2+len(s2)] = 0
got := call(strcasecmp, uint64(ptr1), uint64(ptr2))
want := bytes.Compare(
lower(term(memory[ptr1:])),
lower(term(memory[ptr2:])))
if sign(int32(got)) != want {
t.Errorf("strcasecmp(%q, %q) = %d, want %d",
s1, s2, uint32(got), uint32(want))
}
})
}
func Fuzz_strncasecmp(f *testing.F) {
const s1 = compareTest1
const s2 = compareTest2
for i := range len(compareTest1) + 1 {
f.Add(term(s1[i:]), term(s2[i:]), byte(len(s1)))
}
f.Fuzz(func(t *testing.T, s1, s2 string, n byte) {
if len(s1) > 128 || len(s2) > 128 {
t.SkipNow()
}
copy(memory[ptr1:], s1)
copy(memory[ptr2:], s2)
memory[ptr1+len(s1)] = 0
memory[ptr2+len(s2)] = 0
got := call(strncasecmp, uint64(ptr1), uint64(ptr2), uint64(n))
want := bytes.Compare(
lower(term(memory[ptr1:][:n])),
lower(term(memory[ptr2:][:n])))
if sign(int32(got)) != want {
t.Errorf("strncasecmp(%q, %q, %d) = %d, want %d",
s1, s2, n, uint32(got), uint32(want))
}
})
}
func Fuzz_strspn(f *testing.F) {
for _, t := range searchTests {
f.Add(t.haystk, t.needle)
@@ -1155,129 +792,6 @@ func Fuzz_strcspn(f *testing.F) {
})
}
func Fuzz_memmem(f *testing.F) {
tt := append(searchTests,
searchTest{"abcABCabc", "A", 3},
searchTest{"fofofofofofo\x00foffofoobar", "foffof", 13},
searchTest{"0000000000000000\x000123456789012345678901234567890", "0123456789012345", 17},
)
for _, t := range tt {
f.Add(t.haystk, t.needle)
}
f.Fuzz(func(t *testing.T, haystk, needle string) {
if len(haystk) > 128 || len(needle) > 128 {
t.SkipNow()
}
copy(memory[ptr1:], haystk)
copy(memory[ptr2:], needle)
got := call(memmem,
uint64(ptr1), uint64(len(haystk)),
uint64(ptr2), uint64(len(needle)))
want := strings.Index(haystk, needle)
if want >= 0 {
want = ptr1 + want
} else {
want = 0
}
if uint32(got) != uint32(want) {
t.Errorf("memmem(%q, %q) = %d, want %d",
haystk, needle, uint32(got), uint32(want))
}
})
}
func Fuzz_strstr(f *testing.F) {
tt := append(searchTests,
searchTest{"abcABCabc", "A", 3},
searchTest{"fofofofofofo\x00foffofoobar", "foffof", -1},
searchTest{"0000000000000000\x000123456789012345678901234567890", "0123456789012345", -1},
)
for _, t := range tt {
f.Add(t.haystk, t.needle)
}
f.Fuzz(func(t *testing.T, haystk, needle string) {
if len(haystk) > 128 || len(needle) > 128 {
t.SkipNow()
}
copy(memory[ptr1:], haystk)
copy(memory[ptr2:], needle)
memory[ptr1+len(haystk)] = 0
memory[ptr2+len(needle)] = 0
got := call(strstr, uint64(ptr1), uint64(ptr2))
want := strings.Index(term(haystk), term(needle))
if want >= 0 {
want = ptr1 + want
} else {
want = 0
}
if uint32(got) != uint32(want) {
t.Errorf("strstr(%q, %q) = %d, want %d",
haystk, needle, uint32(got), uint32(want))
}
})
}
func Fuzz_strcasestr(f *testing.F) {
tt := append(searchTests,
searchTest{"A", "a", 0},
searchTest{"a", "A", 0},
searchTest{"Z", "z", 0},
searchTest{"z", "Z", 0},
searchTest{"@", "`", -1},
searchTest{"`", "@", -1},
searchTest{"[", "{", -1},
searchTest{"{", "[", -1},
searchTest{"abcABCabc", "A", 0},
searchTest{"fofofofofofofoffofoobarfoo", "FoFFoF", 12},
searchTest{"fofofofofofofOffOfoobarfoo", "FoFFoF", 12},
searchTest{"fofofofofofo\x00foffofoobar", "foffof", -1},
searchTest{"0000000000000000\x000123456789012345678901234567890", "0123456789012345", -1},
)
for _, t := range tt {
f.Add(t.haystk, t.needle)
}
f.Fuzz(func(t *testing.T, haystk, needle string) {
if len(haystk) > 128 || len(needle) > 128 {
t.SkipNow()
}
if len(needle) == 0 {
t.Skip("musl bug")
}
copy(memory[ptr1:], haystk)
copy(memory[ptr2:], needle)
memory[ptr1+len(haystk)] = 0
memory[ptr2+len(needle)] = 0
got := call(strcasestr, uint64(ptr1), uint64(ptr2))
want := bytes.Index(
lower(term(memory[ptr1:])),
lower(term(memory[ptr2:])))
if want >= 0 {
want = ptr1 + want
} else {
want = 0
}
if uint32(got) != uint32(want) {
t.Errorf("strcasestr(%q, %q) = %d, want %d",
haystk, needle, uint32(got), uint32(want))
}
})
}
func sign(x int32) int {
switch {
case x > 0:
@@ -1295,15 +809,6 @@ func fill(s []byte, v byte) {
}
}
func lower(s []byte) []byte {
for i, c := range s {
if 'A' <= c && c <= 'Z' {
s[i] = c - 'A' + 'a'
}
}
return s
}
func term[T interface{ []byte | string }](s T) T {
for i, c := range []byte(s) {
if c == 0 {

View File

@@ -19,17 +19,17 @@ extern "C" {
// Clang will intrinsify using SIMD for small, constant N.
// For everything else, this helps inlining.
__attribute__((weak))
__attribute__((weak, always_inline))
void *memset(void *dest, int c, size_t n) {
return __builtin_memset(dest, c, n);
}
__attribute__((weak))
__attribute__((weak, always_inline))
void *memcpy(void *__restrict dest, const void *__restrict src, size_t n) {
return __builtin_memcpy(dest, src, n);
}
__attribute__((weak))
__attribute__((weak, always_inline))
void *memmove(void *dest, const void *src, size_t n) {
return __builtin_memmove(dest, src, n);
}
@@ -80,7 +80,7 @@ int memcmp(const void *vl, const void *vr, size_t n) {
return 0;
}
__attribute__((weak))
__attribute__((weak, noinline))
void *memchr(const void *s, int c, size_t n) {
// When n is zero, a function that locates a character finds no occurrence.
// Otherwise, decrement n to ensure sub_overflow overflows
@@ -126,7 +126,7 @@ void *memchr(const void *s, int c, size_t n) {
}
}
__attribute__((weak))
__attribute__((weak, noinline))
void *memrchr(const void *s, int c, size_t n) {
// memrchr is allowed to read up to n bytes from the object.
// Search backward for the last matching character.
@@ -150,7 +150,7 @@ void *memrchr(const void *s, int c, size_t n) {
return NULL;
}
__attribute__((weak))
__attribute__((weak, noinline))
size_t strlen(const char *s) {
// strlen must stop as soon as it finds the terminator.
// Aligning ensures loads beyond the terminator are safe.
@@ -180,93 +180,6 @@ size_t strlen(const char *s) {
}
}
static int __strcmp_s(const char *s1, const char *s2) {
// Scalar algorithm.
const unsigned char *u1 = (unsigned char *)s1;
const unsigned char *u2 = (unsigned char *)s2;
for (;;) {
if (*u1 != *u2) return *u1 - *u2;
if (*u1 == 0) break;
u1++;
u2++;
}
return 0;
}
static int __strcmp(const char *s1, const char *s2) {
// How many bytes can be read before pointers go out of bounds.
size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - //
(size_t)(s1 > s2 ? s1 : s2);
// Unaligned loads handle the case where the strings
// have mismatching alignments.
const v128_t *w1 = (v128_t *)s1;
const v128_t *w2 = (v128_t *)s2;
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
// Find any single bit difference.
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
// The terminator may come before the difference.
break;
}
// We know all characters are equal.
// If any is a terminator the strings are equal.
if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
return 0;
}
w1++;
w2++;
}
return __strcmp_s((char *)w1, (char *)w2);
}
__attribute__((weak, always_inline))
int strcmp(const char *s1, const char *s2) {
// Skip the vector search when comparing against small literal strings.
if (__builtin_constant_p(strlen(s2)) && strlen(s2) < sizeof(v128_t)) {
return __strcmp_s(s1, s2);
}
return __strcmp(s1, s2);
}
__attribute__((weak))
int strncmp(const char *s1, const char *s2, size_t n) {
// How many bytes can be read before pointers go out of bounds.
size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - //
(size_t)(s1 > s2 ? s1 : s2);
if (n > N) n = N;
// Unaligned loads handle the case where the strings
// have mismatching alignments.
const v128_t *w1 = (v128_t *)s1;
const v128_t *w2 = (v128_t *)s2;
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
// Find any single bit difference.
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
// The terminator may come before the difference.
break;
}
// We know all characters are equal.
// If any is a terminator the strings are equal.
if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
return 0;
}
w1++;
w2++;
}
// Scalar algorithm.
const unsigned char *u1 = (unsigned char *)w1;
const unsigned char *u2 = (unsigned char *)w2;
while (n--) {
if (*u1 != *u2) return *u1 - *u2;
if (*u1 == 0) break;
u1++;
u2++;
}
return 0;
}
static char *__strchrnul(const char *s, int c) {
// strchrnul must stop as soon as it finds the terminator.
// Aligning ensures loads beyond the terminator are safe.
@@ -371,7 +284,7 @@ static v128_t __wasm_v128_chkbits(__wasm_v128_bitmap256_t bitmap, v128_t v) {
#undef wasm_i8x16_relaxed_swizzle
__attribute__((weak))
__attribute__((weak, noinline))
size_t strspn(const char *s, const char *c) {
// strspn must stop as soon as it finds the terminator.
// Aligning ensures loads beyond the terminator are safe.
@@ -433,7 +346,7 @@ size_t strspn(const char *s, const char *c) {
}
}
__attribute__((weak))
__attribute__((weak, noinline))
size_t strcspn(const char *s, const char *c) {
if (!c[0] || !c[1]) return __strchrnul(s, *c) - s;
@@ -472,215 +385,6 @@ size_t strcspn(const char *s, const char *c) {
}
}
// SIMD-friendly algorithms for substring searching
// http://0x80.pl/notesen/2016-11-28-simd-strfind.html
// For haystacks of known length and large enough needles,
// Boyer-Moore's bad-character rule may be useful,
// as proposed by Horspool, Sunday and Raita.
//
// We augment the SIMD algorithm with Quick Search's
// bad-character shift.
//
// https://igm.univ-mlv.fr/~lecroq/string/node14.html
// https://igm.univ-mlv.fr/~lecroq/string/node18.html
// https://igm.univ-mlv.fr/~lecroq/string/node19.html
// https://igm.univ-mlv.fr/~lecroq/string/node22.html
static const char *__memmem(const char *haystk, size_t sh, //
const char *needle, size_t sn, //
uint8_t bmbc[256]) {
// We've handled empty and single character needles.
// The needle is not longer than the haystack.
__builtin_assume(2 <= sn && sn <= sh);
// Find the farthest character not equal to the first one.
size_t i = sn - 1;
while (i > 0 && needle[0] == needle[i]) i--;
if (i == 0) i = sn - 1;
// Subtracting ensures sub_overflow overflows
// when we reach the end of the haystack.
if (sh != SIZE_MAX) sh -= sn;
const v128_t fst = wasm_i8x16_splat(needle[0]);
const v128_t lst = wasm_i8x16_splat(needle[i]);
// The last haystack offset for which loading blk_lst is safe.
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
(sizeof(v128_t) + i));
while (haystk <= H) {
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + i));
const v128_t eq_fst = wasm_i8x16_eq(fst, blk_fst);
const v128_t eq_lst = wasm_i8x16_eq(lst, blk_lst);
const v128_t cmp = eq_fst & eq_lst;
if (wasm_v128_any_true(cmp)) {
// The terminator may come before the match.
if (sh == SIZE_MAX && !wasm_i8x16_all_true(blk_fst)) break;
// Find the offset of the first one bit (little-endian).
// Each iteration clears that bit, tries again.
for (uint32_t mask = wasm_i8x16_bitmask(cmp); mask; mask &= mask - 1) {
size_t ctz = __builtin_ctz(mask);
// The match may be after the end of the haystack.
if (ctz > sh) return NULL;
// We know the first character matches.
if (!bcmp(haystk + ctz + 1, needle + 1, sn - 1)) {
return haystk + ctz;
}
}
}
size_t skip = sizeof(v128_t);
if (sh == SIZE_MAX) {
// Have we reached the end of the haystack?
if (!wasm_i8x16_all_true(blk_fst)) return NULL;
} else {
// Apply the bad-character rule to the character to the right
// of the righmost character of the search window.
if (bmbc) skip += bmbc[(unsigned char)haystk[sn - 1 + sizeof(v128_t)]];
// Have we reached the end of the haystack?
if (__builtin_sub_overflow(sh, skip, &sh)) return NULL;
}
haystk += skip;
}
// Scalar algorithm.
for (size_t j = 0; j <= sh; j++) {
for (size_t i = 0;; i++) {
if (sn == i) return haystk;
if (sh == SIZE_MAX && !haystk[i]) return NULL;
if (needle[i] != haystk[i]) break;
}
haystk++;
}
return NULL;
}
__attribute__((weak))
void *memmem(const void *vh, size_t sh, const void *vn, size_t sn) {
// Return immediately on empty needle.
if (sn == 0) return (void *)vh;
// Return immediately when needle is longer than haystack.
if (sn > sh) return NULL;
// Skip to the first matching character using memchr,
// thereby handling single character needles.
const char *needle = (char *)vn;
const char *haystk = (char *)memchr(vh, *needle, sh);
if (!haystk || sn == 1) return (void *)haystk;
// The haystack got shorter, is the needle now longer than it?
sh -= haystk - (char *)vh;
if (sn > sh) return NULL;
// Is Boyer-Moore's bad-character rule useful?
if (sn < sizeof(v128_t) || sh - sn < sizeof(v128_t)) {
return (void *)__memmem(haystk, sh, needle, sn, NULL);
}
// Compute Boyer-Moore's bad-character shift function.
// Only the last 255 characters of the needle matter for shifts up to 255,
// which is good enough for most needles.
size_t c = sn;
size_t i = 0;
if (c >= 255) {
i = sn - 255;
c = 255;
}
#ifndef _REENTRANT
static
#endif
uint8_t bmbc[256];
memset(bmbc, c, sizeof(bmbc));
for (; i < sn; i++) {
// One less than the usual offset because
// we advance at least one vector at a time.
bmbc[(unsigned char)needle[i]] = sn - i - 1;
}
return (void *)__memmem(haystk, sh, needle, sn, bmbc);
}
__attribute__((weak))
char *strstr(const char *haystk, const char *needle) {
// Return immediately on empty needle.
if (!needle[0]) return (char *)haystk;
// Skip to the first matching character using strchr,
// thereby handling single character needles.
haystk = strchr(haystk, *needle);
if (!haystk || !needle[1]) return (char *)haystk;
return (char *)__memmem(haystk, SIZE_MAX, needle, strlen(needle), NULL);
}
__attribute__((weak))
char *strcasestr(const char *haystk, const char *needle) {
// Return immediately on empty needle.
if (!needle[0]) return (char *)haystk;
// We've handled empty needles.
size_t sn = strlen(needle);
__builtin_assume(sn >= 1);
// Find the farthest character not equal to the first one.
size_t i = sn - 1;
while (i > 0 && needle[0] == needle[i]) i--;
if (i == 0) i = sn - 1;
const v128_t fstl = wasm_i8x16_splat(tolower(needle[0]));
const v128_t fstu = wasm_i8x16_splat(toupper(needle[0]));
const v128_t lstl = wasm_i8x16_splat(tolower(needle[i]));
const v128_t lstu = wasm_i8x16_splat(toupper(needle[i]));
// The last haystk offset for which loading blk_lst is safe.
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
(sizeof(v128_t) + i));
while (haystk <= H) {
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + i));
const v128_t eq_fst =
wasm_i8x16_eq(fstl, blk_fst) | wasm_i8x16_eq(fstu, blk_fst);
const v128_t eq_lst =
wasm_i8x16_eq(lstl, blk_lst) | wasm_i8x16_eq(lstu, blk_lst);
const v128_t cmp = eq_fst & eq_lst;
if (wasm_v128_any_true(cmp)) {
// The terminator may come before the match.
if (!wasm_i8x16_all_true(blk_fst)) break;
// Find the offset of the first one bit (little-endian).
// Each iteration clears that bit, tries again.
for (uint32_t mask = wasm_i8x16_bitmask(cmp); mask; mask &= mask - 1) {
size_t ctz = __builtin_ctz(mask);
if (!strncasecmp(haystk + ctz + 1, needle + 1, sn - 1)) {
return (char *)haystk + ctz;
}
}
}
// Have we reached the end of the haystack?
if (!wasm_i8x16_all_true(blk_fst)) return NULL;
haystk += sizeof(v128_t);
}
// Scalar algorithm.
for (;;) {
for (size_t i = 0;; i++) {
if (sn == i) return (char *)haystk;
if (!haystk[i]) return NULL;
if (tolower(needle[i]) != tolower(haystk[i])) break;
}
haystk++;
}
return NULL;
}
// Given the above SIMD implementations,
// these are best implemented as
// small wrappers over those functions.

View File

@@ -1,172 +0,0 @@
#include_next <strings.h> // the system strings.h
#ifndef _WASM_SIMD128_STRINGS_H
#define _WASM_SIMD128_STRINGS_H
#include <ctype.h>
#include <stdint.h>
#include <wasm_simd128.h>
#include <__macro_PAGESIZE.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __wasm_simd128__
#ifdef __OPTIMIZE_SIZE__
// bcmp is the same as memcmp but only compares for equality.
int bcmp(const void *v1, const void *v2, size_t n);
#else // __OPTIMIZE_SIZE__
__attribute__((weak))
int bcmp(const void *v1, const void *v2, size_t n) {
// Scalar algorithm.
if (n < sizeof(v128_t)) {
const unsigned char *u1 = (unsigned char *)v1;
const unsigned char *u2 = (unsigned char *)v2;
while (n--) {
if (*u1 != *u2) return 1;
u1++;
u2++;
}
return 0;
}
// bcmp is allowed to read up to n bytes from each object.
// Unaligned loads handle the case where the objects
// have mismatching alignments.
const v128_t *w1 = (v128_t *)v1;
const v128_t *w2 = (v128_t *)v2;
while (n) {
// Find any single bit difference.
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
return 1;
}
// This makes n a multiple of sizeof(v128_t)
// for every iteration except the first.
size_t align = (n - 1) % sizeof(v128_t) + 1;
w1 = (v128_t *)((char *)w1 + align);
w2 = (v128_t *)((char *)w2 + align);
n -= align;
}
return 0;
}
#endif // __OPTIMIZE_SIZE__
__attribute__((always_inline))
static v128_t __tolower8x16(v128_t v) {
__i8x16 i = v;
i = i + wasm_i8x16_splat(INT8_MAX - ('Z'));
i = i > wasm_i8x16_splat(INT8_MAX - ('Z' - 'A' + 1));
i = i & wasm_i8x16_splat('a' - 'A');
return v | i;
}
static int __strcasecmp_s(const char *s1, const char *s2) {
// Scalar algorithm.
const unsigned char *u1 = (unsigned char *)s1;
const unsigned char *u2 = (unsigned char *)s2;
for (;;) {
int c1 = tolower(*u1);
int c2 = tolower(*u2);
if (c1 != c2) return c1 - c2;
if (c1 == 0) break;
u1++;
u2++;
}
return 0;
}
static int __strcasecmp(const char *s1, const char *s2) {
// How many bytes can be read before pointers go out of bounds.
size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - //
(size_t)(s1 > s2 ? s1 : s2);
// Unaligned loads handle the case where the strings
// have mismatching alignments.
const v128_t *w1 = (v128_t *)s1;
const v128_t *w2 = (v128_t *)s2;
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
v128_t v1 = __tolower8x16(wasm_v128_load(w1));
v128_t v2 = __tolower8x16(wasm_v128_load(w2));
// Find any single bit difference.
if (wasm_v128_any_true(v1 ^ v2)) {
// The terminator may come before the difference.
break;
}
// We know all characters are equal.
// If any is a terminator the strings are equal.
if (!wasm_i8x16_all_true(v1)) {
return 0;
}
w1++;
w2++;
}
return __strcasecmp_s((char *)w1, (char *)w2);
}
__attribute__((weak))
int strcasecmp(const char *s1, const char *s2) {
// Skip the vector search when comparing against small literal strings.
if (__builtin_constant_p(strlen(s2)) && strlen(s2) < sizeof(v128_t)) {
return __strcasecmp_s(s1, s2);
}
return __strcasecmp(s1, s2);
}
__attribute__((weak))
int strncasecmp(const char *s1, const char *s2, size_t n) {
// How many bytes can be read before pointers go out of bounds.
size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - //
(size_t)(s1 > s2 ? s1 : s2);
if (n > N) n = N;
// Unaligned loads handle the case where the strings
// have mismatching alignments.
const v128_t *w1 = (v128_t *)s1;
const v128_t *w2 = (v128_t *)s2;
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
v128_t v1 = __tolower8x16(wasm_v128_load(w1));
v128_t v2 = __tolower8x16(wasm_v128_load(w2));
// Find any single bit difference.
if (wasm_v128_any_true(v1 ^ v2)) {
// The terminator may come before the difference.
break;
}
// We know all characters are equal.
// If any is a terminator the strings are equal.
if (!wasm_i8x16_all_true(v1)) {
return 0;
}
w1++;
w2++;
}
// Scalar algorithm.
const unsigned char *u1 = (unsigned char *)w1;
const unsigned char *u2 = (unsigned char *)w2;
while (n--) {
int c1 = tolower(*u1);
int c2 = tolower(*u2);
if (c1 != c2) return c1 - c2;
if (c1 == 0) break;
u1++;
u2++;
}
return 0;
}
#endif // __wasm_simd128__
#ifdef __cplusplus
} // extern "C"
#endif
#endif // _WASM_SIMD128_STRINGS_H

View File

@@ -1,41 +0,0 @@
# Use strcasecmp and strncasecmp.
--- sqlite3.c.orig
+++ sqlite3.c
@@ -35685,35 +35685,15 @@
return sqlite3StrICmp(zLeft, zRight);
}
SQLITE_PRIVATE int sqlite3StrICmp(const char *zLeft, const char *zRight){
- unsigned char *a, *b;
- int c, x;
- a = (unsigned char *)zLeft;
- b = (unsigned char *)zRight;
- for(;;){
- c = *a;
- x = *b;
- if( c==x ){
- if( c==0 ) break;
- }else{
- c = (int)UpperToLower[c] - (int)UpperToLower[x];
- if( c ) break;
- }
- a++;
- b++;
- }
- return c;
+ return strcasecmp(zLeft, zRight);
}
SQLITE_API int sqlite3_strnicmp(const char *zLeft, const char *zRight, int N){
- register unsigned char *a, *b;
if( zLeft==0 ){
return zRight ? -1 : 0;
}else if( zRight==0 ){
return 1;
}
- a = (unsigned char *)zLeft;
- b = (unsigned char *)zRight;
- while( N-- > 0 && *a!=0 && UpperToLower[*a]==UpperToLower[*b]){ a++; b++; }
- return N<0 ? 0 : UpperToLower[*a] - UpperToLower[*b];
+ return strncasecmp(zLeft, zRight, N);
}
/*

Binary file not shown.