Fix memchr.

This commit is contained in:
Nuno Cruces
2025-04-22 01:19:59 +01:00
parent 48379336dc
commit bb87a920f7
5 changed files with 130 additions and 74 deletions

View File

@@ -10,10 +10,13 @@ SRCS="${1:-libc.c}"
"../tools.sh"
trap 'rm -f libc.c libc.tmp' EXIT
echo '#include <string.h>' > libc.c
echo '#include <stdlib.h>' >> libc.c
cat << EOF > libc.c
#include <string.h>
#include <stdlib.h>
EOF
"$WASI_SDK/clang" --target=wasm32-wasi -std=c23 -g0 -O2 \
-Wall -Wextra -Wno-unused-parameter -Wno-unused-function \
-o libc.wasm -I. "$SRCS" \
-mexec-model=reactor \
-msimd128 -mmutable-globals -mmultivalue \

Binary file not shown.

View File

@@ -142,42 +142,53 @@
(local $3 i32)
(local $4 i32)
(local $5 i32)
(local $6 v128)
(local $6 i32)
(local $7 v128)
(local.set $4
(i32.and
(local.get $0)
(i32.const 15)
(local $8 v128)
(local $scratch i32)
(block $block
(br_if $block
(i32.eqz
(local.get $2)
)
)
)
(block $block1
(block $block
(if
(v128.any_true
(local.tee $6
(i8x16.eq
(v128.load
(local.tee $3
(i32.and
(local.get $0)
(i32.const -16)
)
)
)
(local.set $4
(i32.and
(local.get $0)
(i32.const 15)
)
)
(block $block2
(block $block1
(br_if $block1
(i32.eqz
(v128.any_true
(local.tee $7
(i8x16.splat
(local.get $1)
(i8x16.eq
(v128.load
(local.tee $3
(i32.and
(local.get $0)
(i32.const -16)
)
)
)
(local.tee $8
(i8x16.splat
(local.get $1)
)
)
)
)
)
)
)
(then
(br_if $block
(local.tee $1
(br_if $block1
(i32.eqz
(local.tee $5
(i32.and
(i8x16.bitmask
(local.get $6)
(local.get $7)
)
(i32.shl
(i32.const -1)
@@ -187,44 +198,63 @@
)
)
)
(local.set $1
(local.get $2)
)
(br $block2)
)
(br_if $block1
(i32.gt_u
(br_if $block
(i32.lt_u
(local.get $2)
(local.tee $1
(i32.sub
(i32.add
(local.get $2)
(local.get $4)
(local.get $2)
(local.tee $3
(i32.sub
(i32.const 16)
(local.get $4)
)
)
(i32.const 16)
)
)
(local.get $2)
)
)
(br_if $block
(i32.eqz
(local.get $1)
)
)
(local.set $3
(i32.add
(i32.sub
(local.get $0)
(local.get $4)
)
(i32.const 16)
(local.get $0)
(local.get $3)
)
)
(block $block2
(block $block3
(loop $label
(br_if $block2
(br_if $block3
(v128.any_true
(local.tee $6
(local.tee $7
(i8x16.eq
(v128.load
(local.get $3)
)
(local.get $7)
(local.get $8)
)
)
)
)
(br_if $block
(i32.gt_u
(local.tee $0
(i32.sub
(local.get $1)
(i32.const 16)
)
)
(local.get $1)
)
)
(local.set $3
(i32.add
(local.get $3)
@@ -232,35 +262,49 @@
)
)
(br_if $label
(i32.ge_u
(local.get $1)
(local.tee $1
(i32.sub
(local.get $1)
(i32.const 16)
(i32.eqz
(block (result i32)
(local.set $scratch
(i32.eq
(local.get $1)
(i32.const 16)
)
)
(local.set $1
(local.get $0)
)
(local.get $scratch)
)
)
)
)
(br $block1)
(br $block)
)
(local.set $1
(local.set $5
(i8x16.bitmask
(local.get $6)
(local.get $7)
)
)
)
(local.set $5
(i32.add
(local.get $3)
(i32.ctz
(local.set $6
(select
(i32.add
(local.get $3)
(local.tee $0
(i32.ctz
(local.get $5)
)
)
)
(i32.const 0)
(i32.lt_u
(local.get $0)
(local.get $1)
)
)
)
)
(local.get $5)
(local.get $6)
)
(func $strlen (param $0 i32) (result i32)
(local $1 i32)

View File

@@ -116,6 +116,9 @@ func Benchmark_memchr(b *testing.B) {
if got := call(memchr, ptr1, 5, size); got != ptr1+size/2 {
b.Fatal(got)
}
if got := call(memchr, ptr1, 5, size/2); got != 0 {
b.Fatal(got, ptr1+size/2)
}
}
func Benchmark_memcmp(b *testing.B) {

View File

@@ -45,7 +45,7 @@ void *memmove(void *dest, const void *src, size_t n) {
// aligned address less than memory size.
//
// These also assume unaligned access is not painfully slow,
// but that bitmask extraction is slow on AArch64.
// but that bitmask extraction is really slow on AArch64.
__attribute__((weak))
int memcmp(const void *v1, const void *v2, size_t n) {
@@ -75,13 +75,14 @@ void *memchr(const void *v, int c, size_t n) {
const v128_t *w = (void *)(v - align);
const v128_t wc = wasm_i8x16_splat(c);
while (true) {
while (n) {
const v128_t cmp = wasm_i8x16_eq(*w, wc);
if (wasm_v128_any_true(cmp)) {
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
__builtin_assume(mask || align);
if (mask) {
return (void *)w + __builtin_ctz(mask);
size_t ctz = __builtin_ctz(mask);
return ctz < n ? (void *)w + ctz : NULL;
}
}
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
@@ -90,6 +91,7 @@ void *memchr(const void *v, int c, size_t n) {
align = 0;
w++;
}
return NULL;
}
__attribute__((weak))
@@ -111,12 +113,7 @@ size_t strlen(const char *s) {
}
}
__attribute__((weak))
int strcmp(const char *s1, const char *s2) {
if (__builtin_constant_p(__builtin_strlen(s2))) {
return strncmp(s1, s2, __builtin_strlen(s2));
}
static int __strcmp(const char *s1, const char *s2) {
const v128_t *const limit =
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
@@ -144,6 +141,14 @@ int strcmp(const char *s1, const char *s2) {
return 0;
}
__attribute__((weak, always_inline))
int strcmp(const char *s1, const char *s2) {
if (__builtin_constant_p(strlen(s2))) {
return strncmp(s1, s2, strlen(s2));
}
return __strcmp(s1, s2);
}
__attribute__((weak))
int strncmp(const char *s1, const char *s2, size_t n) {
const v128_t *const limit =
@@ -173,12 +178,7 @@ int strncmp(const char *s1, const char *s2, size_t n) {
return 0;
}
__attribute__((always_inline))
static char *__strchrnul(const char *s, int c) {
if (__builtin_constant_p(c) && (char)c == 0) {
return (char *)s + strlen(s);
}
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
const v128_t *w = (void *)(s - align);
const v128_t wc = wasm_i8x16_splat(c);
@@ -197,13 +197,19 @@ static char *__strchrnul(const char *s, int c) {
}
}
__attribute__((weak))
__attribute__((weak, always_inline))
char *strchrnul(const char *s, int c) {
if (__builtin_constant_p(c) && (char)c == 0) {
return (char *)s + strlen(s);
}
return __strchrnul(s, c);
}
__attribute__((weak))
__attribute__((weak, always_inline))
char *strchr(const char *s, int c) {
if (__builtin_constant_p(c) && (char)c == 0) {
return (char *)s + strlen(s);
}
char *r = __strchrnul(s, c);
return *(char *)r == (char)c ? r : NULL;
}