This commit is contained in:
Nuno Cruces
2025-05-01 12:43:24 +01:00
parent 13b8642384
commit d748d98e39
4 changed files with 233 additions and 152 deletions

Binary file not shown.

View File

@@ -36,103 +36,130 @@
(local.get $0)
)
(func $memcmp (param $0 i32) (param $1 i32) (param $2 i32) (result i32)
(local $3 i32)
(local $3 v128)
(local $4 i32)
(block $block1
(block $block
(local $5 i32)
(block $block
(br_if $block
(i32.lt_u
(local.get $2)
(i32.const 16)
)
)
(loop $label
(if
(i32.ge_u
(local.get $2)
(i32.const 16)
)
(then
(loop $label
(br_if $block
(v128.any_true
(v128.xor
(v128.load align=1
(local.get $1)
)
(v128.load align=1
(local.get $0)
)
)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 16)
)
)
(local.set $0
(i32.add
(i8x16.all_true
(local.tee $3
(i8x16.eq
(v128.load align=1
(local.get $0)
(i32.const 16)
)
)
(br_if $label
(i32.gt_u
(local.tee $2
(i32.sub
(local.get $2)
(i32.const 16)
)
)
(i32.const 15)
(v128.load align=1
(local.get $1)
)
)
)
)
)
(br_if $block1
(i32.eqz
(local.get $2)
(then
(local.set $1
(i32.add
(local.get $1)
(i32.const 16)
)
)
(local.set $0
(i32.add
(local.get $0)
(i32.const 16)
)
)
(br_if $label
(i32.gt_u
(local.tee $2
(i32.sub
(local.get $2)
(i32.const 16)
)
)
(i32.const 15)
)
)
(br $block)
)
)
)
(loop $label1
(if
(i32.ne
(local.tee $3
(i32.load8_u
(local.get $0)
)
)
(local.tee $4
(i32.load8_u
(local.get $1)
(return
(i32.sub
(i32.load8_u
(i32.add
(local.get $0)
(local.tee $2
(i32.ctz
(i32.xor
(i8x16.bitmask
(local.get $3)
)
(i32.const -1)
)
)
)
)
)
(then
(return
(i32.sub
(local.get $3)
(local.get $4)
)
)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(local.set $0
(i32.add
(local.get $0)
(i32.const 1)
)
)
(br_if $label1
(local.tee $2
(i32.sub
(i32.load8_u
(i32.add
(local.get $1)
(local.get $2)
)
)
)
)
)
(if
(local.get $2)
(then
(loop $label1
(if
(i32.ne
(local.tee $4
(i32.load8_u
(local.get $0)
)
)
(local.tee $5
(i32.load8_u
(local.get $1)
)
)
)
(then
(return
(i32.sub
(local.get $4)
(local.get $5)
)
)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(local.set $0
(i32.add
(local.get $0)
(i32.const 1)
)
)
(br_if $label1
(local.tee $2
(i32.sub
(local.get $2)
(i32.const 1)
)
)
)
)
)
)
@@ -886,7 +913,9 @@
(func $strspn (param $0 i32) (param $1 i32) (result i32)
(local $2 i32)
(local $3 i32)
(local $4 v128)
(local $4 i32)
(local $5 v128)
(local $6 v128)
(local $scratch i32)
(if
(i32.eqz
@@ -902,7 +931,7 @@
)
)
)
(block $block1
(block $block
(if
(i32.eqz
(i32.load8_u offset=1
@@ -910,50 +939,75 @@
)
)
(then
(block $block
(br_if $block
(i32.gt_u
(local.tee $1
(local.get $0)
)
(local.tee $3
(i32.sub
(i32.shl
(memory.size)
(i32.const 16)
)
(if
(i32.ge_u
(local.tee $4
(i32.sub
(i32.shl
(memory.size)
(i32.const 16)
)
(i32.const 16)
)
)
)
(local.set $4
(i8x16.splat
(local.get $2)
(local.tee $1
(local.get $0)
)
)
(loop $label
(br_if $block
(i32.eqz
(i8x16.all_true
(i8x16.eq
(v128.load align=1
(local.get $1)
(then
(local.set $5
(i8x16.splat
(local.get $2)
)
)
(loop $label
(if
(i32.eqz
(i8x16.all_true
(local.tee $6
(i8x16.eq
(v128.load align=1
(i32.add
(local.get $0)
(local.get $3)
)
)
(local.get $5)
)
)
)
)
(then
(return
(i32.add
(i32.ctz
(i32.xor
(i8x16.bitmask
(local.get $6)
)
(i32.const -1)
)
)
(local.get $3)
)
(local.get $4)
)
)
)
)
(br_if $label
(i32.le_u
(local.tee $1
(i32.add
(local.get $1)
(i32.const 16)
(br_if $label
(i32.le_u
(local.tee $1
(i32.add
(local.get $0)
(local.tee $3
(i32.add
(local.get $3)
(i32.const 16)
)
)
)
)
(local.get $4)
)
(local.get $3)
)
)
)
@@ -992,7 +1046,7 @@
)
)
)
(br $block1)
(br $block)
)
)
(v128.store
@@ -1094,11 +1148,11 @@
(local.set $2
(local.get $0)
)
(block $block2
(block $block3
(block $block4
(block $block1
(block $block2
(block $block3
(loop $label3
(br_if $block2
(br_if $block1
(i32.eqz
(i32.load8_u
(i32.add
@@ -1110,7 +1164,7 @@
)
)
)
(br_if $block3
(br_if $block2
(i32.eqz
(i32.load8_u
(i32.add
@@ -1122,7 +1176,7 @@
)
)
)
(br_if $block4
(br_if $block3
(i32.eqz
(i32.load8_u
(i32.add
@@ -1162,7 +1216,7 @@
(i32.const 1)
)
)
(br $block2)
(br $block1)
)
(local.set $2
(i32.add
@@ -1170,7 +1224,7 @@
(i32.const 2)
)
)
(br $block2)
(br $block1)
)
(local.set $2
(i32.add

View File

@@ -11,9 +11,16 @@ extern "C" {
#ifdef __wasm_relaxed_simd__
// This header assumes "relaxed fused multiply-add"
// is both faster and more precise.
#define FP_FAST_FMA 1
__attribute__((weak))
double fma(double x, double y, double z) {
const v128_t wx = wasm_f64x2_splat(x);
// If we get a software implementation from the host,
// this is enough to short circuit it on the 2nd lane.
const v128_t wx = wasm_f64x2_replace_lane(b, 0, x);
const v128_t wy = wasm_f64x2_splat(y);
const v128_t wz = wasm_f64x2_splat(z);
const v128_t wr = wasm_f64x2_relaxed_madd(wx, wy, wz);

View File

@@ -39,13 +39,6 @@ void *memmove(void *dest, const void *src, size_t n) {
#ifdef __wasm_simd128__
// SIMD versions of some string.h functions.
//
// These assume aligned v128_t loads can't fail,
// and so can't unaligned loads up to the last
// aligned address less than memory size.
//
// These also assume unaligned access is not painfully slow,
// but that bitmask extraction is really slow on AArch64.
__attribute__((weak))
int memcmp(const void *v1, const void *v2, size_t n) {
@@ -55,9 +48,13 @@ int memcmp(const void *v1, const void *v2, size_t n) {
const v128_t *w1 = (v128_t *)v1;
const v128_t *w2 = (v128_t *)v2;
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
// Find any single bit difference.
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
break;
const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2));
// Bitmask is slow on AArch64, all_true is much faster.
if (!wasm_i8x16_all_true(cmp)) {
size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
const unsigned char *u1 = (unsigned char *)w1 + ctz;
const unsigned char *u2 = (unsigned char *)w2 + ctz;
return *u1 - *u2;
}
w1++;
w2++;
@@ -77,7 +74,7 @@ int memcmp(const void *v1, const void *v2, size_t n) {
__attribute__((weak))
void *memchr(const void *v, int c, size_t n) {
// When n is zero, a function that locates a character finds no occurrence.
// Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
// Otherwise, decrement n to ensure __builtin_sub_overflow overflows
// when n would go equal-to-or-below zero.
if (n-- == 0) {
return NULL;
@@ -98,7 +95,7 @@ void *memchr(const void *v, int c, size_t n) {
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
// At least one bit will be set, unless we cleared them.
// Knowing this helps the compiler.
// Knowing this helps the compiler.
__builtin_assume(mask || align);
// If the mask is zero because of alignment,
// it's as if we didn't find anything.
@@ -109,7 +106,7 @@ void *memchr(const void *v, int c, size_t n) {
return ctz <= n + align ? (char *)w + ctz : NULL;
}
}
// Decrement n; if it "overflows" we're done.
// Decrement n; if it overflows we're done.
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
return NULL;
}
@@ -133,7 +130,7 @@ size_t strlen(const char *s) {
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
// At least one bit will be set, unless we cleared them.
// Knowing this helps the compiler.
// Knowing this helps the compiler.
__builtin_assume(mask || align);
if (mask) {
return (char *)w - s + __builtin_ctz(mask);
@@ -181,12 +178,23 @@ static int __strcmp(const char *s1, const char *s2) {
return 0;
}
static int __strcmp_s(const char *s1, const char *s2) {
const unsigned char *u1 = (unsigned char *)s1;
const unsigned char *u2 = (unsigned char *)s2;
while (true) {
if (*u1 != *u2) return *u1 - *u2;
if (*u1 == 0) break;
u1++;
u2++;
}
return 0;
}
__attribute__((weak, always_inline))
int strcmp(const char *s1, const char *s2) {
// Use strncmp when comparing against literal strings.
// If the literal is small, the vector search will be skipped.
if (__builtin_constant_p(strlen(s2))) {
return strncmp(s1, s2, strlen(s2));
// Skip the vector search when comparing against small literal strings.
if (__builtin_constant_p(strlen(s2) && strlen(s2) < sizeof(v128_t))) {
return __strcmp_s(s1, s2);
}
return __strcmp(s1, s2);
}
@@ -244,7 +252,7 @@ static char *__strchrnul(const char *s, int c) {
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
// At least one bit will be set, unless we cleared them.
// Knowing this helps the compiler.
// Knowing this helps the compiler.
__builtin_assume(mask || align);
if (mask) {
return (char *)w + __builtin_ctz(mask);
@@ -277,7 +285,7 @@ char *strchr(const char *s, int c) {
__attribute__((weak))
size_t strspn(const char *s, const char *c) {
#ifndef _REENTRANT
static // Avoid the stack for builds without threads.
static // Avoid the stack for builds without threads.
#endif
char byteset[UCHAR_MAX + 1];
const char *const a = s;
@@ -293,12 +301,16 @@ size_t strspn(const char *s, const char *c) {
const v128_t *w = (v128_t *)s;
const v128_t wc = wasm_i8x16_splat(*c);
while (w <= limit) {
if (!wasm_i8x16_all_true(wasm_i8x16_eq(wasm_v128_load(w), wc))) {
break;
const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc);
// Bitmask is slow on AArch64, all_true is much faster.
if (!wasm_i8x16_all_true(cmp)) {
size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
return (char *)w + ctz - s;
}
w++;
}
// Continue byte-by-byte.
s = (char *)w;
while (*s == *c) s++;
return s - a;
@@ -311,20 +323,21 @@ size_t strspn(const char *s, const char *c) {
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
while (byteset[*(unsigned char *)s]) s++;
#else
#else // __OPTIMIZE__
// This is faster than memset.
// Going backward helps bounds check elimination.
volatile v128_t *w = (v128_t *)byteset;
#pragma unroll
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
// Keeping byteset[0] = 0 avoids the other loop having to test for it.
// Keeping byteset[0] = 0 avoids the next loop needing that check.
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
#pragma unroll 4
while (byteset[*(unsigned char *)s]) s++;
#endif
#endif // __OPTIMIZE__
return s - a;
}
@@ -332,7 +345,7 @@ size_t strspn(const char *s, const char *c) {
__attribute__((weak))
size_t strcspn(const char *s, const char *c) {
#ifndef _REENTRANT
static // Avoid the stack for builds without threads.
static // Avoid the stack for builds without threads.
#endif
char byteset[UCHAR_MAX + 1];
const char *const a = s;
@@ -346,24 +359,31 @@ size_t strcspn(const char *s, const char *c) {
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
while (!byteset[*(unsigned char *)s]) s++;
#else
#else // __OPTIMIZE__
// This is faster than memset.
// Going backward helps bounds check elimination.
volatile v128_t *w = (v128_t *)byteset;
#pragma unroll
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
// Setting byteset[0] = 1 avoids the other loop having to test for it.
// Setting byteset[0] = 1 avoids the next loop needing that check.
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
#pragma unroll 4
while (!byteset[*(unsigned char *)s]) s++;
#endif
#endif // __OPTIMIZE__
return s - a;
}
__attribute__((weak, always_inline))
char *strpbrk(const char *s, const char *b) {
s += strcspn(s, b);
return *s ? (char *)s : 0;
}
#endif // __wasm_simd128__
#ifdef __cplusplus