Improve strspn.

This commit is contained in:
Nuno Cruces
2025-04-21 15:51:02 +01:00
parent 251a92fa1a
commit 48379336dc
3 changed files with 509 additions and 323 deletions

View File

@@ -856,7 +856,7 @@
(local $scratch i32)
(if
(i32.eqz
(local.tee $3
(local.tee $2
(i32.load8_u
(local.get $1)
)
@@ -882,7 +882,7 @@
(local.tee $1
(local.get $0)
)
(local.tee $2
(local.tee $3
(i32.sub
(i32.shl
(memory.size)
@@ -895,7 +895,7 @@
)
(local.set $4
(i8x16.splat
(local.get $3)
(local.get $2)
)
)
(loop $label
@@ -919,12 +919,12 @@
(i32.const 16)
)
)
(local.get $2)
(local.get $3)
)
)
)
)
(local.set $2
(local.set $0
(i32.add
(i32.xor
(local.get $0)
@@ -934,28 +934,26 @@
)
)
(loop $label1
(local.set $2
(local.set $0
(i32.add
(local.get $2)
(local.get $0)
(i32.const 1)
)
)
(local.set $3
(i32.load8_u
(local.get $1)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(br_if $label1
(i32.eq
(block (result i32)
(local.set $scratch
(i32.load8_u
(local.get $1)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(local.get $scratch)
)
(local.get $2)
(local.get $3)
)
)
@@ -963,6 +961,62 @@
(br $block1)
)
)
(v128.store
(i32.const 65792)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65776)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65760)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65744)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65728)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65712)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65696)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65680)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65664)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65648)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65632)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65616)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65600)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65584)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65568)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
@@ -978,30 +1032,17 @@
)
)
(loop $label2
(i32.store
(local.tee $2
(i32.add
(i32.and
(i32.shr_u
(local.get $3)
(i32.const 3)
)
(i32.const 28)
)
(i32.const 65552)
)
)
(i32.or
(i32.load
(i32.store8
(i32.add
(i32.and
(local.get $2)
(i32.const 255)
)
(i32.shl
(i32.const 1)
(local.get $3)
)
(i32.const 65552)
)
(i32.const 1)
)
(local.set $3
(local.set $2
(i32.load8_u
(local.get $1)
)
@@ -1013,239 +1054,390 @@
)
)
(br_if $label2
(local.get $3)
(local.get $2)
)
)
(local.set $2
(local.get $0)
)
(block $block2
(br_if $block2
(i32.eqz
(local.tee $3
(i32.load8_u
(local.get $0)
)
)
)
)
(local.set $1
(local.get $0)
)
(loop $label3
(if
(i32.eqz
(i32.and
(i32.shr_u
(i32.load
(block $block3
(block $block4
(loop $label3
(br_if $block2
(i32.eqz
(i32.load8_u
(i32.add
(i32.and
(i32.shr_u
(local.get $3)
(i32.const 3)
)
(i32.const 28)
(i32.load8_u
(local.get $2)
)
(i32.const 65552)
)
)
(local.get $3)
)
)
(br_if $block3
(i32.eqz
(i32.load8_u
(i32.add
(i32.load8_u offset=1
(local.get $2)
)
(i32.const 65552)
)
)
)
)
(br_if $block4
(i32.eqz
(i32.load8_u
(i32.add
(i32.load8_u offset=2
(local.get $2)
)
(i32.const 65552)
)
)
)
)
(br_if $label3
(i32.load8_u
(i32.add
(block (result i32)
(local.set $scratch
(i32.load8_u offset=3
(local.get $2)
)
)
(local.set $2
(i32.add
(local.get $2)
(i32.const 4)
)
)
(local.get $scratch)
)
(i32.const 65552)
)
)
)
)
(local.set $2
(i32.sub
(local.get $2)
(i32.const 1)
)
)
(then
(local.set $2
(local.get $1)
)
(br $block2)
(br $block2)
)
(local.set $2
(i32.add
(local.get $2)
(i32.const 2)
)
)
(local.set $3
(i32.load8_u offset=1
(local.get $1)
)
)
(local.set $1
(local.tee $2
(i32.add
(local.get $1)
(i32.const 1)
)
)
)
(br_if $label3
(local.get $3)
(br $block2)
)
(local.set $2
(i32.add
(local.get $2)
(i32.const 1)
)
)
)
(local.set $2
(local.set $0
(i32.sub
(local.get $2)
(local.get $0)
)
)
)
(local.get $2)
(local.get $0)
)
(func $strcspn (param $0 i32) (param $1 i32) (result i32)
(local $2 i32)
(local $3 i32)
(local $3 v128)
(local $4 v128)
(local $5 v128)
(block $block
(if
(local.tee $2
(i32.load8_u
(local.get $1)
)
)
(then
(br_if $block
(i32.load8_u offset=1
(local.get $1)
(local $scratch i32)
(block $block1
(block $block2
(block $block3
(block $block
(br_if $block
(i32.eqz
(local.tee $2
(i32.load8_u
(local.get $1)
)
)
)
)
(br_if $block
(i32.eqz
(i32.load8_u offset=1
(local.get $1)
)
)
)
(v128.store
(i32.const 66048)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 66032)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 66016)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 66000)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65984)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65968)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65952)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65936)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65920)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65904)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65888)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65872)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65856)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65840)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65824)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65808)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(loop $label
(i32.store8
(i32.add
(local.tee $2
(i32.load8_u
(local.get $1)
)
)
(i32.const 65808)
)
(i32.const 1)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(br_if $label
(local.get $2)
)
)
(local.set $1
(local.get $0)
)
(loop $label1
(br_if $block1
(i32.load8_u
(i32.add
(i32.load8_u
(local.get $1)
)
(i32.const 65808)
)
)
)
(br_if $block2
(i32.load8_u
(i32.add
(i32.load8_u offset=1
(local.get $1)
)
(i32.const 65808)
)
)
)
(br_if $block3
(i32.load8_u
(i32.add
(i32.load8_u offset=2
(local.get $1)
)
(i32.const 65808)
)
)
)
(br_if $label1
(i32.eqz
(i32.load8_u
(i32.add
(block (result i32)
(local.set $scratch
(i32.load8_u offset=3
(local.get $1)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 4)
)
)
(local.get $scratch)
)
(i32.const 65808)
)
)
)
)
)
(return
(i32.sub
(i32.sub
(local.get $1)
(i32.const 1)
)
(local.get $0)
)
)
)
)
)
(block $block1
(if
(v128.any_true
(local.tee $4
(v128.or
(i8x16.eq
(local.tee $4
(v128.load
(local.tee $1
(i32.and
(local.get $0)
(i32.const -16)
(block $block4
(if
(v128.any_true
(local.tee $3
(v128.or
(i8x16.eq
(local.tee $3
(v128.load
(local.tee $1
(i32.and
(local.get $0)
(i32.const -16)
)
)
)
)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(i8x16.eq
(local.get $3)
(local.tee $4
(i8x16.splat
(local.get $2)
)
)
)
)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(i8x16.eq
(local.get $4)
(local.tee $5
(i8x16.splat
(local.get $2)
)
(then
(br_if $block4
(local.tee $2
(i32.and
(i8x16.bitmask
(local.get $3)
)
(i32.shl
(i32.const -1)
(i32.and
(local.get $0)
(i32.const 15)
)
)
)
)
)
)
)
)
(then
(br_if $block1
(local.tee $2
(i32.and
(i8x16.bitmask
(local.get $4)
)
(i32.shl
(i32.const -1)
(i32.and
(local.get $0)
(i32.const 15)
(loop $label2
(local.set $3
(v128.load offset=16
(local.get $1)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 16)
)
)
(br_if $label2
(i32.eqz
(v128.any_true
(local.tee $3
(v128.or
(i8x16.eq
(local.get $3)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(i8x16.eq
(local.get $3)
(local.get $4)
)
)
)
)
)
)
)
(local.set $2
(i8x16.bitmask
(local.get $3)
)
)
)
(return
(i32.sub
(i32.add
(local.get $1)
(i32.ctz
(local.get $2)
)
)
(local.get $0)
)
)
)
(loop $label
(local.set $4
(v128.load offset=16
(local.get $1)
)
)
(local.set $1
(return
(i32.sub
(i32.add
(local.get $1)
(i32.const 16)
(i32.const 2)
)
(local.get $0)
)
(br_if $label
(i32.eqz
(v128.any_true
(local.tee $4
(v128.or
(i8x16.eq
(local.get $4)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(i8x16.eq
(local.get $4)
(local.get $5)
)
)
)
)
)
)
)
(local.set $2
(i8x16.bitmask
(local.get $4)
)
)
)
(return
(i32.sub
(i32.add
(local.get $1)
(i32.ctz
(local.get $2)
)
)
(local.get $0)
)
)
)
(v128.store
(i32.const 65600)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(v128.store
(i32.const 65584)
(v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(loop $label1
(i32.store
(local.tee $3
(i32.add
(i32.and
(i32.shr_u
(local.get $2)
(i32.const 3)
)
(i32.const 28)
)
(i32.const 65584)
)
)
(i32.or
(i32.load
(local.get $3)
)
(i32.shl
(i32.const 1)
(local.get $2)
)
)
)
(local.set $2
(i32.load8_u
(local.get $1)
)
)
(local.set $1
@@ -1254,64 +1446,6 @@
(i32.const 1)
)
)
(br_if $label1
(local.get $2)
)
)
(if
(local.tee $2
(i32.load8_u
(local.tee $1
(local.get $0)
)
)
)
(then
(loop $label2
(if
(i32.and
(i32.shr_u
(i32.load
(i32.add
(i32.and
(i32.shr_u
(local.get $2)
(i32.const 3)
)
(i32.const 28)
)
(i32.const 65584)
)
)
(local.get $2)
)
(i32.const 1)
)
(then
(return
(i32.sub
(local.get $1)
(local.get $0)
)
)
)
)
(local.set $2
(i32.load8_u offset=1
(local.get $1)
)
)
(local.set $1
(i32.add
(local.get $1)
(i32.const 1)
)
)
(br_if $label2
(local.get $2)
)
)
)
)
(i32.sub
(local.get $1)

View File

@@ -24,12 +24,14 @@ var (
module api.Module
memset api.Function
memcpy api.Function
memcmp api.Function
memchr api.Function
memcmp api.Function
strlen api.Function
strcmp api.Function
strchr api.Function
strcmp api.Function
strspn api.Function
strncmp api.Function
strcspn api.Function
stack [8]uint64
)
@@ -56,7 +58,9 @@ func TestMain(m *testing.M) {
strlen = mod.ExportedFunction("strlen")
strchr = mod.ExportedFunction("strchr")
strcmp = mod.ExportedFunction("strcmp")
strspn = mod.ExportedFunction("strspn")
strncmp = mod.ExportedFunction("strncmp")
strcspn = mod.ExportedFunction("strcspn")
memory, _ = mod.Memory().Read(0, mod.Memory().Size())
os.Exit(m.Run())
@@ -100,9 +104,9 @@ func Benchmark_memcpy(b *testing.B) {
func Benchmark_memchr(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size)
call(memset, ptr1+size/2, 5, size)
call(memset, ptr1+size/2, 5, size/2)
b.SetBytes(size / 2)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(memchr, ptr1, 5, size)
@@ -118,9 +122,9 @@ func Benchmark_memcmp(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size)
call(memset, ptr2, 7, size)
call(memset, ptr2+size/2, 5, size)
call(memset, ptr2+size/2, 5, size/2)
b.SetBytes(size / 2)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(memcmp, ptr1, ptr2, size)
@@ -155,10 +159,10 @@ func Benchmark_strlen(b *testing.B) {
func Benchmark_strchr(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size)
call(memset, ptr1+size/2, 5, size)
call(memset, ptr1, 7, size-1)
call(memset, ptr1+size/2, 5, size/2-1)
b.SetBytes(size / 2)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strchr, ptr1, 5)
@@ -174,9 +178,9 @@ func Benchmark_strcmp(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size-1)
call(memset, ptr2, 7, size-1)
call(memset, ptr2+size/2, 5, size)
call(memset, ptr2+size/2, 5, size/2-1)
b.SetBytes(size / 2)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strcmp, ptr1, ptr2, size)
@@ -201,28 +205,76 @@ func Benchmark_strcmp(b *testing.B) {
func Benchmark_strncmp(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size)
call(memset, ptr2, 7, size)
call(memset, ptr2+size/2, 5, size)
call(memset, ptr1, 7, size-1)
call(memset, ptr2, 7, size-1)
call(memset, ptr2+size/2, 5, size/2-1)
b.SetBytes(size / 2)
b.SetBytes(size/2 + 1)
b.ResetTimer()
for range b.N {
call(strncmp, ptr1, ptr2, size)
call(strncmp, ptr1, ptr2, size-1)
}
b.StopTimer()
// ptr1 > ptr2
if got := int32(call(strncmp, ptr1, ptr2, size)); got <= 0 {
if got := int32(call(strncmp, ptr1, ptr2, size-1)); got <= 0 {
b.Fatal(got)
}
// make ptr1 < ptr2
memory[ptr1+size/2] = 0
if got := int32(call(strncmp, ptr1, ptr2, size)); got >= 0 {
if got := int32(call(strncmp, ptr1, ptr2, size-1)); got >= 0 {
b.Fatal(got)
}
// ptr1[:size/2] == ptr2[:size/2]
if got := int32(call(strncmp, ptr1, ptr2, size/2)); got != 0 {
if got := int32(call(strncmp, ptr1, ptr2, size/2-1)); got != 0 {
b.Fatal(got)
}
}
func Benchmark_strspn(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size-1)
call(memset, ptr1+size/2, 5, size/2-1)
memory[ptr2+0] = 3
memory[ptr2+1] = 5
memory[ptr2+2] = 7
memory[ptr2+3] = 9
b.SetBytes(size)
b.ResetTimer()
for range b.N {
call(strspn, ptr1, ptr2)
}
b.StopTimer()
if got := int32(call(strspn, ptr1, ptr2)); got != size-1 {
b.Fatal(got)
}
memory[ptr1+size/2] = 11
if got := int32(call(strspn, ptr1, ptr2)); got != size/2 {
b.Fatal(got)
}
}
func Benchmark_strcspn(b *testing.B) {
clear(memory)
call(memset, ptr1, 7, size-1)
call(memset, ptr1+size/2, 5, size/2-1)
memory[ptr2+0] = 3
memory[ptr2+1] = 9
b.SetBytes(size)
b.ResetTimer()
for range b.N {
call(strcspn, ptr1, ptr2)
}
b.StopTimer()
if got := int32(call(strcspn, ptr1, ptr2)); got != size-1 {
b.Fatal(got)
}
memory[ptr1+size/2] = 3
if got := int32(call(strcspn, ptr1, ptr2)); got != size/2 {
b.Fatal(got)
}
}

View File

@@ -1,6 +1,7 @@
#ifndef _WASM_SIMD128_STRING_H
#define _WASM_SIMD128_STRING_H
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <wasm_simd128.h>
@@ -112,6 +113,10 @@ size_t strlen(const char *s) {
__attribute__((weak))
int strcmp(const char *s1, const char *s2) {
if (__builtin_constant_p(__builtin_strlen(s2))) {
return strncmp(s1, s2, __builtin_strlen(s2));
}
const v128_t *const limit =
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
@@ -203,23 +208,12 @@ char *strchr(const char *s, int c) {
return *(char *)r == (char)c ? r : NULL;
}
#pragma push_macro("STATIC")
#pragma push_macro("BITOP")
// Avoid using the C stack.
#ifndef _REENTRANT
#define STATIC static
#else
#define STATIC
#endif
#define BITOP(a, b, op) \
((a)[(b) / (8 * sizeof(size_t))] op((size_t)1) \
<< ((b) % (8 * sizeof(size_t))))
__attribute__((weak))
size_t strspn(const char *s, const char *c) {
STATIC size_t byteset[32 / sizeof(size_t)];
#ifndef _REENTRANT
static
#endif
char byteset[UCHAR_MAX + 1];
const char *const a = s;
if (!c[0]) return 0;
@@ -241,28 +235,34 @@ size_t strspn(const char *s, const char *c) {
return s - a;
}
memset(byteset, 0, sizeof(byteset));
for (; *c && BITOP(byteset, *(uint8_t *)c, |=); c++);
for (; *s && BITOP(byteset, *(uint8_t *)s, &); s++);
volatile v128_t *w = (void *)byteset;
#pragma unroll
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
while (*c && (byteset[*(uint8_t *)c] = 1)) c++;
#pragma unroll 4
while (byteset[*(uint8_t *)s]) s++;
return s - a;
}
__attribute__((weak))
size_t strcspn(const char *s, const char *c) {
STATIC size_t byteset[32 / sizeof(size_t)];
#ifndef _REENTRANT
static
#endif
char byteset[UCHAR_MAX + 1];
const char *const a = s;
if (!c[0] || !c[1]) return __strchrnul(s, *c) - s;
memset(byteset, 0, sizeof(byteset));
for (; *c && BITOP(byteset, *(uint8_t *)c, |=); c++);
for (; *s && !BITOP(byteset, *(uint8_t *)s, &); s++);
volatile v128_t *w = (void *)byteset;
#pragma unroll
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
while ((byteset[*(uint8_t *)c] = 1) && *c) c++;
#pragma unroll 4
while (!byteset[*(uint8_t *)s]) s++;
return s - a;
}
#pragma pop_macro("BITOP")
#pragma pop_macro("STATIC")
#endif // __wasm_simd128__
#ifdef __cplusplus