Handle repetitive needles.

This commit is contained in:
Nuno Cruces
2025-05-10 01:44:25 +01:00
parent 60ab485b29
commit c2c1aea578
3 changed files with 213 additions and 128 deletions

Binary file not shown.

View File

@@ -2030,12 +2030,12 @@
)
)
)
(block $block1
(block $block2
(if
(i32.eqz
(i32.and
(i32.gt_u
(local.tee $8
(local.tee $9
(i32.sub
(local.get $1)
(local.get $3)
@@ -2050,50 +2050,91 @@
)
)
(then
(local.set $7
(i32.load8_u
(local.get $2)
)
)
(local.set $5
(local.tee $8
(i32.sub
(local.get $3)
(i32.const 1)
)
)
)
(block $block
(loop $label
(br_if $block
(i32.ne
(local.get $7)
(local.tee $6
(i32.load8_u
(i32.add
(local.get $2)
(local.get $5)
)
)
)
)
)
(br_if $label
(local.tee $5
(i32.sub
(local.get $5)
(i32.const 1)
)
)
)
)
(local.set $6
(i32.load8_u
(i32.add
(local.get $2)
(local.get $8)
)
)
)
(local.set $5
(local.get $8)
)
)
(if
(i32.le_u
(local.get $0)
(local.tee $9
(local.tee $10
(i32.sub
(i32.sub
(i32.shl
(memory.size)
(i32.const 16)
)
(local.get $3)
(local.get $5)
)
(i32.const 15)
(i32.const 16)
)
)
)
(then
(local.set $11
(v128.load8_splat
(i32.add
(local.get $2)
(local.tee $6
(i32.sub
(local.get $3)
(i32.const 1)
)
)
)
(i8x16.splat
(local.get $6)
)
)
(local.set $13
(v128.load8_splat
(local.get $2)
(i8x16.splat
(local.get $7)
)
)
(local.set $7
(local.set $9
(i32.add
(local.get $2)
(i32.const 1)
)
)
(loop $label1
(block $block
(br_if $block
(loop $label2
(block $block1
(br_if $block1
(i32.eqz
(v128.any_true
(local.tee $14
@@ -2101,12 +2142,9 @@
(i8x16.eq
(local.get $11)
(v128.load align=1
(i32.sub
(i32.add
(local.get $0)
(local.get $3)
)
(i32.const 1)
(i32.add
(local.get $0)
(local.get $5)
)
)
)
@@ -2121,7 +2159,7 @@
)
)
)
(br_if $block
(br_if $block1
(i32.eqz
(local.tee $4
(i8x16.bitmask
@@ -2130,12 +2168,12 @@
)
)
)
(loop $label
(br_if $block1
(loop $label1
(br_if $block2
(i32.eqz
(call $bcmp
(i32.add
(local.tee $5
(local.tee $6
(i32.add
(local.get $0)
(i32.ctz
@@ -2145,12 +2183,12 @@
)
(i32.const 1)
)
(local.get $7)
(local.get $6)
(local.get $9)
(local.get $8)
)
)
)
(br_if $label
(br_if $label1
(local.tee $4
(i32.and
(i32.sub
@@ -2166,7 +2204,7 @@
(if
(i32.lt_u
(local.get $1)
(local.tee $5
(local.tee $7
(i32.sub
(local.get $1)
(i32.const 16)
@@ -2182,7 +2220,7 @@
(if
(i32.gt_u
(local.get $3)
(local.get $5)
(local.get $7)
)
(then
(return
@@ -2191,9 +2229,9 @@
)
)
(local.set $1
(local.get $5)
(local.get $7)
)
(br_if $label1
(br_if $label2
(i32.le_u
(local.tee $0
(i32.add
@@ -2201,26 +2239,26 @@
(i32.const 16)
)
)
(local.get $9)
(local.get $10)
)
)
)
(local.set $8
(local.set $9
(i32.sub
(local.get $1)
(local.get $7)
(local.get $3)
)
)
)
)
(local.set $1
(local.set $5
(i32.const 0)
)
(loop $label2
(loop $label3
(local.set $4
(i32.const 0)
)
(loop $label3
(loop $label4
(if
(i32.ne
(i32.load8_u
@@ -2243,24 +2281,24 @@
(i32.const 1)
)
)
(local.set $5
(local.set $6
(i32.const 0)
)
(br_if $label2
(br_if $label3
(i32.le_u
(local.tee $1
(local.tee $5
(i32.add
(local.get $1)
(local.get $5)
(i32.const 1)
)
)
(local.get $8)
(local.get $9)
)
)
(br $block1)
(br $block2)
)
)
(br_if $label3
(br_if $label4
(i32.ne
(local.get $3)
(local.tee $4
@@ -2282,7 +2320,7 @@
(i32.const 4112)
(select
(i32.const -1)
(local.tee $7
(local.tee $8
(i32.sub
(local.get $3)
(i32.const 1)
@@ -2290,15 +2328,15 @@
)
(local.tee $5
(i32.gt_s
(local.get $7)
(local.get $8)
(i32.const 254)
)
)
)
(i32.const 256)
)
(block $block2
(br_if $block2
(block $block3
(br_if $block3
(i32.ge_u
(local.tee $6
(select
@@ -2310,7 +2348,7 @@
(local.get $5)
)
)
(local.get $7)
(local.get $8)
)
)
(if
@@ -2364,7 +2402,7 @@
(local.set $6
(i32.add
(local.get $6)
(local.tee $9
(local.tee $7
(i32.and
(local.get $10)
(i32.const -16)
@@ -2374,13 +2412,13 @@
)
(local.set $13
(i32x4.splat
(local.get $7)
(local.get $8)
)
)
(local.set $5
(local.get $9)
(local.get $7)
)
(loop $label4
(loop $label5
(v128.store8_lane 0
(i32.add
(i32x4.extract_lane 0
@@ -2635,7 +2673,7 @@
(v128.const i32x4 0x00000010 0x00000010 0x00000010 0x00000010)
)
)
(br_if $label4
(br_if $label5
(local.tee $5
(i32.sub
(local.get $5)
@@ -2644,9 +2682,9 @@
)
)
)
(br_if $block2
(br_if $block3
(i32.eq
(local.get $9)
(local.get $7)
(local.get $10)
)
)
@@ -2667,7 +2705,7 @@
(i32.const 2)
)
)
(loop $label5
(loop $label6
(i32.store8
(i32.add
(i32.load8_u
@@ -2683,7 +2721,7 @@
(i32.const 1)
)
)
(br_if $label5
(br_if $label6
(i32.ne
(local.tee $5
(i32.sub
@@ -2696,60 +2734,102 @@
)
)
)
(local.set $7
(i32.load8_u
(local.get $2)
)
)
(local.set $5
(local.get $8)
)
(block $block4
(loop $label7
(br_if $block4
(i32.ne
(local.get $7)
(local.tee $6
(i32.load8_u
(i32.add
(local.get $2)
(local.get $5)
)
)
)
)
)
(br_if $label7
(local.tee $5
(i32.sub
(local.get $5)
(i32.const 1)
)
)
)
)
(local.set $6
(i32.load8_u
(i32.add
(local.get $2)
(local.get $8)
)
)
)
(local.set $5
(local.get $8)
)
)
(if
(i32.le_u
(local.get $0)
(local.tee $9
(local.tee $10
(i32.sub
(i32.sub
(i32.shl
(memory.size)
(i32.const 16)
)
(local.get $3)
(local.get $5)
)
(i32.const 15)
(i32.const 16)
)
)
)
(then
(local.set $11
(v128.load8_splat
(i32.add
(local.get $2)
(local.get $7)
)
(i8x16.splat
(local.get $6)
)
)
(local.set $13
(v128.load8_splat
(local.get $2)
(i8x16.splat
(local.get $7)
)
)
(local.set $6
(local.set $7
(i32.add
(local.get $3)
(i32.const 14)
)
)
(local.set $9
(i32.add
(local.get $2)
(i32.const 1)
)
)
(loop $label7
(block $block3
(br_if $block3
(loop $label9
(block $block5
(br_if $block5
(i32.eqz
(v128.any_true
(local.tee $16
(local.tee $14
(v128.and
(i8x16.eq
(local.get $11)
(local.tee $14
(v128.load align=1
(i32.sub
(i32.add
(local.get $0)
(local.get $3)
)
(i32.const 1)
)
(v128.load align=1
(i32.add
(local.get $0)
(local.get $5)
)
)
)
@@ -2764,21 +2844,21 @@
)
)
)
(br_if $block3
(br_if $block5
(i32.eqz
(local.tee $4
(i8x16.bitmask
(local.get $16)
(local.get $14)
)
)
)
)
(loop $label6
(br_if $block1
(loop $label8
(br_if $block2
(i32.eqz
(call $bcmp
(i32.add
(local.tee $5
(local.tee $6
(i32.add
(local.get $0)
(i32.ctz
@@ -2788,12 +2868,12 @@
)
(i32.const 1)
)
(local.get $6)
(local.get $7)
(local.get $9)
(local.get $8)
)
)
)
(br_if $label6
(br_if $label8
(local.tee $4
(i32.and
(i32.sub
@@ -2806,10 +2886,10 @@
)
)
)
(local.set $5
(local.set $6
(i32.const 0)
)
(br_if $block1
(br_if $block2
(i32.lt_u
(local.get $1)
(local.tee $1
@@ -2819,8 +2899,11 @@
(i32.add
(i32.load8_u
(i32.add
(i8x16.extract_lane_s 15
(local.get $14)
(i32.load8_u
(i32.add
(local.get $0)
(local.get $7)
)
)
(i32.const 4112)
)
@@ -2832,13 +2915,13 @@
)
)
)
(br_if $block1
(br_if $block2
(i32.lt_u
(local.get $1)
(local.get $3)
)
)
(br_if $label7
(br_if $label9
(i32.le_u
(local.tee $0
(i32.add
@@ -2846,11 +2929,11 @@
(local.get $4)
)
)
(local.get $9)
(local.get $10)
)
)
)
(local.set $8
(local.set $9
(i32.sub
(local.get $1)
(local.get $3)
@@ -2858,16 +2941,16 @@
)
)
)
(local.set $1
(local.set $5
(i32.const 0)
)
(loop $label9
(loop $label11
(local.set $4
(i32.const 0)
)
(block $block4
(loop $label8
(br_if $block4
(block $block6
(loop $label10
(br_if $block6
(i32.ne
(i32.load8_u
(i32.add
@@ -2883,7 +2966,7 @@
)
)
)
(br_if $label8
(br_if $label10
(i32.ne
(local.get $3)
(local.tee $4
@@ -2905,23 +2988,23 @@
(i32.const 1)
)
)
(local.set $5
(local.set $6
(i32.const 0)
)
(br_if $label9
(br_if $label11
(i32.le_u
(local.tee $1
(local.tee $5
(i32.add
(local.get $1)
(local.get $5)
(i32.const 1)
)
)
(local.get $8)
(local.get $9)
)
)
)
)
(local.get $5)
(local.get $6)
)
(func $strstr (param $0 i32) (param $1 i32) (result i32)
(local $2 i32)

View File

@@ -474,15 +474,21 @@ static const char *__memmem_rabin(const char *haystk, size_t sh,
// The needle is no longer than haystack.
__builtin_assume(2 <= sn && sn <= sh);
// Find the farthest character not equal to the first one.
size_t i = sn - 1;
while (i > 0 && needle[0] == needle[i]) i--;
if (i == 0) i = sn - 1;
const v128_t fst = wasm_i8x16_splat(needle[0]);
const v128_t lst = wasm_i8x16_splat(needle[sn - 1]);
const v128_t lst = wasm_i8x16_splat(needle[i]);
// The last haystk offset for which loading blk_lst is safe.
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - sn + 1 -
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i -
sizeof(v128_t));
while (haystk <= H) {
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + sn - 1));
const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + i));
const v128_t eq_fst = wasm_i8x16_eq(fst, blk_fst);
const v128_t eq_lst = wasm_i8x16_eq(lst, blk_lst);
@@ -492,10 +498,6 @@ static const char *__memmem_rabin(const char *haystk, size_t sh,
// Each iteration clears that bit, tries again.
for (uint32_t mask = wasm_i8x16_bitmask(cmp); mask; mask &= mask - 1) {
size_t ctz = __builtin_ctz(mask);
// This could compare one less byte.
// Since bcmp compares left-to-right (this could change)
// the last byte only matters if we just found the needle.
// Otherwise this may help bcmp vectorize.
if (!bcmp(haystk + ctz + 1, needle + 1, sn - 1)) {
return haystk + ctz;
}
@@ -503,8 +505,9 @@ static const char *__memmem_rabin(const char *haystk, size_t sh,
}
size_t skip = sizeof(v128_t);
// Apply the bad-character rule to the last byte of the haystack.
if (bmbc) skip += bmbc[wasm_i8x16_extract_lane(blk_lst, 15)];
// Apply the bad-character rule to the last checked
// character of the haystack.
if (bmbc) skip += bmbc[(unsigned char)haystk[sn + 14]];
// Have we reached the end of the haystack?
if (__builtin_sub_overflow(sh, skip, &sh)) return NULL;
// Is the needle longer than the haystack?
@@ -641,8 +644,7 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) {
return dest + slen;
}
static char *__stpncpy(char *__restrict dest, const char *__restrict src,
size_t n) {
static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
size_t strnlen(const char *s, size_t n);
size_t slen = strnlen(src, n);
memcpy(dest, src, slen);