diff --git a/sqlite3/libc/libc.wasm b/sqlite3/libc/libc.wasm index f2f3722..d05849f 100755 Binary files a/sqlite3/libc/libc.wasm and b/sqlite3/libc/libc.wasm differ diff --git a/sqlite3/libc/libc.wat b/sqlite3/libc/libc.wat index 460ac39..12d33e1 100644 --- a/sqlite3/libc/libc.wat +++ b/sqlite3/libc/libc.wat @@ -2030,12 +2030,12 @@ ) ) ) - (block $block1 + (block $block2 (if (i32.eqz (i32.and (i32.gt_u - (local.tee $8 + (local.tee $9 (i32.sub (local.get $1) (local.get $3) @@ -2050,50 +2050,91 @@ ) ) (then + (local.set $7 + (i32.load8_u + (local.get $2) + ) + ) + (local.set $5 + (local.tee $8 + (i32.sub + (local.get $3) + (i32.const 1) + ) + ) + ) + (block $block + (loop $label + (br_if $block + (i32.ne + (local.get $7) + (local.tee $6 + (i32.load8_u + (i32.add + (local.get $2) + (local.get $5) + ) + ) + ) + ) + ) + (br_if $label + (local.tee $5 + (i32.sub + (local.get $5) + (i32.const 1) + ) + ) + ) + ) + (local.set $6 + (i32.load8_u + (i32.add + (local.get $2) + (local.get $8) + ) + ) + ) + (local.set $5 + (local.get $8) + ) + ) (if (i32.le_u (local.get $0) - (local.tee $9 + (local.tee $10 (i32.sub (i32.sub (i32.shl (memory.size) (i32.const 16) ) - (local.get $3) + (local.get $5) ) - (i32.const 15) + (i32.const 16) ) ) ) (then (local.set $11 - (v128.load8_splat - (i32.add - (local.get $2) - (local.tee $6 - (i32.sub - (local.get $3) - (i32.const 1) - ) - ) - ) + (i8x16.splat + (local.get $6) ) ) (local.set $13 - (v128.load8_splat - (local.get $2) + (i8x16.splat + (local.get $7) ) ) - (local.set $7 + (local.set $9 (i32.add (local.get $2) (i32.const 1) ) ) - (loop $label1 - (block $block - (br_if $block + (loop $label2 + (block $block1 + (br_if $block1 (i32.eqz (v128.any_true (local.tee $14 @@ -2101,12 +2142,9 @@ (i8x16.eq (local.get $11) (v128.load align=1 - (i32.sub - (i32.add - (local.get $0) - (local.get $3) - ) - (i32.const 1) + (i32.add + (local.get $0) + (local.get $5) ) ) ) @@ -2121,7 +2159,7 @@ ) ) ) - (br_if $block + (br_if $block1 (i32.eqz (local.tee $4 (i8x16.bitmask @@ -2130,12 +2168,12 @@ ) ) ) - (loop $label - (br_if $block1 + (loop $label1 + (br_if $block2 (i32.eqz (call $bcmp (i32.add - (local.tee $5 + (local.tee $6 (i32.add (local.get $0) (i32.ctz @@ -2145,12 +2183,12 @@ ) (i32.const 1) ) - (local.get $7) - (local.get $6) + (local.get $9) + (local.get $8) ) ) ) - (br_if $label + (br_if $label1 (local.tee $4 (i32.and (i32.sub @@ -2166,7 +2204,7 @@ (if (i32.lt_u (local.get $1) - (local.tee $5 + (local.tee $7 (i32.sub (local.get $1) (i32.const 16) @@ -2182,7 +2220,7 @@ (if (i32.gt_u (local.get $3) - (local.get $5) + (local.get $7) ) (then (return @@ -2191,9 +2229,9 @@ ) ) (local.set $1 - (local.get $5) + (local.get $7) ) - (br_if $label1 + (br_if $label2 (i32.le_u (local.tee $0 (i32.add @@ -2201,26 +2239,26 @@ (i32.const 16) ) ) - (local.get $9) + (local.get $10) ) ) ) - (local.set $8 + (local.set $9 (i32.sub - (local.get $1) + (local.get $7) (local.get $3) ) ) ) ) - (local.set $1 + (local.set $5 (i32.const 0) ) - (loop $label2 + (loop $label3 (local.set $4 (i32.const 0) ) - (loop $label3 + (loop $label4 (if (i32.ne (i32.load8_u @@ -2243,24 +2281,24 @@ (i32.const 1) ) ) - (local.set $5 + (local.set $6 (i32.const 0) ) - (br_if $label2 + (br_if $label3 (i32.le_u - (local.tee $1 + (local.tee $5 (i32.add - (local.get $1) + (local.get $5) (i32.const 1) ) ) - (local.get $8) + (local.get $9) ) ) - (br $block1) + (br $block2) ) ) - (br_if $label3 + (br_if $label4 (i32.ne (local.get $3) (local.tee $4 @@ -2282,7 +2320,7 @@ (i32.const 4112) (select (i32.const -1) - (local.tee $7 + (local.tee $8 (i32.sub (local.get $3) (i32.const 1) @@ -2290,15 +2328,15 @@ ) (local.tee $5 (i32.gt_s - (local.get $7) + (local.get $8) (i32.const 254) ) ) ) (i32.const 256) ) - (block $block2 - (br_if $block2 + (block $block3 + (br_if $block3 (i32.ge_u (local.tee $6 (select @@ -2310,7 +2348,7 @@ (local.get $5) ) ) - (local.get $7) + (local.get $8) ) ) (if @@ -2364,7 +2402,7 @@ (local.set $6 (i32.add (local.get $6) - (local.tee $9 + (local.tee $7 (i32.and (local.get $10) (i32.const -16) @@ -2374,13 +2412,13 @@ ) (local.set $13 (i32x4.splat - (local.get $7) + (local.get $8) ) ) (local.set $5 - (local.get $9) + (local.get $7) ) - (loop $label4 + (loop $label5 (v128.store8_lane 0 (i32.add (i32x4.extract_lane 0 @@ -2635,7 +2673,7 @@ (v128.const i32x4 0x00000010 0x00000010 0x00000010 0x00000010) ) ) - (br_if $label4 + (br_if $label5 (local.tee $5 (i32.sub (local.get $5) @@ -2644,9 +2682,9 @@ ) ) ) - (br_if $block2 + (br_if $block3 (i32.eq - (local.get $9) + (local.get $7) (local.get $10) ) ) @@ -2667,7 +2705,7 @@ (i32.const 2) ) ) - (loop $label5 + (loop $label6 (i32.store8 (i32.add (i32.load8_u @@ -2683,7 +2721,7 @@ (i32.const 1) ) ) - (br_if $label5 + (br_if $label6 (i32.ne (local.tee $5 (i32.sub @@ -2696,60 +2734,102 @@ ) ) ) + (local.set $7 + (i32.load8_u + (local.get $2) + ) + ) + (local.set $5 + (local.get $8) + ) + (block $block4 + (loop $label7 + (br_if $block4 + (i32.ne + (local.get $7) + (local.tee $6 + (i32.load8_u + (i32.add + (local.get $2) + (local.get $5) + ) + ) + ) + ) + ) + (br_if $label7 + (local.tee $5 + (i32.sub + (local.get $5) + (i32.const 1) + ) + ) + ) + ) + (local.set $6 + (i32.load8_u + (i32.add + (local.get $2) + (local.get $8) + ) + ) + ) + (local.set $5 + (local.get $8) + ) + ) (if (i32.le_u (local.get $0) - (local.tee $9 + (local.tee $10 (i32.sub (i32.sub (i32.shl (memory.size) (i32.const 16) ) - (local.get $3) + (local.get $5) ) - (i32.const 15) + (i32.const 16) ) ) ) (then (local.set $11 - (v128.load8_splat - (i32.add - (local.get $2) - (local.get $7) - ) + (i8x16.splat + (local.get $6) ) ) (local.set $13 - (v128.load8_splat - (local.get $2) + (i8x16.splat + (local.get $7) ) ) - (local.set $6 + (local.set $7 + (i32.add + (local.get $3) + (i32.const 14) + ) + ) + (local.set $9 (i32.add (local.get $2) (i32.const 1) ) ) - (loop $label7 - (block $block3 - (br_if $block3 + (loop $label9 + (block $block5 + (br_if $block5 (i32.eqz (v128.any_true - (local.tee $16 + (local.tee $14 (v128.and (i8x16.eq (local.get $11) - (local.tee $14 - (v128.load align=1 - (i32.sub - (i32.add - (local.get $0) - (local.get $3) - ) - (i32.const 1) - ) + (v128.load align=1 + (i32.add + (local.get $0) + (local.get $5) ) ) ) @@ -2764,21 +2844,21 @@ ) ) ) - (br_if $block3 + (br_if $block5 (i32.eqz (local.tee $4 (i8x16.bitmask - (local.get $16) + (local.get $14) ) ) ) ) - (loop $label6 - (br_if $block1 + (loop $label8 + (br_if $block2 (i32.eqz (call $bcmp (i32.add - (local.tee $5 + (local.tee $6 (i32.add (local.get $0) (i32.ctz @@ -2788,12 +2868,12 @@ ) (i32.const 1) ) - (local.get $6) - (local.get $7) + (local.get $9) + (local.get $8) ) ) ) - (br_if $label6 + (br_if $label8 (local.tee $4 (i32.and (i32.sub @@ -2806,10 +2886,10 @@ ) ) ) - (local.set $5 + (local.set $6 (i32.const 0) ) - (br_if $block1 + (br_if $block2 (i32.lt_u (local.get $1) (local.tee $1 @@ -2819,8 +2899,11 @@ (i32.add (i32.load8_u (i32.add - (i8x16.extract_lane_s 15 - (local.get $14) + (i32.load8_u + (i32.add + (local.get $0) + (local.get $7) + ) ) (i32.const 4112) ) @@ -2832,13 +2915,13 @@ ) ) ) - (br_if $block1 + (br_if $block2 (i32.lt_u (local.get $1) (local.get $3) ) ) - (br_if $label7 + (br_if $label9 (i32.le_u (local.tee $0 (i32.add @@ -2846,11 +2929,11 @@ (local.get $4) ) ) - (local.get $9) + (local.get $10) ) ) ) - (local.set $8 + (local.set $9 (i32.sub (local.get $1) (local.get $3) @@ -2858,16 +2941,16 @@ ) ) ) - (local.set $1 + (local.set $5 (i32.const 0) ) - (loop $label9 + (loop $label11 (local.set $4 (i32.const 0) ) - (block $block4 - (loop $label8 - (br_if $block4 + (block $block6 + (loop $label10 + (br_if $block6 (i32.ne (i32.load8_u (i32.add @@ -2883,7 +2966,7 @@ ) ) ) - (br_if $label8 + (br_if $label10 (i32.ne (local.get $3) (local.tee $4 @@ -2905,23 +2988,23 @@ (i32.const 1) ) ) - (local.set $5 + (local.set $6 (i32.const 0) ) - (br_if $label9 + (br_if $label11 (i32.le_u - (local.tee $1 + (local.tee $5 (i32.add - (local.get $1) + (local.get $5) (i32.const 1) ) ) - (local.get $8) + (local.get $9) ) ) ) ) - (local.get $5) + (local.get $6) ) (func $strstr (param $0 i32) (param $1 i32) (result i32) (local $2 i32) diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index c05543a..8613c88 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -474,15 +474,21 @@ static const char *__memmem_rabin(const char *haystk, size_t sh, // The needle is no longer than haystack. __builtin_assume(2 <= sn && sn <= sh); + // Find the farthest character not equal to the first one. + size_t i = sn - 1; + while (i > 0 && needle[0] == needle[i]) i--; + if (i == 0) i = sn - 1; + const v128_t fst = wasm_i8x16_splat(needle[0]); - const v128_t lst = wasm_i8x16_splat(needle[sn - 1]); + const v128_t lst = wasm_i8x16_splat(needle[i]); + // The last haystk offset for which loading blk_lst is safe. - const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - sn + 1 - + const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - sizeof(v128_t)); while (haystk <= H) { const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk)); - const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + sn - 1)); + const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + i)); const v128_t eq_fst = wasm_i8x16_eq(fst, blk_fst); const v128_t eq_lst = wasm_i8x16_eq(lst, blk_lst); @@ -492,10 +498,6 @@ static const char *__memmem_rabin(const char *haystk, size_t sh, // Each iteration clears that bit, tries again. for (uint32_t mask = wasm_i8x16_bitmask(cmp); mask; mask &= mask - 1) { size_t ctz = __builtin_ctz(mask); - // This could compare one less byte. - // Since bcmp compares left-to-right (this could change) - // the last byte only matters if we just found the needle. - // Otherwise this may help bcmp vectorize. if (!bcmp(haystk + ctz + 1, needle + 1, sn - 1)) { return haystk + ctz; } @@ -503,8 +505,9 @@ static const char *__memmem_rabin(const char *haystk, size_t sh, } size_t skip = sizeof(v128_t); - // Apply the bad-character rule to the last byte of the haystack. - if (bmbc) skip += bmbc[wasm_i8x16_extract_lane(blk_lst, 15)]; + // Apply the bad-character rule to the last checked + // character of the haystack. + if (bmbc) skip += bmbc[(unsigned char)haystk[sn + 14]]; // Have we reached the end of the haystack? if (__builtin_sub_overflow(sh, skip, &sh)) return NULL; // Is the needle longer than the haystack? @@ -641,8 +644,7 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) { return dest + slen; } -static char *__stpncpy(char *__restrict dest, const char *__restrict src, - size_t n) { +static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) { size_t strnlen(const char *s, size_t n); size_t slen = strnlen(src, n); memcpy(dest, src, slen);