diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index 9ef1e41..ab83ded 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -242,17 +242,21 @@ char *strrchr(const char *s, int c) { // SIMDized check which bytes are in a set (Geoff Langdale) // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html +// This is the same algorithm as truffle from Hyperscan: +// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81 +// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp + typedef struct { __u8x16 lo; __u8x16 hi; } __wasm_v128_bitmap256_t; __attribute__((always_inline)) -static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) { - uint8_t hi_nibble = (uint8_t)i >> 4; - uint8_t lo_nibble = (uint8_t)i & 0xf; - bitmap->lo[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 0)); - bitmap->hi[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 8)); +static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, uint8_t i) { + uint8_t hi_nibble = i >> 4; + uint8_t lo_nibble = i & 0xf; + bitmap->lo[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 0)); + bitmap->hi[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 8)); } #ifndef __wasm_relaxed_simd__ @@ -264,18 +268,17 @@ static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) { __attribute__((always_inline)) static v128_t __wasm_v128_chkbits(__wasm_v128_bitmap256_t bitmap, v128_t v) { v128_t hi_nibbles = wasm_u8x16_shr(v, 4); - v128_t bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, // - 1, 2, 4, 8, 16, 32, 64, 128); + v128_t bitmask_lookup = wasm_u64x2_const_splat(0x8040201008040201); v128_t bitmask = wasm_i8x16_relaxed_swizzle(bitmask_lookup, hi_nibbles); v128_t indices_0_7 = v & wasm_u8x16_const_splat(0x8f); v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat(0x80); - v128_t row_0_7 = wasm_i8x16_swizzle(bitmap.lo, indices_0_7); - v128_t row_8_15 = wasm_i8x16_swizzle(bitmap.hi, indices_8_15); + v128_t row_0_7 = wasm_i8x16_swizzle((v128_t)bitmap.lo, indices_0_7); + v128_t row_8_15 = wasm_i8x16_swizzle((v128_t)bitmap.hi, indices_8_15); v128_t bitsets = row_0_7 | row_8_15; - return wasm_i8x16_eq(bitsets & bitmask, bitmask); + return bitsets & bitmask; } #undef wasm_i8x16_relaxed_swizzle @@ -317,17 +320,18 @@ size_t strspn(const char *s, const char *c) { for (; *c; c++) { // Terminator IS NOT on the bitmap. - __wasm_v128_setbit(&bitmap, *c); + __wasm_v128_setbit(&bitmap, (uint8_t)*c); } for (;;) { v128_t v = *(v128_t *)addr; - v128_t cmp = __wasm_v128_chkbits(bitmap, v); + v128_t found = __wasm_v128_chkbits(bitmap, v); // Bitmask is slow on AArch64, all_true is much faster. - if (!wasm_i8x16_all_true(cmp)) { + if (!wasm_i8x16_all_true(found)) { + v128_t cmp = wasm_i8x16_eq(found, (v128_t){}); // Clear the bits corresponding to align (little-endian) // so we can count trailing zeros. - int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align; + int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless align cleared them. // Knowing this helps the compiler if it unrolls the loop. __builtin_assume(mask || align); @@ -356,17 +360,18 @@ size_t strcspn(const char *s, const char *c) { do { // Terminator IS on the bitmap. - __wasm_v128_setbit(&bitmap, *c); + __wasm_v128_setbit(&bitmap, (uint8_t)*c); } while (*c++); for (;;) { v128_t v = *(v128_t *)addr; - v128_t cmp = __wasm_v128_chkbits(bitmap, v); + v128_t found = __wasm_v128_chkbits(bitmap, v); // Bitmask is slow on AArch64, any_true is much faster. - if (wasm_v128_any_true(cmp)) { + if (wasm_v128_any_true(found)) { + v128_t cmp = wasm_i8x16_eq(found, (v128_t){}); // Clear the bits corresponding to align (little-endian) // so we can count trailing zeros. - int mask = wasm_i8x16_bitmask(cmp) >> align << align; + int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless align cleared them. // Knowing this helps the compiler if it unrolls the loop. __builtin_assume(mask || align);