diff --git a/sqlite3/libc/libc.wasm b/sqlite3/libc/libc.wasm index be51968..9352b8e 100755 Binary files a/sqlite3/libc/libc.wasm and b/sqlite3/libc/libc.wasm differ diff --git a/sqlite3/libc/libc.wat b/sqlite3/libc/libc.wat index d988fcd..0f86ea1 100644 --- a/sqlite3/libc/libc.wat +++ b/sqlite3/libc/libc.wat @@ -1060,65 +1060,194 @@ (local $2 i32) (local $3 i32) (local $4 i32) - (local $5 v128) + (local $5 i32) (local $6 v128) + (local $7 v128) + (local $8 v128) + (local $9 v128) + (local $10 v128) (local $scratch i32) - (if - (i32.eqz - (local.tee $2 - (i32.load8_u - (local.get $1) + (local $scratch_12 i32) + (local.set $4 + (i32.sub + (i32.shl + (block (result i32) + (local.set $scratch + (memory.size) + ) + (if + (i32.eqz + (local.tee $2 + (i32.load8_u + (local.get $1) + ) + ) + ) + (then + (return + (i32.const 0) + ) + ) + ) + (local.get $scratch) ) + (i32.const 16) ) - ) - (then - (return - (i32.const 0) - ) + (i32.const 16) ) ) (block $block (if - (i32.eqz - (i32.load8_u offset=1 - (local.get $1) - ) + (i32.load8_u offset=1 + (local.get $1) ) (then - (if - (i32.ge_u - (local.tee $4 - (i32.sub - (i32.shl - (memory.size) - (i32.const 16) + (local.set $1 + (i32.add + (local.get $1) + (i32.const 1) + ) + ) + (loop $label + (v128.store + (i32.const 992) + (local.get $6) + ) + (i32.store8 + (local.tee $5 + (i32.or + (local.tee $3 + (i32.and + (local.get $2) + (i32.const 15) + ) ) - (i32.const 16) + (i32.const 992) ) ) - (local.tee $1 - (local.get $0) + (i32.or + (i32.load8_u + (local.get $5) + ) + (i32.shl + (i32.const 1) + (local.tee $2 + (i32.and + (i32.shr_u + (local.get $2) + (i32.const 4) + ) + (i32.const 15) + ) + ) + ) ) ) - (then - (local.set $5 - (i8x16.splat - (local.get $2) + (v128.store + (i32.const 1008) + (local.get $7) + ) + (i32.store8 + (local.tee $3 + (i32.or + (local.get $3) + (i32.const 1008) ) ) - (loop $label + (i32.or + (i32.load8_u + (local.get $3) + ) + (i32.shl + (i32.const 1) + (i32.sub + (local.get $2) + (i32.const 8) + ) + ) + ) + ) + (local.set $2 + (i32.load8_u + (local.get $1) + ) + ) + (local.set $6 + (v128.load + (i32.const 992) + ) + ) + (local.set $7 + (v128.load + (i32.const 1008) + ) + ) + (local.set $1 + (i32.add + (local.get $1) + (i32.const 1) + ) + ) + (br_if $label + (local.get $2) + ) + ) + (local.set $2 + (local.get $0) + ) + (if + (i32.le_u + (local.get $0) + (local.get $4) + ) + (then + (local.set $1 + (i32.const 0) + ) + (loop $label1 (if (i32.eqz (i8x16.all_true - (local.tee $6 + (local.tee $8 (i8x16.eq - (v128.load align=1 - (i32.add - (local.get $0) - (local.get $3) + (v128.and + (local.tee $9 + (i8x16.swizzle + (v128.const i32x4 0x08040201 0x80402010 0x08040201 0x80402010) + (i8x16.shr_u + (local.tee $8 + (v128.load align=1 + (i32.add + (local.get $0) + (local.get $1) + ) + ) + ) + (i32.const 4) + ) + ) + ) + (v128.bitselect + (i8x16.swizzle + (local.get $6) + (local.tee $10 + (v128.and + (local.get $8) + (v128.const i32x4 0x0f0f0f0f 0x0f0f0f0f 0x0f0f0f0f 0x0f0f0f0f) + ) + ) + ) + (i8x16.swizzle + (local.get $7) + (local.get $10) + ) + (i8x16.gt_s + (local.get $8) + (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff) + ) ) ) - (local.get $5) + (local.get $9) ) ) ) @@ -1129,24 +1258,24 @@ (i32.ctz (i32.xor (i8x16.bitmask - (local.get $6) + (local.get $8) ) (i32.const -1) ) ) - (local.get $3) + (local.get $1) ) ) ) ) - (br_if $label + (br_if $label1 (i32.le_u - (local.tee $1 + (local.tee $2 (i32.add (local.get $0) - (local.tee $3 + (local.tee $1 (i32.add - (local.get $3) + (local.get $1) (i32.const 16) ) ) @@ -1158,25 +1287,29 @@ ) ) ) - (local.set $0 + (local.set $1 (i32.add (i32.xor (local.get $0) (i32.const -1) ) - (local.get $1) + (local.get $2) ) ) - (loop $label1 - (local.set $0 - (i32.add - (local.get $0) - (i32.const 1) - ) - ) - (local.set $3 - (i32.load8_u - (local.get $1) + (loop $label2 + (v128.store + (i32.const 976) + (select + (local.get $7) + (local.get $6) + (i32.lt_s + (local.tee $0 + (i32.load8_s + (local.get $2) + ) + ) + (i32.const 0) + ) ) ) (local.set $1 @@ -1185,158 +1318,161 @@ (i32.const 1) ) ) - (br_if $label1 - (i32.eq + (local.set $2 + (i32.add (local.get $2) - (local.get $3) + (i32.const 1) + ) + ) + (br_if $label2 + (i32.and + (i32.shr_u + (i32.load8_u + (i32.or + (i32.and + (local.get $0) + (i32.const 15) + ) + (i32.const 976) + ) + ) + (i32.and + (i32.shr_u + (local.get $0) + (i32.const 4) + ) + (i32.const 7) + ) + ) + (i32.const 1) ) ) ) (br $block) ) ) - (memory.fill - (i32.const 1040) - (i32.const 0) - (i32.const 256) + (local.set $3 + (local.get $0) + ) + (if + (i32.le_u + (local.get $0) + (local.get $4) + ) + (then + (local.set $6 + (i8x16.splat + (local.get $2) + ) + ) + (local.set $1 + (i32.const 0) + ) + (loop $label3 + (if + (i32.eqz + (i8x16.all_true + (local.tee $7 + (i8x16.eq + (v128.load align=1 + (i32.add + (local.get $0) + (local.get $1) + ) + ) + (local.get $6) + ) + ) + ) + ) + (then + (return + (i32.add + (i32.ctz + (i32.xor + (i8x16.bitmask + (local.get $7) + ) + (i32.const -1) + ) + ) + (local.get $1) + ) + ) + ) + ) + (br_if $label3 + (i32.le_u + (local.tee $3 + (i32.add + (local.get $0) + (local.tee $1 + (i32.add + (local.get $1) + (i32.const 16) + ) + ) + ) + ) + (local.get $4) + ) + ) + ) + ) ) (local.set $1 (i32.add - (local.get $1) - (i32.const 1) + (i32.xor + (local.get $0) + (i32.const -1) + ) + (local.get $3) ) ) - (loop $label2 - (i32.store8 - (i32.add - (i32.and - (local.get $2) - (i32.const 255) - ) - (i32.const 1040) - ) - (i32.const 1) - ) - (local.set $2 - (i32.load8_u - (local.get $1) - ) - ) + (loop $label4 (local.set $1 (i32.add (local.get $1) (i32.const 1) ) ) - (br_if $label2 - (local.get $2) - ) - ) - (local.set $2 - (local.get $0) - ) - (block $block1 - (block $block2 - (block $block3 - (loop $label3 - (br_if $block1 - (i32.eqz - (i32.load8_u - (i32.add - (i32.load8_u - (local.get $2) - ) - (i32.const 1040) - ) - ) - ) - ) - (br_if $block2 - (i32.eqz - (i32.load8_u - (i32.add - (i32.load8_u offset=1 - (local.get $2) - ) - (i32.const 1040) - ) - ) - ) - ) - (br_if $block3 - (i32.eqz - (i32.load8_u - (i32.add - (i32.load8_u offset=2 - (local.get $2) - ) - (i32.const 1040) - ) - ) - ) - ) - (br_if $label3 + (br_if $label4 + (i32.eq + (block (result i32) + (local.set $scratch_12 (i32.load8_u - (i32.add - (block (result i32) - (local.set $scratch - (i32.load8_u offset=3 - (local.get $2) - ) - ) - (local.set $2 - (i32.add - (local.get $2) - (i32.const 4) - ) - ) - (local.get $scratch) - ) - (i32.const 1040) - ) + (local.get $3) ) ) - ) - (local.set $2 - (i32.sub - (local.get $2) - (i32.const 1) + (local.set $3 + (i32.add + (local.get $3) + (i32.const 1) + ) ) + (local.get $scratch_12) ) - (br $block1) - ) - (local.set $2 - (i32.add - (local.get $2) - (i32.const 2) - ) - ) - (br $block1) - ) - (local.set $2 - (i32.add (local.get $2) - (i32.const 1) ) ) ) - (local.set $0 - (i32.sub - (local.get $2) - (local.get $0) - ) - ) ) - (local.get $0) + (local.get $1) ) (func $strcspn (param $0 i32) (param $1 i32) (result i32) (local $2 i32) - (local $3 v128) - (local $4 v128) + (local $3 i32) + (local $4 i32) + (local $5 i32) + (local $6 v128) + (local $7 v128) + (local $8 v128) + (local $9 v128) + (local $10 v128) (local $scratch i32) (block $block (if - (local.tee $2 + (local.tee $3 (i32.load8_u (local.get $1) ) @@ -1352,12 +1488,12 @@ (block $block1 (if (v128.any_true - (local.tee $3 + (local.tee $6 (v128.or (i8x16.eq - (local.tee $3 + (local.tee $6 (v128.load - (local.tee $1 + (local.tee $2 (i32.and (local.get $0) (i32.const -16) @@ -1368,10 +1504,10 @@ (v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000) ) (i8x16.eq - (local.get $3) - (local.tee $4 + (local.get $6) + (local.tee $7 (i8x16.splat - (local.get $2) + (local.get $3) ) ) ) @@ -1380,10 +1516,10 @@ ) (then (br_if $block1 - (local.tee $2 + (local.tee $1 (i32.and (i8x16.bitmask - (local.get $3) + (local.get $6) ) (i32.shl (i32.const -1) @@ -1398,29 +1534,29 @@ ) ) (loop $label - (local.set $3 + (local.set $6 (v128.load offset=16 - (local.get $1) + (local.get $2) ) ) - (local.set $1 + (local.set $2 (i32.add - (local.get $1) + (local.get $2) (i32.const 16) ) ) (br_if $label (i32.eqz (v128.any_true - (local.tee $3 + (local.tee $6 (v128.or (i8x16.eq - (local.get $3) + (local.get $6) (v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000) ) (i8x16.eq - (local.get $3) - (local.get $4) + (local.get $6) + (local.get $7) ) ) ) @@ -1428,143 +1564,271 @@ ) ) ) - (local.set $2 + (local.set $1 (i8x16.bitmask - (local.get $3) + (local.get $6) ) ) ) (return (i32.sub (i32.add - (local.get $1) + (local.get $2) (i32.ctz - (local.get $2) + (local.get $1) ) ) (local.get $0) ) ) ) - (memory.fill - (i32.const 1296) - (i32.const 0) - (i32.const 256) - ) - (loop $label1 - (i32.store8 - (i32.add - (local.tee $2 - (i32.load8_u - (local.get $1) - ) - ) - (i32.const 1296) - ) - (i32.const 1) - ) - (local.set $1 - (i32.add - (local.get $1) - (i32.const 1) - ) - ) - (br_if $label1 - (local.get $2) - ) - ) - (local.set $1 - (local.get $0) - ) - (block $block2 - (block $block3 - (block $block4 - (loop $label2 - (br_if $block2 - (i32.load8_u - (i32.add - (i32.load8_u - (local.get $1) - ) - (i32.const 1296) + (if + (i32.ge_u + (local.tee $3 + (i32.sub + (block (result i32) + (local.set $scratch + (i32.shl + (memory.size) + (i32.const 16) ) ) - ) - (br_if $block3 - (i32.load8_u - (i32.add - (i32.load8_u offset=1 - (local.get $1) - ) - (i32.const 1296) + (loop $label1 + (v128.store + (i32.const 1008) + (local.get $6) ) - ) - ) - (br_if $block4 - (i32.load8_u - (i32.add - (i32.load8_u offset=2 - (local.get $1) - ) - (i32.const 1296) - ) - ) - ) - (br_if $label2 - (i32.eqz - (i32.load8_u - (i32.add - (block (result i32) - (local.set $scratch - (i32.load8_u offset=3 - (local.get $1) + (i32.store8 + (i32.or + (local.tee $3 + (i32.and + (local.tee $2 + (i32.load8_u + (local.get $1) + ) ) + (i32.const 15) ) - (local.set $1 - (i32.add - (local.get $1) + ) + (i32.const 1008) + ) + (i32.or + (i32.load8_u + (i32.or + (local.get $3) + (i32.const 1008) + ) + ) + (i32.shl + (i32.const 1) + (i32.sub + (local.tee $5 + (i32.shr_u + (local.get $2) + (i32.const 4) + ) + ) + (i32.const 8) + ) + ) + ) + ) + (v128.store + (i32.const 992) + (local.get $7) + ) + (i32.store8 + (local.tee $3 + (i32.or + (local.get $3) + (i32.const 992) + ) + ) + (i32.or + (i32.load8_u + (local.get $3) + ) + (i32.shl + (i32.const 1) + (local.get $5) + ) + ) + ) + (local.set $1 + (i32.add + (local.get $1) + (i32.const 1) + ) + ) + (local.set $6 + (v128.load + (i32.const 1008) + ) + ) + (local.set $7 + (v128.load + (i32.const 992) + ) + ) + (br_if $label1 + (local.get $2) + ) + ) + (local.get $scratch) + ) + (i32.const 16) + ) + ) + (local.tee $1 + (local.get $0) + ) + ) + (then + (local.set $2 + (i32.const 0) + ) + (loop $label2 + (if + (v128.any_true + (local.tee $8 + (i8x16.eq + (v128.and + (local.tee $9 + (i8x16.swizzle + (v128.const i32x4 0x08040201 0x80402010 0x08040201 0x80402010) + (i8x16.shr_u + (local.tee $8 + (v128.load align=1 + (i32.add + (local.get $0) + (local.get $2) + ) + ) + ) (i32.const 4) ) ) - (local.get $scratch) ) - (i32.const 1296) + (v128.bitselect + (i8x16.swizzle + (local.get $7) + (local.tee $10 + (v128.and + (local.get $8) + (v128.const i32x4 0x0f0f0f0f 0x0f0f0f0f 0x0f0f0f0f 0x0f0f0f0f) + ) + ) + ) + (i8x16.swizzle + (local.get $6) + (local.get $10) + ) + (i8x16.gt_s + (local.get $8) + (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff) + ) + ) ) + (local.get $9) + ) + ) + ) + (then + (return + (i32.add + (i32.ctz + (i8x16.bitmask + (local.get $8) + ) + ) + (local.get $2) ) ) ) ) - (return - (i32.sub - (i32.sub - (local.get $1) - (i32.const 1) + (br_if $label2 + (i32.le_u + (local.tee $1 + (i32.add + (local.get $0) + (local.tee $2 + (i32.add + (local.get $2) + (i32.const 16) + ) + ) + ) ) - (local.get $0) + (local.get $3) ) ) ) - (return - (i32.sub - (i32.add - (local.get $1) - (i32.const 2) + ) + ) + (local.set $0 + (i32.add + (i32.xor + (local.get $0) + (i32.const -1) + ) + (local.get $1) + ) + ) + (loop $label3 + (v128.store + (i32.const 976) + (select + (local.get $6) + (local.get $7) + (i32.lt_s + (local.tee $2 + (i32.load8_s + (local.get $1) + ) ) - (local.get $0) + (i32.const 0) ) ) ) + (local.set $0 + (i32.add + (local.get $0) + (i32.const 1) + ) + ) (local.set $1 (i32.add (local.get $1) (i32.const 1) ) ) + (br_if $label3 + (i32.eqz + (i32.and + (i32.shr_u + (i32.load8_u + (i32.or + (i32.and + (local.get $2) + (i32.const 15) + ) + (i32.const 976) + ) + ) + (i32.and + (i32.shr_u + (local.get $2) + (i32.const 4) + ) + (i32.const 7) + ) + ) + (i32.const 1) + ) + ) + ) ) - (i32.sub - (local.get $1) - (local.get $0) - ) + (local.get $0) ) (func $memccpy (param $0 i32) (param $1 i32) (param $2 i32) (param $3 i32) (result i32) (memory.copy diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index 67cc2ea..3de90f7 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -334,21 +334,17 @@ char *strrchr(const char *s, int c) { __attribute__((weak)) size_t strspn(const char *s, const char *c) { -#ifndef _REENTRANT - static // Avoid the stack for builds without threads. -#endif - char byteset[UCHAR_MAX + 1]; + // Set limit to the largest possible valid v128_t pointer. + // Unsigned modular arithmetic gives the correct result + // unless memory size is zero, in which case all pointers are invalid. + const v128_t *const limit = + (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; + + const v128_t *w = (v128_t *)s; const char *const a = s; if (!c[0]) return 0; if (!c[1]) { - // Set limit to the largest possible valid v128_t pointer. - // Unsigned modular arithmetic gives the correct result - // unless memory size is zero, in which case all pointers are invalid. - const v128_t *const limit = - (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; - - const v128_t *w = (v128_t *)s; const v128_t wc = wasm_i8x16_splat(*c); while (w <= limit) { const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc); @@ -361,39 +357,113 @@ size_t strspn(const char *s, const char *c) { } // Baseline algorithm. - s = (char *)w; - while (*s == *c) s++; + for (s = (char *)w; *s == *c; s++); return s - a; } - memset(byteset, 0, sizeof(byteset)); - // Keeping byteset[0] = 0 avoids the next loop needing that check. - while (*c && (byteset[*(unsigned char *)c] = 1)) c++; -#if __OPTIMIZE__ && !__OPTIMIZE_SIZE__ -#pragma unroll 4 -#endif - while (byteset[*(unsigned char *)s]) s++; - return s - a; + // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html + typedef unsigned char u8x16 + __attribute__((__vector_size__(16), __aligned__(16))); + + u8x16 bitmap07 = {}; + u8x16 bitmap8f = {}; + for (; *c; c++) { + unsigned lo_nibble = *(unsigned char *)c % 16; + unsigned hi_nibble = *(unsigned char *)c / 16; + bitmap07[lo_nibble] |= 1 << (hi_nibble - 0); + bitmap8f[lo_nibble] |= 1 << (hi_nibble - 8); + // Terminator IS NOT on the bitmap. + } + + for (; w <= limit; w++) { + const v128_t lo_nibbles = wasm_v128_load(w) & wasm_u8x16_const_splat(0xf); + const v128_t hi_nibbles = wasm_u8x16_shr(wasm_v128_load(w), 4); + + const v128_t bitmask_lookup = + wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, // + 1, 2, 4, 8, 16, 32, 64, 128); + + const v128_t bitmask = wasm_i8x16_swizzle(bitmask_lookup, hi_nibbles); + const v128_t bitsets = wasm_v128_bitselect( + wasm_i8x16_swizzle(bitmap07, lo_nibbles), + wasm_i8x16_swizzle(bitmap8f, lo_nibbles), + wasm_i8x16_lt(hi_nibbles, wasm_u8x16_const_splat(8))); + + const v128_t cmp = wasm_i8x16_eq(bitsets & bitmask, bitmask); + if (!wasm_i8x16_all_true(cmp)) { + size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); + return (char *)w + ctz - s; + } + } + + // Baseline algorithm. + for (s = (char *)w;; s++) { + const unsigned lo_nibble = *(unsigned char *)s & 0xf; + const unsigned hi_nibble = *(unsigned char *)s >> 4; + const unsigned bitmask = 1 << (hi_nibble & 0x7); + const unsigned bitset = + hi_nibble < 8 ? bitmap07[lo_nibble] : bitmap8f[lo_nibble]; + if ((bitset & bitmask) == 0) return s - a; + } } __attribute__((weak)) size_t strcspn(const char *s, const char *c) { -#ifndef _REENTRANT - static // Avoid the stack for builds without threads. -#endif - char byteset[UCHAR_MAX + 1]; - const char *const a = s; - if (!c[0] || !c[1]) return __strchrnul(s, *c) - s; - memset(byteset, 0, sizeof(byteset)); - // Setting byteset[0] = 1 avoids the next loop needing that check. - while ((byteset[*(unsigned char *)c] = 1) && *c) c++; -#if __OPTIMIZE__ && !__OPTIMIZE_SIZE__ -#pragma unroll 4 -#endif - while (!byteset[*(unsigned char *)s]) s++; - return s - a; + // Set limit to the largest possible valid v128_t pointer. + // Unsigned modular arithmetic gives the correct result + // unless memory size is zero, in which case all pointers are invalid. + const v128_t *const limit = + (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; + + const v128_t *w = (v128_t *)s; + const char *const a = s; + + // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html + typedef unsigned char u8x16 + __attribute__((__vector_size__(16), __aligned__(16))); + + u8x16 bitmap07 = {}; + u8x16 bitmap8f = {}; + for (;; c++) { + unsigned lo_nibble = *(unsigned char *)c % 16; + unsigned hi_nibble = *(unsigned char *)c / 16; + bitmap07[lo_nibble] |= 1 << (hi_nibble - 0); + bitmap8f[lo_nibble] |= 1 << (hi_nibble - 8); + if (!*c) break; // Terminator IS on the bitmap. + } + + for (; w <= limit; w++) { + const v128_t lo_nibbles = wasm_v128_load(w) & wasm_u8x16_const_splat(0xf); + const v128_t hi_nibbles = wasm_u8x16_shr(wasm_v128_load(w), 4); + + const v128_t bitmask_lookup = + wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, // + 1, 2, 4, 8, 16, 32, 64, 128); + + const v128_t bitmask = wasm_i8x16_swizzle(bitmask_lookup, hi_nibbles); + const v128_t bitsets = wasm_v128_bitselect( + wasm_i8x16_swizzle(bitmap07, lo_nibbles), + wasm_i8x16_swizzle(bitmap8f, lo_nibbles), + wasm_i8x16_lt(hi_nibbles, wasm_u8x16_const_splat(8))); + + const v128_t cmp = wasm_i8x16_eq(bitsets & bitmask, bitmask); + if (wasm_v128_any_true(cmp)) { + size_t ctz = __builtin_ctz(wasm_i8x16_bitmask(cmp)); + return (char *)w + ctz - s; + } + } + + // Baseline algorithm. + for (s = (char *)w;; s++) { + const unsigned lo_nibble = *(unsigned char *)s & 0xf; + const unsigned hi_nibble = *(unsigned char *)s >> 4; + const unsigned bitmask = 1 << (hi_nibble & 0x7); + const unsigned bitset = + hi_nibble < 8 ? bitmap07[lo_nibble] : bitmap8f[lo_nibble]; + if (bitset & bitmask) return s - a; + } } // Given the above SIMD implementations,