mirror of
https://github.com/ncruces/go-sqlite3.git
synced 2026-01-12 05:59:14 +00:00
Fix.
This commit is contained in:
Binary file not shown.
@@ -36,103 +36,130 @@
|
||||
(local.get $0)
|
||||
)
|
||||
(func $memcmp (param $0 i32) (param $1 i32) (param $2 i32) (result i32)
|
||||
(local $3 i32)
|
||||
(local $3 v128)
|
||||
(local $4 i32)
|
||||
(block $block1
|
||||
(block $block
|
||||
(local $5 i32)
|
||||
(block $block
|
||||
(br_if $block
|
||||
(i32.lt_u
|
||||
(local.get $2)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(loop $label
|
||||
(if
|
||||
(i32.ge_u
|
||||
(local.get $2)
|
||||
(i32.const 16)
|
||||
)
|
||||
(then
|
||||
(loop $label
|
||||
(br_if $block
|
||||
(v128.any_true
|
||||
(v128.xor
|
||||
(v128.load align=1
|
||||
(local.get $1)
|
||||
)
|
||||
(v128.load align=1
|
||||
(local.get $0)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(local.set $1
|
||||
(i32.add
|
||||
(local.get $1)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(local.set $0
|
||||
(i32.add
|
||||
(i8x16.all_true
|
||||
(local.tee $3
|
||||
(i8x16.eq
|
||||
(v128.load align=1
|
||||
(local.get $0)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(br_if $label
|
||||
(i32.gt_u
|
||||
(local.tee $2
|
||||
(i32.sub
|
||||
(local.get $2)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(i32.const 15)
|
||||
(v128.load align=1
|
||||
(local.get $1)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(br_if $block1
|
||||
(i32.eqz
|
||||
(local.get $2)
|
||||
(then
|
||||
(local.set $1
|
||||
(i32.add
|
||||
(local.get $1)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(local.set $0
|
||||
(i32.add
|
||||
(local.get $0)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(br_if $label
|
||||
(i32.gt_u
|
||||
(local.tee $2
|
||||
(i32.sub
|
||||
(local.get $2)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
(i32.const 15)
|
||||
)
|
||||
)
|
||||
(br $block)
|
||||
)
|
||||
)
|
||||
)
|
||||
(loop $label1
|
||||
(if
|
||||
(i32.ne
|
||||
(local.tee $3
|
||||
(i32.load8_u
|
||||
(local.get $0)
|
||||
)
|
||||
)
|
||||
(local.tee $4
|
||||
(i32.load8_u
|
||||
(local.get $1)
|
||||
(return
|
||||
(i32.sub
|
||||
(i32.load8_u
|
||||
(i32.add
|
||||
(local.get $0)
|
||||
(local.tee $2
|
||||
(i32.ctz
|
||||
(i32.xor
|
||||
(i8x16.bitmask
|
||||
(local.get $3)
|
||||
)
|
||||
(i32.const -1)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(then
|
||||
(return
|
||||
(i32.sub
|
||||
(local.get $3)
|
||||
(local.get $4)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(local.set $1
|
||||
(i32.add
|
||||
(local.get $1)
|
||||
(i32.const 1)
|
||||
)
|
||||
)
|
||||
(local.set $0
|
||||
(i32.add
|
||||
(local.get $0)
|
||||
(i32.const 1)
|
||||
)
|
||||
)
|
||||
(br_if $label1
|
||||
(local.tee $2
|
||||
(i32.sub
|
||||
(i32.load8_u
|
||||
(i32.add
|
||||
(local.get $1)
|
||||
(local.get $2)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(if
|
||||
(local.get $2)
|
||||
(then
|
||||
(loop $label1
|
||||
(if
|
||||
(i32.ne
|
||||
(local.tee $4
|
||||
(i32.load8_u
|
||||
(local.get $0)
|
||||
)
|
||||
)
|
||||
(local.tee $5
|
||||
(i32.load8_u
|
||||
(local.get $1)
|
||||
)
|
||||
)
|
||||
)
|
||||
(then
|
||||
(return
|
||||
(i32.sub
|
||||
(local.get $4)
|
||||
(local.get $5)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(local.set $1
|
||||
(i32.add
|
||||
(local.get $1)
|
||||
(i32.const 1)
|
||||
)
|
||||
)
|
||||
(local.set $0
|
||||
(i32.add
|
||||
(local.get $0)
|
||||
(i32.const 1)
|
||||
)
|
||||
)
|
||||
(br_if $label1
|
||||
(local.tee $2
|
||||
(i32.sub
|
||||
(local.get $2)
|
||||
(i32.const 1)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -886,7 +913,9 @@
|
||||
(func $strspn (param $0 i32) (param $1 i32) (result i32)
|
||||
(local $2 i32)
|
||||
(local $3 i32)
|
||||
(local $4 v128)
|
||||
(local $4 i32)
|
||||
(local $5 v128)
|
||||
(local $6 v128)
|
||||
(local $scratch i32)
|
||||
(if
|
||||
(i32.eqz
|
||||
@@ -902,7 +931,7 @@
|
||||
)
|
||||
)
|
||||
)
|
||||
(block $block1
|
||||
(block $block
|
||||
(if
|
||||
(i32.eqz
|
||||
(i32.load8_u offset=1
|
||||
@@ -910,50 +939,75 @@
|
||||
)
|
||||
)
|
||||
(then
|
||||
(block $block
|
||||
(br_if $block
|
||||
(i32.gt_u
|
||||
(local.tee $1
|
||||
(local.get $0)
|
||||
)
|
||||
(local.tee $3
|
||||
(i32.sub
|
||||
(i32.shl
|
||||
(memory.size)
|
||||
(i32.const 16)
|
||||
)
|
||||
(if
|
||||
(i32.ge_u
|
||||
(local.tee $4
|
||||
(i32.sub
|
||||
(i32.shl
|
||||
(memory.size)
|
||||
(i32.const 16)
|
||||
)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
)
|
||||
(local.set $4
|
||||
(i8x16.splat
|
||||
(local.get $2)
|
||||
(local.tee $1
|
||||
(local.get $0)
|
||||
)
|
||||
)
|
||||
(loop $label
|
||||
(br_if $block
|
||||
(i32.eqz
|
||||
(i8x16.all_true
|
||||
(i8x16.eq
|
||||
(v128.load align=1
|
||||
(local.get $1)
|
||||
(then
|
||||
(local.set $5
|
||||
(i8x16.splat
|
||||
(local.get $2)
|
||||
)
|
||||
)
|
||||
(loop $label
|
||||
(if
|
||||
(i32.eqz
|
||||
(i8x16.all_true
|
||||
(local.tee $6
|
||||
(i8x16.eq
|
||||
(v128.load align=1
|
||||
(i32.add
|
||||
(local.get $0)
|
||||
(local.get $3)
|
||||
)
|
||||
)
|
||||
(local.get $5)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(then
|
||||
(return
|
||||
(i32.add
|
||||
(i32.ctz
|
||||
(i32.xor
|
||||
(i8x16.bitmask
|
||||
(local.get $6)
|
||||
)
|
||||
(i32.const -1)
|
||||
)
|
||||
)
|
||||
(local.get $3)
|
||||
)
|
||||
(local.get $4)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(br_if $label
|
||||
(i32.le_u
|
||||
(local.tee $1
|
||||
(i32.add
|
||||
(local.get $1)
|
||||
(i32.const 16)
|
||||
(br_if $label
|
||||
(i32.le_u
|
||||
(local.tee $1
|
||||
(i32.add
|
||||
(local.get $0)
|
||||
(local.tee $3
|
||||
(i32.add
|
||||
(local.get $3)
|
||||
(i32.const 16)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(local.get $4)
|
||||
)
|
||||
(local.get $3)
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -992,7 +1046,7 @@
|
||||
)
|
||||
)
|
||||
)
|
||||
(br $block1)
|
||||
(br $block)
|
||||
)
|
||||
)
|
||||
(v128.store
|
||||
@@ -1094,11 +1148,11 @@
|
||||
(local.set $2
|
||||
(local.get $0)
|
||||
)
|
||||
(block $block2
|
||||
(block $block3
|
||||
(block $block4
|
||||
(block $block1
|
||||
(block $block2
|
||||
(block $block3
|
||||
(loop $label3
|
||||
(br_if $block2
|
||||
(br_if $block1
|
||||
(i32.eqz
|
||||
(i32.load8_u
|
||||
(i32.add
|
||||
@@ -1110,7 +1164,7 @@
|
||||
)
|
||||
)
|
||||
)
|
||||
(br_if $block3
|
||||
(br_if $block2
|
||||
(i32.eqz
|
||||
(i32.load8_u
|
||||
(i32.add
|
||||
@@ -1122,7 +1176,7 @@
|
||||
)
|
||||
)
|
||||
)
|
||||
(br_if $block4
|
||||
(br_if $block3
|
||||
(i32.eqz
|
||||
(i32.load8_u
|
||||
(i32.add
|
||||
@@ -1162,7 +1216,7 @@
|
||||
(i32.const 1)
|
||||
)
|
||||
)
|
||||
(br $block2)
|
||||
(br $block1)
|
||||
)
|
||||
(local.set $2
|
||||
(i32.add
|
||||
@@ -1170,7 +1224,7 @@
|
||||
(i32.const 2)
|
||||
)
|
||||
)
|
||||
(br $block2)
|
||||
(br $block1)
|
||||
)
|
||||
(local.set $2
|
||||
(i32.add
|
||||
|
||||
@@ -11,9 +11,16 @@ extern "C" {
|
||||
|
||||
#ifdef __wasm_relaxed_simd__
|
||||
|
||||
// This header assumes "relaxed fused multiply-add"
|
||||
// is both faster and more precise.
|
||||
|
||||
#define FP_FAST_FMA 1
|
||||
|
||||
__attribute__((weak))
|
||||
double fma(double x, double y, double z) {
|
||||
const v128_t wx = wasm_f64x2_splat(x);
|
||||
// If we get a software implementation from the host,
|
||||
// this is enough to short circuit it on the 2nd lane.
|
||||
const v128_t wx = wasm_f64x2_replace_lane(b, 0, x);
|
||||
const v128_t wy = wasm_f64x2_splat(y);
|
||||
const v128_t wz = wasm_f64x2_splat(z);
|
||||
const v128_t wr = wasm_f64x2_relaxed_madd(wx, wy, wz);
|
||||
|
||||
@@ -39,13 +39,6 @@ void *memmove(void *dest, const void *src, size_t n) {
|
||||
#ifdef __wasm_simd128__
|
||||
|
||||
// SIMD versions of some string.h functions.
|
||||
//
|
||||
// These assume aligned v128_t loads can't fail,
|
||||
// and so can't unaligned loads up to the last
|
||||
// aligned address less than memory size.
|
||||
//
|
||||
// These also assume unaligned access is not painfully slow,
|
||||
// but that bitmask extraction is really slow on AArch64.
|
||||
|
||||
__attribute__((weak))
|
||||
int memcmp(const void *v1, const void *v2, size_t n) {
|
||||
@@ -55,9 +48,13 @@ int memcmp(const void *v1, const void *v2, size_t n) {
|
||||
const v128_t *w1 = (v128_t *)v1;
|
||||
const v128_t *w2 = (v128_t *)v2;
|
||||
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
|
||||
// Find any single bit difference.
|
||||
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
|
||||
break;
|
||||
const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2));
|
||||
// Bitmask is slow on AArch64, all_true is much faster.
|
||||
if (!wasm_i8x16_all_true(cmp)) {
|
||||
size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
|
||||
const unsigned char *u1 = (unsigned char *)w1 + ctz;
|
||||
const unsigned char *u2 = (unsigned char *)w2 + ctz;
|
||||
return *u1 - *u2;
|
||||
}
|
||||
w1++;
|
||||
w2++;
|
||||
@@ -77,7 +74,7 @@ int memcmp(const void *v1, const void *v2, size_t n) {
|
||||
__attribute__((weak))
|
||||
void *memchr(const void *v, int c, size_t n) {
|
||||
// When n is zero, a function that locates a character finds no occurrence.
|
||||
// Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
|
||||
// Otherwise, decrement n to ensure __builtin_sub_overflow overflows
|
||||
// when n would go equal-to-or-below zero.
|
||||
if (n-- == 0) {
|
||||
return NULL;
|
||||
@@ -98,7 +95,7 @@ void *memchr(const void *v, int c, size_t n) {
|
||||
// so we can count trailing zeros.
|
||||
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
|
||||
// At least one bit will be set, unless we cleared them.
|
||||
// Knowing this helps the compiler.
|
||||
// Knowing this helps the compiler.
|
||||
__builtin_assume(mask || align);
|
||||
// If the mask is zero because of alignment,
|
||||
// it's as if we didn't find anything.
|
||||
@@ -109,7 +106,7 @@ void *memchr(const void *v, int c, size_t n) {
|
||||
return ctz <= n + align ? (char *)w + ctz : NULL;
|
||||
}
|
||||
}
|
||||
// Decrement n; if it "overflows" we're done.
|
||||
// Decrement n; if it overflows we're done.
|
||||
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
|
||||
return NULL;
|
||||
}
|
||||
@@ -133,7 +130,7 @@ size_t strlen(const char *s) {
|
||||
// so we can count trailing zeros.
|
||||
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
|
||||
// At least one bit will be set, unless we cleared them.
|
||||
// Knowing this helps the compiler.
|
||||
// Knowing this helps the compiler.
|
||||
__builtin_assume(mask || align);
|
||||
if (mask) {
|
||||
return (char *)w - s + __builtin_ctz(mask);
|
||||
@@ -181,12 +178,23 @@ static int __strcmp(const char *s1, const char *s2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __strcmp_s(const char *s1, const char *s2) {
|
||||
const unsigned char *u1 = (unsigned char *)s1;
|
||||
const unsigned char *u2 = (unsigned char *)s2;
|
||||
while (true) {
|
||||
if (*u1 != *u2) return *u1 - *u2;
|
||||
if (*u1 == 0) break;
|
||||
u1++;
|
||||
u2++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
__attribute__((weak, always_inline))
|
||||
int strcmp(const char *s1, const char *s2) {
|
||||
// Use strncmp when comparing against literal strings.
|
||||
// If the literal is small, the vector search will be skipped.
|
||||
if (__builtin_constant_p(strlen(s2))) {
|
||||
return strncmp(s1, s2, strlen(s2));
|
||||
// Skip the vector search when comparing against small literal strings.
|
||||
if (__builtin_constant_p(strlen(s2) && strlen(s2) < sizeof(v128_t))) {
|
||||
return __strcmp_s(s1, s2);
|
||||
}
|
||||
return __strcmp(s1, s2);
|
||||
}
|
||||
@@ -244,7 +252,7 @@ static char *__strchrnul(const char *s, int c) {
|
||||
// so we can count trailing zeros.
|
||||
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
|
||||
// At least one bit will be set, unless we cleared them.
|
||||
// Knowing this helps the compiler.
|
||||
// Knowing this helps the compiler.
|
||||
__builtin_assume(mask || align);
|
||||
if (mask) {
|
||||
return (char *)w + __builtin_ctz(mask);
|
||||
@@ -277,7 +285,7 @@ char *strchr(const char *s, int c) {
|
||||
__attribute__((weak))
|
||||
size_t strspn(const char *s, const char *c) {
|
||||
#ifndef _REENTRANT
|
||||
static // Avoid the stack for builds without threads.
|
||||
static // Avoid the stack for builds without threads.
|
||||
#endif
|
||||
char byteset[UCHAR_MAX + 1];
|
||||
const char *const a = s;
|
||||
@@ -293,12 +301,16 @@ size_t strspn(const char *s, const char *c) {
|
||||
const v128_t *w = (v128_t *)s;
|
||||
const v128_t wc = wasm_i8x16_splat(*c);
|
||||
while (w <= limit) {
|
||||
if (!wasm_i8x16_all_true(wasm_i8x16_eq(wasm_v128_load(w), wc))) {
|
||||
break;
|
||||
const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc);
|
||||
// Bitmask is slow on AArch64, all_true is much faster.
|
||||
if (!wasm_i8x16_all_true(cmp)) {
|
||||
size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
|
||||
return (char *)w + ctz - s;
|
||||
}
|
||||
w++;
|
||||
}
|
||||
|
||||
// Continue byte-by-byte.
|
||||
s = (char *)w;
|
||||
while (*s == *c) s++;
|
||||
return s - a;
|
||||
@@ -311,20 +323,21 @@ size_t strspn(const char *s, const char *c) {
|
||||
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
|
||||
while (byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#else
|
||||
#else // __OPTIMIZE__
|
||||
|
||||
// This is faster than memset.
|
||||
// Going backward helps bounds check elimination.
|
||||
volatile v128_t *w = (v128_t *)byteset;
|
||||
#pragma unroll
|
||||
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
|
||||
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
|
||||
|
||||
// Keeping byteset[0] = 0 avoids the other loop having to test for it.
|
||||
// Keeping byteset[0] = 0 avoids the next loop needing that check.
|
||||
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
|
||||
#pragma unroll 4
|
||||
while (byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#endif
|
||||
#endif // __OPTIMIZE__
|
||||
|
||||
return s - a;
|
||||
}
|
||||
@@ -332,7 +345,7 @@ size_t strspn(const char *s, const char *c) {
|
||||
__attribute__((weak))
|
||||
size_t strcspn(const char *s, const char *c) {
|
||||
#ifndef _REENTRANT
|
||||
static // Avoid the stack for builds without threads.
|
||||
static // Avoid the stack for builds without threads.
|
||||
#endif
|
||||
char byteset[UCHAR_MAX + 1];
|
||||
const char *const a = s;
|
||||
@@ -346,24 +359,31 @@ size_t strcspn(const char *s, const char *c) {
|
||||
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
|
||||
while (!byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#else
|
||||
#else // __OPTIMIZE__
|
||||
|
||||
// This is faster than memset.
|
||||
// Going backward helps bounds check elimination.
|
||||
volatile v128_t *w = (v128_t *)byteset;
|
||||
#pragma unroll
|
||||
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
|
||||
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
|
||||
|
||||
// Setting byteset[0] = 1 avoids the other loop having to test for it.
|
||||
// Setting byteset[0] = 1 avoids the next loop needing that check.
|
||||
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
|
||||
#pragma unroll 4
|
||||
while (!byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#endif
|
||||
#endif // __OPTIMIZE__
|
||||
|
||||
return s - a;
|
||||
}
|
||||
|
||||
__attribute__((weak, always_inline))
|
||||
char *strpbrk(const char *s, const char *b) {
|
||||
s += strcspn(s, b);
|
||||
return *s ? (char *)s : 0;
|
||||
}
|
||||
|
||||
#endif // __wasm_simd128__
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Reference in New Issue
Block a user