mirror of
https://github.com/ncruces/go-sqlite3.git
synced 2026-01-11 21:49:13 +00:00
Size optimized versions.
This commit is contained in:
@@ -40,8 +40,8 @@ void *memmove(void *dest, const void *src, size_t n) {
|
||||
|
||||
// SIMD versions of some string.h functions.
|
||||
//
|
||||
// These assume aligned v128_t reads can't fail,
|
||||
// and so can't unaligned reads up to the last
|
||||
// These assume aligned v128_t loads can't fail,
|
||||
// and so can't unaligned loads up to the last
|
||||
// aligned address less than memory size.
|
||||
//
|
||||
// These also assume unaligned access is not painfully slow,
|
||||
@@ -49,9 +49,13 @@ void *memmove(void *dest, const void *src, size_t n) {
|
||||
|
||||
__attribute__((weak))
|
||||
int memcmp(const void *v1, const void *v2, size_t n) {
|
||||
// memcmp can read up to n bytes from each object.
|
||||
// Using unaligned loads to handle the case where
|
||||
// the objects have mismatching alignments.
|
||||
const v128_t *w1 = v1;
|
||||
const v128_t *w2 = v2;
|
||||
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
|
||||
// Find any single bit difference.
|
||||
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
|
||||
break;
|
||||
}
|
||||
@@ -59,8 +63,9 @@ int memcmp(const void *v1, const void *v2, size_t n) {
|
||||
w2++;
|
||||
}
|
||||
|
||||
const uint8_t *u1 = (void *)w1;
|
||||
const uint8_t *u2 = (void *)w2;
|
||||
// Continue byte-by-byte.
|
||||
const unsigned char *u1 = (void *)w1;
|
||||
const unsigned char *u2 = (void *)w2;
|
||||
while (n--) {
|
||||
if (*u1 != *u2) return *u1 - *u2;
|
||||
u1++;
|
||||
@@ -71,24 +76,40 @@ int memcmp(const void *v1, const void *v2, size_t n) {
|
||||
|
||||
__attribute__((weak))
|
||||
void *memchr(const void *v, int c, size_t n) {
|
||||
// When n is zero, a function that locates a character finds no occurrence.
|
||||
// Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
|
||||
// when n would go equal-to-or-below zero.
|
||||
if (n-- == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// memchr must behave as if it reads characters sequentially
|
||||
// and stops as soon as a match is found.
|
||||
// Aligning ensures loads can't fail.
|
||||
uintptr_t align = (uintptr_t)v % sizeof(v128_t);
|
||||
const v128_t *w = (void *)(v - align);
|
||||
const v128_t wc = wasm_i8x16_splat(c);
|
||||
|
||||
while (true) {
|
||||
const v128_t cmp = wasm_i8x16_eq(*w, wc);
|
||||
// Bitmask is slow on AArch64, any_true is much faster.
|
||||
if (wasm_v128_any_true(cmp)) {
|
||||
// Clear the bits corresponding to alignment
|
||||
// so we can count trailing zeros.
|
||||
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
|
||||
// At least one bit will be set, unless we cleared them.
|
||||
// Knowing this helps the compiler.
|
||||
__builtin_assume(mask || align);
|
||||
// If the mask is zero because of alignment,
|
||||
// it's as if we didn't find anything.
|
||||
if (mask) {
|
||||
// We found a match, unless it is beyond the end of the object.
|
||||
// Recall that we decremented n, so less-than-or-equal-to is correct.
|
||||
size_t ctz = __builtin_ctz(mask);
|
||||
return ctz <= n + align ? (void *)w + ctz : NULL;
|
||||
}
|
||||
}
|
||||
// Decrement n; if it "overflows" we're done.
|
||||
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
|
||||
return NULL;
|
||||
}
|
||||
@@ -99,13 +120,20 @@ void *memchr(const void *v, int c, size_t n) {
|
||||
|
||||
__attribute__((weak))
|
||||
size_t strlen(const char *s) {
|
||||
// strlen must stop as soon as it finds the terminator.
|
||||
// Aligning ensures loads can't fail.
|
||||
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
|
||||
const v128_t *w = (void *)(s - align);
|
||||
|
||||
while (true) {
|
||||
// Bitmask is slow on AArch64, all_true is much faster.
|
||||
if (!wasm_i8x16_all_true(*w)) {
|
||||
const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){});
|
||||
// Clear the bits corresponding to alignment
|
||||
// so we can count trailing zeros.
|
||||
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
|
||||
// At least one bit will be set, unless we cleared them.
|
||||
// Knowing this helps the compiler.
|
||||
__builtin_assume(mask || align);
|
||||
if (mask) {
|
||||
return (char *)w - s + __builtin_ctz(mask);
|
||||
@@ -117,15 +145,23 @@ size_t strlen(const char *s) {
|
||||
}
|
||||
|
||||
static int __strcmp(const char *s1, const char *s2) {
|
||||
// Set limit to the largest possible valid v128_t pointer.
|
||||
// Unsigned modular arithmetic gives the correct result
|
||||
// unless memory size is zero, in which case all pointers are invalid.
|
||||
const v128_t *const limit =
|
||||
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
|
||||
|
||||
// Using unaligned loads to handle the case where
|
||||
// the strings have mismatching alignments.
|
||||
const v128_t *w1 = (void *)s1;
|
||||
const v128_t *w2 = (void *)s2;
|
||||
while (w1 <= limit && w2 <= limit) {
|
||||
// Find any single bit difference.
|
||||
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
|
||||
break;
|
||||
}
|
||||
// All bytes are equal.
|
||||
// If any byte is zero (on both strings) the strings are equal.
|
||||
if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
|
||||
return 0;
|
||||
}
|
||||
@@ -133,8 +169,9 @@ static int __strcmp(const char *s1, const char *s2) {
|
||||
w2++;
|
||||
}
|
||||
|
||||
const uint8_t *u1 = (void *)w1;
|
||||
const uint8_t *u2 = (void *)w2;
|
||||
// Continue byte-by-byte.
|
||||
const unsigned char *u1 = (void *)w1;
|
||||
const unsigned char *u2 = (void *)w2;
|
||||
while (true) {
|
||||
if (*u1 != *u2) return *u1 - *u2;
|
||||
if (*u1 == 0) break;
|
||||
@@ -146,6 +183,8 @@ static int __strcmp(const char *s1, const char *s2) {
|
||||
|
||||
__attribute__((weak, always_inline))
|
||||
int strcmp(const char *s1, const char *s2) {
|
||||
// Use strncmp when comparing against literal strings.
|
||||
// If the literal is small, the vector search will be skipped.
|
||||
if (__builtin_constant_p(strlen(s2))) {
|
||||
return strncmp(s1, s2, strlen(s2));
|
||||
}
|
||||
@@ -154,15 +193,23 @@ int strcmp(const char *s1, const char *s2) {
|
||||
|
||||
__attribute__((weak))
|
||||
int strncmp(const char *s1, const char *s2, size_t n) {
|
||||
// Set limit to the largest possible valid v128_t pointer.
|
||||
// Unsigned modular arithmetic gives the correct result
|
||||
// unless memory size is zero, in which case all pointers are invalid.
|
||||
const v128_t *const limit =
|
||||
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
|
||||
|
||||
// Using unaligned loads to handle the case where
|
||||
// the strings have mismatching alignments.
|
||||
const v128_t *w1 = (void *)s1;
|
||||
const v128_t *w2 = (void *)s2;
|
||||
for (; w1 <= limit && w2 <= limit && n >= sizeof(v128_t); n -= sizeof(v128_t)) {
|
||||
// Find any single bit difference.
|
||||
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
|
||||
break;
|
||||
}
|
||||
// All bytes are equal.
|
||||
// If any byte is zero (on both strings) the strings are equal.
|
||||
if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
|
||||
return 0;
|
||||
}
|
||||
@@ -170,8 +217,9 @@ int strncmp(const char *s1, const char *s2, size_t n) {
|
||||
w2++;
|
||||
}
|
||||
|
||||
const uint8_t *u1 = (void *)w1;
|
||||
const uint8_t *u2 = (void *)w2;
|
||||
// Continue byte-by-byte.
|
||||
const unsigned char *u1 = (void *)w1;
|
||||
const unsigned char *u2 = (void *)w2;
|
||||
while (n--) {
|
||||
if (*u1 != *u2) return *u1 - *u2;
|
||||
if (*u1 == 0) break;
|
||||
@@ -182,14 +230,21 @@ int strncmp(const char *s1, const char *s2, size_t n) {
|
||||
}
|
||||
|
||||
static char *__strchrnul(const char *s, int c) {
|
||||
// strchrnul must stop as soon as a match is found.
|
||||
// Aligning ensures loads can't fail.
|
||||
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
|
||||
const v128_t *w = (void *)(s - align);
|
||||
const v128_t wc = wasm_i8x16_splat(c);
|
||||
|
||||
while (true) {
|
||||
const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}) | wasm_i8x16_eq(*w, wc);
|
||||
// Bitmask is slow on AArch64, any_true is much faster.
|
||||
if (wasm_v128_any_true(cmp)) {
|
||||
// Clear the bits corresponding to alignment
|
||||
// so we can count trailing zeros.
|
||||
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
|
||||
// At least one bit will be set, unless we cleared them.
|
||||
// Knowing this helps the compiler.
|
||||
__builtin_assume(mask || align);
|
||||
if (mask) {
|
||||
return (char *)w + __builtin_ctz(mask);
|
||||
@@ -202,6 +257,7 @@ static char *__strchrnul(const char *s, int c) {
|
||||
|
||||
__attribute__((weak, always_inline))
|
||||
char *strchrnul(const char *s, int c) {
|
||||
// For finding the terminator, strlen is faster.
|
||||
if (__builtin_constant_p(c) && (char)c == 0) {
|
||||
return (char *)s + strlen(s);
|
||||
}
|
||||
@@ -210,6 +266,7 @@ char *strchrnul(const char *s, int c) {
|
||||
|
||||
__attribute__((weak, always_inline))
|
||||
char *strchr(const char *s, int c) {
|
||||
// For finding the terminator, strlen is faster.
|
||||
if (__builtin_constant_p(c) && (char)c == 0) {
|
||||
return (char *)s + strlen(s);
|
||||
}
|
||||
@@ -220,13 +277,16 @@ char *strchr(const char *s, int c) {
|
||||
__attribute__((weak))
|
||||
size_t strspn(const char *s, const char *c) {
|
||||
#ifndef _REENTRANT
|
||||
static
|
||||
static // Avoid the stack for builds without threads.
|
||||
#endif
|
||||
char byteset[UCHAR_MAX + 1];
|
||||
const char *const a = s;
|
||||
|
||||
if (!c[0]) return 0;
|
||||
if (!c[1]) {
|
||||
// Set limit to the largest possible valid v128_t pointer.
|
||||
// Unsigned modular arithmetic gives the correct result
|
||||
// unless memory size is zero, in which case all pointers are invalid.
|
||||
const v128_t *const limit =
|
||||
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
|
||||
|
||||
@@ -244,33 +304,63 @@ size_t strspn(const char *s, const char *c) {
|
||||
return s - a;
|
||||
}
|
||||
|
||||
volatile v128_t *w = (void *)byteset;
|
||||
#pragma unroll
|
||||
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
|
||||
#if defined(__OPTIMIZE_SIZE__) || !defined(__OPTIMIZE__)
|
||||
|
||||
// Unoptimized version.
|
||||
memset(byteset, 0, sizeof(byteset));
|
||||
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
|
||||
while (byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#else
|
||||
|
||||
// This is faster than memset.
|
||||
volatile v128_t *w = (void *)byteset;
|
||||
#pragma unroll
|
||||
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
|
||||
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
|
||||
|
||||
// Keeping byteset[0] = 0 avoids the other loop having to test for it.
|
||||
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
|
||||
#pragma unroll 4
|
||||
while (byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#endif
|
||||
|
||||
while (*c && (byteset[*(uint8_t *)c] = 1)) c++;
|
||||
#pragma unroll 4
|
||||
while (byteset[*(uint8_t *)s]) s++;
|
||||
return s - a;
|
||||
}
|
||||
|
||||
__attribute__((weak))
|
||||
size_t strcspn(const char *s, const char *c) {
|
||||
#ifndef _REENTRANT
|
||||
static
|
||||
static // Avoid the stack for builds without threads.
|
||||
#endif
|
||||
char byteset[UCHAR_MAX + 1];
|
||||
const char *const a = s;
|
||||
|
||||
if (!c[0] || !c[1]) return __strchrnul(s, *c) - s;
|
||||
|
||||
volatile v128_t *w = (void *)byteset;
|
||||
#pragma unroll
|
||||
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
|
||||
#if defined(__OPTIMIZE_SIZE__) || !defined(__OPTIMIZE__)
|
||||
|
||||
// Unoptimized version.
|
||||
memset(byteset, 0, sizeof(byteset));
|
||||
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
|
||||
while (!byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#else
|
||||
|
||||
// This is faster than memset.
|
||||
volatile v128_t *w = (void *)byteset;
|
||||
#pragma unroll
|
||||
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
|
||||
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
|
||||
|
||||
// Setting byteset[0] = 1 avoids the other loop having to test for it.
|
||||
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
|
||||
#pragma unroll 4
|
||||
while (!byteset[*(unsigned char *)s]) s++;
|
||||
|
||||
#endif
|
||||
|
||||
while ((byteset[*(uint8_t *)c] = 1) && *c) c++;
|
||||
#pragma unroll 4
|
||||
while (!byteset[*(uint8_t *)s]) s++;
|
||||
return s - a;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user