From 9ea7099c243a343c601ce2d5fab34cf9a2f4ee1c Mon Sep 17 00:00:00 2001 From: Nuno Cruces Date: Thu, 24 Apr 2025 23:17:30 +0100 Subject: [PATCH] Size optimized versions. --- sqlite3/libc/string.h | 134 +++++++++++++++++++++++++++++++++++------- 1 file changed, 112 insertions(+), 22 deletions(-) diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h index 2bdcebe..fba3421 100644 --- a/sqlite3/libc/string.h +++ b/sqlite3/libc/string.h @@ -40,8 +40,8 @@ void *memmove(void *dest, const void *src, size_t n) { // SIMD versions of some string.h functions. // -// These assume aligned v128_t reads can't fail, -// and so can't unaligned reads up to the last +// These assume aligned v128_t loads can't fail, +// and so can't unaligned loads up to the last // aligned address less than memory size. // // These also assume unaligned access is not painfully slow, @@ -49,9 +49,13 @@ void *memmove(void *dest, const void *src, size_t n) { __attribute__((weak)) int memcmp(const void *v1, const void *v2, size_t n) { + // memcmp can read up to n bytes from each object. + // Using unaligned loads to handle the case where + // the objects have mismatching alignments. const v128_t *w1 = v1; const v128_t *w2 = v2; for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) { + // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { break; } @@ -59,8 +63,9 @@ int memcmp(const void *v1, const void *v2, size_t n) { w2++; } - const uint8_t *u1 = (void *)w1; - const uint8_t *u2 = (void *)w2; + // Continue byte-by-byte. + const unsigned char *u1 = (void *)w1; + const unsigned char *u2 = (void *)w2; while (n--) { if (*u1 != *u2) return *u1 - *u2; u1++; @@ -71,24 +76,40 @@ int memcmp(const void *v1, const void *v2, size_t n) { __attribute__((weak)) void *memchr(const void *v, int c, size_t n) { + // When n is zero, a function that locates a character finds no occurrence. + // Otherwise, decrement n to ensure __builtin_sub_overflow "overflows" + // when n would go equal-to-or-below zero. if (n-- == 0) { return NULL; } + // memchr must behave as if it reads characters sequentially + // and stops as soon as a match is found. + // Aligning ensures loads can't fail. uintptr_t align = (uintptr_t)v % sizeof(v128_t); const v128_t *w = (void *)(v - align); const v128_t wc = wasm_i8x16_splat(c); while (true) { const v128_t cmp = wasm_i8x16_eq(*w, wc); + // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { + // Clear the bits corresponding to alignment + // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; + // At least one bit will be set, unless we cleared them. + // Knowing this helps the compiler. __builtin_assume(mask || align); + // If the mask is zero because of alignment, + // it's as if we didn't find anything. if (mask) { + // We found a match, unless it is beyond the end of the object. + // Recall that we decremented n, so less-than-or-equal-to is correct. size_t ctz = __builtin_ctz(mask); return ctz <= n + align ? (void *)w + ctz : NULL; } } + // Decrement n; if it "overflows" we're done. if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) { return NULL; } @@ -99,13 +120,20 @@ void *memchr(const void *v, int c, size_t n) { __attribute__((weak)) size_t strlen(const char *s) { + // strlen must stop as soon as it finds the terminator. + // Aligning ensures loads can't fail. uintptr_t align = (uintptr_t)s % sizeof(v128_t); const v128_t *w = (void *)(s - align); while (true) { + // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(*w)) { const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}); + // Clear the bits corresponding to alignment + // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; + // At least one bit will be set, unless we cleared them. + // Knowing this helps the compiler. __builtin_assume(mask || align); if (mask) { return (char *)w - s + __builtin_ctz(mask); @@ -117,15 +145,23 @@ size_t strlen(const char *s) { } static int __strcmp(const char *s1, const char *s2) { + // Set limit to the largest possible valid v128_t pointer. + // Unsigned modular arithmetic gives the correct result + // unless memory size is zero, in which case all pointers are invalid. const v128_t *const limit = (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; + // Using unaligned loads to handle the case where + // the strings have mismatching alignments. const v128_t *w1 = (void *)s1; const v128_t *w2 = (void *)s2; while (w1 <= limit && w2 <= limit) { + // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { break; } + // All bytes are equal. + // If any byte is zero (on both strings) the strings are equal. if (!wasm_i8x16_all_true(wasm_v128_load(w1))) { return 0; } @@ -133,8 +169,9 @@ static int __strcmp(const char *s1, const char *s2) { w2++; } - const uint8_t *u1 = (void *)w1; - const uint8_t *u2 = (void *)w2; + // Continue byte-by-byte. + const unsigned char *u1 = (void *)w1; + const unsigned char *u2 = (void *)w2; while (true) { if (*u1 != *u2) return *u1 - *u2; if (*u1 == 0) break; @@ -146,6 +183,8 @@ static int __strcmp(const char *s1, const char *s2) { __attribute__((weak, always_inline)) int strcmp(const char *s1, const char *s2) { + // Use strncmp when comparing against literal strings. + // If the literal is small, the vector search will be skipped. if (__builtin_constant_p(strlen(s2))) { return strncmp(s1, s2, strlen(s2)); } @@ -154,15 +193,23 @@ int strcmp(const char *s1, const char *s2) { __attribute__((weak)) int strncmp(const char *s1, const char *s2, size_t n) { + // Set limit to the largest possible valid v128_t pointer. + // Unsigned modular arithmetic gives the correct result + // unless memory size is zero, in which case all pointers are invalid. const v128_t *const limit = (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; + // Using unaligned loads to handle the case where + // the strings have mismatching alignments. const v128_t *w1 = (void *)s1; const v128_t *w2 = (void *)s2; for (; w1 <= limit && w2 <= limit && n >= sizeof(v128_t); n -= sizeof(v128_t)) { + // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { break; } + // All bytes are equal. + // If any byte is zero (on both strings) the strings are equal. if (!wasm_i8x16_all_true(wasm_v128_load(w1))) { return 0; } @@ -170,8 +217,9 @@ int strncmp(const char *s1, const char *s2, size_t n) { w2++; } - const uint8_t *u1 = (void *)w1; - const uint8_t *u2 = (void *)w2; + // Continue byte-by-byte. + const unsigned char *u1 = (void *)w1; + const unsigned char *u2 = (void *)w2; while (n--) { if (*u1 != *u2) return *u1 - *u2; if (*u1 == 0) break; @@ -182,14 +230,21 @@ int strncmp(const char *s1, const char *s2, size_t n) { } static char *__strchrnul(const char *s, int c) { + // strchrnul must stop as soon as a match is found. + // Aligning ensures loads can't fail. uintptr_t align = (uintptr_t)s % sizeof(v128_t); const v128_t *w = (void *)(s - align); const v128_t wc = wasm_i8x16_splat(c); while (true) { const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}) | wasm_i8x16_eq(*w, wc); + // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { + // Clear the bits corresponding to alignment + // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; + // At least one bit will be set, unless we cleared them. + // Knowing this helps the compiler. __builtin_assume(mask || align); if (mask) { return (char *)w + __builtin_ctz(mask); @@ -202,6 +257,7 @@ static char *__strchrnul(const char *s, int c) { __attribute__((weak, always_inline)) char *strchrnul(const char *s, int c) { + // For finding the terminator, strlen is faster. if (__builtin_constant_p(c) && (char)c == 0) { return (char *)s + strlen(s); } @@ -210,6 +266,7 @@ char *strchrnul(const char *s, int c) { __attribute__((weak, always_inline)) char *strchr(const char *s, int c) { + // For finding the terminator, strlen is faster. if (__builtin_constant_p(c) && (char)c == 0) { return (char *)s + strlen(s); } @@ -220,13 +277,16 @@ char *strchr(const char *s, int c) { __attribute__((weak)) size_t strspn(const char *s, const char *c) { #ifndef _REENTRANT - static + static // Avoid the stack for builds without threads. #endif char byteset[UCHAR_MAX + 1]; const char *const a = s; if (!c[0]) return 0; if (!c[1]) { + // Set limit to the largest possible valid v128_t pointer. + // Unsigned modular arithmetic gives the correct result + // unless memory size is zero, in which case all pointers are invalid. const v128_t *const limit = (v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1; @@ -244,33 +304,63 @@ size_t strspn(const char *s, const char *c) { return s - a; } - volatile v128_t *w = (void *)byteset; -#pragma unroll - for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){}; +#if defined(__OPTIMIZE_SIZE__) || !defined(__OPTIMIZE__) + + // Unoptimized version. + memset(byteset, 0, sizeof(byteset)); + while (*c && (byteset[*(unsigned char *)c] = 1)) c++; + while (byteset[*(unsigned char *)s]) s++; + +#else + + // This is faster than memset. + volatile v128_t *w = (void *)byteset; + #pragma unroll + for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){}; + static_assert(sizeof(byteset) % sizeof(v128_t) == 0); + + // Keeping byteset[0] = 0 avoids the other loop having to test for it. + while (*c && (byteset[*(unsigned char *)c] = 1)) c++; + #pragma unroll 4 + while (byteset[*(unsigned char *)s]) s++; + +#endif - while (*c && (byteset[*(uint8_t *)c] = 1)) c++; -#pragma unroll 4 - while (byteset[*(uint8_t *)s]) s++; return s - a; } __attribute__((weak)) size_t strcspn(const char *s, const char *c) { #ifndef _REENTRANT - static + static // Avoid the stack for builds without threads. #endif char byteset[UCHAR_MAX + 1]; const char *const a = s; if (!c[0] || !c[1]) return __strchrnul(s, *c) - s; - volatile v128_t *w = (void *)byteset; -#pragma unroll - for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){}; +#if defined(__OPTIMIZE_SIZE__) || !defined(__OPTIMIZE__) + + // Unoptimized version. + memset(byteset, 0, sizeof(byteset)); + while ((byteset[*(unsigned char *)c] = 1) && *c) c++; + while (!byteset[*(unsigned char *)s]) s++; + +#else + + // This is faster than memset. + volatile v128_t *w = (void *)byteset; + #pragma unroll + for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){}; + static_assert(sizeof(byteset) % sizeof(v128_t) == 0); + + // Setting byteset[0] = 1 avoids the other loop having to test for it. + while ((byteset[*(unsigned char *)c] = 1) && *c) c++; + #pragma unroll 4 + while (!byteset[*(unsigned char *)s]) s++; + +#endif - while ((byteset[*(uint8_t *)c] = 1) && *c) c++; -#pragma unroll 4 - while (!byteset[*(uint8_t *)s]) s++; return s - a; }