#include_next // the system string.h #ifndef _WASM_SIMD128_STRING_H #define _WASM_SIMD128_STRING_H #include #include #include #include #include <__macro_PAGESIZE.h> #ifdef __cplusplus extern "C" { #endif #ifdef __wasm_bulk_memory__ // Use the builtins if compiled with bulk memory operations. // Clang will intrinsify using SIMD for small, constant N. // For everything else, this helps inlining. __attribute__((weak)) void *memset(void *dest, int c, size_t n) { return __builtin_memset(dest, c, n); } __attribute__((weak)) void *memcpy(void *__restrict dest, const void *__restrict src, size_t n) { return __builtin_memcpy(dest, src, n); } __attribute__((weak)) void *memmove(void *dest, const void *src, size_t n) { return __builtin_memmove(dest, src, n); } #endif // __wasm_bulk_memory__ #ifdef __wasm_simd128__ __attribute__((weak)) int memcmp(const void *v1, const void *v2, size_t n) { // Scalar algorithm. if (n < sizeof(v128_t)) { const unsigned char *u1 = (unsigned char *)v1; const unsigned char *u2 = (unsigned char *)v2; while (n--) { if (*u1 != *u2) return *u1 - *u2; u1++; u2++; } return 0; } // memcmp is allowed to read up to n bytes from each object. // Find the first different character in the objects. // Unaligned loads handle the case where the objects // have mismatching alignments. const v128_t *w1 = (v128_t *)v1; const v128_t *w2 = (v128_t *)v2; while (n) { const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2)); // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(cmp)) { // Find the offset of the first zero bit (little-endian). size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); const unsigned char *u1 = (unsigned char *)w1 + ctz; const unsigned char *u2 = (unsigned char *)w2 + ctz; // This may help the compiler if the function is inlined. __builtin_assume(*u1 - *u2 != 0); return *u1 - *u2; } // This makes n a multiple of sizeof(v128_t) // for every iteration except the first. size_t align = (n - 1) % sizeof(v128_t) + 1; w1 = (v128_t *)((char *)w1 + align); w2 = (v128_t *)((char *)w2 + align); n -= align; } return 0; } __attribute__((weak)) void *memchr(const void *v, int c, size_t n) { // When n is zero, a function that locates a character finds no occurrence. // Otherwise, decrement n to ensure sub_overflow overflows // when n would go equal-to-or-below zero. if (!n--) { return NULL; } // memchr must behave as if it reads characters sequentially // and stops as soon as a match is found. // Aligning ensures loads beyond the first match are safe. uintptr_t align = (uintptr_t)v % sizeof(v128_t); const v128_t *w = (v128_t *)((char *)v - align); const v128_t wc = wasm_i8x16_splat(c); for (;;) { const v128_t cmp = wasm_i8x16_eq(*w, wc); // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { // Clear the bits corresponding to alignment (little-endian) // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless we cleared them. // Knowing this helps the compiler. __builtin_assume(mask || align); // If the mask is zero because of alignment, // it's as if we didn't find anything. if (mask) { // Find the offset of the first one bit (little-endian). // That's a match, unless it is beyond the end of the object. // Recall that we decremented n, so less-than-or-equal-to is correct. size_t ctz = __builtin_ctz(mask); return ctz <= n + align ? (char *)w + ctz : NULL; } } // Decrement n; if it overflows we're done. if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) { return NULL; } align = 0; w++; } } __attribute__((weak)) void *memrchr(const void *v, int c, size_t n) { // memrchr is allowed to read up to n bytes from the object. // Search backward for the last matching character. const v128_t *w = (v128_t *)((char *)v + n); const v128_t wc = wasm_i8x16_splat(c); for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) { const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(--w), wc); // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { // Find the offset of the last one bit (little-endian). size_t clz = __builtin_clz(wasm_i8x16_bitmask(cmp)) - 15; return (char *)(w + 1) - clz; } } // Scalar algorithm. const char *a = (char *)w; while (n--) { if (*(--a) == (char)c) return (char *)a; } return NULL; } __attribute__((weak)) size_t strlen(const char *s) { // strlen must stop as soon as it finds the terminator. // Aligning ensures loads beyond the terminator are safe. uintptr_t align = (uintptr_t)s % sizeof(v128_t); const v128_t *w = (v128_t *)(s - align); for (;;) { // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(*w)) { const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}); // Clear the bits corresponding to alignment (little-endian) // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless we cleared them. // Knowing this helps the compiler. __builtin_assume(mask || align); if (mask) { // Find the offset of the first one bit (little-endian). return (char *)w - s + __builtin_ctz(mask); } } align = 0; w++; } } static int __strcmp_s(const char *s1, const char *s2) { // Scalar algorithm. const unsigned char *u1 = (unsigned char *)s1; const unsigned char *u2 = (unsigned char *)s2; for (;;) { if (*u1 != *u2) return *u1 - *u2; if (*u1 == 0) break; u1++; u2++; } return 0; } static int __strcmp(const char *s1, const char *s2) { // How many bytes can be read before pointers go out of bounds. size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - // (size_t)(s1 > s2 ? s1 : s2); // Unaligned loads handle the case where the strings // have mismatching alignments. const v128_t *w1 = (v128_t *)s1; const v128_t *w2 = (v128_t *)s2; for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { // The terminator may come before the difference. break; } // We know all characters are equal. // If any is a terminator the strings are equal. if (!wasm_i8x16_all_true(wasm_v128_load(w1))) { return 0; } w1++; w2++; } return __strcmp_s((char *)w1, (char *)w2); } __attribute__((weak, always_inline)) int strcmp(const char *s1, const char *s2) { // Skip the vector search when comparing against small literal strings. if (__builtin_constant_p(strlen(s2)) && strlen(s2) < sizeof(v128_t)) { return __strcmp_s(s1, s2); } return __strcmp(s1, s2); } __attribute__((weak)) int strncmp(const char *s1, const char *s2, size_t n) { // How many bytes can be read before pointers go out of bounds. size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - // (size_t)(s1 > s2 ? s1 : s2); if (n > N) n = N; // Unaligned loads handle the case where the strings // have mismatching alignments. const v128_t *w1 = (v128_t *)s1; const v128_t *w2 = (v128_t *)s2; for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) { // Find any single bit difference. if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) { // The terminator may come before the difference. break; } // We know all characters are equal. // If any is a terminator the strings are equal. if (!wasm_i8x16_all_true(wasm_v128_load(w1))) { return 0; } w1++; w2++; } // Scalar algorithm. const unsigned char *u1 = (unsigned char *)w1; const unsigned char *u2 = (unsigned char *)w2; while (n--) { if (*u1 != *u2) return *u1 - *u2; if (*u1 == 0) break; u1++; u2++; } return 0; } static char *__strchrnul(const char *s, int c) { // strchrnul must stop as soon as a match is found. // Aligning ensures loads beyond the first match are safe. uintptr_t align = (uintptr_t)s % sizeof(v128_t); const v128_t *w = (v128_t *)(s - align); const v128_t wc = wasm_i8x16_splat(c); for (;;) { const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}) | wasm_i8x16_eq(*w, wc); // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { // Clear the bits corresponding to alignment (little-endian) // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; // At least one bit will be set, unless we cleared them. // Knowing this helps the compiler. __builtin_assume(mask || align); if (mask) { // Find the offset of the first one bit (little-endian). return (char *)w + __builtin_ctz(mask); } } align = 0; w++; } } __attribute__((weak, always_inline)) char *strchrnul(const char *s, int c) { // For finding the terminator, strlen is faster. if (__builtin_constant_p(c) && (char)c == 0) { return (char *)s + strlen(s); } return __strchrnul(s, c); } __attribute__((weak, always_inline)) char *strchr(const char *s, int c) { // For finding the terminator, strlen is faster. if (__builtin_constant_p(c) && (char)c == 0) { return (char *)s + strlen(s); } char *r = __strchrnul(s, c); return *r == (char)c ? r : NULL; } __attribute__((weak, always_inline)) char *strrchr(const char *s, int c) { // For finding the terminator, strlen is faster. if (__builtin_constant_p(c) && (char)c == 0) { return (char *)s + strlen(s); } // This could also be implemented in a single pass using strchr, // advancing to the next match until no more matches are found. // That would be suboptimal with lots of consecutive matches. return (char *)memrchr(s, c, strlen(s) + 1); } // SIMDized check which bytes are in a set // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html typedef struct { __u8x16 l; __u8x16 h; } __wasm_v128_bitmap256_t; __attribute__((always_inline)) static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) { uint8_t hi_nibble = (uint8_t)i >> 4; uint8_t lo_nibble = (uint8_t)i & 0xf; bitmap->l[lo_nibble] |= 1 << (hi_nibble - 0); bitmap->h[lo_nibble] |= 1 << (hi_nibble - 8); } __attribute__((always_inline)) static int __wasm_v128_chkbit(__wasm_v128_bitmap256_t bitmap, int i) { uint8_t hi_nibble = (uint8_t)i >> 4; uint8_t lo_nibble = (uint8_t)i & 0xf; uint8_t bitmask = 1 << (hi_nibble & 0x7); uint8_t bitset = (hi_nibble < 8 ? bitmap.l : bitmap.h)[lo_nibble]; return bitmask & bitset; } #ifndef __wasm_relaxed_simd__ #define wasm_i8x16_relaxed_laneselect wasm_v128_bitselect #define wasm_i8x16_relaxed_swizzle wasm_i8x16_swizzle #endif // __wasm_relaxed_simd__ __attribute__((always_inline)) static v128_t __wasm_v128_chkbits(__wasm_v128_bitmap256_t bitmap, v128_t v) { v128_t hi_nibbles = wasm_u8x16_shr(v, 4); v128_t lo_nibbles = v & wasm_u8x16_const_splat(0xf); v128_t bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, // 1, 2, 4, 8, 16, 32, 64, 128); v128_t bitmask = wasm_i8x16_relaxed_swizzle(bitmask_lookup, hi_nibbles); v128_t bitsets = wasm_i8x16_relaxed_laneselect( wasm_i8x16_relaxed_swizzle(bitmap.l, lo_nibbles), wasm_i8x16_relaxed_swizzle(bitmap.h, lo_nibbles), wasm_i8x16_lt(hi_nibbles, wasm_u8x16_const_splat(8))); return wasm_i8x16_eq(bitsets & bitmask, bitmask); } #undef wasm_i8x16_relaxed_laneselect #undef wasm_i8x16_relaxed_swizzle __attribute__((weak)) size_t strspn(const char *s, const char *c) { // How many bytes can be read before the pointer goes out of bounds. size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - (size_t)s; const v128_t *w = (v128_t *)s; const char *const a = s; if (!c[0]) return 0; if (!c[1]) { const v128_t wc = wasm_i8x16_splat(*c); for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w), wc); // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(cmp)) { // Find the offset of the first zero bit (little-endian). size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); return (char *)w + ctz - s; } w++; } // Scalar algorithm. for (s = (char *)w; *s == *c; s++); return s - a; } __wasm_v128_bitmap256_t bitmap = {}; for (; *c; c++) { // Terminator IS NOT on the bitmap. __wasm_v128_setbit(&bitmap, *c); } for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w)); // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(cmp)) { // Find the offset of the first zero bit (little-endian). size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); return (char *)w + ctz - s; } w++; } // Scalar algorithm. for (s = (char *)w; __wasm_v128_chkbit(bitmap, *s); s++); return s - a; } __attribute__((weak)) size_t strcspn(const char *s, const char *c) { if (!c[0] || !c[1]) return __strchrnul(s, *c) - s; // How many bytes can be read before the pointer goes out of bounds. size_t N = __builtin_wasm_memory_size(0) * PAGESIZE - (size_t)s; const v128_t *w = (v128_t *)s; const char *const a = s; __wasm_v128_bitmap256_t bitmap = {}; do { // Terminator IS on the bitmap. __wasm_v128_setbit(&bitmap, *c); } while (*c++); for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) { const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w)); // Bitmask is slow on AArch64, any_true is much faster. if (wasm_v128_any_true(cmp)) { // Find the offset of the first one bit (little-endian). size_t ctz = __builtin_ctz(wasm_i8x16_bitmask(cmp)); return (char *)w + ctz - s; } w++; } // Scalar algorithm. for (s = (char *)w; !__wasm_v128_chkbit(bitmap, *s); s++); return s - a; } // SIMD-friendly algorithms for substring searching // http://0x80.pl/notesen/2016-11-28-simd-strfind.html // For haystacks of known length and large enough needles, // Boyer-Moore's bad-character rule may be useful, // as proposed by Horspool, Sunday and Raita. // // We augment the SIMD algorithm with Quick Search's // bad-character shift. // // https://igm.univ-mlv.fr/~lecroq/string/node14.html // https://igm.univ-mlv.fr/~lecroq/string/node18.html // https://igm.univ-mlv.fr/~lecroq/string/node19.html // https://igm.univ-mlv.fr/~lecroq/string/node22.html static const char *__memmem(const char *haystk, size_t sh, // const char *needle, size_t sn, // uint8_t bmbc[256]) { // We've handled empty and single character needles. // The needle is not longer than the haystack. __builtin_assume(2 <= sn && sn <= sh); // Find the farthest character not equal to the first one. size_t i = sn - 1; while (i > 0 && needle[0] == needle[i]) i--; if (i == 0) i = sn - 1; // Subtracting ensures sub_overflow overflows // when we reach the end of the haystack. if (sh != SIZE_MAX) sh -= sn; const v128_t fst = wasm_i8x16_splat(needle[0]); const v128_t lst = wasm_i8x16_splat(needle[i]); // The last haystack offset for which loading blk_lst is safe. const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - // (sizeof(v128_t) + i)); while (haystk <= H) { const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk)); const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + i)); const v128_t eq_fst = wasm_i8x16_eq(fst, blk_fst); const v128_t eq_lst = wasm_i8x16_eq(lst, blk_lst); const v128_t cmp = eq_fst & eq_lst; if (wasm_v128_any_true(cmp)) { // The terminator may come before the match. if (sh == SIZE_MAX && !wasm_i8x16_all_true(blk_fst)) break; // Find the offset of the first one bit (little-endian). // Each iteration clears that bit, tries again. for (uint32_t mask = wasm_i8x16_bitmask(cmp); mask; mask &= mask - 1) { size_t ctz = __builtin_ctz(mask); // The match may be after the end of the haystack. if (ctz > sh) return NULL; // We know the first character matches. if (!bcmp(haystk + ctz + 1, needle + 1, sn - 1)) { return haystk + ctz; } } } size_t skip = sizeof(v128_t); if (sh == SIZE_MAX) { // Have we reached the end of the haystack? if (!wasm_i8x16_all_true(blk_fst)) return NULL; } else { // Apply the bad-character rule to the character to the right // of the righmost character of the search window. if (bmbc) skip += bmbc[(unsigned char)haystk[sn - 1 + sizeof(v128_t)]]; // Have we reached the end of the haystack? if (__builtin_sub_overflow(sh, skip, &sh)) return NULL; } haystk += skip; } // Scalar algorithm. for (size_t j = 0; j <= sh; j++) { for (size_t i = 0;; i++) { if (sn == i) return haystk; if (sh == SIZE_MAX && !haystk[i]) return NULL; if (needle[i] != haystk[i]) break; } haystk++; } return NULL; } __attribute__((weak)) void *memmem(const void *vh, size_t sh, const void *vn, size_t sn) { // Return immediately on empty needle. if (sn == 0) return (void *)vh; // Return immediately when needle is longer than haystack. if (sn > sh) return NULL; // Skip to the first matching character using memchr, // thereby handling single character needles. const char *needle = (char *)vn; const char *haystk = (char *)memchr(vh, *needle, sh); if (!haystk || sn == 1) return (void *)haystk; // The haystack got shorter, is the needle now longer than it? sh -= haystk - (char *)vh; if (sn > sh) return NULL; // Is Boyer-Moore's bad-character rule useful? if (sn < sizeof(v128_t) || sh - sn < sizeof(v128_t)) { return (void *)__memmem(haystk, sh, needle, sn, NULL); } // Compute Boyer-Moore's bad-character shift function. // Only the last 255 characters of the needle matter for shifts up to 255, // which is good enough for most needles. size_t c = sn; size_t i = 0; if (c >= 255) { i = sn - 255; c = 255; } #ifndef _REENTRANT static #endif uint8_t bmbc[256]; memset(bmbc, c, sizeof(bmbc)); for (; i < sn; i++) { // One less than the usual offset because // we advance at least one vector at a time. bmbc[(unsigned char)needle[i]] = sn - i - 1; } return (void *)__memmem(haystk, sh, needle, sn, bmbc); } __attribute__((weak)) char *strstr(const char *haystk, const char *needle) { // Return immediately on empty needle. if (!needle[0]) return (char *)haystk; // Skip to the first matching character using strchr, // thereby handling single character needles. haystk = strchr(haystk, *needle); if (!haystk || !needle[1]) return (char *)haystk; return (char *)__memmem(haystk, SIZE_MAX, needle, strlen(needle), NULL); } __attribute__((weak)) char *strcasestr(const char *haystk, const char *needle) { // Return immediately on empty needle. if (!needle[0]) return (char *)haystk; // We've handled empty needles. size_t sn = strlen(needle); __builtin_assume(sn >= 1); // Find the farthest character not equal to the first one. size_t i = sn - 1; while (i > 0 && needle[0] == needle[i]) i--; if (i == 0) i = sn - 1; const v128_t fstl = wasm_i8x16_splat(tolower(needle[0])); const v128_t fstu = wasm_i8x16_splat(toupper(needle[0])); const v128_t lstl = wasm_i8x16_splat(tolower(needle[i])); const v128_t lstu = wasm_i8x16_splat(toupper(needle[i])); // The last haystk offset for which loading blk_lst is safe. const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - // (sizeof(v128_t) + i)); while (haystk <= H) { const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk)); const v128_t blk_lst = wasm_v128_load((v128_t *)(haystk + i)); const v128_t eq_fst = wasm_i8x16_eq(fstl, blk_fst) | wasm_i8x16_eq(fstu, blk_fst); const v128_t eq_lst = wasm_i8x16_eq(lstl, blk_lst) | wasm_i8x16_eq(lstu, blk_lst); const v128_t cmp = eq_fst & eq_lst; if (wasm_v128_any_true(cmp)) { // The terminator may come before the match. if (!wasm_i8x16_all_true(blk_fst)) break; // Find the offset of the first one bit (little-endian). // Each iteration clears that bit, tries again. for (uint32_t mask = wasm_i8x16_bitmask(cmp); mask; mask &= mask - 1) { size_t ctz = __builtin_ctz(mask); if (!strncasecmp(haystk + ctz + 1, needle + 1, sn - 1)) { return (char *)haystk + ctz; } } } // Have we reached the end of the haystack? if (!wasm_i8x16_all_true(blk_fst)) return NULL; haystk += sizeof(v128_t); } // Scalar algorithm. for (;;) { for (size_t i = 0;; i++) { if (sn == i) return (char *)haystk; if (!haystk[i]) return NULL; if (tolower(needle[i]) != tolower(haystk[i])) break; } haystk++; } return NULL; } // Given the above SIMD implementations, // these are best implemented as // small wrappers over those functions. // Simple wrappers already in musl: // - mempcpy // - strcat // - strlcat // - strdup // - strndup // - strnlen // - strpbrk // - strsep // - strtok __attribute__((weak)) void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) { const void *m = memchr(src, c, n); if (m != NULL) { n = (char *)m - (char *)src + 1; m = (char *)dest + n; } memcpy(dest, src, n); return (void *)m; } __attribute__((weak)) size_t strlcpy(char *__restrict dest, const char *__restrict src, size_t n) { size_t slen = strlen(src); if (n--) { if (n > slen) n = slen; memcpy(dest, src, n); dest[n] = 0; } return slen; } __attribute__((weak)) char *strncat(char *__restrict dest, const char *__restrict src, size_t n) { size_t strnlen(const char *s, size_t n); size_t dlen = strlen(dest); size_t slen = strnlen(src, n); memcpy(dest + dlen, src, slen); dest[dlen + slen] = 0; return dest; } static char *__stpcpy(char *__restrict dest, const char *__restrict src) { size_t slen = strlen(src); memcpy(dest, src, slen + 1); return dest + slen; } static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) { size_t strnlen(const char *s, size_t n); size_t slen = strnlen(src, n); memcpy(dest, src, slen); memset(dest + slen, 0, n - slen); return dest + slen; } __attribute__((weak, always_inline)) char *stpcpy(char *__restrict dest, const char *__restrict src) { return __stpcpy(dest, src); } __attribute__((weak, always_inline)) char *strcpy(char *__restrict dest, const char *__restrict src) { __stpcpy(dest, src); return dest; } __attribute__((weak, always_inline)) char *stpncpy(char *__restrict dest, const char *__restrict src, size_t n) { return __stpncpy(dest, src, n); } __attribute__((weak, always_inline)) char *strncpy(char *__restrict dest, const char *__restrict src, size_t n) { __stpncpy(dest, src, n); return dest; } #endif // __wasm_simd128__ #ifdef __cplusplus } // extern "C" #endif #endif // _WASM_SIMD128_STRING_H