mirror of
https://github.com/ncruces/go-sqlite3.git
synced 2026-01-12 05:59:14 +00:00
Docs, tweaks.
This commit is contained in:
41
sqlite3/libc/README.md
Normal file
41
sqlite3/libc/README.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# Using SIMD for libc
|
||||
|
||||
I found that implementing some libc functions with Wasm SIMD128 can make them significantly faster.
|
||||
|
||||
Rough numbers for [wazero](https://wazero.io/):
|
||||
|
||||
function | speedup
|
||||
------------ | -----
|
||||
`strlen` | 4.1×
|
||||
`memchr` | 4.1×
|
||||
`strchr` | 4.0×
|
||||
`strrchr` | 9.1×
|
||||
`memcmp` | 13.0×
|
||||
`strcmp` | 10.4×
|
||||
`strncmp` | 15.7×
|
||||
`strcasecmp` | 8.8×
|
||||
`strncasecmp`| 8.6×
|
||||
`strspn` | 9.9×
|
||||
`strcspn` | 9.0×
|
||||
`memmem` | 2.2×
|
||||
`strstr` | 5.5×
|
||||
`strcasestr` | 25.2×
|
||||
|
||||
For functions where musl uses SWAR on a 4-byte `size_t`,
|
||||
the improvement is around 4×.
|
||||
This is very close to the expected theoretical improvement,
|
||||
as we're processing 4× the bytes per cycle (16 _vs._ 4).
|
||||
|
||||
For other functions where there's no algorithmic change,
|
||||
the improvement is around 8×.
|
||||
These functions are harder to optimize
|
||||
(which is why musl doesn't bother with SWAR),
|
||||
so getting an 8× improvement from processing 16× bytes seems decent.
|
||||
|
||||
String search is harder to compare, since there are algorithmic changes,
|
||||
and different needles produce very different numbers.
|
||||
We use [Quick Search](https://igm.univ-mlv.fr/~lecroq/string/node19.html) for `memmem`,
|
||||
and a [Rabin–Karp](https://igm.univ-mlv.fr/~lecroq/string/node5.html) for `strstr` and `strcasestr`;
|
||||
musl uses [Two Way](https://igm.univ-mlv.fr/~lecroq/string/node26.html) for `memmem` and `strstr`,
|
||||
and [brute force](https://igm.univ-mlv.fr/~lecroq/string/node3.html) for `strcasestr`.
|
||||
Unlike Two-Way, both replacements can go quadratic for long, periodic needles.
|
||||
@@ -402,8 +402,8 @@ size_t strspn(const char *s, const char *c) {
|
||||
__wasm_v128_bitmap256_t bitmap = {};
|
||||
|
||||
for (; *c; c++) {
|
||||
__wasm_v128_setbit(&bitmap, *c);
|
||||
// Terminator IS NOT on the bitmap.
|
||||
__wasm_v128_setbit(&bitmap, *c);
|
||||
}
|
||||
|
||||
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
|
||||
@@ -433,11 +433,10 @@ size_t strcspn(const char *s, const char *c) {
|
||||
|
||||
__wasm_v128_bitmap256_t bitmap = {};
|
||||
|
||||
for (;;) {
|
||||
__wasm_v128_setbit(&bitmap, *c);
|
||||
do {
|
||||
// Terminator IS on the bitmap.
|
||||
if (!*c++) break;
|
||||
}
|
||||
__wasm_v128_setbit(&bitmap, *c);
|
||||
} while (*c++);
|
||||
|
||||
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
|
||||
const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w));
|
||||
@@ -465,13 +464,13 @@ size_t strcspn(const char *s, const char *c) {
|
||||
// We augment the SIMD algorithm with Quick Search's
|
||||
// bad-character shift.
|
||||
//
|
||||
// https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
|
||||
// https://www-igm.univ-mlv.fr/~lecroq/string/node18.html
|
||||
// https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
|
||||
// https://www-igm.univ-mlv.fr/~lecroq/string/node22.html
|
||||
// https://igm.univ-mlv.fr/~lecroq/string/node14.html
|
||||
// https://igm.univ-mlv.fr/~lecroq/string/node18.html
|
||||
// https://igm.univ-mlv.fr/~lecroq/string/node19.html
|
||||
// https://igm.univ-mlv.fr/~lecroq/string/node22.html
|
||||
|
||||
static const char *__memmem(const char *haystk, size_t sh,
|
||||
const char *needle, size_t sn,
|
||||
static const char *__memmem(const char *haystk, size_t sh, //
|
||||
const char *needle, size_t sn, //
|
||||
uint8_t bmbc[256]) {
|
||||
// We've handled empty and single character needles.
|
||||
// The needle is not longer than the haystack.
|
||||
@@ -490,8 +489,8 @@ static const char *__memmem(const char *haystk, size_t sh,
|
||||
const v128_t lst = wasm_i8x16_splat(needle[i]);
|
||||
|
||||
// The last haystack offset for which loading blk_lst is safe.
|
||||
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i -
|
||||
sizeof(v128_t));
|
||||
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
|
||||
(sizeof(v128_t) + i));
|
||||
|
||||
while (haystk <= H) {
|
||||
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
|
||||
@@ -622,8 +621,8 @@ char *strcasestr(const char *haystk, const char *needle) {
|
||||
const v128_t lstu = wasm_i8x16_splat(toupper(needle[i]));
|
||||
|
||||
// The last haystk offset for which loading blk_lst is safe.
|
||||
const char *H =
|
||||
(char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - sizeof(v128_t));
|
||||
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
|
||||
(sizeof(v128_t) + i));
|
||||
|
||||
while (haystk <= H) {
|
||||
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
|
||||
@@ -680,7 +679,8 @@ char *strcasestr(const char *haystk, const char *needle) {
|
||||
// - strtok
|
||||
|
||||
__attribute__((weak))
|
||||
void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) {
|
||||
void *memccpy(void *__restrict dest, const void *__restrict src, int c,
|
||||
size_t n) {
|
||||
const void *m = memchr(src, c, n);
|
||||
if (m != NULL) {
|
||||
n = (char *)m - (char *)src + 1;
|
||||
@@ -717,7 +717,8 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) {
|
||||
return dest + slen;
|
||||
}
|
||||
|
||||
static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
|
||||
static char *__stpncpy(char *__restrict dest, const char *__restrict src,
|
||||
size_t n) {
|
||||
size_t strnlen(const char *s, size_t n);
|
||||
size_t slen = strnlen(src, n);
|
||||
memcpy(dest, src, slen);
|
||||
|
||||
Reference in New Issue
Block a user