From d7aef63844feb061443d13ee4a5a81eeb3986d99 Mon Sep 17 00:00:00 2001
From: Nuno Cruces <ncruces@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:43:17 +0100
Subject: [PATCH] Naming, volatile.

---
 sqlite3/libc/string.h | 96 +++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 45 deletions(-)

diff --git a/sqlite3/libc/string.h b/sqlite3/libc/string.h
index 8a4a67f..8758393 100644
--- a/sqlite3/libc/string.h
+++ b/sqlite3/libc/string.h
@@ -39,11 +39,11 @@ void *memmove(void *dest, const void *src, size_t n) {
 #ifdef __wasm_simd128__
 
 __attribute__((weak))
-int memcmp(const void *v1, const void *v2, size_t n) {
+int memcmp(const void *vl, const void *vr, size_t n) {
   // Scalar algorithm.
   if (n < sizeof(v128_t)) {
-    const unsigned char *u1 = (unsigned char *)v1;
-    const unsigned char *u2 = (unsigned char *)v2;
+    const unsigned char *u1 = (unsigned char *)vl;
+    const unsigned char *u2 = (unsigned char *)vr;
     while (n--) {
       if (*u1 != *u2) return *u1 - *u2;
       u1++;
@@ -56,16 +56,16 @@ int memcmp(const void *v1, const void *v2, size_t n) {
   // Find the first different character in the objects.
   // Unaligned loads handle the case where the objects
   // have mismatching alignments.
-  const v128_t *w1 = (v128_t *)v1;
-  const v128_t *w2 = (v128_t *)v2;
+  const v128_t *v1 = (v128_t *)vl;
+  const v128_t *v2 = (v128_t *)vr;
   while (n) {
-    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(w1), wasm_v128_load(w2));
+    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(v1), wasm_v128_load(v2));
     // Bitmask is slow on AArch64, all_true is much faster.
     if (!wasm_i8x16_all_true(cmp)) {
       // Find the offset of the first zero bit (little-endian).
       size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
-      const unsigned char *u1 = (unsigned char *)w1 + ctz;
-      const unsigned char *u2 = (unsigned char *)w2 + ctz;
+      const unsigned char *u1 = (unsigned char *)v1 + ctz;
+      const unsigned char *u2 = (unsigned char *)v2 + ctz;
       // This may help the compiler if the function is inlined.
       __builtin_assume(*u1 - *u2 != 0);
       return *u1 - *u2;
@@ -73,15 +73,15 @@ int memcmp(const void *v1, const void *v2, size_t n) {
     // This makes n a multiple of sizeof(v128_t)
     // for every iteration except the first.
     size_t align = (n - 1) % sizeof(v128_t) + 1;
-    w1 = (v128_t *)((char *)w1 + align);
-    w2 = (v128_t *)((char *)w2 + align);
+    v1 = (v128_t *)((char *)v1 + align);
+    v2 = (v128_t *)((char *)v2 + align);
     n -= align;
   }
   return 0;
 }
 
 __attribute__((weak))
-void *memchr(const void *v, int c, size_t n) {
+void *memchr(const void *s, int c, size_t n) {
   // When n is zero, a function that locates a character finds no occurrence.
   // Otherwise, decrement n to ensure sub_overflow overflows
   // when n would go equal-to-or-below zero.
@@ -92,12 +92,13 @@ void *memchr(const void *v, int c, size_t n) {
   // memchr must behave as if it reads characters sequentially
   // and stops as soon as a match is found.
   // Aligning ensures loads beyond the first match are safe.
-  uintptr_t align = (uintptr_t)v % sizeof(v128_t);
-  const v128_t *w = (v128_t *)((char *)v - align);
-  const v128_t wc = wasm_i8x16_splat(c);
+  // Volatile avoids compiler tricks around out of bounds loads.
+  uintptr_t align = (uintptr_t)s % sizeof(v128_t);
+  const volatile v128_t *v = (v128_t *)((char *)s - align);
+  const v128_t vc = wasm_i8x16_splat(c);
 
   for (;;) {
-    const v128_t cmp = wasm_i8x16_eq(*w, wc);
+    const v128_t cmp = wasm_i8x16_eq(*v, vc);
     // Bitmask is slow on AArch64, any_true is much faster.
     if (wasm_v128_any_true(cmp)) {
       // Clear the bits corresponding to alignment (little-endian)
@@ -113,7 +114,7 @@ void *memchr(const void *v, int c, size_t n) {
         // That's a match, unless it is beyond the end of the object.
         // Recall that we decremented n, so less-than-or-equal-to is correct.
         size_t ctz = __builtin_ctz(mask);
-        return ctz - align <= n ? (char *)w + ctz : NULL;
+        return ctz - align <= n ? (char *)v + ctz : NULL;
       }
     }
     // Decrement n; if it overflows we're done.
@@ -121,28 +122,28 @@ void *memchr(const void *v, int c, size_t n) {
       return NULL;
     }
     align = 0;
-    w++;
+    v++;
   }
 }
 
 __attribute__((weak))
-void *memrchr(const void *v, int c, size_t n) {
+void *memrchr(const void *s, int c, size_t n) {
   // memrchr is allowed to read up to n bytes from the object.
   // Search backward for the last matching character.
-  const v128_t *w = (v128_t *)((char *)v + n);
-  const v128_t wc = wasm_i8x16_splat(c);
+  const v128_t *v = (v128_t *)((char *)s + n);
+  const v128_t vc = wasm_i8x16_splat(c);
   for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
-    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(--w), wc);
+    const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(--v), vc);
     // Bitmask is slow on AArch64, any_true is much faster.
     if (wasm_v128_any_true(cmp)) {
       // Find the offset of the last one bit (little-endian).
       size_t clz = __builtin_clz(wasm_i8x16_bitmask(cmp)) - 15;
-      return (char *)(w + 1) - clz;
+      return (char *)(v + 1) - clz;
     }
   }
 
   // Scalar algorithm.
-  const char *a = (char *)w;
+  const char *a = (char *)v;
   while (n--) {
     if (*(--a) == (char)c) return (char *)a;
   }
@@ -154,12 +155,13 @@ size_t strlen(const char *s) {
   // strlen must stop as soon as it finds the terminator.
   // Aligning ensures loads beyond the terminator are safe.
   uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-  const v128_t *w = (v128_t *)(s - align);
+  const volatile v128_t *v = (v128_t *)(s - align);
 
   for (;;) {
+    const v128_t vv = *v;
     // Bitmask is slow on AArch64, all_true is much faster.
-    if (!wasm_i8x16_all_true(*w)) {
-      const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){});
+    if (!wasm_i8x16_all_true(vv)) {
+      const v128_t cmp = wasm_i8x16_eq(vv, (v128_t){});
       // Clear the bits corresponding to alignment (little-endian)
       // so we can count trailing zeros.
       int mask = wasm_i8x16_bitmask(cmp) >> align << align;
@@ -170,11 +172,11 @@ size_t strlen(const char *s) {
       // it's as if we didn't find anything.
       if (mask) {
         // Find the offset of the first one bit (little-endian).
-        return (char *)w - s + __builtin_ctz(mask);
+        return (char *)v - s + __builtin_ctz(mask);
       }
     }
     align = 0;
-    w++;
+    v++;
   }
 }
 
@@ -268,12 +270,14 @@ int strncmp(const char *s1, const char *s2, size_t n) {
 static char *__strchrnul(const char *s, int c) {
   // strchrnul must stop as soon as it finds the terminator.
   // Aligning ensures loads beyond the terminator are safe.
+  // Volatile avoids compiler tricks around out of bounds loads.
   uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-  const v128_t *w = (v128_t *)(s - align);
-  const v128_t wc = wasm_i8x16_splat(c);
+  const volatile v128_t *v = (v128_t *)(s - align);
+  const v128_t vc = wasm_i8x16_splat(c);
 
   for (;;) {
-    const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}) | wasm_i8x16_eq(*w, wc);
+    const v128_t vv = *v;
+    const v128_t cmp = wasm_i8x16_eq(vv, (v128_t){}) | wasm_i8x16_eq(vv, vc);
     // Bitmask is slow on AArch64, any_true is much faster.
     if (wasm_v128_any_true(cmp)) {
       // Clear the bits corresponding to alignment (little-endian)
@@ -286,11 +290,11 @@ static char *__strchrnul(const char *s, int c) {
       // it's as if we didn't find anything.
       if (mask) {
         // Find the offset of the first one bit (little-endian).
-        return (char *)w + __builtin_ctz(mask);
+        return (char *)v + __builtin_ctz(mask);
       }
     }
     align = 0;
-    w++;
+    v++;
   }
 }
 
@@ -371,14 +375,15 @@ __attribute__((weak))
 size_t strspn(const char *s, const char *c) {
   // strspn must stop as soon as it finds the terminator.
   // Aligning ensures loads beyond the terminator are safe.
+  // Volatile avoids compiler tricks around out of bounds loads.
   uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-  const v128_t *w = (v128_t *)(s - align);
+  const volatile v128_t *v = (v128_t *)(s - align);
 
   if (!c[0]) return 0;
   if (!c[1]) {
-    const v128_t wc = wasm_i8x16_splat(*c);
+    const v128_t vc = wasm_i8x16_splat(*c);
     for (;;) {
-      const v128_t cmp = wasm_i8x16_eq(*w, wc);
+      const v128_t cmp = wasm_i8x16_eq(*v, vc);
       // Bitmask is slow on AArch64, all_true is much faster.
       if (!wasm_i8x16_all_true(cmp)) {
         // Clear the bits corresponding to alignment (little-endian)
@@ -391,11 +396,11 @@ size_t strspn(const char *s, const char *c) {
         // it's as if we didn't find anything.
         if (mask) {
           // Find the offset of the first one bit (little-endian).
-          return (char *)w - s + __builtin_ctz(mask);
+          return (char *)v - s + __builtin_ctz(mask);
         }
       }
       align = 0;
-      w++;
+      v++;
     }
   }
 
@@ -407,7 +412,7 @@ size_t strspn(const char *s, const char *c) {
   }
 
   for (;;) {
-    const v128_t cmp = __wasm_v128_chkbits(bitmap, *w);
+    const v128_t cmp = __wasm_v128_chkbits(bitmap, *v);
     // Bitmask is slow on AArch64, all_true is much faster.
     if (!wasm_i8x16_all_true(cmp)) {
       // Clear the bits corresponding to alignment (little-endian)
@@ -420,11 +425,11 @@ size_t strspn(const char *s, const char *c) {
       // it's as if we didn't find anything.
       if (mask) {
         // Find the offset of the first one bit (little-endian).
-        return (char *)w - s + __builtin_ctz(mask);
+        return (char *)v - s + __builtin_ctz(mask);
       }
     }
     align = 0;
-    w++;
+    v++;
   }
 }
 
@@ -434,8 +439,9 @@ size_t strcspn(const char *s, const char *c) {
 
   // strcspn must stop as soon as it finds the terminator.
   // Aligning ensures loads beyond the terminator are safe.
+  // Volatile avoids compiler tricks around out of bounds loads.
   uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-  const v128_t *w = (v128_t *)(s - align);
+  const volatile v128_t *v = (v128_t *)(s - align);
 
   __wasm_v128_bitmap256_t bitmap = {};
 
@@ -445,7 +451,7 @@ size_t strcspn(const char *s, const char *c) {
   } while (*c++);
 
   for (;;) {
-    const v128_t cmp = __wasm_v128_chkbits(bitmap, *w);
+    const v128_t cmp = __wasm_v128_chkbits(bitmap, *v);
     // Bitmask is slow on AArch64, any_true is much faster.
     if (wasm_v128_any_true(cmp)) {
       // Clear the bits corresponding to alignment (little-endian)
@@ -458,11 +464,11 @@ size_t strcspn(const char *s, const char *c) {
       // it's as if we didn't find anything.
       if (mask) {
         // Find the offset of the first one bit (little-endian).
-        return (char *)w - s + __builtin_ctz(mask);
+        return (char *)v - s + __builtin_ctz(mask);
       }
     }
     align = 0;
-    w++;
+    v++;
   }
 }