[ruby/prism] SIMD/SWAR for strpbrk

kddnewton · kddnewton · commit 0666ceabbaf0 · 2026-03-17T11:14:11.000-04:00
ruby/prism@c464b298aa
diff --git a/prism/defines.h b/prism/defines.h
@@ -276,6 +276,18 @@
     #define PRISM_UNLIKELY(x) (x)
 #endif
 
+/**
+ * Platform detection for SIMD / fast-path implementations. At most one of
+ * these macros is defined, selecting the best available vectorization strategy.
+ */
+#if (defined(__aarch64__) && defined(__ARM_NEON)) || defined(_M_ARM64)
+    #define PRISM_HAS_NEON
+#elif (defined(__x86_64__) && defined(__SSSE3__)) || defined(_M_X64)
+    #define PRISM_HAS_SSSE3
+#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    #define PRISM_HAS_SWAR
+#endif
+
 /**
  * Count trailing zero bits in a 64-bit value. Used by SWAR identifier scanning
  * to find the first non-matching byte in a word.
diff --git a/prism/prism.c b/prism/prism.c
@@ -1783,16 +1783,14 @@ char_is_identifier_utf8(const uint8_t *b, ptrdiff_t n) {
  * Callers must handle any remaining bytes (short tail or non-ASCII/UTF-8)
  * with a byte-at-a-time loop.
  *
- * Up to four optimized implementations are selected at compile time, with a
+ * Up to three optimized implementations are selected at compile time, with a
  * no-op fallback for unsupported platforms:
  *   1. NEON — processes 16 bytes per iteration on aarch64.
- *   2. SSE2 — processes 16 bytes per iteration on x86-64.
- *   3. WASM SIMD — processes 16 bytes per iteration on WebAssembly.
- *   4. SWAR — little-endian fallback, processes 8 bytes per iteration.
- *   5. No-op — returns 0; the caller's byte-at-a-time loop handles everything.
+ *   2. SSSE3 — processes 16 bytes per iteration on x86-64.
+ *   3. SWAR — little-endian fallback, processes 8 bytes per iteration.
  */
 
-#if defined(__aarch64__) && defined(__ARM_NEON)
+#if defined(PRISM_HAS_NEON)
 #include <arm_neon.h>
 
 static inline size_t
@@ -1844,8 +1842,8 @@ scan_identifier_ascii(const uint8_t *start, const uint8_t *end) {
     return (size_t) (cursor - start);
 }
 
-#elif defined(__x86_64__) && defined(__SSE2__)
-#include <emmintrin.h>
+#elif defined(PRISM_HAS_SSSE3)
+#include <tmmintrin.h>
 
 static inline size_t
 scan_identifier_ascii(const uint8_t *start, const uint8_t *end) {
@@ -1886,54 +1884,11 @@ scan_identifier_ascii(const uint8_t *start, const uint8_t *end) {
     return (size_t) (cursor - start);
 }
 
-#elif defined(__wasm_simd128__)
-#include <wasm_simd128.h>
-
-static inline size_t
-scan_identifier_ascii(const uint8_t *start, const uint8_t *end) {
-    const uint8_t *cursor = start;
-
-    while (cursor + 16 <= end) {
-        v128_t v = wasm_v128_load(cursor);
-
-        // Range checks via subtract-and-unsigned-compare: (v - lo) < count
-        // is true iff v is in [lo, lo + count). One subtract + one compare
-        // per range instead of two comparisons + AND.
-
-        // Fold case: OR with 0x20 maps A-Z to a-z.
-        v128_t lowered = wasm_v128_or(v, wasm_u8x16_splat(0x20));
-        v128_t letter = wasm_u8x16_lt(
-            wasm_i8x16_sub(lowered, wasm_u8x16_splat(0x61)),
-            wasm_u8x16_splat(0x1A));
-
-        v128_t digit = wasm_u8x16_lt(
-            wasm_i8x16_sub(v, wasm_u8x16_splat(0x30)),
-            wasm_u8x16_splat(0x0A));
-
-        v128_t underscore = wasm_i8x16_eq(v, wasm_u8x16_splat(0x5F));
-
-        v128_t ident = wasm_v128_or(wasm_v128_or(letter, digit), underscore);
-
-        // Fast path: if all 16 bytes are identifier chars, advance.
-        if (wasm_i8x16_all_true(ident)) {
-            cursor += 16;
-            continue;
-        }
-
-        // Extract bitmask only on the exit path to find the first non-match.
-        uint32_t mask = wasm_i8x16_bitmask(ident);
-        cursor += pm_ctzll((uint64_t) (~mask & 0xFFFF));
-        return (size_t) (cursor - start);
-    }
-
-    return (size_t) (cursor - start);
-}
-
 // The SWAR path uses pm_ctzll to find the first non-matching byte within a
 // word, which only yields the correct byte index on little-endian targets.
 // We gate on a positive little-endian check so that unknown-endianness
 // platforms safely fall through to the no-op fallback.
-#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#elif defined(PRISM_HAS_SWAR)
 
 /**
  * Portable SWAR fallback — processes 8 bytes per iteration.
diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c
@@ -29,13 +29,214 @@ pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, uint32_t start, uint32_t l
     parser->explicit_encoding = parser->encoding;
 }
 
+/**
+ * Scan forward through ASCII bytes looking for a byte that is in the given
+ * charset. Returns true if a match was found, storing its offset in *index.
+ * Returns false if no match was found, storing the number of ASCII bytes
+ * consumed in *index (so the caller can skip past them).
+ *
+ * All charset characters must be ASCII (< 0x80). The scanner stops at non-ASCII
+ * bytes, returning control to the caller's encoding-aware loop.
+ *
+ * Up to three optimized implementations are selected at compile time, with a
+ * no-op fallback for unsupported platforms:
+ *   1. NEON — processes 16 bytes per iteration on aarch64.
+ *   2. SSSE3 — processes 16 bytes per iteration on x86-64.
+ *   3. SWAR — little-endian fallback, processes 8 bytes per iteration.
+ */
+
+#if defined(PRISM_HAS_NEON)
+#include <arm_neon.h>
+
+static inline bool
+scan_strpbrk_ascii(const uint8_t *source, size_t maximum, const uint8_t *charset, size_t *index) {
+    // Build nibble-based lookup tables from the charset. All breakpoint
+    // characters are ASCII (< 0x80), so they fit within high nibbles 0-7.
+    //
+    // For each charset byte c, we set bit (1 << (c >> 4)) in low_lut[c & 0xF].
+    // high_lut[h] = (1 << h) for each high nibble h present in the charset.
+    // A source byte s matches iff (low_lut[s & 0xF] & high_lut[s >> 4]) != 0.
+    uint8_t low_arr[16] = { 0 };
+    uint8_t high_arr[16] = { 0 };
+    uint64_t table[4] = { 0 };
+
+    for (const uint8_t *c = charset; *c != '\0'; c++) {
+        low_arr[*c & 0x0F] |= (uint8_t) (1 << (*c >> 4));
+        high_arr[*c >> 4] = (uint8_t) (1 << (*c >> 4));
+        table[*c >> 6] |= (uint64_t) 1 << (*c & 0x3F);
+    }
+
+    uint8x16_t low_lut = vld1q_u8(low_arr);
+    uint8x16_t high_lut = vld1q_u8(high_arr);
+    uint8x16_t mask_0f = vdupq_n_u8(0x0F);
+    uint8x16_t mask_80 = vdupq_n_u8(0x80);
+
+    size_t idx = 0;
+
+    while (idx + 16 <= maximum) {
+        uint8x16_t v = vld1q_u8(source + idx);
+
+        // If any byte has the high bit set, we have non-ASCII data.
+        // Return to let the caller's encoding-aware loop handle it.
+        if (vmaxvq_u8(vandq_u8(v, mask_80)) != 0) break;
+
+        uint8x16_t lo_class = vqtbl1q_u8(low_lut, vandq_u8(v, mask_0f));
+        uint8x16_t hi_class = vqtbl1q_u8(high_lut, vshrq_n_u8(v, 4));
+        uint8x16_t matched = vtstq_u8(lo_class, hi_class);
+
+        if (vmaxvq_u8(matched) == 0) {
+            idx += 16;
+            continue;
+        }
+
+        // Find the position of the first matching byte.
+        uint64_t lo64 = vgetq_lane_u64(vreinterpretq_u64_u8(matched), 0);
+        if (lo64 != 0) {
+            *index = idx + pm_ctzll(lo64) / 8;
+            return true;
+        }
+        uint64_t hi64 = vgetq_lane_u64(vreinterpretq_u64_u8(matched), 1);
+        *index = idx + 8 + pm_ctzll(hi64) / 8;
+        return true;
+    }
+
+    // Scalar tail for remaining < 16 ASCII bytes.
+    while (idx < maximum && source[idx] < 0x80) {
+        uint8_t byte = source[idx];
+        if (table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) {
+            *index = idx;
+            return true;
+        }
+        idx++;
+    }
+
+    *index = idx;
+    return false;
+}
+
+#elif defined(PRISM_HAS_SSSE3)
+#include <tmmintrin.h>
+
+static inline bool
+scan_strpbrk_ascii(const uint8_t *source, size_t maximum, const uint8_t *charset, size_t *index) {
+    // Build nibble-based lookup tables and bitmap table in a single pass.
+    uint8_t low_arr[16] = { 0 };
+    uint8_t high_arr[16] = { 0 };
+    uint64_t table[4] = { 0 };
+
+    for (const uint8_t *c = charset; *c != '\0'; c++) {
+        low_arr[*c & 0x0F] |= (uint8_t) (1 << (*c >> 4));
+        high_arr[*c >> 4] = (uint8_t) (1 << (*c >> 4));
+        table[*c >> 6] |= (uint64_t) 1 << (*c & 0x3F);
+    }
+
+    __m128i low_lut = _mm_loadu_si128((const __m128i *) low_arr);
+    __m128i high_lut = _mm_loadu_si128((const __m128i *) high_arr);
+    __m128i mask_0f = _mm_set1_epi8(0x0F);
+
+    size_t idx = 0;
+
+    while (idx + 16 <= maximum) {
+        __m128i v = _mm_loadu_si128((const __m128i *) (source + idx));
+
+        // If any byte has the high bit set, stop.
+        if (_mm_movemask_epi8(v) != 0) break;
+
+        // Nibble-based classification using pshufb (SSSE3), same as NEON
+        // vqtbl1q_u8. A byte matches iff (low_lut[lo_nib] & high_lut[hi_nib]) != 0.
+        __m128i lo_class = _mm_shuffle_epi8(low_lut, _mm_and_si128(v, mask_0f));
+        __m128i hi_class = _mm_shuffle_epi8(high_lut, _mm_and_si128(_mm_srli_epi16(v, 4), mask_0f));
+        __m128i matched = _mm_and_si128(lo_class, hi_class);
+
+        // Check if any byte matched.
+        int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(matched, _mm_setzero_si128()));
+
+        if (mask == 0xFFFF) {
+            // All bytes were zero — no match in this chunk.
+            idx += 16;
+            continue;
+        }
+
+        // Find the first matching byte (first non-zero in matched).
+        *index = idx + pm_ctzll((uint64_t) (~mask & 0xFFFF));
+        return true;
+    }
+
+    // Scalar tail.
+    while (idx < maximum && source[idx] < 0x80) {
+        uint8_t byte = source[idx];
+        if (table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) {
+            *index = idx;
+            return true;
+        }
+        idx++;
+    }
+
+    *index = idx;
+    return false;
+}
+
+#elif defined(PRISM_HAS_SWAR)
+
+static inline bool
+scan_strpbrk_ascii(const uint8_t *source, size_t maximum, const uint8_t *charset, size_t *index) {
+    // Build a 256-bit lookup table (one bit per ASCII value).
+    uint64_t table[4] = { 0 };
+    for (const uint8_t *c = charset; *c != '\0'; c++) {
+        table[*c >> 6] |= (uint64_t) 1 << (*c & 0x3F);
+    }
+
+    static const uint64_t highs = 0x8080808080808080ULL;
+    size_t idx = 0;
+
+    while (idx + 8 <= maximum) {
+        uint64_t word;
+        memcpy(&word, source + idx, 8);
+
+        // Bail on any non-ASCII byte.
+        if (word & highs) break;
+
+        // Check each byte against the charset table.
+        for (size_t j = 0; j < 8; j++) {
+            uint8_t byte = source[idx + j];
+            if (table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) {
+                *index = idx + j;
+                return true;
+            }
+        }
+
+        idx += 8;
+    }
+
+    // Scalar tail.
+    while (idx < maximum && source[idx] < 0x80) {
+        uint8_t byte = source[idx];
+        if (table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) {
+            *index = idx;
+            return true;
+        }
+        idx++;
+    }
+
+    *index = idx;
+    return false;
+}
+
+#else
+
+static inline bool
+scan_strpbrk_ascii(PRISM_ATTRIBUTE_UNUSED const uint8_t *source, PRISM_ATTRIBUTE_UNUSED size_t maximum, PRISM_ATTRIBUTE_UNUSED const uint8_t *charset, size_t *index) {
+    *index = 0;
+    return false;
+}
+
+#endif
+
 /**
  * This is the default path.
  */
 static inline const uint8_t *
-pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
-    size_t index = 0;
-
+pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) {
     while (index < maximum) {
         if (strchr((const char *) charset, source[index]) != NULL) {
             return source + index;
@@ -73,9 +274,7 @@ pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *chars
  * This is the path when the encoding is ASCII-8BIT.
  */
 static inline const uint8_t *
-pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
-    size_t index = 0;
-
+pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) {
     while (index < maximum) {
         if (strchr((const char *) charset, source[index]) != NULL) {
             return source + index;
@@ -92,8 +291,7 @@ pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t
  * This is the slow path that does care about the encoding.
  */
 static inline const uint8_t *
-pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
-    size_t index = 0;
+pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) {
     const pm_encoding_t *encoding = parser->encoding;
 
     while (index < maximum) {
@@ -135,8 +333,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
  * the encoding only supports single-byte characters.
  */
 static inline const uint8_t *
-pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
-    size_t index = 0;
+pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) {
     const pm_encoding_t *encoding = parser->encoding;
 
     while (index < maximum) {
@@ -192,15 +389,19 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
  */
 const uint8_t *
 pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
-    if (length <= 0) {
-        return NULL;
-    } else if (!parser->encoding_changed) {
-        return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
+    if (length <= 0) return NULL;
+
+    size_t maximum = (size_t) length;
+    size_t index = 0;
+    if (scan_strpbrk_ascii(source, maximum, charset, &index)) return source + index;
+
+    if (!parser->encoding_changed) {
+        return pm_strpbrk_utf8(parser, source, charset, index, maximum, validate);
     } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
-        return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
+        return pm_strpbrk_ascii_8bit(parser, source, charset, index, maximum, validate);
     } else if (parser->encoding->multibyte) {
-        return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
+        return pm_strpbrk_multi_byte(parser, source, charset, index, maximum, validate);
     } else {
-        return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
+        return pm_strpbrk_single_byte(parser, source, charset, index, maximum, validate);
     }
 }