fix(selector): tokenize utf-8 support

This commit is contained in:
Fabio Bozzo
2024-10-22 11:12:42 +02:00
parent ac1b03f144
commit d3e2ac07fc
2 changed files with 37 additions and 6 deletions

View File

@@ -5,6 +5,7 @@ import (
"regexp"
"strconv"
"strings"
"unicode/utf8"
)
var (
@@ -102,10 +103,10 @@ func tokenize(str string) []string {
ctx := ""
for col < len(str) {
char := string(str[col])
char, size := utf8.DecodeRuneInString(str[col:])
if char == "\"" && string(str[col-1]) != "\\" {
col++
if char == '"' && (col == 0 || str[col-1] != '\\') {
col += size
if ctx == "\"" {
ctx = ""
} else {
@@ -115,17 +116,17 @@ func tokenize(str string) []string {
}
if ctx == "\"" {
col++
col += size
continue
}
if char == "." || char == "[" {
if char == '.' || char == '[' {
if ofs < col {
toks = append(toks, str[ofs:col])
}
ofs = col
}
col++
col += size
}
if ofs < col && ctx != "\"" {

View File

@@ -0,0 +1,30 @@
package selector
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestTokenizeUTF8(t *testing.T) {
t.Run("simple UTF-8", func(t *testing.T) {
str := ".こんにちは[0]"
expected := []string{".", "こんにちは", "[0]"}
actual := tokenize(str)
require.Equal(t, expected, actual)
})
t.Run("UTF-8 with quotes", func(t *testing.T) {
str := ".こんにちは[\"привет\"]"
expected := []string{".", "こんにちは", "[\"привет\"]"}
actual := tokenize(str)
require.Equal(t, expected, actual)
})
t.Run("UTF-8 with escaped quotes", func(t *testing.T) {
str := ".こんにちは[\"привет \\\"мир\\\"\"]"
expected := []string{".", "こんにちは", "[\"привет \\\"мир\\\"\"]"}
actual := tokenize(str)
require.Equal(t, expected, actual)
})
}