From d3e2ac07fc3b75f51627a494330666f157ec32e2 Mon Sep 17 00:00:00 2001 From: Fabio Bozzo Date: Tue, 22 Oct 2024 11:12:42 +0200 Subject: [PATCH] fix(selector): tokenize utf-8 support --- pkg/policy/selector/parsing.go | 13 +++++++------ pkg/policy/selector/parsing_test.go | 30 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 pkg/policy/selector/parsing_test.go diff --git a/pkg/policy/selector/parsing.go b/pkg/policy/selector/parsing.go index 16843e2..61bb562 100644 --- a/pkg/policy/selector/parsing.go +++ b/pkg/policy/selector/parsing.go @@ -5,6 +5,7 @@ import ( "regexp" "strconv" "strings" + "unicode/utf8" ) var ( @@ -102,10 +103,10 @@ func tokenize(str string) []string { ctx := "" for col < len(str) { - char := string(str[col]) + char, size := utf8.DecodeRuneInString(str[col:]) - if char == "\"" && string(str[col-1]) != "\\" { - col++ + if char == '"' && (col == 0 || str[col-1] != '\\') { + col += size if ctx == "\"" { ctx = "" } else { @@ -115,17 +116,17 @@ func tokenize(str string) []string { } if ctx == "\"" { - col++ + col += size continue } - if char == "." || char == "[" { + if char == '.' || char == '[' { if ofs < col { toks = append(toks, str[ofs:col]) } ofs = col } - col++ + col += size } if ofs < col && ctx != "\"" { diff --git a/pkg/policy/selector/parsing_test.go b/pkg/policy/selector/parsing_test.go new file mode 100644 index 0000000..223170c --- /dev/null +++ b/pkg/policy/selector/parsing_test.go @@ -0,0 +1,30 @@ +package selector + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestTokenizeUTF8(t *testing.T) { + t.Run("simple UTF-8", func(t *testing.T) { + str := ".こんにちは[0]" + expected := []string{".", "こんにちは", "[0]"} + actual := tokenize(str) + require.Equal(t, expected, actual) + }) + + t.Run("UTF-8 with quotes", func(t *testing.T) { + str := ".こんにちは[\"привет\"]" + expected := []string{".", "こんにちは", "[\"привет\"]"} + actual := tokenize(str) + require.Equal(t, expected, actual) + }) + + t.Run("UTF-8 with escaped quotes", func(t *testing.T) { + str := ".こんにちは[\"привет \\\"мир\\\"\"]" + expected := []string{".", "こんにちは", "[\"привет \\\"мир\\\"\"]"} + actual := tokenize(str) + require.Equal(t, expected, actual) + }) +}