More unicode.

This commit is contained in:
Nuno Cruces
2025-01-16 15:46:49 +00:00
parent a159b548ed
commit ab09da7136
2 changed files with 57 additions and 10 deletions

View File

@@ -5,16 +5,18 @@
// - LIKE and REGEXP operators,
// - collation sequences.
//
// It also provides, from PostgreSQL:
// - unaccent(),
// - initcap().
//
// The implementation is not 100% compatible with the [ICU extension]:
// - upper() and lower() use [strings.ToUpper], [strings.ToLower] and [cases];
// - the LIKE operator follows [strings.EqualFold] rules;
// - the REGEXP operator uses Go [regexp/syntax];
// - collation sequences use [collate].
//
// It also provides (approximately) from PostgreSQL:
// - casefold(),
// - initcap(),
// - normalize(),
// - unaccent().
//
// Expect subtle differences (e.g.) in the handling of Turkish case folding.
//
// [ICU extension]: https://sqlite.org/src/dir/ext/icu
@@ -48,13 +50,13 @@ var RegisterLike = true
// Register registers Unicode aware functions for a database connection.
func Register(db *sqlite3.Conn) error {
const flags = sqlite3.DETERMINISTIC | sqlite3.INNOCUOUS
var errs util.ErrorJoiner
var lkfn sqlite3.ScalarFunction
if RegisterLike {
errs.Join(
db.CreateFunction("like", 2, flags, like),
db.CreateFunction("like", 3, flags, like))
lkfn = like
}
errs.Join(
return errors.Join(
db.CreateFunction("like", 2, flags, lkfn),
db.CreateFunction("like", 3, flags, lkfn),
db.CreateFunction("upper", 1, flags, upper),
db.CreateFunction("upper", 2, flags, upper),
db.CreateFunction("lower", 1, flags, lower),
@@ -62,7 +64,10 @@ func Register(db *sqlite3.Conn) error {
db.CreateFunction("regexp", 2, flags, regex),
db.CreateFunction("initcap", 1, flags, initcap),
db.CreateFunction("initcap", 2, flags, initcap),
db.CreateFunction("casefold", 1, flags, casefold),
db.CreateFunction("unaccent", 1, flags, unaccent),
db.CreateFunction("normalize", 1, flags, normalize),
db.CreateFunction("normalize", 2, flags, normalize),
db.CreateFunction("icu_load_collation", 2, sqlite3.DIRECTONLY,
func(ctx sqlite3.Context, arg ...sqlite3.Value) {
name := arg[1].Text()
@@ -76,7 +81,6 @@ func Register(db *sqlite3.Conn) error {
return // notest
}
}))
return errors.Join(errs...)
}
// RegisterCollation registers a Unicode collation sequence for a database connection.
@@ -154,6 +158,10 @@ func initcap(ctx sqlite3.Context, arg ...sqlite3.Value) {
ctx.ResultRawText(cs.Bytes(arg[0].RawText()))
}
func casefold(ctx sqlite3.Context, arg ...sqlite3.Value) {
ctx.ResultRawText(cases.Fold().Bytes(arg[0].RawText()))
}
func unaccent(ctx sqlite3.Context, arg ...sqlite3.Value) {
unaccent := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
res, _, err := transform.Bytes(unaccent, arg[0].RawText())
@@ -164,6 +172,31 @@ func unaccent(ctx sqlite3.Context, arg ...sqlite3.Value) {
}
}
func normalize(ctx sqlite3.Context, arg ...sqlite3.Value) {
form := norm.NFC
if len(arg) > 1 {
switch strings.ToUpper(arg[1].Text()) {
case "NFC":
//
case "NFD":
form = norm.NFD
case "NFKC":
form = norm.NFKC
case "NFKD":
form = norm.NFKD
default:
ctx.ResultError(util.ErrorString("unicode: invalid form"))
return
}
}
res, _, err := transform.Bytes(form, arg[0].RawText())
if err != nil {
ctx.ResultError(err) // notest
} else {
ctx.ResultRawText(res)
}
}
func regex(ctx sqlite3.Context, arg ...sqlite3.Value) {
re, ok := ctx.GetAuxData(0).(*regexp.Regexp)
if !ok {

View File

@@ -49,6 +49,12 @@ func TestRegister(t *testing.T) {
{`upper('Dünyanın İlk Borsası', 'tr-TR')`, "DÜNYANIN İLK BORSASI"},
{`initcap('Kad je hladno Marko nosi džemper')`, "Kad Je Hladno Marko Nosi Džemper"},
{`initcap('Kad je hladno Marko nosi džemper', 'hr-HR')`, "Kad Je Hladno Marko Nosi Džemper"},
{`normalize(X'61cc88')`, "ä"},
{`normalize(X'61cc88', 'NFC' )`, "ä"},
{`normalize(X'61cc88', 'NFKC')`, "ä"},
{`normalize('ä', 'NFD' )`, "\x61\xcc\x88"},
{`normalize('ä', 'NFKD')`, "\x61\xcc\x88"},
{`casefold('Maße')`, "masse"},
{`unaccent('Hôtel')`, "Hotel"},
{`'Hello' REGEXP 'ell'`, "1"},
{`'Hello' REGEXP 'el.'`, "1"},
@@ -208,6 +214,14 @@ func TestRegister_error(t *testing.T) {
t.Errorf("got %v, want sqlite3.ERROR", err)
}
err = db.Exec(`SELECT normalize('', 'NF')`)
if err == nil {
t.Error("want error")
}
if !errors.Is(err, sqlite3.ERROR) {
t.Errorf("got %v, want sqlite3.ERROR", err)
}
err = db.Exec(`SELECT 'hello' REGEXP '\'`)
if err == nil {
t.Error("want error")