Files
sqlite3/ext/unicode/unicode.go

288 lines
7.1 KiB
Go
Raw Permalink Normal View History

2023-07-12 13:29:14 +01:00
// Package unicode provides an alternative to the SQLite ICU extension.
2023-07-11 13:38:29 +01:00
//
2023-08-31 16:30:52 +01:00
// Like the [ICU extension], it provides Unicode aware:
2025-03-24 22:38:22 +00:00
// - upper() and lower() functions
// - LIKE and REGEXP operators
// - collation sequences
2023-07-12 13:29:14 +01:00
//
2025-03-14 11:37:48 +00:00
// Like PostgreSQL, it also provides:
2025-03-24 22:38:22 +00:00
// - initcap()
// - casefold()
// - normalize()
// - unaccent()
2025-01-16 15:46:49 +00:00
//
2025-03-14 11:37:48 +00:00
// The implementations are not 100% compatible:
2025-03-24 22:38:22 +00:00
// - upper(), lower(), initcap() casefold() use [strings.ToUpper], [strings.ToLower], [strings.Title] and [cases]
// - normalize(), unaccent() use [transform] and [unicode.Mn]
// - the LIKE operator follows [strings.EqualFold] rules
// - the REGEXP operator uses Go [regexp/syntax]
// - collation sequences use [collate]
2025-03-14 11:37:48 +00:00
//
2023-07-12 13:29:14 +01:00
// Expect subtle differences (e.g.) in the handling of Turkish case folding.
2023-08-31 16:30:52 +01:00
//
// [ICU extension]: https://sqlite.org/src/dir/ext/icu
2023-07-11 13:38:29 +01:00
package unicode
import (
"bytes"
2024-07-08 12:06:57 +01:00
"errors"
2023-07-11 13:38:29 +01:00
"regexp"
"strings"
2025-03-14 11:37:48 +00:00
"sync"
2024-09-29 20:01:14 +01:00
"unicode"
2023-07-11 13:38:29 +01:00
"unicode/utf8"
"golang.org/x/text/cases"
"golang.org/x/text/collate"
"golang.org/x/text/language"
2024-09-29 20:01:14 +01:00
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
2024-10-18 12:20:32 +01:00
"github.com/ncruces/go-sqlite3"
"github.com/ncruces/go-sqlite3/internal/util"
2023-07-11 13:38:29 +01:00
)
2025-08-13 03:31:12 +01:00
// RegisterLike must be set to false to not register a Unicode aware LIKE operator.
2024-09-30 13:30:50 +01:00
// Overriding the built-in LIKE operator disables the [LIKE optimization].
//
// [LIKE optimization]: https://sqlite.org/optoverview.html#the_like_optimization
var RegisterLike = true
2023-07-11 13:38:29 +01:00
// Register registers Unicode aware functions for a database connection.
2024-07-08 12:06:57 +01:00
func Register(db *sqlite3.Conn) error {
2024-09-30 13:30:50 +01:00
const flags = sqlite3.DETERMINISTIC | sqlite3.INNOCUOUS
2025-01-16 15:46:49 +00:00
var lkfn sqlite3.ScalarFunction
2024-09-30 13:30:50 +01:00
if RegisterLike {
2025-01-16 15:46:49 +00:00
lkfn = like
2024-09-30 13:30:50 +01:00
}
2025-01-16 15:46:49 +00:00
return errors.Join(
db.CreateFunction("like", 2, flags, lkfn),
db.CreateFunction("like", 3, flags, lkfn),
2024-07-08 12:06:57 +01:00
db.CreateFunction("upper", 1, flags, upper),
db.CreateFunction("upper", 2, flags, upper),
db.CreateFunction("lower", 1, flags, lower),
db.CreateFunction("lower", 2, flags, lower),
db.CreateFunction("regexp", 2, flags, regex),
2024-09-29 20:01:14 +01:00
db.CreateFunction("initcap", 1, flags, initcap),
db.CreateFunction("initcap", 2, flags, initcap),
2025-01-16 15:46:49 +00:00
db.CreateFunction("casefold", 1, flags, casefold),
2024-09-29 20:01:14 +01:00
db.CreateFunction("unaccent", 1, flags, unaccent),
2025-01-16 15:46:49 +00:00
db.CreateFunction("normalize", 1, flags, normalize),
db.CreateFunction("normalize", 2, flags, normalize),
2024-07-08 12:06:57 +01:00
db.CreateFunction("icu_load_collation", 2, sqlite3.DIRECTONLY,
func(ctx sqlite3.Context, arg ...sqlite3.Value) {
name := arg[1].Text()
if name == "" {
return
}
2023-07-11 13:38:29 +01:00
2024-09-30 13:30:50 +01:00
err := RegisterCollation(ctx.Conn(), arg[0].Text(), name)
2024-07-08 12:06:57 +01:00
if err != nil {
ctx.ResultError(err)
2024-07-26 13:29:24 +01:00
return // notest
2024-07-08 12:06:57 +01:00
}
}))
2023-07-11 13:38:29 +01:00
}
2023-08-31 16:30:52 +01:00
// RegisterCollation registers a Unicode collation sequence for a database connection.
func RegisterCollation(db *sqlite3.Conn, locale, name string) error {
tag, err := language.Parse(locale)
2023-08-10 13:39:52 +01:00
if err != nil {
return err
}
return db.CreateCollation(name, collate.New(tag).Compare)
}
2024-09-27 15:45:51 +01:00
// RegisterCollationsNeeded registers Unicode collation sequences on demand for a database connection.
func RegisterCollationsNeeded(db *sqlite3.Conn) error {
return db.CollationNeeded(func(db *sqlite3.Conn, name string) {
if tag, err := language.Parse(name); err == nil {
db.CreateCollation(name, collate.New(tag).Compare)
}
})
}
2023-07-11 13:38:29 +01:00
func upper(ctx sqlite3.Context, arg ...sqlite3.Value) {
if len(arg) == 1 {
2023-11-10 13:42:11 +00:00
ctx.ResultRawText(bytes.ToUpper(arg[0].RawText()))
2023-07-11 13:38:29 +01:00
return
}
cs, ok := ctx.GetAuxData(1).(cases.Caser)
if !ok {
2023-07-12 13:29:14 +01:00
t, err := language.Parse(arg[1].Text())
2023-07-11 13:38:29 +01:00
if err != nil {
ctx.ResultError(err)
2024-07-26 13:29:24 +01:00
return // notest
2023-07-11 13:38:29 +01:00
}
2025-03-10 14:54:34 +00:00
cs = cases.Upper(t)
ctx.SetAuxData(1, cs)
2023-07-11 13:38:29 +01:00
}
2023-11-10 13:42:11 +00:00
ctx.ResultRawText(cs.Bytes(arg[0].RawText()))
2023-07-11 13:38:29 +01:00
}
func lower(ctx sqlite3.Context, arg ...sqlite3.Value) {
if len(arg) == 1 {
2023-11-10 13:42:11 +00:00
ctx.ResultRawText(bytes.ToLower(arg[0].RawText()))
2023-07-11 13:38:29 +01:00
return
}
cs, ok := ctx.GetAuxData(1).(cases.Caser)
if !ok {
2023-07-12 13:29:14 +01:00
t, err := language.Parse(arg[1].Text())
2023-07-11 13:38:29 +01:00
if err != nil {
ctx.ResultError(err)
2024-07-26 13:29:24 +01:00
return // notest
2023-07-11 13:38:29 +01:00
}
2025-03-10 14:54:34 +00:00
cs = cases.Lower(t)
ctx.SetAuxData(1, cs)
2023-07-11 13:38:29 +01:00
}
2023-11-10 13:42:11 +00:00
ctx.ResultRawText(cs.Bytes(arg[0].RawText()))
2023-07-11 13:38:29 +01:00
}
2024-09-29 20:01:14 +01:00
func initcap(ctx sqlite3.Context, arg ...sqlite3.Value) {
if len(arg) == 1 {
ctx.ResultRawText(bytes.Title(arg[0].RawText()))
return
}
cs, ok := ctx.GetAuxData(1).(cases.Caser)
if !ok {
t, err := language.Parse(arg[1].Text())
if err != nil {
ctx.ResultError(err)
return // notest
}
2025-03-10 14:54:34 +00:00
cs = cases.Title(t)
ctx.SetAuxData(1, cs)
2024-09-29 20:01:14 +01:00
}
ctx.ResultRawText(cs.Bytes(arg[0].RawText()))
}
2025-01-16 15:46:49 +00:00
func casefold(ctx sqlite3.Context, arg ...sqlite3.Value) {
ctx.ResultRawText(cases.Fold().Bytes(arg[0].RawText()))
}
2025-03-14 11:37:48 +00:00
var unaccentPool = sync.Pool{
New: func() any {
return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
},
}
2024-09-29 20:01:14 +01:00
func unaccent(ctx sqlite3.Context, arg ...sqlite3.Value) {
2025-03-14 11:37:48 +00:00
unaccent := unaccentPool.Get().(transform.Transformer)
defer unaccentPool.Put(unaccent)
2024-09-29 20:01:14 +01:00
res, _, err := transform.Bytes(unaccent, arg[0].RawText())
if err != nil {
2024-09-30 13:30:50 +01:00
ctx.ResultError(err) // notest
2024-09-29 20:01:14 +01:00
} else {
ctx.ResultRawText(res)
}
}
2025-01-16 15:46:49 +00:00
func normalize(ctx sqlite3.Context, arg ...sqlite3.Value) {
form := norm.NFC
if len(arg) > 1 {
switch strings.ToUpper(arg[1].Text()) {
case "NFC":
//
case "NFD":
form = norm.NFD
case "NFKC":
form = norm.NFKC
case "NFKD":
form = norm.NFKD
default:
ctx.ResultError(util.ErrorString("unicode: invalid form"))
return
}
}
res, _, err := transform.Bytes(form, arg[0].RawText())
if err != nil {
ctx.ResultError(err) // notest
} else {
ctx.ResultRawText(res)
}
}
2023-07-11 13:38:29 +01:00
func regex(ctx sqlite3.Context, arg ...sqlite3.Value) {
re, ok := ctx.GetAuxData(0).(*regexp.Regexp)
if !ok {
2025-03-10 14:54:34 +00:00
re, ok = arg[0].Pointer().(*regexp.Regexp)
if !ok {
r, err := regexp.Compile(arg[0].Text())
if err != nil {
ctx.ResultError(err)
return // notest
}
re = r
2023-07-11 13:38:29 +01:00
}
2025-03-10 14:54:34 +00:00
ctx.SetAuxData(0, re)
2023-07-11 13:38:29 +01:00
}
2023-11-27 14:57:04 +00:00
ctx.ResultBool(re.Match(arg[1].RawText()))
2023-07-11 13:38:29 +01:00
}
func like(ctx sqlite3.Context, arg ...sqlite3.Value) {
escape := rune(-1)
if len(arg) == 3 {
var size int
2023-11-27 14:57:04 +00:00
b := arg[2].RawText()
2023-07-11 13:38:29 +01:00
escape, size = utf8.DecodeRune(b)
if size != len(b) {
ctx.ResultError(util.ErrorString("ESCAPE expression must be a single character"))
return
}
}
2024-12-19 14:00:46 +00:00
_ = arg[1] // bounds check
2023-07-11 13:38:29 +01:00
type likeData struct {
*regexp.Regexp
escape rune
}
re, ok := ctx.GetAuxData(0).(likeData)
if !ok || re.escape != escape {
2023-07-26 02:02:39 +01:00
re = likeData{
regexp.MustCompile(like2regex(arg[0].Text(), escape)),
escape,
2023-07-11 13:38:29 +01:00
}
ctx.SetAuxData(0, re)
}
2023-11-27 14:57:04 +00:00
ctx.ResultBool(re.Match(arg[1].RawText()))
2023-07-11 13:38:29 +01:00
}
func like2regex(pattern string, escape rune) string {
var re strings.Builder
start := 0
literal := false
2023-07-12 13:29:14 +01:00
re.Grow(len(pattern) + 10)
re.WriteString(`(?is)\A`) // case insensitive, . matches any character
2023-07-11 13:38:29 +01:00
for i, r := range pattern {
if start < 0 {
start = i
}
if literal {
literal = false
continue
}
var symbol string
switch r {
case '_':
symbol = `.`
case '%':
symbol = `.*`
case escape:
literal = true
default:
continue
}
re.WriteString(regexp.QuoteMeta(pattern[start:i]))
re.WriteString(symbol)
start = -1
}
if start >= 0 {
re.WriteString(regexp.QuoteMeta(pattern[start:]))
}
2023-07-12 13:29:14 +01:00
re.WriteString(`\z`)
2023-07-11 13:38:29 +01:00
return re.String()
}