Files
sqlite3/ext/stats/stats.go

237 lines
6.6 KiB
Go
Raw Normal View History

2023-08-31 16:30:52 +01:00
// Package stats provides aggregate functions for statistics.
//
2023-12-30 10:44:50 +00:00
// Provided functions:
2023-08-31 16:30:52 +01:00
// - stddev_pop: population standard deviation
2023-09-01 02:26:30 +01:00
// - stddev_samp: sample standard deviation
// - var_pop: population variance
// - var_samp: sample variance
// - covar_pop: population covariance
// - covar_samp: sample covariance
2023-09-02 00:48:55 +01:00
// - corr: correlation coefficient
2024-01-10 11:07:16 +00:00
// - regr_r2: correlation coefficient squared
// - regr_avgx: average of the independent variable
// - regr_avgy: average of the dependent variable
// - regr_sxx: sum of the squares of the independent variable
// - regr_syy: sum of the squares of the dependent variable
// - regr_sxy: sum of the products of each pair of variables
// - regr_count: count non-null pairs of variables
// - regr_slope: slope of the least-squares-fit linear equation
// - regr_intercept: y-intercept of the least-squares-fit linear equation
2025-01-20 14:39:36 +00:00
// - regr_json: all regr stats as a JSON object
2025-01-19 01:28:53 +00:00
// - percentile_disc: discrete quantile
// - percentile_cont: continuous quantile
// - percentile: continuous percentile
// - median: middle value
// - mode: most frequent value
2024-06-06 16:41:20 +01:00
// - every: boolean and
// - some: boolean or
2023-08-31 16:30:52 +01:00
//
2023-12-30 10:44:50 +00:00
// These join the [Built-in Aggregate Functions]:
// - count: count rows/values
// - sum: sum values
// - avg: average value
// - min: minimum value
// - max: maximum value
//
2024-06-06 16:41:20 +01:00
// And the [Built-in Window Functions]:
// - rank: rank of the current row with gaps
// - dense_rank: rank of the current row without gaps
// - percent_rank: relative rank of the row
// - cume_dist: cumulative distribution
//
2024-06-06 19:52:49 +01:00
// See: [ANSI SQL Aggregate Functions]
2023-08-31 16:30:52 +01:00
//
2023-12-30 10:44:50 +00:00
// [Built-in Aggregate Functions]: https://sqlite.org/lang_aggfunc.html
2024-06-06 16:41:20 +01:00
// [Built-in Window Functions]: https://sqlite.org/windowfunctions.html#builtins
2023-08-31 16:30:52 +01:00
// [ANSI SQL Aggregate Functions]: https://www.oreilly.com/library/view/sql-in-a/9780596155322/ch04s02.html
package stats
2024-07-08 12:06:57 +01:00
import (
"errors"
"github.com/ncruces/go-sqlite3"
)
2023-08-31 16:30:52 +01:00
// Register registers statistics functions.
2024-07-08 12:06:57 +01:00
func Register(db *sqlite3.Conn) error {
2024-09-30 13:30:50 +01:00
const flags = sqlite3.DETERMINISTIC | sqlite3.INNOCUOUS
2024-10-16 14:00:22 +01:00
const order = sqlite3.SELFORDER1 | flags
2024-07-08 12:06:57 +01:00
return errors.Join(
db.CreateWindowFunction("var_pop", 1, flags, newVariance(var_pop)),
db.CreateWindowFunction("var_samp", 1, flags, newVariance(var_samp)),
db.CreateWindowFunction("stddev_pop", 1, flags, newVariance(stddev_pop)),
db.CreateWindowFunction("stddev_samp", 1, flags, newVariance(stddev_samp)),
db.CreateWindowFunction("covar_pop", 2, flags, newCovariance(var_pop)),
db.CreateWindowFunction("covar_samp", 2, flags, newCovariance(var_samp)),
db.CreateWindowFunction("corr", 2, flags, newCovariance(corr)),
db.CreateWindowFunction("regr_r2", 2, flags, newCovariance(regr_r2)),
db.CreateWindowFunction("regr_sxx", 2, flags, newCovariance(regr_sxx)),
db.CreateWindowFunction("regr_syy", 2, flags, newCovariance(regr_syy)),
db.CreateWindowFunction("regr_sxy", 2, flags, newCovariance(regr_sxy)),
db.CreateWindowFunction("regr_avgx", 2, flags, newCovariance(regr_avgx)),
db.CreateWindowFunction("regr_avgy", 2, flags, newCovariance(regr_avgy)),
db.CreateWindowFunction("regr_slope", 2, flags, newCovariance(regr_slope)),
db.CreateWindowFunction("regr_intercept", 2, flags, newCovariance(regr_intercept)),
db.CreateWindowFunction("regr_count", 2, flags, newCovariance(regr_count)),
db.CreateWindowFunction("regr_json", 2, flags, newCovariance(regr_json)),
2024-10-16 14:00:22 +01:00
db.CreateWindowFunction("median", 1, order, newPercentile(median)),
db.CreateWindowFunction("percentile", 2, order, newPercentile(percentile_100)),
db.CreateWindowFunction("percentile_cont", 2, order, newPercentile(percentile_cont)),
db.CreateWindowFunction("percentile_disc", 2, order, newPercentile(percentile_disc)),
2024-07-08 12:06:57 +01:00
db.CreateWindowFunction("every", 1, flags, newBoolean(every)),
2025-01-19 01:28:53 +00:00
db.CreateWindowFunction("some", 1, flags, newBoolean(some)),
db.CreateWindowFunction("mode", 1, order, newMode))
2023-08-31 16:30:52 +01:00
}
const (
var_pop = iota
var_samp
stddev_pop
stddev_samp
2023-09-02 00:48:55 +01:00
corr
2024-01-10 11:07:16 +00:00
regr_r2
regr_sxx
regr_syy
regr_sxy
2024-01-09 03:20:59 +00:00
regr_avgx
regr_avgy
regr_slope
regr_intercept
2024-01-10 11:07:16 +00:00
regr_count
2024-01-25 11:03:33 +00:00
regr_json
2023-08-31 16:30:52 +01:00
)
2023-09-01 02:26:30 +01:00
func newVariance(kind int) func() sqlite3.AggregateFunction {
return func() sqlite3.AggregateFunction { return &variance{kind: kind} }
2023-08-31 16:30:52 +01:00
}
2023-09-01 02:26:30 +01:00
type variance struct {
2023-08-31 16:30:52 +01:00
kind int
welford
}
2023-09-01 02:26:30 +01:00
func (fn *variance) Value(ctx sqlite3.Context) {
2025-01-20 14:39:36 +00:00
switch fn.n {
case 1:
switch fn.kind {
case var_pop, stddev_pop:
ctx.ResultFloat(0)
}
return
case 0:
return
}
2023-08-31 16:30:52 +01:00
var r float64
2023-09-01 02:26:30 +01:00
switch fn.kind {
2023-08-31 16:30:52 +01:00
case var_pop:
2023-09-01 02:26:30 +01:00
r = fn.var_pop()
2023-08-31 16:30:52 +01:00
case var_samp:
2023-09-01 02:26:30 +01:00
r = fn.var_samp()
2023-08-31 16:30:52 +01:00
case stddev_pop:
2023-09-01 02:26:30 +01:00
r = fn.stddev_pop()
2023-08-31 16:30:52 +01:00
case stddev_samp:
2023-09-01 02:26:30 +01:00
r = fn.stddev_samp()
2023-08-31 16:30:52 +01:00
}
ctx.ResultFloat(r)
}
2023-09-01 02:26:30 +01:00
func (fn *variance) Step(ctx sqlite3.Context, arg ...sqlite3.Value) {
2024-10-01 15:16:06 +01:00
a := arg[0]
f := a.Float()
if f != 0.0 || a.NumericType() != sqlite3.NULL {
fn.enqueue(f)
2023-08-31 16:30:52 +01:00
}
}
2023-09-01 02:26:30 +01:00
func (fn *variance) Inverse(ctx sqlite3.Context, arg ...sqlite3.Value) {
2024-10-01 15:16:06 +01:00
a := arg[0]
f := a.Float()
if f != 0.0 || a.NumericType() != sqlite3.NULL {
fn.dequeue(f)
2023-09-01 02:26:30 +01:00
}
}
func newCovariance(kind int) func() sqlite3.AggregateFunction {
return func() sqlite3.AggregateFunction { return &covariance{kind: kind} }
}
type covariance struct {
kind int
welford2
}
func (fn *covariance) Value(ctx sqlite3.Context) {
2025-01-20 14:39:36 +00:00
if fn.kind == regr_count {
ctx.ResultInt64(fn.regr_count())
return
}
switch fn.n {
case 1:
switch fn.kind {
case var_pop, stddev_pop, regr_sxx, regr_syy, regr_sxy:
ctx.ResultFloat(0)
return
case regr_avgx, regr_avgy:
break
default:
return
}
case 0:
return
}
2023-09-01 02:26:30 +01:00
var r float64
switch fn.kind {
case var_pop:
r = fn.covar_pop()
case var_samp:
r = fn.covar_samp()
2023-09-02 00:48:55 +01:00
case corr:
r = fn.correlation()
2024-01-10 11:07:16 +00:00
case regr_r2:
r = fn.regr_r2()
case regr_sxx:
r = fn.regr_sxx()
case regr_syy:
r = fn.regr_syy()
case regr_sxy:
r = fn.regr_sxy()
2024-01-09 03:20:59 +00:00
case regr_avgx:
r = fn.regr_avgx()
case regr_avgy:
r = fn.regr_avgy()
case regr_slope:
r = fn.regr_slope()
case regr_intercept:
r = fn.regr_intercept()
2024-01-25 11:03:33 +00:00
case regr_json:
2025-01-20 14:39:36 +00:00
var buf [128]byte
ctx.ResultRawText(fn.regr_json(buf[:0]))
2024-01-25 11:03:33 +00:00
return
2023-09-01 02:26:30 +01:00
}
ctx.ResultFloat(r)
}
func (fn *covariance) Step(ctx sqlite3.Context, arg ...sqlite3.Value) {
2024-09-28 10:37:47 +01:00
b, a := arg[1], arg[0] // avoid a bounds check
2024-10-01 15:16:06 +01:00
fa := a.Float()
fb := b.Float()
if true &&
(fa != 0.0 || a.NumericType() != sqlite3.NULL) &&
(fb != 0.0 || b.NumericType() != sqlite3.NULL) {
fn.enqueue(fa, fb)
2023-09-01 02:26:30 +01:00
}
}
func (fn *covariance) Inverse(ctx sqlite3.Context, arg ...sqlite3.Value) {
2024-09-28 10:37:47 +01:00
b, a := arg[1], arg[0] // avoid a bounds check
2024-10-01 15:16:06 +01:00
fa := a.Float()
fb := b.Float()
if true &&
(fa != 0.0 || a.NumericType() != sqlite3.NULL) &&
(fb != 0.0 || b.NumericType() != sqlite3.NULL) {
fn.dequeue(fa, fb)
2023-08-31 16:30:52 +01:00
}
}