feat: add UTF-8 support and base256emoji

This include fixes for UTF-8 as well as base256emoji encoding (an encoding which actually use UTF-8).
This commit is contained in:
Jorropo
2022-06-08 07:31:52 +02:00
committed by Rod Vagg
parent 0bd72a8c32
commit df5b7bc6ee
7 changed files with 160 additions and 26 deletions

94
base256emoji.go Normal file
View File

@@ -0,0 +1,94 @@
package multibase
import (
"strconv"
"strings"
"unicode/utf8"
)
var base256emojiTable = [256]rune{
// Curated list, this is just a list of things that *somwhat* are related to our comunity
'🚀', '🪐', '☄', '🛰', '🌌', // Space
'🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘', // Moon
'🌍', '🌏', '🌎', // Our Home, for now (earth)
'☉', '☀', // Our Garden, for now (sol)
'💻', '🖥', '💾', '💿', // Computer
// The rest is completed from https://home.unicode.org/emoji/emoji-frequency/ at the time of creation (december 2021) (the data is from 2019), most used first until we reach 256.
// We exclude modifier based emojies (such as flags) as they are bigger than one single codepoint.
// Some other emojies were removed adhoc for various reasons.
'😂', '❤', '😍', '🤣', '😊', '🙏', '💕', '😭', '😘', '👍',
'😅', '👏', '😁', '🔥', '🥰', '💔', '💖', '💙', '😢', '🤔',
'😆', '🙄', '💪', '😉', '☺', '👌', '🤗', '💜', '😔', '😎',
'😇', '🌹', '🤦', '🎉', '💞', '✌', '✨', '🤷', '😱', '😌',
'🌸', '🙌', '😋', '💗', '💚', '😏', '💛', '🙂', '💓', '🤩',
'😄', '😀', '🖤', '😃', '💯', '🙈', '👇', '🎶', '😒', '🤭',
'❣', '😜', '💋', '👀', '😪', '😑', '💥', '🙋', '😞', '😩',
'😡', '🤪', '👊', '🥳', '😥', '🤤', '👉', '💃', '😳', '✋',
'😚', '😝', '😴', '🌟', '😬', '🙃', '🍀', '🌷', '😻', '😓',
'⭐', '✅', '🥺', '🌈', '😈', '🤘', '💦', '✔', '😣', '🏃',
'💐', '☹', '🎊', '💘', '😠', '☝', '😕', '🌺', '🎂', '🌻',
'😐', '🖕', '💝', '🙊', '😹', '🗣', '💫', '💀', '👑', '🎵',
'🤞', '😛', '🔴', '😤', '🌼', '😫', '⚽', '🤙', '☕', '🏆',
'🤫', '👈', '😮', '🙆', '🍻', '🍃', '🐶', '💁', '😲', '🌿',
'🧡', '🎁', '⚡', '🌞', '🎈', '❌', '✊', '👋', '😰', '🤨',
'😶', '🤝', '🚶', '💰', '🍓', '💢', '🤟', '🙁', '🚨', '💨',
'🤬', '✈', '🎀', '🍺', '🤓', '😙', '💟', '🌱', '😖', '👶',
'🥴', '▶', '➡', '❓', '💎', '💸', '⬇', '😨', '🌚', '🦋',
'😷', '🕺', '⚠', '🙅', '😟', '😵', '👎', '🤲', '🤠', '🤧',
'📌', '🔵', '💅', '🧐', '🐾', '🍒', '😗', '🤑', '🌊', '🤯',
'🐷', '☎', '💧', '😯', '💆', '👆', '🎤', '🙇', '🍑', '❄',
'🌴', '💣', '🐸', '💌', '📍', '🥀', '🤢', '👅', '💡', '💩',
'👐', '📸', '👻', '🤐', '🤮', '🎼', '🥵', '🚩', '🍎', '🍊',
'👼', '💍', '📣', '🥂',
}
var base256emojiReverseTable map[rune]byte
func init() {
base256emojiReverseTable = make(map[rune]byte, len(base256emojiTable))
for i, v := range base256emojiTable {
base256emojiReverseTable[v] = byte(i)
}
}
func base256emojiEncode(in []byte) string {
var l int
for _, v := range in {
l += utf8.RuneLen(base256emojiTable[v])
}
var out strings.Builder
out.Grow(l)
for _, v := range in {
out.WriteRune(base256emojiTable[v])
}
return out.String()
}
type base256emojiCorruptInputError struct {
index int
char rune
}
func (e base256emojiCorruptInputError) Error() string {
return "illegal base256emoji data at input byte " + strconv.FormatInt(int64(e.index), 10) + ", char: '" + string(e.char) + "'"
}
func (e base256emojiCorruptInputError) String() string {
return e.Error()
}
func base256emojiDecode(in string) ([]byte, error) {
out := make([]byte, utf8.RuneCountInString(in))
var stri int
for i := 0; len(in) > 0; i++ {
r, n := utf8.DecodeRuneInString(in)
in = in[n:]
var ok bool
out[i], ok = base256emojiReverseTable[r]
if !ok {
return nil, base256emojiCorruptInputError{stri, r}
}
stri += n
}
return out, nil
}

26
base256emoji_test.go Normal file
View File

@@ -0,0 +1,26 @@
package multibase
import "testing"
func TestBase256EmojiAlphabet(t *testing.T) {
var c uint
for _, v := range base256emojiTable {
if v != rune(0) {
c++
}
}
if c != 256 {
t.Errorf("Base256Emoji count is wrong, expected 256, got %d.", c)
}
}
func TestBase256EmojiUniq(t *testing.T) {
m := make(map[rune]struct{}, len(base256emojiTable))
for i, v := range base256emojiTable {
_, ok := m[v]
if ok {
t.Errorf("Base256Emoji duplicate %s at index %d.", string(v), i)
}
m[v] = struct{}{}
}
}

View File

@@ -2,6 +2,7 @@ package multibase
import ( import (
"fmt" "fmt"
"unicode/utf8"
) )
// Encoder is a multibase encoding that is verified to be supported and // Encoder is a multibase encoding that is verified to be supported and
@@ -36,8 +37,9 @@ func EncoderByName(str string) (Encoder, error) {
var ok bool var ok bool
if len(str) == 0 { if len(str) == 0 {
return Encoder{-1}, fmt.Errorf("empty multibase encoding") return Encoder{-1}, fmt.Errorf("empty multibase encoding")
} else if len(str) == 1 { } else if utf8.RuneCountInString(str) == 1 {
base = Encoding(str[0]) r, _ := utf8.DecodeRuneInString(str)
base = Encoding(r)
_, ok = EncodingToStr[base] _, ok = EncodingToStr[base]
} else { } else {
base, ok = Encodings[str] base, ok = Encodings[str]

View File

@@ -2,6 +2,7 @@ package multibase
import ( import (
"testing" "testing"
"unicode/utf8"
) )
func TestInvalidCode(t *testing.T) { func TestInvalidCode(t *testing.T) {
@@ -43,9 +44,10 @@ func TestEncoder(t *testing.T) {
} }
// Test that an encoder can be created from the single letter // Test that an encoder can be created from the single letter
// prefix // prefix
_, err = EncoderByName(str[0:1]) r, _ := utf8.DecodeRuneInString(str)
_, err = EncoderByName(string(r))
if err != nil { if err != nil {
t.Fatalf("EncoderByName(%s) failed: %v", str[0:1], err) t.Fatalf("EncoderByName(%s) failed: %v", string(r), err)
} }
} }
} }

View File

@@ -4,6 +4,7 @@ import (
"encoding/base64" "encoding/base64"
"encoding/hex" "encoding/hex"
"fmt" "fmt"
"unicode/utf8"
b58 "github.com/mr-tron/base58/base58" b58 "github.com/mr-tron/base58/base58"
b32 "github.com/multiformats/go-base32" b32 "github.com/multiformats/go-base32"
@@ -38,31 +39,33 @@ const (
Base64url = 'u' Base64url = 'u'
Base64pad = 'M' Base64pad = 'M'
Base64urlPad = 'U' Base64urlPad = 'U'
Base256Emoji = '🚀'
) )
// EncodingToStr is a map of the supported encoding, unsupported encoding // EncodingToStr is a map of the supported encoding, unsupported encoding
// specified in standard are left out // specified in standard are left out
var EncodingToStr = map[Encoding]string{ var EncodingToStr = map[Encoding]string{
0x00: "identity", 0x00: "identity",
'0': "base2", '0': "base2",
'f': "base16", 'f': "base16",
'F': "base16upper", 'F': "base16upper",
'b': "base32", 'b': "base32",
'B': "base32upper", 'B': "base32upper",
'c': "base32pad", 'c': "base32pad",
'C': "base32padupper", 'C': "base32padupper",
'v': "base32hex", 'v': "base32hex",
'V': "base32hexupper", 'V': "base32hexupper",
't': "base32hexpad", 't': "base32hexpad",
'T': "base32hexpadupper", 'T': "base32hexpadupper",
'k': "base36", 'k': "base36",
'K': "base36upper", 'K': "base36upper",
'z': "base58btc", 'z': "base58btc",
'Z': "base58flickr", 'Z': "base58flickr",
'm': "base64", 'm': "base64",
'u': "base64url", 'u': "base64url",
'M': "base64pad", 'M': "base64pad",
'U': "base64urlpad", 'U': "base64urlpad",
Base256Emoji: "base256emoji",
} }
var Encodings = map[string]Encoding{} var Encodings = map[string]Encoding{}
@@ -123,6 +126,8 @@ func Encode(base Encoding, data []byte) (string, error) {
return string(Base64url) + base64.RawURLEncoding.EncodeToString(data), nil return string(Base64url) + base64.RawURLEncoding.EncodeToString(data), nil
case Base64: case Base64:
return string(Base64) + base64.RawStdEncoding.EncodeToString(data), nil return string(Base64) + base64.RawStdEncoding.EncodeToString(data), nil
case Base256Emoji:
return string(Base256Emoji) + base256emojiEncode(data), nil
default: default:
return "", ErrUnsupportedEncoding return "", ErrUnsupportedEncoding
} }
@@ -135,7 +140,8 @@ func Decode(data string) (Encoding, []byte, error) {
return 0, nil, fmt.Errorf("cannot decode multibase for zero length string") return 0, nil, fmt.Errorf("cannot decode multibase for zero length string")
} }
enc := Encoding(data[0]) r, _ := utf8.DecodeRuneInString(data)
enc := Encoding(r)
switch enc { switch enc {
case Identity: case Identity:
@@ -179,6 +185,9 @@ func Decode(data string) (Encoding, []byte, error) {
case Base64url: case Base64url:
bytes, err := base64.RawURLEncoding.DecodeString(data[1:]) bytes, err := base64.RawURLEncoding.DecodeString(data[1:])
return Base64url, bytes, err return Base64url, bytes, err
case Base256Emoji:
bytes, err := base256emojiDecode(data[4:])
return Base256Emoji, bytes, err
default: default:
return -1, nil, ErrUnsupportedEncoding return -1, nil, ErrUnsupportedEncoding
} }

View File

@@ -44,6 +44,7 @@ var encodedSamples = map[Encoding]string{
Base64url: "uRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE", Base64url: "uRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE",
Base64pad: "MRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE=", Base64pad: "MRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE=",
Base64urlPad: "URGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE=", Base64urlPad: "URGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE=",
Base256Emoji: "🚀💛✋💃✋😻😈🥺🤤🍀🌟💐✋😅✋💦✋🥺🏃😈😴🌟😻😝👏👏👏",
} }
func testEncode(t *testing.T, encoding Encoding, bytes []byte, expected string) { func testEncode(t *testing.T, encoding Encoding, bytes []byte, expected string) {

2
spec

Submodule spec updated: a4b4a4e5e4...cffd1aa308