feat: add UTF-8 support and base256emoji

This include fixes for UTF-8 as well as base256emoji encoding (an encoding which actually use UTF-8).
This commit is contained in:
Jorropo
2022-06-08 07:31:52 +02:00
committed by Rod Vagg
parent 0bd72a8c32
commit df5b7bc6ee
7 changed files with 160 additions and 26 deletions

94
base256emoji.go Normal file
View File

@@ -0,0 +1,94 @@
package multibase
import (
"strconv"
"strings"
"unicode/utf8"
)
var base256emojiTable = [256]rune{
// Curated list, this is just a list of things that *somwhat* are related to our comunity
'🚀', '🪐', '☄', '🛰', '🌌', // Space
'🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘', // Moon
'🌍', '🌏', '🌎', // Our Home, for now (earth)
'☉', '☀', // Our Garden, for now (sol)
'💻', '🖥', '💾', '💿', // Computer
// The rest is completed from https://home.unicode.org/emoji/emoji-frequency/ at the time of creation (december 2021) (the data is from 2019), most used first until we reach 256.
// We exclude modifier based emojies (such as flags) as they are bigger than one single codepoint.
// Some other emojies were removed adhoc for various reasons.
'😂', '❤', '😍', '🤣', '😊', '🙏', '💕', '😭', '😘', '👍',
'😅', '👏', '😁', '🔥', '🥰', '💔', '💖', '💙', '😢', '🤔',
'😆', '🙄', '💪', '😉', '☺', '👌', '🤗', '💜', '😔', '😎',
'😇', '🌹', '🤦', '🎉', '💞', '✌', '✨', '🤷', '😱', '😌',
'🌸', '🙌', '😋', '💗', '💚', '😏', '💛', '🙂', '💓', '🤩',
'😄', '😀', '🖤', '😃', '💯', '🙈', '👇', '🎶', '😒', '🤭',
'❣', '😜', '💋', '👀', '😪', '😑', '💥', '🙋', '😞', '😩',
'😡', '🤪', '👊', '🥳', '😥', '🤤', '👉', '💃', '😳', '✋',
'😚', '😝', '😴', '🌟', '😬', '🙃', '🍀', '🌷', '😻', '😓',
'⭐', '✅', '🥺', '🌈', '😈', '🤘', '💦', '✔', '😣', '🏃',
'💐', '☹', '🎊', '💘', '😠', '☝', '😕', '🌺', '🎂', '🌻',
'😐', '🖕', '💝', '🙊', '😹', '🗣', '💫', '💀', '👑', '🎵',
'🤞', '😛', '🔴', '😤', '🌼', '😫', '⚽', '🤙', '☕', '🏆',
'🤫', '👈', '😮', '🙆', '🍻', '🍃', '🐶', '💁', '😲', '🌿',
'🧡', '🎁', '⚡', '🌞', '🎈', '❌', '✊', '👋', '😰', '🤨',
'😶', '🤝', '🚶', '💰', '🍓', '💢', '🤟', '🙁', '🚨', '💨',
'🤬', '✈', '🎀', '🍺', '🤓', '😙', '💟', '🌱', '😖', '👶',
'🥴', '▶', '➡', '❓', '💎', '💸', '⬇', '😨', '🌚', '🦋',
'😷', '🕺', '⚠', '🙅', '😟', '😵', '👎', '🤲', '🤠', '🤧',
'📌', '🔵', '💅', '🧐', '🐾', '🍒', '😗', '🤑', '🌊', '🤯',
'🐷', '☎', '💧', '😯', '💆', '👆', '🎤', '🙇', '🍑', '❄',
'🌴', '💣', '🐸', '💌', '📍', '🥀', '🤢', '👅', '💡', '💩',
'👐', '📸', '👻', '🤐', '🤮', '🎼', '🥵', '🚩', '🍎', '🍊',
'👼', '💍', '📣', '🥂',
}
var base256emojiReverseTable map[rune]byte
func init() {
base256emojiReverseTable = make(map[rune]byte, len(base256emojiTable))
for i, v := range base256emojiTable {
base256emojiReverseTable[v] = byte(i)
}
}
func base256emojiEncode(in []byte) string {
var l int
for _, v := range in {
l += utf8.RuneLen(base256emojiTable[v])
}
var out strings.Builder
out.Grow(l)
for _, v := range in {
out.WriteRune(base256emojiTable[v])
}
return out.String()
}
type base256emojiCorruptInputError struct {
index int
char rune
}
func (e base256emojiCorruptInputError) Error() string {
return "illegal base256emoji data at input byte " + strconv.FormatInt(int64(e.index), 10) + ", char: '" + string(e.char) + "'"
}
func (e base256emojiCorruptInputError) String() string {
return e.Error()
}
func base256emojiDecode(in string) ([]byte, error) {
out := make([]byte, utf8.RuneCountInString(in))
var stri int
for i := 0; len(in) > 0; i++ {
r, n := utf8.DecodeRuneInString(in)
in = in[n:]
var ok bool
out[i], ok = base256emojiReverseTable[r]
if !ok {
return nil, base256emojiCorruptInputError{stri, r}
}
stri += n
}
return out, nil
}

26
base256emoji_test.go Normal file
View File

@@ -0,0 +1,26 @@
package multibase
import "testing"
func TestBase256EmojiAlphabet(t *testing.T) {
var c uint
for _, v := range base256emojiTable {
if v != rune(0) {
c++
}
}
if c != 256 {
t.Errorf("Base256Emoji count is wrong, expected 256, got %d.", c)
}
}
func TestBase256EmojiUniq(t *testing.T) {
m := make(map[rune]struct{}, len(base256emojiTable))
for i, v := range base256emojiTable {
_, ok := m[v]
if ok {
t.Errorf("Base256Emoji duplicate %s at index %d.", string(v), i)
}
m[v] = struct{}{}
}
}

View File

@@ -2,6 +2,7 @@ package multibase
import (
"fmt"
"unicode/utf8"
)
// Encoder is a multibase encoding that is verified to be supported and
@@ -36,8 +37,9 @@ func EncoderByName(str string) (Encoder, error) {
var ok bool
if len(str) == 0 {
return Encoder{-1}, fmt.Errorf("empty multibase encoding")
} else if len(str) == 1 {
base = Encoding(str[0])
} else if utf8.RuneCountInString(str) == 1 {
r, _ := utf8.DecodeRuneInString(str)
base = Encoding(r)
_, ok = EncodingToStr[base]
} else {
base, ok = Encodings[str]

View File

@@ -2,6 +2,7 @@ package multibase
import (
"testing"
"unicode/utf8"
)
func TestInvalidCode(t *testing.T) {
@@ -43,9 +44,10 @@ func TestEncoder(t *testing.T) {
}
// Test that an encoder can be created from the single letter
// prefix
_, err = EncoderByName(str[0:1])
r, _ := utf8.DecodeRuneInString(str)
_, err = EncoderByName(string(r))
if err != nil {
t.Fatalf("EncoderByName(%s) failed: %v", str[0:1], err)
t.Fatalf("EncoderByName(%s) failed: %v", string(r), err)
}
}
}

View File

@@ -4,6 +4,7 @@ import (
"encoding/base64"
"encoding/hex"
"fmt"
"unicode/utf8"
b58 "github.com/mr-tron/base58/base58"
b32 "github.com/multiformats/go-base32"
@@ -38,31 +39,33 @@ const (
Base64url = 'u'
Base64pad = 'M'
Base64urlPad = 'U'
Base256Emoji = '🚀'
)
// EncodingToStr is a map of the supported encoding, unsupported encoding
// specified in standard are left out
var EncodingToStr = map[Encoding]string{
0x00: "identity",
'0': "base2",
'f': "base16",
'F': "base16upper",
'b': "base32",
'B': "base32upper",
'c': "base32pad",
'C': "base32padupper",
'v': "base32hex",
'V': "base32hexupper",
't': "base32hexpad",
'T': "base32hexpadupper",
'k': "base36",
'K': "base36upper",
'z': "base58btc",
'Z': "base58flickr",
'm': "base64",
'u': "base64url",
'M': "base64pad",
'U': "base64urlpad",
0x00: "identity",
'0': "base2",
'f': "base16",
'F': "base16upper",
'b': "base32",
'B': "base32upper",
'c': "base32pad",
'C': "base32padupper",
'v': "base32hex",
'V': "base32hexupper",
't': "base32hexpad",
'T': "base32hexpadupper",
'k': "base36",
'K': "base36upper",
'z': "base58btc",
'Z': "base58flickr",
'm': "base64",
'u': "base64url",
'M': "base64pad",
'U': "base64urlpad",
Base256Emoji: "base256emoji",
}
var Encodings = map[string]Encoding{}
@@ -123,6 +126,8 @@ func Encode(base Encoding, data []byte) (string, error) {
return string(Base64url) + base64.RawURLEncoding.EncodeToString(data), nil
case Base64:
return string(Base64) + base64.RawStdEncoding.EncodeToString(data), nil
case Base256Emoji:
return string(Base256Emoji) + base256emojiEncode(data), nil
default:
return "", ErrUnsupportedEncoding
}
@@ -135,7 +140,8 @@ func Decode(data string) (Encoding, []byte, error) {
return 0, nil, fmt.Errorf("cannot decode multibase for zero length string")
}
enc := Encoding(data[0])
r, _ := utf8.DecodeRuneInString(data)
enc := Encoding(r)
switch enc {
case Identity:
@@ -179,6 +185,9 @@ func Decode(data string) (Encoding, []byte, error) {
case Base64url:
bytes, err := base64.RawURLEncoding.DecodeString(data[1:])
return Base64url, bytes, err
case Base256Emoji:
bytes, err := base256emojiDecode(data[4:])
return Base256Emoji, bytes, err
default:
return -1, nil, ErrUnsupportedEncoding
}

View File

@@ -44,6 +44,7 @@ var encodedSamples = map[Encoding]string{
Base64url: "uRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE",
Base64pad: "MRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE=",
Base64urlPad: "URGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchISE=",
Base256Emoji: "🚀💛✋💃✋😻😈🥺🤤🍀🌟💐✋😅✋💦✋🥺🏃😈😴🌟😻😝👏👏👏",
}
func testEncode(t *testing.T, encoding Encoding, bytes []byte, expected string) {

2
spec

Submodule spec updated: a4b4a4e5e4...cffd1aa308