cid impl via struct and via string together.

Added back in some of the parser methods.  (These were previously named "Cast"
and I think that's silly and wrong so I fixed it.)

Functions are named overly-literally with their type (e.g. ParseCidString and
ParseCidStruct rather than ParseCid or even just Parse) because for this
research package I don't want to bother with many sub-packages.  (Maybe I'll
regret this, but at the moment it seems simpler to hold back on sub-packages.)

Functions that produce Cids are literal with their return types, as well.
Part of the purpose of this research package is going to be to concretely
benchmark exactly how much performance overhead there is to using interfaces
(which will likely cause a lot of boxing and unboxing in practice) -- since we
want to explore where this boxing happens and how much it costs, it's important
that none of our basic implementation functions do the boxing!

The entire set of codec enums came along in this commit.  Ah well; they would
have eventually anyway, I guess.  But it's interesting to note the only thing
that dragged them along so far is the reference to 'DagProtobuf' when
constructing v0 CIDs; otherwise, this enum is quite unused here.
This commit is contained in:
Eric Myhre
2018-08-24 11:43:16 +02:00
parent ff25e9673c
commit c724ad0d22
6 changed files with 371 additions and 3 deletions

View File

@@ -22,6 +22,7 @@ type Cid interface {
Multihash() mh.Multihash // Yields the multihash segment.
String() string // Produces the CID formatted as b58 string.
Bytes() []byte // Produces the CID formatted as raw binary.
Prefix() Prefix // Produces a tuple of non-content metadata.
@@ -29,7 +30,8 @@ type Cid interface {
// - `KeyString() CidString` is gone because we're natively a map key now, you're welcome.
// - `StringOfBase(mbase.Encoding) (string, error)` is skipped, maybe it can come back but maybe it should be a formatter's job.
// - `Equals(o Cid) bool` is gone because it's now `==`, you're welcome.
// - `Bytes() []byte` is gone because I can't imagine where that should be used except again where a formatter should be involved.
// TODO: make a multi-return method for {v,mc,mh} decomposition. CidStr will be able to implement this more efficiently than if one makes a series of the individual getter calls.
}
// Prefix represents all the metadata of a Cid,

View File

@@ -2,23 +2,32 @@ package cid
import (
"encoding/binary"
"fmt"
mbase "github.com/multiformats/go-multibase"
mh "github.com/multiformats/go-multihash"
)
//=================
// def & accessors
//=================
var _ Cid = CidStr("")
var _ map[CidStr]struct{} = nil
// CidStr is a representation of a Cid as a string type containing binary.
//
// Using golang's string type is preferable over byte slices even for binary
// data because golang strings are immutable, usable as map keys,
// trivially comparable with built-in equals operators, etc.
//
// Please do not cast strings or bytes into the CidStr type directly;
// use a parse method which validates the data and yields a CidStr.
type CidStr string
// EmptyCid is a constant for a zero/uninitialized/sentinelvalue cid;
// EmptyCidStr is a constant for a zero/uninitialized/sentinelvalue cid;
// it is declared mainly for readability in checks for sentinel values.
const EmptyCid = CidStr("")
const EmptyCidStr = CidStr("")
func (c CidStr) Version() uint64 {
bytes := []byte(c)
@@ -57,6 +66,21 @@ func (c CidStr) String() string {
}
}
// Bytes produces a raw binary format of the CID.
//
// (For CidStr, this method is only distinct from casting because of
// compatibility with v0 CIDs.)
func (c CidStr) Bytes() []byte {
switch c.Version() {
case 0:
return c.Multihash()
case 1:
return []byte(c)
default:
panic("not possible to reach this point")
}
}
// Prefix builds and returns a Prefix out of a Cid.
func (c CidStr) Prefix() Prefix {
dec, _ := mh.Decode(c.Multihash()) // assuming we got a valid multiaddr, this will not error
@@ -67,3 +91,71 @@ func (c CidStr) Prefix() Prefix {
Codec: c.Multicodec(),
}
}
//==================================
// parsers & validators & factories
//==================================
func newCidStr(version uint64, codecType uint64, mhash mh.Multihash) CidStr {
hashlen := len(mhash)
// two 8 bytes (max) numbers plus hash
buf := make([]byte, 2*binary.MaxVarintLen64+hashlen)
n := binary.PutUvarint(buf, version)
n += binary.PutUvarint(buf[n:], codecType)
cn := copy(buf[n:], mhash)
if cn != hashlen {
panic("copy hash length is inconsistent")
}
return CidStr(buf[:n+hashlen])
}
// CidStrParse takes a binary byte slice, parses it, and returns either
// a valid CidStr, or the zero CidStr and an error.
//
// For CidV1, the data buffer is in the form:
//
// <version><codec-type><multihash>
//
// CidV0 are also supported. In particular, data buffers starting
// with length 34 bytes, which starts with bytes [18,32...] are considered
// binary multihashes.
//
// The multicodec bytes are not parsed to verify they're a valid varint;
// no further reification is performed.
//
// Multibase encoding should already have been unwrapped before parsing;
// if you have a multibase-enveloped string, use CidStrDecode instead.
//
// CidStrParse is the inverse of Cid.Bytes().
func CidStrParse(data []byte) (CidStr, error) {
if len(data) == 34 && data[0] == 18 && data[1] == 32 {
h, err := mh.Cast(data)
if err != nil {
return EmptyCidStr, err
}
return newCidStr(0, DagProtobuf, h), nil
}
vers, n := binary.Uvarint(data)
if err := uvError(n); err != nil {
return EmptyCidStr, err
}
if vers != 0 && vers != 1 {
return EmptyCidStr, fmt.Errorf("invalid cid version number: %d", vers)
}
_, cn := binary.Uvarint(data[n:])
if err := uvError(cn); err != nil {
return EmptyCidStr, err
}
rest := data[n+cn:]
h, err := mh.Cast(rest)
if err != nil {
return EmptyCidStr, err
}
// REVIEW: if the data is longer than the mh.len expects, we silently ignore it? should we?
return CidStr(data[0 : n+cn+len(h)]), nil
}

View File

@@ -0,0 +1,162 @@
package cid
import (
"encoding/binary"
"fmt"
mbase "github.com/multiformats/go-multibase"
mh "github.com/multiformats/go-multihash"
)
//=================
// def & accessors
//=================
var _ Cid = CidStruct{}
//var _ map[CidStruct]struct{} = nil // Will not compile! See struct def docs.
// CidStruct represents a CID in a struct format.
//
// This format complies with the exact same Cid interface as the CidStr
// implementation, but completely pre-parses the Cid metadata.
// CidStruct is a tad quicker in case of repeatedly accessed fields,
// but requires more reshuffling to parse and to serialize.
// CidStruct is not usable as a map key, because it contains a Multihash
// reference, which is a slice, and thus not "comparable" as a primitive.
//
// Beware of zero-valued CidStruct: it is difficult to distinguish an
// incorrectly-initialized "invalid" CidStruct from one representing a v0 cid.
type CidStruct struct {
version uint64
codec uint64
hash mh.Multihash
}
// EmptyCidStruct is a constant for a zero/uninitialized/sentinelvalue cid;
// it is declared mainly for readability in checks for sentinel values.
//
// Note: it's not actually a const; the compiler does not allow const structs.
var EmptyCidStruct = CidStruct{}
func (c CidStruct) Version() uint64 {
return c.version
}
func (c CidStruct) Multicodec() uint64 {
return c.codec
}
func (c CidStruct) Multihash() mh.Multihash {
return c.hash
}
// String returns the default string representation of a Cid.
// Currently, Base58 is used as the encoding for the multibase string.
func (c CidStruct) String() string {
switch c.Version() {
case 0:
return c.Multihash().B58String()
case 1:
mbstr, err := mbase.Encode(mbase.Base58BTC, c.Bytes())
if err != nil {
panic("should not error with hardcoded mbase: " + err.Error())
}
return mbstr
default:
panic("not possible to reach this point")
}
}
// Bytes produces a raw binary format of the CID.
func (c CidStruct) Bytes() []byte {
switch c.version {
case 0:
return []byte(c.hash)
case 1:
// two 8 bytes (max) numbers plus hash
buf := make([]byte, 2*binary.MaxVarintLen64+len(c.hash))
n := binary.PutUvarint(buf, c.version)
n += binary.PutUvarint(buf[n:], c.codec)
cn := copy(buf[n:], c.hash)
if cn != len(c.hash) {
panic("copy hash length is inconsistent")
}
return buf[:n+len(c.hash)]
default:
panic("not possible to reach this point")
}
}
// Prefix builds and returns a Prefix out of a Cid.
func (c CidStruct) Prefix() Prefix {
dec, _ := mh.Decode(c.hash) // assuming we got a valid multiaddr, this will not error
return Prefix{
MhType: dec.Code,
MhLength: dec.Length,
Version: c.version,
Codec: c.codec,
}
}
//==================================
// parsers & validators & factories
//==================================
// CidStructParse takes a binary byte slice, parses it, and returns either
// a valid CidStruct, or the zero CidStruct and an error.
//
// For CidV1, the data buffer is in the form:
//
// <version><codec-type><multihash>
//
// CidV0 are also supported. In particular, data buffers starting
// with length 34 bytes, which starts with bytes [18,32...] are considered
// binary multihashes.
//
// The multicodec bytes are not parsed to verify they're a valid varint;
// no further reification is performed.
//
// Multibase encoding should already have been unwrapped before parsing;
// if you have a multibase-enveloped string, use CidStructDecode instead.
//
// CidStructParse is the inverse of Cid.Bytes().
func CidStructParse(data []byte) (CidStruct, error) {
if len(data) == 34 && data[0] == 18 && data[1] == 32 {
h, err := mh.Cast(data)
if err != nil {
return EmptyCidStruct, err
}
return CidStruct{
codec: DagProtobuf,
version: 0,
hash: h,
}, nil
}
vers, n := binary.Uvarint(data)
if err := uvError(n); err != nil {
return EmptyCidStruct, err
}
if vers != 0 && vers != 1 {
return EmptyCidStruct, fmt.Errorf("invalid cid version number: %d", vers)
}
codec, cn := binary.Uvarint(data[n:])
if err := uvError(cn); err != nil {
return EmptyCidStruct, err
}
rest := data[n+cn:]
h, err := mh.Cast(rest)
if err != nil {
return EmptyCidStruct, err
}
return CidStruct{
version: vers,
codec: codec,
hash: h,
}, nil
}

76
_rsrch/cidiface/enums.go Normal file
View File

@@ -0,0 +1,76 @@
package cid
// These are multicodec-packed content types. The should match
// the codes described in the authoritative document:
// https://github.com/multiformats/multicodec/blob/master/table.csv
const (
Raw = 0x55
DagProtobuf = 0x70
DagCBOR = 0x71
GitRaw = 0x78
EthBlock = 0x90
EthBlockList = 0x91
EthTxTrie = 0x92
EthTx = 0x93
EthTxReceiptTrie = 0x94
EthTxReceipt = 0x95
EthStateTrie = 0x96
EthAccountSnapshot = 0x97
EthStorageTrie = 0x98
BitcoinBlock = 0xb0
BitcoinTx = 0xb1
ZcashBlock = 0xc0
ZcashTx = 0xc1
DecredBlock = 0xe0
DecredTx = 0xe1
)
// Codecs maps the name of a codec to its type
var Codecs = map[string]uint64{
"v0": DagProtobuf,
"raw": Raw,
"protobuf": DagProtobuf,
"cbor": DagCBOR,
"git-raw": GitRaw,
"eth-block": EthBlock,
"eth-block-list": EthBlockList,
"eth-tx-trie": EthTxTrie,
"eth-tx": EthTx,
"eth-tx-receipt-trie": EthTxReceiptTrie,
"eth-tx-receipt": EthTxReceipt,
"eth-state-trie": EthStateTrie,
"eth-account-snapshot": EthAccountSnapshot,
"eth-storage-trie": EthStorageTrie,
"bitcoin-block": BitcoinBlock,
"bitcoin-tx": BitcoinTx,
"zcash-block": ZcashBlock,
"zcash-tx": ZcashTx,
"decred-block": DecredBlock,
"decred-tx": DecredTx,
}
// CodecToStr maps the numeric codec to its name
var CodecToStr = map[uint64]string{
Raw: "raw",
DagProtobuf: "protobuf",
DagCBOR: "cbor",
GitRaw: "git-raw",
EthBlock: "eth-block",
EthBlockList: "eth-block-list",
EthTxTrie: "eth-tx-trie",
EthTx: "eth-tx",
EthTxReceiptTrie: "eth-tx-receipt-trie",
EthTxReceipt: "eth-tx-receipt",
EthStateTrie: "eth-state-trie",
EthAccountSnapshot: "eth-account-snapshot",
EthStorageTrie: "eth-storage-trie",
BitcoinBlock: "bitcoin-block",
BitcoinTx: "bitcoin-tx",
ZcashBlock: "zcash-block",
ZcashTx: "zcash-tx",
DecredBlock: "decred-block",
DecredTx: "decred-tx",
}

24
_rsrch/cidiface/errors.go Normal file
View File

@@ -0,0 +1,24 @@
package cid
import (
"errors"
)
var (
// ErrVarintBuffSmall means that a buffer passed to the cid parser was not
// long enough, or did not contain an invalid cid
ErrVarintBuffSmall = errors.New("reading varint: buffer too small")
// ErrVarintTooBig means that the varint in the given cid was above the
// limit of 2^64
ErrVarintTooBig = errors.New("reading varint: varint bigger than 64bits" +
" and not supported")
// ErrCidTooShort means that the cid passed to decode was not long
// enough to be a valid Cid
ErrCidTooShort = errors.New("cid too short")
// ErrInvalidEncoding means that selected encoding is not supported
// by this Cid version
ErrInvalidEncoding = errors.New("invalid base encoding")
)

12
_rsrch/cidiface/misc.go Normal file
View File

@@ -0,0 +1,12 @@
package cid
func uvError(read int) error {
switch {
case read == 0:
return ErrVarintBuffSmall
case read < 0:
return ErrVarintTooBig
default:
return nil
}
}