cid impl via struct and via string together.

Added back in some of the parser methods. (These were previously named "Cast" and I think that's silly and wrong so I fixed it.) Functions are named overly-literally with their type (e.g. ParseCidString and ParseCidStruct rather than ParseCid or even just Parse) because for this research package I don't want to bother with many sub-packages. (Maybe I'll regret this, but at the moment it seems simpler to hold back on sub-packages.) Functions that produce Cids are literal with their return types, as well. Part of the purpose of this research package is going to be to concretely benchmark exactly how much performance overhead there is to using interfaces (which will likely cause a lot of boxing and unboxing in practice) -- since we want to explore where this boxing happens and how much it costs, it's important that none of our basic implementation functions do the boxing! The entire set of codec enums came along in this commit. Ah well; they would have eventually anyway, I guess. But it's interesting to note the only thing that dragged them along so far is the reference to 'DagProtobuf' when constructing v0 CIDs; otherwise, this enum is quite unused here.
2018-08-24 11:43:16 +02:00
parent ff25e9673c
commit c724ad0d22
6 changed files with 371 additions and 3 deletions
--- a/_rsrch/cidiface/cid.go
+++ b/_rsrch/cidiface/cid.go
@@ -22,6 +22,7 @@ type Cid interface {
 	Multihash() mh.Multihash // Yields the multihash segment.

 	String() string // Produces the CID formatted as b58 string.
+	Bytes() []byte  // Produces the CID formatted as raw binary.

 	Prefix() Prefix // Produces a tuple of non-content metadata.

@@ -29,7 +30,8 @@ type Cid interface {
 	// - `KeyString() CidString` is gone because we're natively a map key now, you're welcome.
 	// - `StringOfBase(mbase.Encoding) (string, error)` is skipped, maybe it can come back but maybe it should be a formatter's job.
 	// - `Equals(o Cid) bool` is gone because it's now `==`, you're welcome.
-	// - `Bytes() []byte` is gone because I can't imagine where that should be used except again where a formatter should be involved.
+
+	// TODO: make a multi-return method for {v,mc,mh} decomposition.  CidStr will be able to implement this more efficiently than if one makes a series of the individual getter calls.
 }

 // Prefix represents all the metadata of a Cid,
--- a/_rsrch/cidiface/cidString.go
+++ b/_rsrch/cidiface/cidString.go
@@ -2,23 +2,32 @@ package cid

 import (
 	"encoding/binary"
+	"fmt"

 	mbase "github.com/multiformats/go-multibase"
 	mh "github.com/multiformats/go-multihash"
 )

+//=================
+// def & accessors
+//=================
+
 var _ Cid = CidStr("")
+var _ map[CidStr]struct{} = nil

 // CidStr is a representation of a Cid as a string type containing binary.
 //
 // Using golang's string type is preferable over byte slices even for binary
 // data because golang strings are immutable, usable as map keys,
 // trivially comparable with built-in equals operators, etc.
+//
+// Please do not cast strings or bytes into the CidStr type directly;
+// use a parse method which validates the data and yields a CidStr.
 type CidStr string

-// EmptyCid is a constant for a zero/uninitialized/sentinelvalue cid;
+// EmptyCidStr is a constant for a zero/uninitialized/sentinelvalue cid;
 // it is declared mainly for readability in checks for sentinel values.
-const EmptyCid = CidStr("")
+const EmptyCidStr = CidStr("")

 func (c CidStr) Version() uint64 {
 	bytes := []byte(c)
@@ -57,6 +66,21 @@ func (c CidStr) String() string {
 	}
 }

+// Bytes produces a raw binary format of the CID.
+//
+// (For CidStr, this method is only distinct from casting because of
+// compatibility with v0 CIDs.)
+func (c CidStr) Bytes() []byte {
+	switch c.Version() {
+	case 0:
+		return c.Multihash()
+	case 1:
+		return []byte(c)
+	default:
+		panic("not possible to reach this point")
+	}
+}
+
 // Prefix builds and returns a Prefix out of a Cid.
 func (c CidStr) Prefix() Prefix {
 	dec, _ := mh.Decode(c.Multihash()) // assuming we got a valid multiaddr, this will not error
@@ -67,3 +91,71 @@ func (c CidStr) Prefix() Prefix {
 		Codec:    c.Multicodec(),
 	}
 }
+
+//==================================
+// parsers & validators & factories
+//==================================
+
+func newCidStr(version uint64, codecType uint64, mhash mh.Multihash) CidStr {
+	hashlen := len(mhash)
+	// two 8 bytes (max) numbers plus hash
+	buf := make([]byte, 2*binary.MaxVarintLen64+hashlen)
+	n := binary.PutUvarint(buf, version)
+	n += binary.PutUvarint(buf[n:], codecType)
+	cn := copy(buf[n:], mhash)
+	if cn != hashlen {
+		panic("copy hash length is inconsistent")
+	}
+	return CidStr(buf[:n+hashlen])
+}
+
+// CidStrParse takes a binary byte slice, parses it, and returns either
+// a valid CidStr, or the zero CidStr and an error.
+//
+// For CidV1, the data buffer is in the form:
+//
+//     <version><codec-type><multihash>
+//
+// CidV0 are also supported. In particular, data buffers starting
+// with length 34 bytes, which starts with bytes [18,32...] are considered
+// binary multihashes.
+//
+// The multicodec bytes are not parsed to verify they're a valid varint;
+// no further reification is performed.
+//
+// Multibase encoding should already have been unwrapped before parsing;
+// if you have a multibase-enveloped string, use CidStrDecode instead.
+//
+// CidStrParse is the inverse of Cid.Bytes().
+func CidStrParse(data []byte) (CidStr, error) {
+	if len(data) == 34 && data[0] == 18 && data[1] == 32 {
+		h, err := mh.Cast(data)
+		if err != nil {
+			return EmptyCidStr, err
+		}
+		return newCidStr(0, DagProtobuf, h), nil
+	}
+
+	vers, n := binary.Uvarint(data)
+	if err := uvError(n); err != nil {
+		return EmptyCidStr, err
+	}
+
+	if vers != 0 && vers != 1 {
+		return EmptyCidStr, fmt.Errorf("invalid cid version number: %d", vers)
+	}
+
+	_, cn := binary.Uvarint(data[n:])
+	if err := uvError(cn); err != nil {
+		return EmptyCidStr, err
+	}
+
+	rest := data[n+cn:]
+	h, err := mh.Cast(rest)
+	if err != nil {
+		return EmptyCidStr, err
+	}
+
+	// REVIEW: if the data is longer than the mh.len expects, we silently ignore it?  should we?
+	return CidStr(data[0 : n+cn+len(h)]), nil
+}
--- a/_rsrch/cidiface/cidStruct.go
+++ b/_rsrch/cidiface/cidStruct.go
@@ -0,0 +1,162 @@
+package cid
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	mbase "github.com/multiformats/go-multibase"
+	mh "github.com/multiformats/go-multihash"
+)
+
+//=================
+// def & accessors
+//=================
+
+var _ Cid = CidStruct{}
+
+//var _ map[CidStruct]struct{} = nil // Will not compile!  See struct def docs.
+
+// CidStruct represents a CID in a struct format.
+//
+// This format complies with the exact same Cid interface as the CidStr
+// implementation, but completely pre-parses the Cid metadata.
+// CidStruct is a tad quicker in case of repeatedly accessed fields,
+// but requires more reshuffling to parse and to serialize.
+// CidStruct is not usable as a map key, because it contains a Multihash
+// reference, which is a slice, and thus not "comparable" as a primitive.
+//
+// Beware of zero-valued CidStruct: it is difficult to distinguish an
+// incorrectly-initialized "invalid" CidStruct from one representing a v0 cid.
+type CidStruct struct {
+	version uint64
+	codec   uint64
+	hash    mh.Multihash
+}
+
+// EmptyCidStruct is a constant for a zero/uninitialized/sentinelvalue cid;
+// it is declared mainly for readability in checks for sentinel values.
+//
+// Note: it's not actually a const; the compiler does not allow const structs.
+var EmptyCidStruct = CidStruct{}
+
+func (c CidStruct) Version() uint64 {
+	return c.version
+}
+
+func (c CidStruct) Multicodec() uint64 {
+	return c.codec
+}
+
+func (c CidStruct) Multihash() mh.Multihash {
+	return c.hash
+}
+
+// String returns the default string representation of a Cid.
+// Currently, Base58 is used as the encoding for the multibase string.
+func (c CidStruct) String() string {
+	switch c.Version() {
+	case 0:
+		return c.Multihash().B58String()
+	case 1:
+		mbstr, err := mbase.Encode(mbase.Base58BTC, c.Bytes())
+		if err != nil {
+			panic("should not error with hardcoded mbase: " + err.Error())
+		}
+		return mbstr
+	default:
+		panic("not possible to reach this point")
+	}
+}
+
+// Bytes produces a raw binary format of the CID.
+func (c CidStruct) Bytes() []byte {
+	switch c.version {
+	case 0:
+		return []byte(c.hash)
+	case 1:
+		// two 8 bytes (max) numbers plus hash
+		buf := make([]byte, 2*binary.MaxVarintLen64+len(c.hash))
+		n := binary.PutUvarint(buf, c.version)
+		n += binary.PutUvarint(buf[n:], c.codec)
+		cn := copy(buf[n:], c.hash)
+		if cn != len(c.hash) {
+			panic("copy hash length is inconsistent")
+		}
+		return buf[:n+len(c.hash)]
+	default:
+		panic("not possible to reach this point")
+	}
+}
+
+// Prefix builds and returns a Prefix out of a Cid.
+func (c CidStruct) Prefix() Prefix {
+	dec, _ := mh.Decode(c.hash) // assuming we got a valid multiaddr, this will not error
+	return Prefix{
+		MhType:   dec.Code,
+		MhLength: dec.Length,
+		Version:  c.version,
+		Codec:    c.codec,
+	}
+}
+
+//==================================
+// parsers & validators & factories
+//==================================
+
+// CidStructParse takes a binary byte slice, parses it, and returns either
+// a valid CidStruct, or the zero CidStruct and an error.
+//
+// For CidV1, the data buffer is in the form:
+//
+//     <version><codec-type><multihash>
+//
+// CidV0 are also supported. In particular, data buffers starting
+// with length 34 bytes, which starts with bytes [18,32...] are considered
+// binary multihashes.
+//
+// The multicodec bytes are not parsed to verify they're a valid varint;
+// no further reification is performed.
+//
+// Multibase encoding should already have been unwrapped before parsing;
+// if you have a multibase-enveloped string, use CidStructDecode instead.
+//
+// CidStructParse is the inverse of Cid.Bytes().
+func CidStructParse(data []byte) (CidStruct, error) {
+	if len(data) == 34 && data[0] == 18 && data[1] == 32 {
+		h, err := mh.Cast(data)
+		if err != nil {
+			return EmptyCidStruct, err
+		}
+		return CidStruct{
+			codec:   DagProtobuf,
+			version: 0,
+			hash:    h,
+		}, nil
+	}
+
+	vers, n := binary.Uvarint(data)
+	if err := uvError(n); err != nil {
+		return EmptyCidStruct, err
+	}
+
+	if vers != 0 && vers != 1 {
+		return EmptyCidStruct, fmt.Errorf("invalid cid version number: %d", vers)
+	}
+
+	codec, cn := binary.Uvarint(data[n:])
+	if err := uvError(cn); err != nil {
+		return EmptyCidStruct, err
+	}
+
+	rest := data[n+cn:]
+	h, err := mh.Cast(rest)
+	if err != nil {
+		return EmptyCidStruct, err
+	}
+
+	return CidStruct{
+		version: vers,
+		codec:   codec,
+		hash:    h,
+	}, nil
+}
--- a/_rsrch/cidiface/enums.go
+++ b/_rsrch/cidiface/enums.go
@@ -0,0 +1,76 @@
+package cid
+
+// These are multicodec-packed content types. The should match
+// the codes described in the authoritative document:
+// https://github.com/multiformats/multicodec/blob/master/table.csv
+const (
+	Raw = 0x55
+
+	DagProtobuf = 0x70
+	DagCBOR     = 0x71
+
+	GitRaw = 0x78
+
+	EthBlock           = 0x90
+	EthBlockList       = 0x91
+	EthTxTrie          = 0x92
+	EthTx              = 0x93
+	EthTxReceiptTrie   = 0x94
+	EthTxReceipt       = 0x95
+	EthStateTrie       = 0x96
+	EthAccountSnapshot = 0x97
+	EthStorageTrie     = 0x98
+	BitcoinBlock       = 0xb0
+	BitcoinTx          = 0xb1
+	ZcashBlock         = 0xc0
+	ZcashTx            = 0xc1
+	DecredBlock        = 0xe0
+	DecredTx           = 0xe1
+)
+
+// Codecs maps the name of a codec to its type
+var Codecs = map[string]uint64{
+	"v0":                   DagProtobuf,
+	"raw":                  Raw,
+	"protobuf":             DagProtobuf,
+	"cbor":                 DagCBOR,
+	"git-raw":              GitRaw,
+	"eth-block":            EthBlock,
+	"eth-block-list":       EthBlockList,
+	"eth-tx-trie":          EthTxTrie,
+	"eth-tx":               EthTx,
+	"eth-tx-receipt-trie":  EthTxReceiptTrie,
+	"eth-tx-receipt":       EthTxReceipt,
+	"eth-state-trie":       EthStateTrie,
+	"eth-account-snapshot": EthAccountSnapshot,
+	"eth-storage-trie":     EthStorageTrie,
+	"bitcoin-block":        BitcoinBlock,
+	"bitcoin-tx":           BitcoinTx,
+	"zcash-block":          ZcashBlock,
+	"zcash-tx":             ZcashTx,
+	"decred-block":         DecredBlock,
+	"decred-tx":            DecredTx,
+}
+
+// CodecToStr maps the numeric codec to its name
+var CodecToStr = map[uint64]string{
+	Raw:                "raw",
+	DagProtobuf:        "protobuf",
+	DagCBOR:            "cbor",
+	GitRaw:             "git-raw",
+	EthBlock:           "eth-block",
+	EthBlockList:       "eth-block-list",
+	EthTxTrie:          "eth-tx-trie",
+	EthTx:              "eth-tx",
+	EthTxReceiptTrie:   "eth-tx-receipt-trie",
+	EthTxReceipt:       "eth-tx-receipt",
+	EthStateTrie:       "eth-state-trie",
+	EthAccountSnapshot: "eth-account-snapshot",
+	EthStorageTrie:     "eth-storage-trie",
+	BitcoinBlock:       "bitcoin-block",
+	BitcoinTx:          "bitcoin-tx",
+	ZcashBlock:         "zcash-block",
+	ZcashTx:            "zcash-tx",
+	DecredBlock:        "decred-block",
+	DecredTx:           "decred-tx",
+}
--- a/_rsrch/cidiface/errors.go
+++ b/_rsrch/cidiface/errors.go
@@ -0,0 +1,24 @@
+package cid
+
+import (
+	"errors"
+)
+
+var (
+	// ErrVarintBuffSmall means that a buffer passed to the cid parser was not
+	// long enough, or did not contain an invalid cid
+	ErrVarintBuffSmall = errors.New("reading varint: buffer too small")
+
+	// ErrVarintTooBig means that the varint in the given cid was above the
+	// limit of 2^64
+	ErrVarintTooBig = errors.New("reading varint: varint bigger than 64bits" +
+		" and not supported")
+
+	// ErrCidTooShort means that the cid passed to decode was not long
+	// enough to be a valid Cid
+	ErrCidTooShort = errors.New("cid too short")
+
+	// ErrInvalidEncoding means that selected encoding is not supported
+	// by this Cid version
+	ErrInvalidEncoding = errors.New("invalid base encoding")
+)
--- a/_rsrch/cidiface/misc.go
+++ b/_rsrch/cidiface/misc.go
@@ -0,0 +1,12 @@
+package cid
+
+func uvError(read int) error {
+	switch {
+	case read == 0:
+		return ErrVarintBuffSmall
+	case read < 0:
+		return ErrVarintTooBig
+	default:
+		return nil
+	}
+}