diff --git a/_rsrch/cidiface/README.md b/_rsrch/cidiface/README.md new file mode 100644 index 0000000..24dd9b7 --- /dev/null +++ b/_rsrch/cidiface/README.md @@ -0,0 +1,144 @@ +What golang Kinds work best to implement CIDs? +============================================== + +There are many possible ways to implement CIDs. This package explores them. + +### criteria + +There's a couple different criteria to consider: + +- We want the best performance when operating on the type (getters, mostly); +- We want to minimize the number of memory allocations we need; +- We want types which can be used as map keys, because this is common. + +The priority of these criteria is open to argument, but it's probably +mapkeys > minalloc > anythingelse. +(Mapkeys and minalloc are also quite entangled, since if we don't pick a +representation that can work natively as a map key, we'll end up needing +a `KeyRepr()` method which gives us something that does work as a map key, +an that will almost certainly involve a malloc itself.) + +### options + +There are quite a few different ways to go: + +- Option A: CIDs as a struct; multihash as bytes. +- Option B: CIDs as a string. +- Option C: CIDs as an interface with multiple implementors. +- Option D: CIDs as a struct; multihash also as a struct or string. +- Option E: CIDs as a struct; content as strings plus offsets. + +The current approach on the master branch is Option A. + +Option D is distinctive from Option A because multihash as bytes transitively +causes the CID struct to be non-comparible and thus not suitable for map keys +as per https://golang.org/ref/spec#KeyType . (It's also a bit more work to +pursue Option D because it's just a bigger splash radius of change; but also, +something we might also want to do soon, because we *do* also have these same +map-key-usability concerns with multihash alone.) + +Option E is distinctive from Option D because Option E would always maintain +the binary format of the cid internally, and so could yield it again without +malloc, while still potentially having faster access to components than +Option B since it wouldn't need to re-parse varints to access later fields. + +Option C is the avoid-choices choice, but note that interfaces are not free; +since "minimize mallocs" is one of our major goals, we cannot use interfaces +whimsically. + +Note there is no proposal for migrating to `type Cid []bytes`, because that +is generally considered to be strictly inferior to `type Cid string`. + + +Discoveries +----------- + +### using interfaces as map keys forgoes a lot of safety checks + +Using interfaces as map keys pushes a bunch of type checking to runtime. +E.g., it's totally valid at compile time to push a type which is non-comparable +into a map key; it will panic at *runtime* instead of failing at compile-time. + +There's also no way to define equality checks between implementors of the +interface: golang will always use its innate concept of comparison for the +concrete types. This means its effectively *never safe* to use two different +concrete implementations of an interface in the same map; you may add elements +which are semantically "equal" in your mind, and end up very confused later +when both impls of the same "equal" object have been stored. + +### sentinel values are possible in any impl, but some are clearer than others + +When using `*Cid`, the nil value is a clear sentinel for 'invalid'; +when using `type Cid string`, the zero value is a clear sentinel; +when using `type Cid struct` per Option A or D... the only valid check is +for a nil multihash field, since version=0 and codec=0 are both valid values. + +### usability as a map key is important + +We already covered this in the criteria section, but for clarity: + +- Option A: ❌ +- Option B: ✔ +- Option C: ~ (caveats, and depends on concrete impl) +- Option D: ✔ +- Option E: ✔ + +### living without offsets requires parsing + +Since CID (and multihash!) are defined using varints, they require parsing; +we can't just jump into the string at a known offset in order to yield e.g. +the multicodec number. + +In order to get to the 'meat' of the CID (the multihash content), we first +must parse: + +- the CID version varint; +- the multicodec varint; +- the multihash type enum varint; +- and the multihash length varint. + +Since there are many applications where we want to jump straight to the +multihash content (for example, when doing CAS sharding -- see the +[disclaimer](https://github.com/multiformats/multihash#disclaimers) about +bias in leading bytes), this overhead may be interesting. + +How much this overhead is significant is hard to say from microbenchmarking; +it depends largely on usage patterns. If these traversals are a significant +timesink, it would be an argument for Option D/E. +If these traversals are *not* a significant timesink, we might be wiser +to keep to Option B, because keeping a struct full of offsets will add several +words of memory usage per CID, and we keep a *lot* of CIDs. + +### interfaces cause boxing which is a significant performance cost + +See `BenchmarkCidMap_CidStr` and friends. + +Long story short: using interfaces *anywhere* will cause the compiler to +implicitly generate boxing and unboxing code (e.g. `runtime.convT2E`); +this is both another function call, and more concerningly, results in +large numbers of unbatchable memory allocations. + +Numbers without context are dangerous, but if you need one: 33%. +It's a big deal. + +This means attempts to "use interfaces, but switch to concrete impls when +performance is important" are a red herring: it doesn't work that way. + +This is not a general inditement against using interfaces -- but +if a situation is at the scale where it's become important to mind whether +or not pointers are a performance impact, then that situation also +is one where you have to think twice before using interfaces. + +### one way or another: let's get rid of that star + +We should switch completely to handling `Cid` and remove `*Cid` completely. +Regardless of whether we do this by migrating to interface, or string +implementations, or simply structs with no pointers... once we get there, +refactoring to any of the *others* can become a no-op from the perspective +of any downstream code that uses CIDs. + +(This means all access via functions, never references to fields -- even if +we were to use a struct implementation. *Pretend* there's a interface, +in other words.) + +There are probably `gofix` incantations which can help us with this migration. diff --git a/_rsrch/cidiface/cid.go b/_rsrch/cidiface/cid.go new file mode 100644 index 0000000..cb4b871 --- /dev/null +++ b/_rsrch/cidiface/cid.go @@ -0,0 +1,48 @@ +package cid + +import ( + mh "github.com/multiformats/go-multihash" +) + +// Cid represents a self-describing content adressed identifier. +// +// A CID is composed of: +// +// - a Version of the CID itself, +// - a Multicodec (indicates the encoding of the referenced content), +// - and a Multihash (which identifies the referenced content). +// +// (Note that the Multihash further contains its own version and hash type +// indicators.) +type Cid interface { + // n.b. 'yields' means "without copy", 'produces' means a malloc. + + Version() uint64 // Yields the version prefix as a uint. + Multicodec() uint64 // Yields the multicodec as a uint. + Multihash() mh.Multihash // Yields the multihash segment. + + String() string // Produces the CID formatted as b58 string. + Bytes() []byte // Produces the CID formatted as raw binary. + + Prefix() Prefix // Produces a tuple of non-content metadata. + + // some change notes: + // - `KeyString() CidString` is gone because we're natively a map key now, you're welcome. + // - `StringOfBase(mbase.Encoding) (string, error)` is skipped, maybe it can come back but maybe it should be a formatter's job. + // - `Equals(o Cid) bool` is gone because it's now `==`, you're welcome. + + // TODO: make a multi-return method for {v,mc,mh} decomposition. CidStr will be able to implement this more efficiently than if one makes a series of the individual getter calls. +} + +// Prefix represents all the metadata of a Cid, +// that is, the Version, the Codec, the Multihash type +// and the Multihash length. It does not contains +// any actual content information. +// NOTE: The use -1 in MhLength to mean default length is deprecated, +// use the V0Builder or V1Builder structures instead +type Prefix struct { + Version uint64 + Codec uint64 + MhType uint64 + MhLength int +} diff --git a/_rsrch/cidiface/cidBoxingBench_test.go b/_rsrch/cidiface/cidBoxingBench_test.go new file mode 100644 index 0000000..920ce1f --- /dev/null +++ b/_rsrch/cidiface/cidBoxingBench_test.go @@ -0,0 +1,71 @@ +package cid + +import ( + "testing" +) + +// BenchmarkCidMap_CidStr estimates how fast it is to insert primitives into a map +// keyed by CidStr (concretely). +// +// We do 100 insertions per benchmark run to make sure the map initialization +// doesn't dominate the results. +// +// Sample results on linux amd64 go1.11beta: +// +// BenchmarkCidMap_CidStr-8 100000 16317 ns/op +// BenchmarkCidMap_CidIface-8 100000 20516 ns/op +// +// With benchmem on: +// +// BenchmarkCidMap_CidStr-8 100000 15579 ns/op 11223 B/op 207 allocs/op +// BenchmarkCidMap_CidIface-8 100000 19500 ns/op 12824 B/op 307 allocs/op +// BenchmarkCidMap_StrPlusHax-8 200000 10451 ns/op 7589 B/op 202 allocs/op +// +// We can see here that the impact of interface boxing is significant: +// it increases the time taken to do the inserts to 133%, largely because +// the implied `runtime.convT2E` calls cause another malloc each. +// +// There are also significant allocations in both cases because +// A) we cannot create a multihash without allocations since they are []byte; +// B) the map has to be grown several times; +// C) something I haven't quite put my finger on yet. +// Ideally we'd drive those down further as well. +// +// Pre-allocating the map reduces allocs by a very small percentage by *count*, +// but reduces the time taken by 66% overall (presumably because when a map +// re-arranges itself, it involves more or less an O(n) copy of the content +// in addition to the alloc itself). This isn't topical to the question of +// whether or not interfaces are a good idea; just for contextualizing. +// +func BenchmarkCidMap_CidStr(b *testing.B) { + for i := 0; i < b.N; i++ { + mp := map[CidStr]int{} + for x := 0; x < 100; x++ { + mp[NewCidStr(0, uint64(x), []byte{})] = x + } + } +} + +// BenchmarkCidMap_CidIface is in the family of BenchmarkCidMap_CidStr: +// it is identical except the map key type is declared as an interface +// (which forces all insertions to be boxed, changing performance). +func BenchmarkCidMap_CidIface(b *testing.B) { + for i := 0; i < b.N; i++ { + mp := map[Cid]int{} + for x := 0; x < 100; x++ { + mp[NewCidStr(0, uint64(x), []byte{})] = x + } + } +} + +// BenchmarkCidMap_CidStrAvoidMapGrowth is in the family of BenchmarkCidMap_CidStr: +// it is identical except the map is created with a size hint that removes +// some allocations (5, in practice, apparently). +func BenchmarkCidMap_CidStrAvoidMapGrowth(b *testing.B) { + for i := 0; i < b.N; i++ { + mp := make(map[CidStr]int, 100) + for x := 0; x < 100; x++ { + mp[NewCidStr(0, uint64(x), []byte{})] = x + } + } +} diff --git a/_rsrch/cidiface/cidString.go b/_rsrch/cidiface/cidString.go new file mode 100644 index 0000000..129fafd --- /dev/null +++ b/_rsrch/cidiface/cidString.go @@ -0,0 +1,161 @@ +package cid + +import ( + "encoding/binary" + "fmt" + + mbase "github.com/multiformats/go-multibase" + mh "github.com/multiformats/go-multihash" +) + +//================= +// def & accessors +//================= + +var _ Cid = CidStr("") +var _ map[CidStr]struct{} = nil + +// CidStr is a representation of a Cid as a string type containing binary. +// +// Using golang's string type is preferable over byte slices even for binary +// data because golang strings are immutable, usable as map keys, +// trivially comparable with built-in equals operators, etc. +// +// Please do not cast strings or bytes into the CidStr type directly; +// use a parse method which validates the data and yields a CidStr. +type CidStr string + +// EmptyCidStr is a constant for a zero/uninitialized/sentinelvalue cid; +// it is declared mainly for readability in checks for sentinel values. +const EmptyCidStr = CidStr("") + +func (c CidStr) Version() uint64 { + bytes := []byte(c) + v, _ := binary.Uvarint(bytes) + return v +} + +func (c CidStr) Multicodec() uint64 { + bytes := []byte(c) + _, n := binary.Uvarint(bytes) // skip version length + codec, _ := binary.Uvarint(bytes[n:]) + return codec +} + +func (c CidStr) Multihash() mh.Multihash { + bytes := []byte(c) + _, n1 := binary.Uvarint(bytes) // skip version length + _, n2 := binary.Uvarint(bytes[n1:]) // skip codec length + return mh.Multihash(bytes[n1+n2:]) // return slice of remainder +} + +// String returns the default string representation of a Cid. +// Currently, Base58 is used as the encoding for the multibase string. +func (c CidStr) String() string { + switch c.Version() { + case 0: + return c.Multihash().B58String() + case 1: + mbstr, err := mbase.Encode(mbase.Base58BTC, []byte(c)) + if err != nil { + panic("should not error with hardcoded mbase: " + err.Error()) + } + return mbstr + default: + panic("not possible to reach this point") + } +} + +// Bytes produces a raw binary format of the CID. +// +// (For CidStr, this method is only distinct from casting because of +// compatibility with v0 CIDs.) +func (c CidStr) Bytes() []byte { + switch c.Version() { + case 0: + return c.Multihash() + case 1: + return []byte(c) + default: + panic("not possible to reach this point") + } +} + +// Prefix builds and returns a Prefix out of a Cid. +func (c CidStr) Prefix() Prefix { + dec, _ := mh.Decode(c.Multihash()) // assuming we got a valid multiaddr, this will not error + return Prefix{ + MhType: dec.Code, + MhLength: dec.Length, + Version: c.Version(), + Codec: c.Multicodec(), + } +} + +//================================== +// parsers & validators & factories +//================================== + +func NewCidStr(version uint64, codecType uint64, mhash mh.Multihash) CidStr { + hashlen := len(mhash) + // two 8 bytes (max) numbers plus hash + buf := make([]byte, 2*binary.MaxVarintLen64+hashlen) + n := binary.PutUvarint(buf, version) + n += binary.PutUvarint(buf[n:], codecType) + cn := copy(buf[n:], mhash) + if cn != hashlen { + panic("copy hash length is inconsistent") + } + return CidStr(buf[:n+hashlen]) +} + +// CidStrParse takes a binary byte slice, parses it, and returns either +// a valid CidStr, or the zero CidStr and an error. +// +// For CidV1, the data buffer is in the form: +// +// +// +// CidV0 are also supported. In particular, data buffers starting +// with length 34 bytes, which starts with bytes [18,32...] are considered +// binary multihashes. +// +// The multicodec bytes are not parsed to verify they're a valid varint; +// no further reification is performed. +// +// Multibase encoding should already have been unwrapped before parsing; +// if you have a multibase-enveloped string, use CidStrDecode instead. +// +// CidStrParse is the inverse of Cid.Bytes(). +func CidStrParse(data []byte) (CidStr, error) { + if len(data) == 34 && data[0] == 18 && data[1] == 32 { + h, err := mh.Cast(data) + if err != nil { + return EmptyCidStr, err + } + return NewCidStr(0, DagProtobuf, h), nil + } + + vers, n := binary.Uvarint(data) + if err := uvError(n); err != nil { + return EmptyCidStr, err + } + + if vers != 0 && vers != 1 { + return EmptyCidStr, fmt.Errorf("invalid cid version number: %d", vers) + } + + _, cn := binary.Uvarint(data[n:]) + if err := uvError(cn); err != nil { + return EmptyCidStr, err + } + + rest := data[n+cn:] + h, err := mh.Cast(rest) + if err != nil { + return EmptyCidStr, err + } + + // REVIEW: if the data is longer than the mh.len expects, we silently ignore it? should we? + return CidStr(data[0 : n+cn+len(h)]), nil +} diff --git a/_rsrch/cidiface/cidStruct.go b/_rsrch/cidiface/cidStruct.go new file mode 100644 index 0000000..2fd5fa7 --- /dev/null +++ b/_rsrch/cidiface/cidStruct.go @@ -0,0 +1,164 @@ +package cid + +import ( + "encoding/binary" + "fmt" + + mbase "github.com/multiformats/go-multibase" + mh "github.com/multiformats/go-multihash" +) + +//================= +// def & accessors +//================= + +var _ Cid = CidStruct{} + +//var _ map[CidStruct]struct{} = nil // Will not compile! See struct def docs. +//var _ map[Cid]struct{} = map[Cid]struct{}{CidStruct{}: struct{}{}} // Legal to compile... +// but you'll get panics: "runtime error: hash of unhashable type cid.CidStruct" + +// CidStruct represents a CID in a struct format. +// +// This format complies with the exact same Cid interface as the CidStr +// implementation, but completely pre-parses the Cid metadata. +// CidStruct is a tad quicker in case of repeatedly accessed fields, +// but requires more reshuffling to parse and to serialize. +// CidStruct is not usable as a map key, because it contains a Multihash +// reference, which is a slice, and thus not "comparable" as a primitive. +// +// Beware of zero-valued CidStruct: it is difficult to distinguish an +// incorrectly-initialized "invalid" CidStruct from one representing a v0 cid. +type CidStruct struct { + version uint64 + codec uint64 + hash mh.Multihash +} + +// EmptyCidStruct is a constant for a zero/uninitialized/sentinelvalue cid; +// it is declared mainly for readability in checks for sentinel values. +// +// Note: it's not actually a const; the compiler does not allow const structs. +var EmptyCidStruct = CidStruct{} + +func (c CidStruct) Version() uint64 { + return c.version +} + +func (c CidStruct) Multicodec() uint64 { + return c.codec +} + +func (c CidStruct) Multihash() mh.Multihash { + return c.hash +} + +// String returns the default string representation of a Cid. +// Currently, Base58 is used as the encoding for the multibase string. +func (c CidStruct) String() string { + switch c.Version() { + case 0: + return c.Multihash().B58String() + case 1: + mbstr, err := mbase.Encode(mbase.Base58BTC, c.Bytes()) + if err != nil { + panic("should not error with hardcoded mbase: " + err.Error()) + } + return mbstr + default: + panic("not possible to reach this point") + } +} + +// Bytes produces a raw binary format of the CID. +func (c CidStruct) Bytes() []byte { + switch c.version { + case 0: + return []byte(c.hash) + case 1: + // two 8 bytes (max) numbers plus hash + buf := make([]byte, 2*binary.MaxVarintLen64+len(c.hash)) + n := binary.PutUvarint(buf, c.version) + n += binary.PutUvarint(buf[n:], c.codec) + cn := copy(buf[n:], c.hash) + if cn != len(c.hash) { + panic("copy hash length is inconsistent") + } + return buf[:n+len(c.hash)] + default: + panic("not possible to reach this point") + } +} + +// Prefix builds and returns a Prefix out of a Cid. +func (c CidStruct) Prefix() Prefix { + dec, _ := mh.Decode(c.hash) // assuming we got a valid multiaddr, this will not error + return Prefix{ + MhType: dec.Code, + MhLength: dec.Length, + Version: c.version, + Codec: c.codec, + } +} + +//================================== +// parsers & validators & factories +//================================== + +// CidStructParse takes a binary byte slice, parses it, and returns either +// a valid CidStruct, or the zero CidStruct and an error. +// +// For CidV1, the data buffer is in the form: +// +// +// +// CidV0 are also supported. In particular, data buffers starting +// with length 34 bytes, which starts with bytes [18,32...] are considered +// binary multihashes. +// +// The multicodec bytes are not parsed to verify they're a valid varint; +// no further reification is performed. +// +// Multibase encoding should already have been unwrapped before parsing; +// if you have a multibase-enveloped string, use CidStructDecode instead. +// +// CidStructParse is the inverse of Cid.Bytes(). +func CidStructParse(data []byte) (CidStruct, error) { + if len(data) == 34 && data[0] == 18 && data[1] == 32 { + h, err := mh.Cast(data) + if err != nil { + return EmptyCidStruct, err + } + return CidStruct{ + codec: DagProtobuf, + version: 0, + hash: h, + }, nil + } + + vers, n := binary.Uvarint(data) + if err := uvError(n); err != nil { + return EmptyCidStruct, err + } + + if vers != 0 && vers != 1 { + return EmptyCidStruct, fmt.Errorf("invalid cid version number: %d", vers) + } + + codec, cn := binary.Uvarint(data[n:]) + if err := uvError(cn); err != nil { + return EmptyCidStruct, err + } + + rest := data[n+cn:] + h, err := mh.Cast(rest) + if err != nil { + return EmptyCidStruct, err + } + + return CidStruct{ + version: vers, + codec: codec, + hash: h, + }, nil +} diff --git a/_rsrch/cidiface/enums.go b/_rsrch/cidiface/enums.go new file mode 100644 index 0000000..53e3d47 --- /dev/null +++ b/_rsrch/cidiface/enums.go @@ -0,0 +1,76 @@ +package cid + +// These are multicodec-packed content types. The should match +// the codes described in the authoritative document: +// https://github.com/multiformats/multicodec/blob/master/table.csv +const ( + Raw = 0x55 + + DagProtobuf = 0x70 + DagCBOR = 0x71 + + GitRaw = 0x78 + + EthBlock = 0x90 + EthBlockList = 0x91 + EthTxTrie = 0x92 + EthTx = 0x93 + EthTxReceiptTrie = 0x94 + EthTxReceipt = 0x95 + EthStateTrie = 0x96 + EthAccountSnapshot = 0x97 + EthStorageTrie = 0x98 + BitcoinBlock = 0xb0 + BitcoinTx = 0xb1 + ZcashBlock = 0xc0 + ZcashTx = 0xc1 + DecredBlock = 0xe0 + DecredTx = 0xe1 +) + +// Codecs maps the name of a codec to its type +var Codecs = map[string]uint64{ + "v0": DagProtobuf, + "raw": Raw, + "protobuf": DagProtobuf, + "cbor": DagCBOR, + "git-raw": GitRaw, + "eth-block": EthBlock, + "eth-block-list": EthBlockList, + "eth-tx-trie": EthTxTrie, + "eth-tx": EthTx, + "eth-tx-receipt-trie": EthTxReceiptTrie, + "eth-tx-receipt": EthTxReceipt, + "eth-state-trie": EthStateTrie, + "eth-account-snapshot": EthAccountSnapshot, + "eth-storage-trie": EthStorageTrie, + "bitcoin-block": BitcoinBlock, + "bitcoin-tx": BitcoinTx, + "zcash-block": ZcashBlock, + "zcash-tx": ZcashTx, + "decred-block": DecredBlock, + "decred-tx": DecredTx, +} + +// CodecToStr maps the numeric codec to its name +var CodecToStr = map[uint64]string{ + Raw: "raw", + DagProtobuf: "protobuf", + DagCBOR: "cbor", + GitRaw: "git-raw", + EthBlock: "eth-block", + EthBlockList: "eth-block-list", + EthTxTrie: "eth-tx-trie", + EthTx: "eth-tx", + EthTxReceiptTrie: "eth-tx-receipt-trie", + EthTxReceipt: "eth-tx-receipt", + EthStateTrie: "eth-state-trie", + EthAccountSnapshot: "eth-account-snapshot", + EthStorageTrie: "eth-storage-trie", + BitcoinBlock: "bitcoin-block", + BitcoinTx: "bitcoin-tx", + ZcashBlock: "zcash-block", + ZcashTx: "zcash-tx", + DecredBlock: "decred-block", + DecredTx: "decred-tx", +} diff --git a/_rsrch/cidiface/errors.go b/_rsrch/cidiface/errors.go new file mode 100644 index 0000000..588c62e --- /dev/null +++ b/_rsrch/cidiface/errors.go @@ -0,0 +1,24 @@ +package cid + +import ( + "errors" +) + +var ( + // ErrVarintBuffSmall means that a buffer passed to the cid parser was not + // long enough, or did not contain an invalid cid + ErrVarintBuffSmall = errors.New("reading varint: buffer too small") + + // ErrVarintTooBig means that the varint in the given cid was above the + // limit of 2^64 + ErrVarintTooBig = errors.New("reading varint: varint bigger than 64bits" + + " and not supported") + + // ErrCidTooShort means that the cid passed to decode was not long + // enough to be a valid Cid + ErrCidTooShort = errors.New("cid too short") + + // ErrInvalidEncoding means that selected encoding is not supported + // by this Cid version + ErrInvalidEncoding = errors.New("invalid base encoding") +) diff --git a/_rsrch/cidiface/misc.go b/_rsrch/cidiface/misc.go new file mode 100644 index 0000000..9a4486a --- /dev/null +++ b/_rsrch/cidiface/misc.go @@ -0,0 +1,12 @@ +package cid + +func uvError(read int) error { + switch { + case read == 0: + return ErrVarintBuffSmall + case read < 0: + return ErrVarintTooBig + default: + return nil + } +}