implement CidFromReader

And reuse a CidFromBytes test for it, which includes both CIDv0 and CIDv1 cases as inputs. Fixes #126.
2021-07-02 18:35:02 +01:00
2 changed files with 91 additions and 107 deletions
--- a/cid.go
+++ b/cid.go
@@ -694,10 +694,19 @@ type bufByteReader struct {
 	direct   io.ByteReader
 	fallback io.Reader

-	dst []byte
+	consumed int
+	dst      []byte
 }

 func (r *bufByteReader) ReadByte() (byte, error) {
+	// We still have some of the initial bytes to use.
+	if r.consumed < len(r.dst) {
+		b := r.dst[r.consumed]
+		r.consumed++
+		return b, nil
+	}
+	r.consumed++
+
 	// The underlying reader has ReadByte; use it.
 	if br := r.direct; br != nil {
 		b, err := br.ReadByte()
@@ -709,8 +718,6 @@ func (r *bufByteReader) ReadByte() (byte, error) {
 	}

 	// Fall back to a one-byte Read.
-	// TODO: consider reading straight into dst,
-	// once we have benchmarks and if they prove that to be faster.
 	var p [1]byte
 	if _, err := io.ReadFull(r.fallback, p[:]); err != nil {
 		return 0, err
@@ -730,30 +737,38 @@ func CidFromReader(r io.Reader) (int, Cid, error) {
 	// 64 bytes is enough for any CIDv0,
 	// and it's enough for most CIDv1s in practice.
 	// If the digest is too long, we'll allocate more.
-	br := toBufByteReader(r, make([]byte, 0, 64))
+	buf := make([]byte, 0, 64)
+
+	// We read two bytes, to tell if this is a CIDv0 or a CIDv1.
+	if n, err := io.ReadFull(r, buf[:2]); err != nil {
+		return n, Undef, err
+	}
+	buf = buf[:2]
+
+	// If we have a CIDv0, read the rest of the bytes and cast the buffer.
+	if buf[0] == mh.SHA2_256 && buf[1] == 32 {
+		if n, err := io.ReadFull(r, buf[2:34]); err != nil {
+			return len(buf) + n, Undef, err
+		}
+
+		buf = buf[:34]
+		h, err := mh.Cast(buf)
+		if err != nil {
+			return len(buf), Undef, err
+		}
+
+		return len(buf), Cid{string(h)}, nil
+	}

-	// We read the first varint, to tell if this is a CIDv0 or a CIDv1.
 	// The varint package wants a io.ByteReader, so we must wrap our io.Reader.
+	// Note that we already read two bytes, so bufByteReader uses those first.
+	// After those two bytes, bufByteReader appends the read bytes to br.dst.
+	br := toBufByteReader(r, buf[:2])
 	vers, err := varint.ReadUvarint(br)
 	if err != nil {
 		return len(br.dst), Undef, err
 	}

-	// If we have a CIDv0, read the rest of the bytes and cast the buffer.
-	if vers == mh.SHA2_256 {
-		if n, err := io.ReadFull(r, br.dst[1:34]); err != nil {
-			return len(br.dst) + n, Undef, err
-		}
-
-		br.dst = br.dst[:34]
-		h, err := mh.Cast(br.dst)
-		if err != nil {
-			return len(br.dst), Undef, err
-		}
-
-		return len(br.dst), Cid{string(h)}, nil
-	}
-
 	if vers != 1 {
 		return len(br.dst), Undef, fmt.Errorf("expected 1 as the cid version number, got: %d", vers)
 	}
@@ -781,38 +796,29 @@ func CidFromReader(r io.Reader) (int, Cid, error) {
 		return len(br.dst), Undef, err
 	}

+	// Update buf's length.
+	// We're not reading single bytes beyond this point.
+	buf = br.dst
+	br = nil
+
+	// Multihash digest; might be too long, so allocate.
 	// Refuse to make large allocations to prevent OOMs due to bugs.
+	// TODO: reuse buf if it has enough space
 	const maxDigestAlloc = 32 << 20 // 32MiB
 	if mhl > maxDigestAlloc {
-		return len(br.dst), Undef, fmt.Errorf("refusing to allocate %d bytes for a digest", mhl)
+		return len(buf), Undef, fmt.Errorf("refusing to allocate %d bytes for a digest", mhl)
 	}
-
-	// Fine to convert mhl to int, given maxDigestAlloc.
-	prefixLength := len(br.dst)
-	cidLength := prefixLength + int(mhl)
-	if cidLength > cap(br.dst) {
-		// If the multihash digest doesn't fit in our initial 64 bytes,
-		// efficiently extend the slice via append+make.
-		br.dst = append(br.dst, make([]byte, cidLength-cap(br.dst))...)
-	} else {
-		// The multihash digest fits inside our buffer,
-		// so just extend its capacity.
-		br.dst = br.dst[:cidLength]
-	}
-
-	if n, err := io.ReadFull(r, br.dst[prefixLength:cidLength]); err != nil {
-		// We can't use len(br.dst) here,
-		// as we've only read n bytes past prefixLength.
-		return prefixLength + n, Undef, err
+	digest := make([]byte, int(mhl))
+	if n, err := io.ReadFull(r, digest); err != nil {
+		return len(buf) + n, Undef, err
 	}
+	buf = append(buf, digest...)

 	// This simply ensures the multihash is valid.
-	// TODO: consider removing this bit, as it's probably redundant;
-	// for now, it helps ensure consistency with CidFromBytes.
-	_, _, err = mh.MHFromBytes(br.dst[mhStart:])
+	_, _, err = mh.MHFromBytes(buf[mhStart:])
 	if err != nil {
-		return len(br.dst), Undef, err
+		return len(buf), Undef, err
 	}

-	return len(br.dst), Cid{string(br.dst)}, nil
+	return len(buf), Cid{string(buf)}, nil
 }
--- a/cid_test.go
+++ b/cid_test.go
@@ -721,71 +721,49 @@ func TestReadCidsFromBuffer(t *testing.T) {
 	}
 }

-func TestBadCidInput(t *testing.T) {
-	for _, name := range []string{
-		"FromBytes",
-		"FromReader",
-	} {
-		t.Run(name, func(t *testing.T) {
-			usingReader := name == "FromReader"
+func TestBadCidFromBytes(t *testing.T) {
+	l, c, err := CidFromBytes([]byte{mh.SHA2_256, 32, 0x00})
+	if err == nil {
+		t.Fatal("expected not-enough-bytes for V0 CidFromBytes")
+	}
+	if l != 0 {
+		t.Fatal("expected length=0 from bad CidFromBytes")
+	}
+	if c != Undef {
+		t.Fatal("expected Undef CID from bad CidFromBytes")
+	}

-			fromBytes := CidFromBytes
-			if usingReader {
-				fromBytes = func(data []byte) (int, Cid, error) {
-					return CidFromReader(bytes.NewReader(data))
-				}
-			}
+	c, err = Decode("bafkreie5qrjvaw64n4tjm6hbnm7fnqvcssfed4whsjqxzslbd3jwhsk3mm")
+	if err != nil {
+		t.Fatal(err)
+	}
+	byts := make([]byte, c.ByteLen())
+	copy(byts, c.Bytes())
+	byts[1] = 0x80 // bad codec varint
+	byts[2] = 0x00
+	l, c, err = CidFromBytes(byts)
+	if err == nil {
+		t.Fatal("expected not-enough-bytes for V1 CidFromBytes")
+	}
+	if l != 0 {
+		t.Fatal("expected length=0 from bad CidFromBytes")
+	}
+	if c != Undef {
+		t.Fatal("expected Undef CID from bad CidFromBytes")
+	}

-			l, c, err := fromBytes([]byte{mh.SHA2_256, 32, 0x00})
-			if err == nil {
-				t.Fatal("expected not-enough-bytes for V0 CID")
-			}
-			if !usingReader && l != 0 {
-				t.Fatal("expected length==0 from bad CID")
-			} else if usingReader && l == 0 {
-				t.Fatal("expected length!=0 from bad CID")
-			}
-			if c != Undef {
-				t.Fatal("expected Undef CID from bad CID")
-			}
-
-			c, err = Decode("bafkreie5qrjvaw64n4tjm6hbnm7fnqvcssfed4whsjqxzslbd3jwhsk3mm")
-			if err != nil {
-				t.Fatal(err)
-			}
-			byts := make([]byte, c.ByteLen())
-			copy(byts, c.Bytes())
-			byts[1] = 0x80 // bad codec varint
-			byts[2] = 0x00
-			l, c, err = fromBytes(byts)
-			if err == nil {
-				t.Fatal("expected not-enough-bytes for V1 CID")
-			}
-			if !usingReader && l != 0 {
-				t.Fatal("expected length==0 from bad CID")
-			} else if usingReader && l == 0 {
-				t.Fatal("expected length!=0 from bad CID")
-			}
-			if c != Undef {
-				t.Fatal("expected Undef CID from bad CID")
-			}
-
-			copy(byts, c.Bytes())
-			byts[2] = 0x80 // bad multihash varint
-			byts[3] = 0x00
-			l, c, err = fromBytes(byts)
-			if err == nil {
-				t.Fatal("expected not-enough-bytes for V1 CID")
-			}
-			if !usingReader && l != 0 {
-				t.Fatal("expected length==0 from bad CID")
-			} else if usingReader && l == 0 {
-				t.Fatal("expected length!=0 from bad CID")
-			}
-			if c != Undef {
-				t.Fatal("expected Undef CID from bad CidFromBytes")
-			}
-		})
+	copy(byts, c.Bytes())
+	byts[2] = 0x80 // bad multihash varint
+	byts[3] = 0x00
+	l, c, err = CidFromBytes(byts)
+	if err == nil {
+		t.Fatal("expected not-enough-bytes for V1 CidFromBytes")
+	}
+	if l != 0 {
+		t.Fatal("expected length=0 from bad CidFromBytes")
+	}
+	if c != Undef {
+		t.Fatal("expected Undef CID from bad CidFromBytes")
 	}
 }