kitchensink/pkg/based32/based32.go

// Package based32 provides a simplified variant of the standard
// Bech32 human readable binary codec
//
// This codec simplifies the padding algorithm compared to the Bech32 standard
// BIP 0173 by performing all of the check validation with the decoded bits
// instead of separating the pads of each segment.
//
// The format will be entirely created by the use of the standard library
// base32, which may or may not result in the same thing (we are teaching Go
// here, not cryptocurrency, and the extra rules used by the Bech32 standard
// complicate this tutorial unnecessarily - and, Go Uber Alles :)
package based32

import (
	"encoding/base32"
	"github.com/quanterall/kitchensink/pkg/codec"
	"github.com/quanterall/kitchensink/pkg/proto"
	"lukechampine.com/blake3"
	"strings"
)

// charset is the set of characters used in the data section of bech32 strings.
// Note that this is ordered, such that for a given charset[i], i is the binary
// value of the character.
const charset = "qpzry9x8gf2tvdw0s3jn54khce6mua7l"

// Codec provides the encoder/decoder implementation created by makeCodec.
//
// This variable is sometimes called a "Singleton" in other languages, and in Go
// it is a thing that should be avoided unless the value is not a constant and
// an initialization process is required.
//
// Variable declarations like this are executed before init() functions and are
// for cases such as this, as the import of this package means the programmer
// intends to use this codec, usually, as otherwise they would be creating a new
// implementation from the struct type or for the interface.
//
// In general, an init() function is better avoided, and singletons also, better
// avoided, unless it makes sense in the context of the package as this is this
// initialization adds to startup delay for an application, so consider
// carefully before using these or init().
var Codec = makeCodec(
	"Base32Check",
	charset,
	"QNTRL",
)

func getCheckLen(length int) (checkLen int) {

	// In order to provide a minimum of 1 byte of check to the output, while
	// avoiding the encoder adding padding characters (default is '=') the
	// length of the encoded bytes must be rounded to the nearest multiple of 5,
	// adding 5 if it is already a multiple of 5 (5 bytes is 40 bits which
	// encodes as 8 base32 characters).
	//
	// The first byte of the encoded data contains the check length, as this
	// formula varies depending on the length of the data, so it needs to be
	// encoded into the format in the beginning as it can't go at the end. So
	// the check length is one byte less than this formula indicates.
	//
	// This is a significant divergence from the methods used for these encoders
	// because in this tutorial we are not only aiming to produce human readable
	// transcription codes for just transaction hashes (usually 256bit/32 byte)
	// and addresses (usually 160bit/20byte) but a general formula that could
	// encode any binary data length, but presumably it would be likely no more
	// than 512 bits of data for a double length hash, since such a code would
	// take at least a couple of minutes to correctly transcribe.
	//
	// Though a Go programmer may never do a lot of this kind of algorithm
	// design, it is here especially for those who are inclined towards this
	// kind of low level encoding, which is part of any data encoding for wire,
	// storage, for graphic and audio encoding formats, and things like writing
	// GUIs.
	//
	// The following formula ensures that there is at least 1 check byte, up to
	// 4
	//
	// we add two to the length before modulus, as there must be 1 byte for
	// check length and 1 byte of check
	lengthMod := (2 + length) % 5

	// The modulus is subtracted from 5 to produce the complement required to
	// make the correct number of bytes of total data, plus 1 to account for the
	// minimum length of 1.
	checkLen = 5 - lengthMod + 1

	return checkLen
}

// getCutPoint is made into a function because it is needed more than once.
func getCutPoint(length, checkLen int) int {

	return length - checkLen - 1
}

// makeCodec generates our custom codec as above, into the exported Codec
// variable
//
// Here we demonstrate the use of closures. In this case, it is an
// initialization, but it can also be used in dynamic generation code, or to use
// the 'builder' pattern to construct larger algorithms out of small modular
// parts.
func makeCodec(
	name string,
	cs string,
	hrp string,
) (cdc *codec.Codec) {

	// Create the codec.Codec struct and put its pointer in the return variable.
	cdc = &codec.Codec{
		Name:    name,
		Charset: cs,
		HRP:     hrp,
	}

	// We need to create the check creation functions first
	cdc.MakeCheck = func(input []byte, checkLen int) (output []byte) {

		// We use the Blake3 256 bit hash because it is nearly as fast as CRC32
		// but less complicated to use due to the 32 bit integer conversions to
		// bytes required to use the CRC32 algorithm.
		checkArray := blake3.Sum256(input)

		// This truncates the blake3 hash to the prescribed check length
		return checkArray[:checkLen]
	}

	// Create a base32.Encoding from the provided charset.
	enc := base32.NewEncoding(cdc.Charset)

	cdc.Encoder = func(input []byte) (output string, err error) {

		if len(input) < 1 {

			// Unfortunately there is a minor bug in the Go protobuf/grpc
			// generator that does not set the type of the errors to Error,
			// which is an alias of int32. Thus here we have to cast it to int32
			// to retrieve the map entry containing the error name.
			//
			// You can see the error in ../proto/based32.pb.go which is what is
			// generated by protoc-gen-go.
			err = proto.Error_ZERO_LENGTH
			return
		}

		// The check length depends on the modulus of the length of the data is
		// order to avoid padding.
		checkLen := getCheckLen(len(input))

		// The output is longer than the input, so we create a new buffer.
		outputBytes := make([]byte, len(input)+checkLen+1)

		// Add the check length byte to the front
		outputBytes[0] = byte(checkLen)

		// Then copy the input bytes for beginning segment.
		copy(outputBytes[1:len(input)+1], input)

		// Then copy the check to the end of the input.
		copy(outputBytes[len(input)+1:], cdc.MakeCheck(input, checkLen))

		// Create the encoding for the output.
		outputString := enc.EncodeToString(outputBytes)

		// We can omit the first character of the encoding because the length
		// prefix never uses the first 5 bits of the first byte, and add it back
		// for the decoder later.
		trimmedString := outputString[1:]

		// Prefix the output with the Human Readable Part and append the
		// encoded string version of the provided bytes.
		output = cdc.HRP + trimmedString

		return
	}

	cdc.Check = func(input []byte) (err error) {

		// We must do this check or the next statement will cause a bounds check
		// panic. Note that zero length and nil slices are different, but have
		// the same effect in this case, so both must be checked.
		switch {
		case len(input) < 1:

			err = proto.Error_ZERO_LENGTH
			return

		case input == nil:

			err = proto.Error_NIL_SLICE
			return
		}

		// The check length is encoded into the first byte in order to ensure
		// the data is cut correctly to perform the integrity check.
		checkLen := int(input[0])

		// Ensure there is at enough bytes in the input to run a check on
		if len(input) < checkLen+1 {

			err = proto.Error_CHECK_TOO_SHORT
			return
		}

		// Find the index to cut the input to find the checksum value. We need
		// this same value twice so it must be made into a variable.
		cutPoint := getCutPoint(len(input), checkLen)

		// Here is an example of a multiple assignment and more use of the
		// slicing operator.
		payload, checksum := input[1:cutPoint], string(input[cutPoint:])

		// A checksum is checked in all cases by taking the data received, and
		// applying the checksum generation function, and then comparing the
		// checksum to the one attached to the received data with checksum
		// present.
		//
		// Note: The casting to string above and here. This makes a copy to the
		// immutable string, which is not optimal for large byte slices, but for
		// this short check value, it is a cheap operation on the stack, and an
		// illustration of the interchangeability of []byte and string, with the
		// distinction of the availability of a comparison operator for the
		// string that isn't present for []byte, so for such cases this
		// conversion is a shortcut method to compare byte slices.
		computedChecksum := string(cdc.MakeCheck(payload, checkLen))

		// Here we assign to the return variable the result of the comparison.
		// by doing this instead of using an if and returns, the meaning of the
		// comparison is more clear by the use of the return value's name.
		valid := checksum != computedChecksum

		if !valid {

			err = proto.Error_CHECK_FAILED
		}

		return
	}

	cdc.Decoder = func(input string) (output []byte, err error) {

		// Other than for human identification, the HRP is also a validity
		// check, so if the string prefix is wrong, the entire value is wrong
		// and won't decode as it is expected.
		if !strings.HasPrefix(input, cdc.HRP) {

			log.Printf(
				"Provided string has incorrect human readable part:"+
					"found '%s' expected '%s'", input[:len(cdc.HRP)], cdc.HRP,
			)

			err = proto.Error_INCORRECT_HUMAN_READABLE_PART
			return
		}

		// Cut the HRP off the beginning to get the content, add the initial
		// zeroed 5 bits with a 'q' character.
		//
		// Be aware the input string will be copied to create the []byte
		// version. Also, because the input bytes are always zero for the first
		// 5 most significant bits, we must re-add the zero at the front (q)
		// before feeding it to the decoder.
		input = "q" + input[len(cdc.HRP):]

		// The length of the base32 string refers to 5 bits per slice index
		// position, so the correct size of the output bytes, which are 8 bytes
		// per slice index position, is found with the following simple integer
		// math calculation.
		//
		// This allocation needs to be made first as the base32 Decode function
		// does not do this allocation automatically and it would be wasteful to
		// not compute it precisely, when the calculation is so simple.
		//
		// If this allocation is omitted, the decoder will panic due to bounds
		// check error. A nil slice is equivalent to a zero length slice and
		// gives a bounds check error, but in fact, the slice has no data at
		// all. Yes, the panic message is lies:
		//
		//   panic: runtime error: index out of range [4] with length 0
		//
		// If this assignment isn't made, by default, output is nil, not
		// []byte{} so this panic message is deceptive.
		data := make([]byte, len(input)*5/8)

		var writtenBytes int
		writtenBytes, err = enc.Decode(data, []byte(input))
		if err != nil {

			log.Println(err)
			return
		}

		// The first byte signifies the length of the check at the end
		checkLen := int(data[0])
		if writtenBytes < checkLen+1 {

			err = proto.Error_CHECK_TOO_SHORT
			return
		}

		// Assigning the result of the check here as if true the resulting
		// decoded bytes still need to be trimmed of the check value (keeping
		// things cleanly separated between the check and decode function.
		err = cdc.Check(data)

		// There is no point in doing any more if the check fails, as per the
		// contract specified in the interface definition codecer.Codecer
		if err != nil {
			return
		}

		// Slice off the check length prefix, and the check bytes to return the
		// valid input bytes.
		output = data[1:getCutPoint(len(data)+1, checkLen)]

		// If we got to here, the decode was successful.
		return
	}

	// We return the value explicitly to be nice to readers as the function is
	// not a short and simple one.
	return cdc
}