225 lines
5.4 KiB
Go
225 lines
5.4 KiB
Go
package bitpack
|
|
|
|
import (
|
|
"math"
|
|
)
|
|
|
|
// OffsetArray is an interface representing read-only views of arrays of 64 bits
|
|
// offsets.
|
|
type OffsetArray interface {
|
|
// Returns the value at index i.
|
|
//
|
|
// The method complexity may be anywhere between O(1) and O(N).
|
|
Index(i int) uint64
|
|
// Returns the number of offsets in the array.
|
|
//
|
|
// The method complexity must be O(1).
|
|
Len() int
|
|
}
|
|
|
|
// OffsetArrayLen is a helper function to access the length of an offset array.
|
|
// It is similar to calling Len on the array but handles the special case where
|
|
// the array is nil, in which case it returns zero.
|
|
func OffsetArrayLen(array OffsetArray) int {
|
|
if array != nil {
|
|
return array.Len()
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// NewOffsetArray constructs a new array of offsets from the slice of values
|
|
// passed as argument. The slice is not retained, the returned array always
|
|
// holds a copy of the values.
|
|
//
|
|
// The underlying implementation of the offset array applies a compression
|
|
// mechanism derived from Frame-of-Reference and Delta Encoding to minimize
|
|
// the memory footprint of the array. This compression model works best when
|
|
// the input is made of ordered values, otherwise the deltas between values
|
|
// are likely to be too large to benefit from delta encoding.
|
|
//
|
|
// See https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/
|
|
func NewOffsetArray(values []uint64) OffsetArray {
|
|
if len(values) == 0 {
|
|
return emptyOffsetArray{}
|
|
}
|
|
if len(values) <= smallOffsetArrayCapacity {
|
|
return newSmallOffsetArray(values)
|
|
}
|
|
|
|
maxDelta := uint64(0)
|
|
lastValue := values[0]
|
|
// TODO: the pre-processing we perform here can be optimized using SIMD
|
|
// instructions.
|
|
for _, value := range values[1:] {
|
|
if delta := value - lastValue; delta > maxDelta {
|
|
maxDelta = delta
|
|
}
|
|
lastValue = value
|
|
}
|
|
|
|
switch {
|
|
case maxDelta > math.MaxUint32:
|
|
return newOffsetArray(values)
|
|
case maxDelta > math.MaxUint16:
|
|
return newDeltaArray[uint32](values)
|
|
case maxDelta > math.MaxUint8:
|
|
return newDeltaArray[uint16](values)
|
|
case maxDelta > 15:
|
|
return newDeltaArray[uint8](values)
|
|
default:
|
|
return newDeltaArrayUint4(values)
|
|
}
|
|
}
|
|
|
|
type offsetArray struct {
|
|
values []uint64
|
|
}
|
|
|
|
func newOffsetArray(values []uint64) *offsetArray {
|
|
a := &offsetArray{
|
|
values: make([]uint64, len(values)),
|
|
}
|
|
copy(a.values, values)
|
|
return a
|
|
}
|
|
|
|
func (a *offsetArray) Index(i int) uint64 {
|
|
return a.values[i]
|
|
}
|
|
|
|
func (a *offsetArray) Len() int {
|
|
return len(a.values)
|
|
}
|
|
|
|
type emptyOffsetArray struct{}
|
|
|
|
func (emptyOffsetArray) Index(int) uint64 {
|
|
panic("index out of bounds")
|
|
}
|
|
|
|
func (emptyOffsetArray) Len() int {
|
|
return 0
|
|
}
|
|
|
|
const smallOffsetArrayCapacity = 7
|
|
|
|
type smallOffsetArray struct {
|
|
length int
|
|
values [smallOffsetArrayCapacity]uint64
|
|
}
|
|
|
|
func newSmallOffsetArray(values []uint64) *smallOffsetArray {
|
|
a := &smallOffsetArray{length: len(values)}
|
|
copy(a.values[:], values)
|
|
return a
|
|
}
|
|
|
|
func (a *smallOffsetArray) Index(i int) uint64 {
|
|
if i < 0 || i >= a.length {
|
|
panic("index out of bounds")
|
|
}
|
|
return a.values[i]
|
|
}
|
|
|
|
func (a *smallOffsetArray) Len() int {
|
|
return a.length
|
|
}
|
|
|
|
type uintType interface {
|
|
uint8 | uint16 | uint32 | uint64
|
|
}
|
|
|
|
type deltaArray[T uintType] struct {
|
|
deltas []T
|
|
firstValue uint64
|
|
}
|
|
|
|
func newDeltaArray[T uintType](values []uint64) *deltaArray[T] {
|
|
a := &deltaArray[T]{
|
|
deltas: make([]T, len(values)-1),
|
|
firstValue: values[0],
|
|
}
|
|
lastValue := values[0]
|
|
for i, value := range values[1:] {
|
|
a.deltas[i] = T(value - lastValue)
|
|
lastValue = value
|
|
}
|
|
return a
|
|
}
|
|
|
|
func (a *deltaArray[T]) Index(i int) uint64 {
|
|
if i < 0 || i >= a.Len() {
|
|
panic("index out of bounds")
|
|
}
|
|
value := a.firstValue
|
|
// TODO: computing the prefix sum can be vectorized;
|
|
// see https://en.algorithmica.org/hpc/algorithms/prefix/
|
|
for _, delta := range a.deltas[:i] {
|
|
value += uint64(delta)
|
|
}
|
|
return value
|
|
}
|
|
|
|
func (a *deltaArray[T]) Len() int {
|
|
return len(a.deltas) + 1
|
|
}
|
|
|
|
// deltaArrayUint4 is a specialization of deltaArray which packs 4 bits integers
|
|
// to hold deltas between 0 and 15; based on the analysis of compiling Python,
|
|
// it appeared that most source offset deltas were under 16, so using this
|
|
// data structure cuts by 50% the memory needed compared to deltaArray[uint8].
|
|
//
|
|
// Here is the distribution of source offset deltas for Python 3.13:
|
|
//
|
|
// - <=15 : 10240
|
|
// - <=255 : 9565
|
|
// - <=65535 : 1163
|
|
//
|
|
// Memory profiles showed that using deltaArrayUint4 (compared to deltaArray[T])
|
|
// dropped the memory footprint of source mappings for Python from 6MB to 4.5MB.
|
|
type deltaArrayUint4 struct {
|
|
deltas []byte
|
|
numValues int
|
|
firstValue uint64
|
|
}
|
|
|
|
func newDeltaArrayUint4(values []uint64) *deltaArrayUint4 {
|
|
a := &deltaArrayUint4{
|
|
deltas: make([]byte, len(values)/2+1),
|
|
numValues: len(values),
|
|
firstValue: values[0],
|
|
}
|
|
lastValue := values[0]
|
|
for i, value := range values[1:] {
|
|
a.assign(i, value-lastValue)
|
|
lastValue = value
|
|
}
|
|
return a
|
|
}
|
|
|
|
func (a *deltaArrayUint4) assign(i int, v uint64) {
|
|
index, shift := uint(i)>>1, 4*(uint(i)&1)
|
|
a.deltas[index] &= ^(0xF << shift)
|
|
a.deltas[index] |= byte(v) << shift
|
|
}
|
|
|
|
func (a *deltaArrayUint4) index(i int) uint64 {
|
|
index, shift := uint(i)>>1, 4*(uint(i)&1)
|
|
return uint64((a.deltas[index] >> shift) & 0xF)
|
|
}
|
|
|
|
func (a *deltaArrayUint4) Index(i int) uint64 {
|
|
if i < 0 || i >= a.Len() {
|
|
panic("index out of bounds")
|
|
}
|
|
value := a.firstValue
|
|
for j := 0; j < i; j++ {
|
|
value += a.index(j)
|
|
}
|
|
return value
|
|
}
|
|
|
|
func (a *deltaArrayUint4) Len() int {
|
|
return a.numValues
|
|
}
|