mirror of
https://gitlab.com/pulsechaincom/erigon-pulse.git
synced 2025-01-10 04:51:20 +00:00
ac40ca5269
Mainnet: ``` 447M accounts.0-544.l 45M accounts.0-544.li 133M code.0-544.l 14M code.0-544.li 2.0G storage.0-544.l 197M storage.0-544.li ``` Decided no to use Roaring - because it can only keep full bitmap in RAM (no way to stream into file). But it's more compact 2Gb -> 1.4Gb. Maybe can shard large bitmap - or do other trick (storage has 1B keys - sharding probably is cheap). Maybe in the future.
184 lines
5.7 KiB
Go
184 lines
5.7 KiB
Go
/*
|
|
Copyright 2021 Erigon contributors
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package recsplit
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"io"
|
|
"math/bits"
|
|
"unsafe"
|
|
|
|
"github.com/ledgerwatch/erigon-lib/common/bitutil"
|
|
)
|
|
|
|
// Optimal Golomb-Rice parameters for leaves
|
|
var bijMemo = []uint32{0, 0, 0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30}
|
|
|
|
// GolombRice can build up the golomb-rice encoding of the sequeuce of numbers, as well as read the numbers back from it.
|
|
type GolombRice struct {
|
|
data []uint64 // Present in the builder and in the reader
|
|
bitCount int // Speficic to the builder - number of bits added to the encoding so far
|
|
}
|
|
|
|
// appendUnaryAll adds the unary encoding of specified sequence of numbers to the end of the
|
|
// current encoding
|
|
func (g *GolombRice) appendUnaryAll(unary []uint64) {
|
|
bitInc := 0
|
|
for _, u := range unary {
|
|
// Each number u uses u+1 bits for its unary representation
|
|
bitInc += int(u) + 1
|
|
}
|
|
targetSize := (g.bitCount + bitInc + 63) / 64
|
|
for len(g.data) < targetSize {
|
|
g.data = append(g.data, 0)
|
|
}
|
|
|
|
for _, u := range unary {
|
|
g.bitCount += int(u)
|
|
appendPtr := g.bitCount / 64
|
|
g.data[appendPtr] |= uint64(1) << (g.bitCount & 63)
|
|
g.bitCount++
|
|
}
|
|
}
|
|
|
|
// appendFixed encodes the next value using specified Golomb parameter. Since we are using Golomb-Rice encoding,
|
|
// all Golomb parameters are powers of two. Therefore we input log2 of golomb parameter, rather than golomn paramter itself,
|
|
// for convinience
|
|
func (g *GolombRice) appendFixed(v uint64, log2golomb int) {
|
|
if log2golomb == 0 {
|
|
return
|
|
}
|
|
lowerBits := v & ((uint64(1) << log2golomb) - 1) // Extract the part of the number that will be encoded using truncated binary encoding
|
|
usedBits := g.bitCount & 63 // How many bits of the last element of b.data is used by previous value
|
|
targetSize := (g.bitCount + log2golomb + 63) / 64
|
|
//fmt.Printf("g.bitCount = %d, log2golomb = %d, targetSize = %d\n", g.bitCount, log2golomb, targetSize)
|
|
for len(g.data) < targetSize {
|
|
g.data = append(g.data, 0)
|
|
}
|
|
appendPtr := g.bitCount / 64 // The index in b.data corresponding to the last element used by previous value, or if previous values fits perfectly, the index of the next free element
|
|
curWord := g.data[appendPtr]
|
|
curWord |= lowerBits << usedBits // curWord now contains the new value potentially combined with the part of the previous value
|
|
if usedBits+log2golomb > 64 {
|
|
// New value overflows to the next element
|
|
g.data[appendPtr] = curWord
|
|
appendPtr++
|
|
curWord = lowerBits >> (64 - usedBits) // curWord now contains the part of the new value that overflows
|
|
}
|
|
g.data[appendPtr] = curWord
|
|
g.bitCount += log2golomb
|
|
}
|
|
|
|
// Bits returns currrent number of bits in the compact encoding of the hash function representation
|
|
func (g *GolombRice) Bits() int {
|
|
return g.bitCount
|
|
}
|
|
|
|
func (g *GolombRiceReader) ReadReset(bitPos, unaryOffset int) {
|
|
g.currFixedOffset = bitPos
|
|
unaryPos := bitPos + unaryOffset
|
|
g.currPtrUnary = unaryPos / 64
|
|
g.currWindowUnary = g.data[g.currPtrUnary] >> (unaryPos & 63)
|
|
g.currPtrUnary++
|
|
g.validLowerBitsUnary = 64 - (unaryPos & 63)
|
|
}
|
|
|
|
func (g *GolombRiceReader) SkipSubtree(nodes, fixedLen int) {
|
|
if nodes <= 0 {
|
|
panic("nodes <= 0")
|
|
}
|
|
missing := nodes
|
|
var cnt int
|
|
for cnt = bits.OnesCount64(g.currWindowUnary); cnt < missing; cnt = bits.OnesCount64(g.currWindowUnary) {
|
|
g.currWindowUnary = g.data[g.currPtrUnary]
|
|
g.currPtrUnary++
|
|
missing -= cnt
|
|
g.validLowerBitsUnary = 64
|
|
}
|
|
cnt = bitutil.Select64(g.currWindowUnary, missing-1)
|
|
g.currWindowUnary >>= cnt
|
|
g.currWindowUnary >>= 1
|
|
g.validLowerBitsUnary -= cnt + 1
|
|
|
|
g.currFixedOffset += fixedLen
|
|
}
|
|
|
|
func (g *GolombRiceReader) ReadNext(log2golomb int) uint64 {
|
|
var result uint64
|
|
|
|
if g.currWindowUnary == 0 {
|
|
result += uint64(g.validLowerBitsUnary)
|
|
g.currWindowUnary = g.data[g.currPtrUnary]
|
|
g.currPtrUnary++
|
|
g.validLowerBitsUnary = 64
|
|
for g.currWindowUnary == 0 {
|
|
result += 64
|
|
g.currWindowUnary = g.data[g.currPtrUnary]
|
|
g.currPtrUnary++
|
|
}
|
|
}
|
|
|
|
pos := bits.TrailingZeros64(g.currWindowUnary)
|
|
|
|
g.currWindowUnary >>= pos
|
|
g.currWindowUnary >>= 1
|
|
g.validLowerBitsUnary -= pos + 1
|
|
|
|
result += uint64(pos)
|
|
result <<= log2golomb
|
|
|
|
idx64 := g.currFixedOffset >> 6
|
|
var fixed uint64
|
|
shift := g.currFixedOffset & 63
|
|
fixed = g.data[idx64] >> shift
|
|
if shift+log2golomb > 64 {
|
|
fixed |= g.data[idx64+1] << (64 - shift)
|
|
}
|
|
result |= fixed & ((uint64(1) << log2golomb) - 1)
|
|
g.currFixedOffset += log2golomb
|
|
return result
|
|
}
|
|
|
|
// Data returns the binary representation of the Golomb-Rice code that is built
|
|
func (g *GolombRice) Data() []uint64 {
|
|
return g.data
|
|
}
|
|
|
|
const maxDataSize = 0xFFFFFFFFFFFF
|
|
|
|
// Write outputs the state of golomb rice encoding into a writer, which can be recovered later by Read
|
|
func (g *GolombRice) Write(w io.Writer) error {
|
|
var numBuf [8]byte
|
|
binary.BigEndian.PutUint64(numBuf[:], uint64(len(g.data)))
|
|
if _, e := w.Write(numBuf[:]); e != nil {
|
|
return e
|
|
}
|
|
p := (*[maxDataSize]byte)(unsafe.Pointer(&g.data[0]))
|
|
b := (*p)[:]
|
|
if _, e := w.Write(b[:len(g.data)*8]); e != nil {
|
|
return e
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type GolombRiceReader struct {
|
|
data []uint64 // Present in the builder and in the reader
|
|
currFixedOffset int // Specific to the reader
|
|
currWindowUnary uint64
|
|
currPtrUnary int
|
|
validLowerBitsUnary int
|
|
}
|