2021-09-13 17:31:09 +00:00
/ *
Copyright 2021 Erigon contributors
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package recsplit
import (
"encoding/binary"
"fmt"
"math"
"math/bits"
"github.com/ledgerwatch/erigon-lib/etl"
"github.com/spaolacci/murmur3"
)
const RecSplitLogPrefix = "recsplit"
const MaxLeafSize = 24
2021-09-20 11:14:49 +00:00
/ * * David Stafford ' s ( http : //zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html)
* 13 th variant of the 64 - bit finalizer function in Austin Appleby ' s
* MurmurHash3 ( https : //github.com/aappleby/smhasher).
*
* @ param z a 64 - bit integer .
* @ return a 64 - bit integer obtained by mixing the bits of ` z ` .
* /
2021-09-13 17:31:09 +00:00
2021-09-20 11:14:49 +00:00
func remix ( z uint64 ) uint64 {
z = ( z ^ ( z >> 30 ) ) * 0xbf58476d1ce4e5b9
z = ( z ^ ( z >> 27 ) ) * 0x94d049bb133111eb
return z ^ ( z >> 31 )
2021-09-13 17:31:09 +00:00
}
// RecSplit is the implementation of Recursive Split algorithm for constructing perfect hash mapping, described in
// https://arxiv.org/pdf/1910.06416.pdf Emmanuel Esposito, Thomas Mueller Graf, and Sebastiano Vigna.
// Recsplit: Minimal perfect hashing via recursive splitting. In 2020 Proceedings of the Symposium on Algorithm Engineering and Experiments (ALENEX),
// pages 175− 185. SIAM, 2020.
type RecSplit struct {
2021-09-18 21:59:27 +00:00
bucketSize int
2021-09-20 16:39:32 +00:00
keyExpectedCount uint64 // Number of keys in the hash table
keysAdded uint64 // Number of keys actually added to the recSplit (to check the match with keyExpectedCount)
bucketCount uint64 // Number of buckets
hasher murmur3 . Hash128 // Salted hash function to use for splitting into initial buckets and mapping to 64-bit fingerprints
2021-09-18 21:59:27 +00:00
collector * etl . Collector
built bool // Flag indicating that the hash function has been built and no more keys can be added
currentBucketIdx uint64 // Current bucket being accumulated
2021-09-20 11:14:49 +00:00
currentBucket [ ] uint64 // 64-bit fingerprints of keys in the current bucket accumulated before the recsplit is performed for that bucket
2021-09-18 21:59:27 +00:00
gr GolombRice // Helper object to encode the tree of hash function salts using Golomb-Rice code.
// Helper object to encode the sequence of cumulative number of keys in the buckets
// and the sequence of of cumulative bit offsets of buckets in the Golomb-Rice code.
ef DoubleEliasFano
bucketSizeAcc [ ] uint64 // Bucket size accumulator
bucketPosAcc [ ] uint64 // Accumulator for position of every bucket in the encoding of the hash function
2021-09-20 16:39:32 +00:00
leafSize uint16 // Leaf size for recursive split algorithm
primaryAggrBound uint16 // The lower bound for primary key aggregation (computed from leafSize)
secondaryAggrBound uint16 // The lower bound for secondary key aggregation (computed from leadSize)
2021-09-20 11:14:49 +00:00
startSeed [ ] uint64
2021-09-18 21:59:27 +00:00
golombRice [ ] uint32
2021-09-20 11:14:49 +00:00
buffer [ ] uint64
2021-09-20 16:39:32 +00:00
count [ ] uint16
2021-09-20 11:14:49 +00:00
salt uint32 // Murmur3 hash used for converting keys to 64-bit values and assigning to buckets
collision bool
tmpDir string
trace bool
2021-09-13 17:31:09 +00:00
}
type RecSplitArgs struct {
KeyCount int
BucketSize int
Salt uint32 // Hash seed (salt) for the hash function used for allocating the initial buckets - need to be generated randomly
2021-09-20 16:39:32 +00:00
LeafSize uint16
2021-09-13 17:31:09 +00:00
TmpDir string
2021-09-20 11:14:49 +00:00
StartSeed [ ] uint64 // For each level of recursive split, the hash seed (salt) used for that level - need to be generated randomly and be large enough to accomodate all the levels
2021-09-13 17:31:09 +00:00
}
// NewRecSplit creates a new RecSplit instance with given number of keys and given bucket size
// Typical bucket size is 100 - 2000, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access
// salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes)
// are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time
func NewRecSplit ( args RecSplitArgs ) ( * RecSplit , error ) {
bucketCount := ( args . KeyCount + args . BucketSize - 1 ) / args . BucketSize
2021-09-18 21:59:27 +00:00
rs := & RecSplit { bucketSize : args . BucketSize , keyExpectedCount : uint64 ( args . KeyCount ) , bucketCount : uint64 ( bucketCount ) }
2021-09-20 11:14:49 +00:00
rs . salt = args . Salt
2021-09-20 16:39:32 +00:00
rs . hasher = murmur3 . New128WithSeed ( rs . salt )
2021-09-20 11:14:49 +00:00
rs . tmpDir = args . TmpDir
rs . collector = etl . NewCollector ( rs . tmpDir , etl . NewSortableBuffer ( etl . BufferOptimalSize ) )
rs . currentBucket = make ( [ ] uint64 , 0 , args . BucketSize )
2021-09-18 21:59:27 +00:00
rs . bucketSizeAcc = make ( [ ] uint64 , 1 , bucketCount + 1 )
rs . bucketPosAcc = make ( [ ] uint64 , 1 , bucketCount + 1 )
2021-09-13 17:31:09 +00:00
if args . LeafSize > MaxLeafSize {
return nil , fmt . Errorf ( "exceeded max leaf size %d: %d" , MaxLeafSize , args . LeafSize )
}
rs . leafSize = args . LeafSize
2021-09-20 16:39:32 +00:00
rs . primaryAggrBound = rs . leafSize * uint16 ( math . Max ( 2 , math . Ceil ( 0.35 * float64 ( rs . leafSize ) + 1. / 2. ) ) )
2021-09-13 17:31:09 +00:00
if rs . leafSize < 7 {
rs . secondaryAggrBound = rs . primaryAggrBound * 2
} else {
2021-09-20 16:39:32 +00:00
rs . secondaryAggrBound = rs . primaryAggrBound * uint16 ( math . Ceil ( 0.21 * float64 ( rs . leafSize ) + 9. / 10. ) )
2021-09-13 17:31:09 +00:00
}
rs . startSeed = args . StartSeed
2021-09-20 16:39:32 +00:00
rs . count = make ( [ ] uint16 , rs . secondaryAggrBound )
2021-09-13 17:31:09 +00:00
return rs , nil
}
2021-09-20 11:14:49 +00:00
func ( rs * RecSplit ) SetTrace ( trace bool ) {
rs . trace = trace
}
2021-09-13 17:31:09 +00:00
// remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly
// distributed over the range [0..n)
func remap ( x uint64 , n uint64 ) uint64 {
hi , _ := bits . Mul64 ( x , n )
return hi
}
const mask48 uint64 = ( 1 << 48 ) - 1
// remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly
// distributed over the range [0..n), under assumption that n is less than 2^16
2021-09-20 16:39:32 +00:00
func remap16 ( x uint64 , n uint16 ) uint16 {
return uint16 ( ( ( x & mask48 ) * uint64 ( n ) ) >> 48 )
2021-09-13 17:31:09 +00:00
}
2021-09-20 11:14:49 +00:00
// ResetNextSalt resets the RecSplit and uses the next salt value to try to avoid collisions
// when mapping keys to 64-bit values
func ( rs * RecSplit ) ResetNextSalt ( ) {
rs . collision = false
rs . keysAdded = 0
rs . salt ++
2021-09-20 16:39:32 +00:00
rs . hasher = murmur3 . New128WithSeed ( rs . salt )
2021-09-20 11:14:49 +00:00
rs . collector = etl . NewCollector ( rs . tmpDir , etl . NewSortableBuffer ( etl . BufferOptimalSize ) )
rs . currentBucket = rs . currentBucket [ : 0 ]
rs . bucketSizeAcc = rs . bucketSizeAcc [ : 1 ] // First entry is always zero
rs . bucketPosAcc = rs . bucketPosAcc [ : 0 ] // First entry is always zero
}
2021-09-20 16:39:32 +00:00
func ( rs * RecSplit ) splitParams ( m uint16 ) ( fanout , unit uint16 ) {
2021-09-18 21:59:27 +00:00
if m > rs . secondaryAggrBound { // High-level aggregation (fanout 2)
unit = rs . secondaryAggrBound * ( ( ( m + 1 ) / 2 + rs . secondaryAggrBound - 1 ) / rs . secondaryAggrBound )
fanout = 2
} else if m > rs . primaryAggrBound { // Second-level aggregation
unit = rs . primaryAggrBound
fanout = ( m + rs . primaryAggrBound - 1 ) / rs . primaryAggrBound
} else { // First-level aggregation
unit = rs . leafSize
fanout = ( m + rs . leafSize - 1 ) / rs . leafSize
}
return
}
2021-09-20 16:39:32 +00:00
func ( rs * RecSplit ) computeGolombRice ( m uint16 , table [ ] uint32 ) {
2021-09-18 21:59:27 +00:00
fanout , unit := rs . splitParams ( m )
2021-09-20 16:39:32 +00:00
k := make ( [ ] uint16 , fanout )
2021-09-18 21:59:27 +00:00
k [ fanout - 1 ] = m
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < fanout - 1 ; i ++ {
2021-09-18 21:59:27 +00:00
k [ i ] = unit
k [ fanout - 1 ] -= k [ i ]
}
sqrt_prod := float64 ( 1 )
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < fanout ; i ++ {
2021-09-18 21:59:27 +00:00
sqrt_prod *= math . Sqrt ( float64 ( k [ i ] ) )
}
p := math . Sqrt ( float64 ( m ) ) / ( math . Pow ( 2 * math . Pi , ( float64 ( fanout ) - 1. ) / 2.0 ) * sqrt_prod )
golombRiceLength := uint32 ( math . Ceil ( math . Log2 ( - math . Log ( ( math . Sqrt ( 5 ) + 1.0 ) / 2.0 ) / math . Log1p ( - p ) ) ) ) // log2 Golomb modulus
if golombRiceLength > 0x1F {
panic ( "golombRiceLength > 0x1F" )
}
table [ m ] = golombRiceLength << 27
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < fanout ; i ++ {
2021-09-18 21:59:27 +00:00
golombRiceLength += table [ k [ i ] ] & 0xFFFF
}
if golombRiceLength > 0xFFFF {
panic ( "golombRiceLength > 0xFFFF" )
}
table [ m ] |= golombRiceLength // Sum of Golomb-Rice codeslengths in the subtree, stored in the lower 16 bits
nodes := uint32 ( 1 )
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < fanout ; i ++ {
2021-09-18 21:59:27 +00:00
nodes += ( table [ k [ i ] ] >> 16 ) & 0x7FF
}
if rs . leafSize >= 3 && nodes > 0x7FF {
panic ( "rs.leafSize >= 3 && nodes > 0x7FF" )
}
table [ m ] |= nodes << 16
}
// golombParam returns the optimal Golomb parameter to use for encoding
// salt for the part of the hash function separating m elements. It is based on
// calculations with assumptions that we draw hash functions at random
2021-09-20 16:39:32 +00:00
func ( rs * RecSplit ) golombParam ( m uint16 ) int {
s := uint16 ( len ( rs . golombRice ) )
2021-09-18 21:59:27 +00:00
for m >= s {
rs . golombRice = append ( rs . golombRice , 0 )
// For the case where bucket is larger than planned
if s == 0 {
rs . golombRice [ 0 ] = ( bijMemo [ 0 ] << 27 ) | bijMemo [ 0 ]
} else if s <= rs . leafSize {
rs . golombRice [ s ] = ( bijMemo [ s ] << 27 ) | ( uint32 ( 1 ) << 16 ) | bijMemo [ s ]
} else {
rs . computeGolombRice ( s , rs . golombRice )
}
s ++
}
return int ( rs . golombRice [ m ] >> 27 )
}
2021-09-13 17:31:09 +00:00
// Add key to the RecSplit. There can be many more keys than what fits in RAM, and RecSplit
// spills data onto disk to accomodate that. The key gets copied by the collector, therefore
// the slice underlying key is not getting accessed by RecSplit after this invocation.
func ( rs * RecSplit ) AddKey ( key [ ] byte ) error {
if rs . built {
return fmt . Errorf ( "cannot add keys after perfect hash function had been built" )
}
rs . hasher . Reset ( )
rs . hasher . Write ( key ) //nolint:errcheck
2021-09-20 16:39:32 +00:00
hi , lo := rs . hasher . Sum128 ( )
2021-09-20 11:14:49 +00:00
var bucketKey [ 16 ] byte
2021-09-20 16:39:32 +00:00
binary . BigEndian . PutUint64 ( bucketKey [ : ] , remap ( hi , rs . bucketCount ) )
binary . BigEndian . PutUint64 ( bucketKey [ 8 : ] , lo )
2021-09-13 17:31:09 +00:00
rs . keysAdded ++
2021-09-20 11:14:49 +00:00
return rs . collector . Collect ( bucketKey [ : ] , [ ] byte { } )
2021-09-13 17:31:09 +00:00
}
2021-09-18 21:59:27 +00:00
func ( rs * RecSplit ) recsplitCurrentBucket ( ) error {
2021-09-13 17:31:09 +00:00
// Extend rs.bucketSizeAcc to accomodate current bucket index + 1
for len ( rs . bucketSizeAcc ) <= int ( rs . currentBucketIdx ) + 1 {
rs . bucketSizeAcc = append ( rs . bucketSizeAcc , rs . bucketSizeAcc [ len ( rs . bucketSizeAcc ) - 1 ] )
}
2021-09-18 21:59:27 +00:00
rs . bucketSizeAcc [ int ( rs . currentBucketIdx ) + 1 ] += uint64 ( len ( rs . currentBucket ) )
2021-09-13 17:31:09 +00:00
if len ( rs . currentBucket ) > 1 {
for i , key := range rs . currentBucket [ 1 : ] {
if key == rs . currentBucket [ i ] {
2021-09-20 11:14:49 +00:00
rs . collision = true
2021-09-13 17:31:09 +00:00
return fmt . Errorf ( "duplicate key %x" , key )
}
}
2021-09-20 11:14:49 +00:00
bitPos := rs . gr . bitCount
if rs . buffer == nil {
rs . buffer = make ( [ ] uint64 , len ( rs . currentBucket ) )
} else {
for len ( rs . buffer ) < len ( rs . currentBucket ) {
rs . buffer = append ( rs . buffer , 0 )
}
}
2021-09-13 17:31:09 +00:00
unary := rs . recsplit ( 0 /* level */ , rs . currentBucket , nil /* unary */ )
2021-09-18 21:59:27 +00:00
rs . gr . appendUnaryAll ( unary )
2021-09-20 11:14:49 +00:00
if rs . trace {
fmt . Printf ( "recsplitBucket(%d, %d, bitsize = %d)\n" , rs . currentBucketIdx , len ( rs . currentBucket ) , rs . gr . bitCount - bitPos )
}
2021-09-13 17:31:09 +00:00
}
// Extend rs.bucketPosAcc to accomodate current bucket index + 1
for len ( rs . bucketPosAcc ) <= int ( rs . currentBucketIdx ) + 1 {
rs . bucketPosAcc = append ( rs . bucketPosAcc , rs . bucketPosAcc [ len ( rs . bucketPosAcc ) - 1 ] )
}
2021-09-20 11:14:49 +00:00
rs . bucketPosAcc [ int ( rs . currentBucketIdx ) + 1 ] = uint64 ( rs . gr . Bits ( ) )
2021-09-13 17:31:09 +00:00
// clear for the next buckey
rs . currentBucket = rs . currentBucket [ : 0 ]
return nil
}
// recsplit applies recSplit algorithm to the given bucket
2021-09-20 11:14:49 +00:00
func ( rs * RecSplit ) recsplit ( level int , bucket [ ] uint64 , unary [ ] uint64 ) [ ] uint64 {
if rs . trace {
fmt . Printf ( "recsplit(%d, %d, %x)\n" , level , len ( bucket ) , bucket )
}
2021-09-13 17:31:09 +00:00
// Pick initial salt for this level of recursive split
salt := rs . startSeed [ level ]
2021-09-20 16:39:32 +00:00
m := uint16 ( len ( bucket ) )
2021-09-13 17:31:09 +00:00
if m <= rs . leafSize {
// No need to build aggregation levels - just find find bijection
var mask uint32
for {
mask = 0
var fail bool
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; ! fail && i < m ; i ++ {
2021-09-20 11:14:49 +00:00
bit := uint32 ( 1 ) << remap16 ( remix ( bucket [ i ] + salt ) , m )
2021-09-13 17:31:09 +00:00
if mask & bit != 0 {
fail = true
} else {
mask |= bit
}
}
if ! fail {
break
}
salt ++
}
salt -= rs . startSeed [ level ]
2021-09-18 21:59:27 +00:00
log2golomb := rs . golombParam ( m )
2021-09-20 11:14:49 +00:00
if rs . trace {
fmt . Printf ( "encode bij %d with log2golomn %d at p = %d\n" , salt , log2golomb , rs . gr . bitCount )
}
2021-09-18 21:59:27 +00:00
rs . gr . appendFixed ( salt , log2golomb )
unary = append ( unary , salt >> log2golomb )
2021-09-13 17:31:09 +00:00
} else {
2021-09-18 21:59:27 +00:00
fanout , unit := rs . splitParams ( m )
2021-09-20 16:39:32 +00:00
count := rs . count
2021-09-13 17:31:09 +00:00
for {
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < fanout - 1 ; i ++ {
count [ i ] = 0
}
2021-09-13 17:31:09 +00:00
var fail bool
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < m ; i ++ {
count [ remap16 ( remix ( bucket [ i ] + salt ) , m ) / unit ] ++
}
for i := uint16 ( 0 ) ; i < fanout - 1 ; i ++ {
fail = fail || ( count [ i ] != unit )
2021-09-13 17:31:09 +00:00
}
2021-09-20 16:39:32 +00:00
if ! fail {
2021-09-13 17:31:09 +00:00
break
}
salt ++
}
2021-09-20 16:39:32 +00:00
for i , c := uint16 ( 0 ) , uint16 ( 0 ) ; i < fanout ; i ++ {
2021-09-20 11:14:49 +00:00
count [ i ] = c
c += unit
}
2021-09-20 16:39:32 +00:00
for i := uint16 ( 0 ) ; i < m ; i ++ {
j := remap16 ( remix ( bucket [ i ] + salt ) , m ) / unit
rs . buffer [ count [ j ] ] = bucket [ i ]
2021-09-20 11:14:49 +00:00
count [ j ] ++
}
copy ( bucket , rs . buffer )
2021-09-13 17:31:09 +00:00
salt -= rs . startSeed [ level ]
2021-09-18 21:59:27 +00:00
log2golomb := rs . golombParam ( m )
2021-09-20 11:14:49 +00:00
if rs . trace {
fmt . Printf ( "encode fanout %d: %d with log2golomn %d at p = %d\n" , fanout , salt , log2golomb , rs . gr . bitCount )
}
2021-09-18 21:59:27 +00:00
rs . gr . appendFixed ( salt , log2golomb )
unary = append ( unary , salt >> log2golomb )
2021-09-20 16:39:32 +00:00
var i uint16
2021-09-18 21:59:27 +00:00
for i = 0 ; i < m - unit ; i += unit {
2021-09-20 11:14:49 +00:00
unary = rs . recsplit ( level + 1 , bucket [ i : i + unit ] , unary )
2021-09-13 17:31:09 +00:00
}
if m - i > 1 {
2021-09-20 11:14:49 +00:00
unary = rs . recsplit ( level + 1 , bucket [ i : ] , unary )
2021-09-13 17:31:09 +00:00
}
}
return unary
}
// loadFunc is required to satisfy the type etl.LoadFunc type, to use with collector.Load
func ( rs * RecSplit ) loadFunc ( k , v [ ] byte , table etl . CurrentTableReader , next etl . LoadNextFunc ) error {
// k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket
bucketIdx := binary . BigEndian . Uint64 ( k )
if rs . currentBucketIdx != bucketIdx {
if rs . currentBucketIdx != math . MaxUint64 {
if err := rs . recsplitCurrentBucket ( ) ; err != nil {
return err
}
}
rs . currentBucketIdx = bucketIdx
}
2021-09-20 11:14:49 +00:00
rs . currentBucket = append ( rs . currentBucket , binary . BigEndian . Uint64 ( k [ 8 : ] ) )
2021-09-13 17:31:09 +00:00
return nil
}
// Build has to be called after all the keys have been added, and it initiates the process
// of building the perfect hash function.
func ( rs * RecSplit ) Build ( ) error {
if rs . built {
return fmt . Errorf ( "already built" )
}
if rs . keysAdded != rs . keyExpectedCount {
return fmt . Errorf ( "expected keys %d, got %d" , rs . keyExpectedCount , rs . keysAdded )
}
rs . currentBucketIdx = math . MaxUint64 // To make sure 0 bucket is detected
defer rs . collector . Close ( RecSplitLogPrefix )
if err := rs . collector . Load ( RecSplitLogPrefix , nil /* db */ , "" /* toBucket */ , rs . loadFunc , etl . TransformArgs { } ) ; err != nil {
return err
}
if len ( rs . currentBucket ) > 0 {
if err := rs . recsplitCurrentBucket ( ) ; err != nil {
return err
}
}
2021-09-18 21:59:27 +00:00
rs . gr . appendFixed ( 1 , 1 ) // Sentinel (avoids checking for parts of size 1)
// Construct Elias Fano index
rs . ef . Build ( rs . bucketSizeAcc , rs . bucketPosAcc )
2021-09-13 17:31:09 +00:00
rs . built = true
return nil
}
2021-09-18 21:59:27 +00:00
2021-09-20 16:39:32 +00:00
func ( rs * RecSplit ) skipBits ( m uint16 ) int {
2021-09-18 21:59:27 +00:00
return int ( rs . golombRice [ m ] & 0xffff )
}
2021-09-20 16:39:32 +00:00
func ( rs * RecSplit ) skipNodes ( m uint16 ) int {
2021-09-18 21:59:27 +00:00
return int ( rs . golombRice [ m ] >> 16 ) & 0x7FF
}
2021-09-20 11:14:49 +00:00
func ( rs * RecSplit ) Lookup ( key [ ] byte , trace bool ) int {
2021-09-18 21:59:27 +00:00
rs . hasher . Reset ( )
rs . hasher . Write ( key ) //nolint:errcheck
2021-09-20 16:39:32 +00:00
bucketHash , fingerprint := rs . hasher . Sum128 ( )
2021-09-20 11:14:49 +00:00
if trace {
fmt . Printf ( "lookup key %x, fingerprint %x\n" , key , fingerprint )
}
2021-09-20 16:39:32 +00:00
bucket := remap ( bucketHash , rs . bucketCount )
2021-09-18 21:59:27 +00:00
cumKeys , cumKeysNext , bitPos := rs . ef . Get3 ( bucket )
2021-09-20 16:39:32 +00:00
m := uint16 ( cumKeysNext - cumKeys ) // Number of keys in this bucket
2021-09-20 11:14:49 +00:00
if trace {
fmt . Printf ( "bucket: %d, m = %d, bitPos = %d, unaryOffset = %d\n" , bucket , m , bitPos , rs . skipBits ( m ) )
}
2021-09-18 21:59:27 +00:00
rs . gr . ReadReset ( int ( bitPos ) , rs . skipBits ( m ) )
var level int
2021-09-20 11:14:49 +00:00
var p int
2021-09-18 21:59:27 +00:00
for m > rs . secondaryAggrBound { // fanout = 2
2021-09-20 11:14:49 +00:00
if trace {
p = rs . gr . currFixedOffset
}
d := rs . gr . ReadNext ( rs . golombParam ( m ) )
if trace {
fmt . Printf ( "level %d, p = %d, d = %d golomb %d\n" , level , p , d , rs . golombParam ( m ) )
}
hmod := remap16 ( remix ( fingerprint + rs . startSeed [ level ] + d ) , m )
split := ( ( ( m + 1 ) / 2 + rs . secondaryAggrBound - 1 ) / rs . secondaryAggrBound ) * rs . secondaryAggrBound
2021-09-18 21:59:27 +00:00
if hmod < split {
m = split
} else {
rs . gr . SkipSubtree ( rs . skipNodes ( split ) , rs . skipBits ( split ) )
m -= split
cumKeys += uint64 ( split )
}
level ++
}
if m > rs . primaryAggrBound {
2021-09-20 11:14:49 +00:00
if trace {
p = rs . gr . currFixedOffset
}
d := rs . gr . ReadNext ( rs . golombParam ( m ) )
if trace {
fmt . Printf ( "level %d, p = %d, d = %d golomb %d\n" , level , p , d , rs . golombParam ( m ) )
}
hmod := remap16 ( remix ( fingerprint + rs . startSeed [ level ] + d ) , m )
2021-09-18 21:59:27 +00:00
part := hmod / rs . primaryAggrBound
if rs . primaryAggrBound < m - part * rs . primaryAggrBound {
m = rs . primaryAggrBound
} else {
m = m - part * rs . primaryAggrBound
}
cumKeys += uint64 ( rs . primaryAggrBound * part )
if part != 0 {
2021-09-20 16:39:32 +00:00
rs . gr . SkipSubtree ( rs . skipNodes ( rs . primaryAggrBound ) * int ( part ) , rs . skipBits ( rs . primaryAggrBound ) * int ( part ) )
2021-09-18 21:59:27 +00:00
}
level ++
}
if m > rs . leafSize {
2021-09-20 11:14:49 +00:00
if trace {
p = rs . gr . currFixedOffset
}
d := rs . gr . ReadNext ( rs . golombParam ( m ) )
if trace {
fmt . Printf ( "level %d, p = %d, d = %d, golomb %d\n" , level , p , d , rs . golombParam ( m ) )
}
hmod := remap16 ( remix ( fingerprint + rs . startSeed [ level ] + d ) , m )
2021-09-18 21:59:27 +00:00
part := hmod / rs . leafSize
if rs . leafSize < m - part * rs . leafSize {
m = rs . leafSize
} else {
m = m - part * rs . leafSize
}
cumKeys += uint64 ( rs . leafSize * part )
if part != 0 {
2021-09-20 16:39:32 +00:00
rs . gr . SkipSubtree ( int ( part ) , rs . skipBits ( rs . leafSize ) * int ( part ) )
2021-09-18 21:59:27 +00:00
}
level ++
}
2021-09-20 11:14:49 +00:00
if trace {
p = rs . gr . currFixedOffset
}
b := rs . gr . ReadNext ( rs . golombParam ( m ) )
if trace {
fmt . Printf ( "level %d, p = %d, b = %d, golomn = %d\n" , level , p , b , rs . golombParam ( m ) )
}
2021-09-20 16:39:32 +00:00
return int ( cumKeys ) + int ( remap16 ( remix ( fingerprint + rs . startSeed [ level ] + b ) , m ) )
2021-09-20 11:14:49 +00:00
}
// Stats returns the size of golomb rice encoding and ellias fano encoding
func ( rs RecSplit ) Stats ( ) ( int , int ) {
return len ( rs . gr . Data ( ) ) , len ( rs . ef . Data ( ) )
}
// Collision returns true if there was a collision detected during mapping of keys
// into 64-bit values
// RecSplit needs to be reset, re-populated with keys, and rebuilt
func ( rs RecSplit ) Collision ( ) bool {
return rs . collision
2021-09-18 21:59:27 +00:00
}