2021-09-18 22:59:27 +01:00
/ *
Copyright 2021 Erigon contributors
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package recsplit
import (
2021-10-16 10:43:41 +01:00
"encoding/binary"
2021-09-20 12:14:49 +01:00
"io"
2021-09-18 22:59:27 +01:00
"math/bits"
2021-10-16 10:43:41 +01:00
"unsafe"
2021-11-05 17:04:17 +07:00
"github.com/ledgerwatch/erigon-lib/common/bitutil"
2021-09-18 22:59:27 +01:00
)
// Optimal Golomb-Rice parameters for leaves
2023-01-07 12:30:57 +07:00
var bijMemo = [ ] uint32 { 0 , 0 , 0 , 1 , 3 , 4 , 5 , 7 , 8 , 10 , 11 , 12 , 14 , 15 , 16 , 18 , 19 , 21 , 22 , 23 , 25 , 26 , 28 , 29 , 30 }
2021-09-18 22:59:27 +01:00
// GolombRice can build up the golomb-rice encoding of the sequeuce of numbers, as well as read the numbers back from it.
type GolombRice struct {
2021-10-16 10:43:41 +01:00
data [ ] uint64 // Present in the builder and in the reader
2022-10-21 10:31:23 +02:00
bitCount int // Speficic to the builder - number of bits added to the encoding so far
2021-09-18 22:59:27 +01:00
}
// appendUnaryAll adds the unary encoding of specified sequence of numbers to the end of the
// current encoding
2021-09-20 12:14:49 +01:00
func ( g * GolombRice ) appendUnaryAll ( unary [ ] uint64 ) {
2021-09-18 22:59:27 +01:00
bitInc := 0
for _ , u := range unary {
// Each number u uses u+1 bits for its unary representation
bitInc += int ( u ) + 1
}
targetSize := ( g . bitCount + bitInc + 63 ) / 64
for len ( g . data ) < targetSize {
g . data = append ( g . data , 0 )
}
for _ , u := range unary {
g . bitCount += int ( u )
appendPtr := g . bitCount / 64
g . data [ appendPtr ] |= uint64 ( 1 ) << ( g . bitCount & 63 )
g . bitCount ++
}
}
// appendFixed encodes the next value using specified Golomb parameter. Since we are using Golomb-Rice encoding,
// all Golomb parameters are powers of two. Therefore we input log2 of golomb parameter, rather than golomn paramter itself,
// for convinience
2021-09-20 12:14:49 +01:00
func ( g * GolombRice ) appendFixed ( v uint64 , log2golomb int ) {
2021-09-18 22:59:27 +01:00
if log2golomb == 0 {
return
}
2021-09-20 12:14:49 +01:00
lowerBits := v & ( ( uint64 ( 1 ) << log2golomb ) - 1 ) // Extract the part of the number that will be encoded using truncated binary encoding
2021-09-18 22:59:27 +01:00
usedBits := g . bitCount & 63 // How many bits of the last element of b.data is used by previous value
targetSize := ( g . bitCount + log2golomb + 63 ) / 64
//fmt.Printf("g.bitCount = %d, log2golomb = %d, targetSize = %d\n", g.bitCount, log2golomb, targetSize)
for len ( g . data ) < targetSize {
g . data = append ( g . data , 0 )
}
appendPtr := g . bitCount / 64 // The index in b.data corresponding to the last element used by previous value, or if previous values fits perfectly, the index of the next free element
curWord := g . data [ appendPtr ]
2022-03-19 11:38:37 +07:00
curWord |= lowerBits << usedBits // curWord now contains the new value potentially combined with the part of the previous value
2021-09-18 22:59:27 +01:00
if usedBits + log2golomb > 64 {
// New value overflows to the next element
g . data [ appendPtr ] = curWord
appendPtr ++
2022-03-19 11:38:37 +07:00
curWord = lowerBits >> ( 64 - usedBits ) // curWord now contains the part of the new value that overflows
2021-09-18 22:59:27 +01:00
}
g . data [ appendPtr ] = curWord
g . bitCount += log2golomb
}
2022-03-19 11:38:37 +07:00
// Bits returns currrent number of bits in the compact encoding of the hash function representation
2022-10-22 13:19:27 +07:00
func ( g * GolombRice ) Bits ( ) int {
2021-09-18 22:59:27 +01:00
return g . bitCount
}
2022-03-19 11:38:37 +07:00
func ( g * GolombRiceReader ) ReadReset ( bitPos , unaryOffset int ) {
2021-09-18 22:59:27 +01:00
g . currFixedOffset = bitPos
unaryPos := bitPos + unaryOffset
g . currPtrUnary = unaryPos / 64
g . currWindowUnary = g . data [ g . currPtrUnary ] >> ( unaryPos & 63 )
g . currPtrUnary ++
g . validLowerBitsUnary = 64 - ( unaryPos & 63 )
}
2022-03-19 11:38:37 +07:00
func ( g * GolombRiceReader ) SkipSubtree ( nodes , fixedLen int ) {
2021-09-18 22:59:27 +01:00
if nodes <= 0 {
panic ( "nodes <= 0" )
}
missing := nodes
var cnt int
for cnt = bits . OnesCount64 ( g . currWindowUnary ) ; cnt < missing ; cnt = bits . OnesCount64 ( g . currWindowUnary ) {
g . currWindowUnary = g . data [ g . currPtrUnary ]
g . currPtrUnary ++
missing -= cnt
g . validLowerBitsUnary = 64
}
2021-11-05 17:04:17 +07:00
cnt = bitutil . Select64 ( g . currWindowUnary , missing - 1 )
2021-09-18 22:59:27 +01:00
g . currWindowUnary >>= cnt
g . currWindowUnary >>= 1
g . validLowerBitsUnary -= cnt + 1
g . currFixedOffset += fixedLen
}
2021-10-16 10:43:41 +01:00
func ( g * GolombRiceReader ) ReadNext ( log2golomb int ) uint64 {
2021-09-18 22:59:27 +01:00
var result uint64
if g . currWindowUnary == 0 {
result += uint64 ( g . validLowerBitsUnary )
g . currWindowUnary = g . data [ g . currPtrUnary ]
g . currPtrUnary ++
g . validLowerBitsUnary = 64
for g . currWindowUnary == 0 {
result += 64
g . currWindowUnary = g . data [ g . currPtrUnary ]
g . currPtrUnary ++
}
}
pos := bits . TrailingZeros64 ( g . currWindowUnary )
g . currWindowUnary >>= pos
g . currWindowUnary >>= 1
g . validLowerBitsUnary -= pos + 1
result += uint64 ( pos )
result <<= log2golomb
idx64 := g . currFixedOffset >> 6
var fixed uint64
shift := g . currFixedOffset & 63
fixed = g . data [ idx64 ] >> shift
if shift + log2golomb > 64 {
fixed |= g . data [ idx64 + 1 ] << ( 64 - shift )
}
result |= fixed & ( ( uint64 ( 1 ) << log2golomb ) - 1 )
g . currFixedOffset += log2golomb
return result
}
2021-09-20 12:14:49 +01:00
// Data returns the binary representation of the Golomb-Rice code that is built
2022-10-22 13:19:27 +07:00
func ( g * GolombRice ) Data ( ) [ ] uint64 {
2021-09-20 12:14:49 +01:00
return g . data
}
2021-10-16 10:43:41 +01:00
const maxDataSize = 0xFFFFFFFFFFFF
2021-09-20 12:14:49 +01:00
// Write outputs the state of golomb rice encoding into a writer, which can be recovered later by Read
2021-10-16 10:43:41 +01:00
func ( g * GolombRice ) Write ( w io . Writer ) error {
var numBuf [ 8 ] byte
binary . BigEndian . PutUint64 ( numBuf [ : ] , uint64 ( len ( g . data ) ) )
if _ , e := w . Write ( numBuf [ : ] ) ; e != nil {
return e
}
p := ( * [ maxDataSize ] byte ) ( unsafe . Pointer ( & g . data [ 0 ] ) )
b := ( * p ) [ : ]
2021-11-05 17:04:17 +07:00
if _ , e := w . Write ( b [ : len ( g . data ) * 8 ] ) ; e != nil {
2021-10-16 10:43:41 +01:00
return e
}
2021-09-20 12:14:49 +01:00
return nil
}
2021-10-16 10:43:41 +01:00
type GolombRiceReader struct {
data [ ] uint64 // Present in the builder and in the reader
currFixedOffset int // Specific to the reader
currWindowUnary uint64
currPtrUnary int
validLowerBitsUnary int
2021-09-20 12:14:49 +01:00
}