/* Copyright 2021 Erigon contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package eliasfano32 import ( "encoding/binary" "fmt" "io" "math" "math/bits" "unsafe" "github.com/ledgerwatch/erigon-lib/common/bitutil" ) // EliasFano algo overview https://www.antoniomallia.it/sorted-integers-compression-with-elias-fano-encoding.html // P. Elias. Efficient storage and retrieval by content and address of static files. J. ACM, 21(2):246–260, 1974. // Partitioned Elias-Fano Indexes http://groups.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf const ( log2q uint64 = 8 q uint64 = 1 << log2q qMask uint64 = q - 1 superQ uint64 = 1 << 14 superQMask uint64 = superQ - 1 qPerSuperQ uint64 = superQ / q // 64 superQSize uint64 = 1 + qPerSuperQ/2 // 1 + 64/4 = 17 ) // EliasFano can be used to encode one monotone sequence type EliasFano struct { data []uint64 lowerBits []uint64 upperBits []uint64 jump []uint64 lowerBitsMask uint64 count uint64 u uint64 l uint64 maxOffset uint64 minDelta uint64 i uint64 delta uint64 wordsUpperBits int } func NewEliasFano(count uint64, maxOffset, minDelta uint64) *EliasFano { if minDelta > (1 << 32) { panic(fmt.Sprintf("too big minDelat: %d", minDelta)) } if count == 0 { panic(fmt.Sprintf("too small count: %d", count)) } //fmt.Printf("count=%d,maxOffset=%d,minDelta=%d\n", count, maxOffset, minDelta) ef := &EliasFano{ count: count - 1, maxOffset: maxOffset, minDelta: minDelta, } ef.u = maxOffset - ef.count*ef.minDelta + 1 ef.wordsUpperBits = ef.deriveFields() return ef } func (ef *EliasFano) AddOffset(offset uint64) { //fmt.Printf("0x%x,\n", offset) if ef.l != 0 { set_bits(ef.lowerBits, ef.i*ef.l, int(ef.l), (offset-ef.delta)&ef.lowerBitsMask) } //pos := ((offset - ef.delta) >> ef.l) + ef.i set(ef.upperBits, ((offset-ef.delta)>>ef.l)+ef.i) //fmt.Printf("add:%x, pos=%x, set=%x, res=%x\n", offset, pos, pos/64, uint64(1)<<(pos%64)) ef.i++ ef.delta += ef.minDelta } func (ef EliasFano) jumpSizeWords() int { size := ((ef.count + 1) / superQ) * superQSize // Whole blocks if (ef.count+1)%superQ != 0 { size += 1 + (((ef.count+1)%superQ+q-1)/q+3)/2 // Partial block } return int(size) } func (ef *EliasFano) deriveFields() int { if ef.u/(ef.count+1) == 0 { ef.l = 0 } else { ef.l = 63 ^ uint64(bits.LeadingZeros64(ef.u/(ef.count+1))) // pos of first non-zero bit //fmt.Printf("lllllllll: %d, %d\n", 63^uint64(bits.LeadingZeros64(24/7)), msb(ef.u/(ef.count+1))) } //fmt.Printf("EF: %d, %d,%d\n", ef.count, ef.u, ef.l) ef.lowerBitsMask = (uint64(1) << ef.l) - 1 wordsLowerBits := int(((ef.count+1)*ef.l+63)/64 + 1) wordsUpperBits := int((ef.count + 1 + (ef.u >> ef.l) + 63) / 64) jumpWords := ef.jumpSizeWords() totalWords := wordsLowerBits + wordsUpperBits + jumpWords //fmt.Printf("EF: %d, %d,%d,%d\n", totalWords, wordsLowerBits, wordsUpperBits, jumpWords) if ef.data == nil { ef.data = make([]uint64, totalWords) } else { ef.data = ef.data[:totalWords] } ef.lowerBits = ef.data[:wordsLowerBits] ef.upperBits = ef.data[wordsLowerBits : wordsLowerBits+wordsUpperBits] ef.jump = ef.data[wordsLowerBits+wordsUpperBits:] return wordsUpperBits } // Build construct Elias Fano index for a given sequences func (ef *EliasFano) Build() { for i, c, lastSuperQ := uint64(0), uint64(0), uint64(0); i < uint64(ef.wordsUpperBits); i++ { for b := uint64(0); b < 64; b++ { if ef.upperBits[i]&(uint64(1)<= (1 << 32) { fmt.Printf("ef.l=%x,ef.u=%x\n", ef.l, ef.u) fmt.Printf("offset=%x,lastSuperQ=%x,i=%x,b=%x,c=%x\n", offset, lastSuperQ, i, b, c) fmt.Printf("ef.minDelta=%x\n", ef.minDelta) //fmt.Printf("ef.upperBits=%x\n", ef.upperBits) //fmt.Printf("ef.lowerBits=%x\n", ef.lowerBits) //fmt.Printf("ef.wordsUpperBits=%b\n", ef.wordsUpperBits) panic("") } // c % superQ is the bit index inside the group of 4096 bits jumpSuperQ := (c / superQ) * superQSize jumpInsideSuperQ := (c % superQ) / q idx64 := jumpSuperQ + 1 + (jumpInsideSuperQ >> 1) shift := 32 * (jumpInsideSuperQ % 2) mask := uint64(0xffffffff) << shift ef.jump[idx64] = (ef.jump[idx64] &^ mask) | (offset << shift) } c++ } } } } func (ef EliasFano) get(i uint64) (val uint64, window uint64, sel int, currWord uint64, lower uint64, delta uint64) { lower = i * ef.l idx64 := lower / 64 shift := lower % 64 lower = ef.lowerBits[idx64] >> shift if shift > 0 { lower |= ef.lowerBits[idx64+1] << (64 - shift) } jumpSuperQ := (i / superQ) * superQSize jumpInsideSuperQ := (i % superQ) / q idx64 = jumpSuperQ + 1 + (jumpInsideSuperQ >> 1) shift = 32 * (jumpInsideSuperQ % 2) mask := uint64(0xffffffff) << shift jump := ef.jump[jumpSuperQ] + (ef.jump[idx64]&mask)>>shift currWord = jump / 64 window = ef.upperBits[currWord] & (uint64(0xffffffffffffffff) << (jump % 64)) d := int(i & qMask) for bitCount := bits.OnesCount64(window); bitCount <= d; bitCount = bits.OnesCount64(window) { currWord++ window = ef.upperBits[currWord] d -= bitCount } sel = bitutil.Select64(window, d) delta = i * ef.minDelta val = ((currWord*64+uint64(sel)-i)<>= ef.l valNext = ((currWord*64+uint64(bits.TrailingZeros64(window))-i-1)< 56 { panic(fmt.Sprintf("ef.lCumKeys (%d) * 2 + ef.lPosition (%d) > 56", ef.lCumKeys, ef.lPosition)) } ef.lowerBitsMaskCumKeys = (uint64(1) << ef.lCumKeys) - 1 ef.lowerBitsMaskPosition = (uint64(1) << ef.lPosition) - 1 wordsLowerBits := int(((ef.numBuckets+1)*(ef.lCumKeys+ef.lPosition)+63)/64 + 1) wordsCumKeys := int((ef.numBuckets + 1 + (ef.uCumKeys >> ef.lCumKeys) + 63) / 64) wordsPosition := int((ef.numBuckets + 1 + (ef.uPosition >> ef.lPosition) + 63) / 64) jumpWords := ef.jumpSizeWords() totalWords := wordsLowerBits + wordsCumKeys + wordsPosition + jumpWords if ef.data == nil { ef.data = make([]uint64, totalWords) } else { ef.data = ef.data[:totalWords] } ef.lowerBits = ef.data[:wordsLowerBits] ef.upperBitsCumKeys = ef.data[wordsLowerBits : wordsLowerBits+wordsCumKeys] ef.upperBitsPosition = ef.data[wordsLowerBits+wordsCumKeys : wordsLowerBits+wordsCumKeys+wordsPosition] ef.jump = ef.data[wordsLowerBits+wordsCumKeys+wordsPosition:] return wordsCumKeys, wordsPosition } // Build construct double Elias Fano index for two given sequences func (ef *DoubleEliasFano) Build(cumKeys []uint64, position []uint64) { //fmt.Printf("cumKeys = %d\nposition = %d\n", cumKeys, position) if len(cumKeys) != len(position) { panic("len(cumKeys) != len(position)") } ef.numBuckets = uint64(len(cumKeys) - 1) ef.posMinDelta = math.MaxUint64 ef.cumKeysMinDelta = math.MaxUint64 for i := uint64(1); i <= ef.numBuckets; i++ { if cumKeys[i] < cumKeys[i-1] { panic("cumKeys[i] <= cumKeys[i-1]") } nkeysDelta := cumKeys[i] - cumKeys[i-1] if nkeysDelta < ef.cumKeysMinDelta { ef.cumKeysMinDelta = nkeysDelta } if position[i] < position[i-1] { panic("position[i] < position[i-1]") } bucketBits := position[i] - position[i-1] if bucketBits < ef.posMinDelta { ef.posMinDelta = bucketBits } } //fmt.Printf("cumKeysMinDelta = %d, posMinDelta = %d\n", ef.cumKeysMinDelta, ef.posMinDelta) ef.uPosition = position[ef.numBuckets] - ef.numBuckets*ef.posMinDelta + 1 ef.uCumKeys = cumKeys[ef.numBuckets] - ef.numBuckets*ef.cumKeysMinDelta + 1 // Largest possible encoding of the cumKeys wordsCumKeys, wordsPosition := ef.deriveFields() for i, cumDelta, bitDelta := uint64(0), uint64(0), uint64(0); i <= ef.numBuckets; i, cumDelta, bitDelta = i+1, cumDelta+ef.cumKeysMinDelta, bitDelta+ef.posMinDelta { if ef.lCumKeys != 0 { //fmt.Printf("i=%d, set_bits cum for %d = %b\n", i, cumKeys[i]-cumDelta, (cumKeys[i]-cumDelta)&ef.lowerBitsMaskCumKeys) set_bits(ef.lowerBits, i*(ef.lCumKeys+ef.lPosition), int(ef.lCumKeys), (cumKeys[i]-cumDelta)&ef.lowerBitsMaskCumKeys) //fmt.Printf("loweBits %b\n", ef.lowerBits) } set(ef.upperBitsCumKeys, ((cumKeys[i]-cumDelta)>>ef.lCumKeys)+i) //fmt.Printf("i=%d, set cum for %d = %d\n", i, cumKeys[i]-cumDelta, (cumKeys[i]-cumDelta)>>ef.lCumKeys+i) if ef.lPosition != 0 { //fmt.Printf("i=%d, set_bits pos for %d = %b\n", i, position[i]-bitDelta, (position[i]-bitDelta)&ef.lowerBitsMaskPosition) set_bits(ef.lowerBits, i*(ef.lCumKeys+ef.lPosition)+ef.lCumKeys, int(ef.lPosition), (position[i]-bitDelta)&ef.lowerBitsMaskPosition) //fmt.Printf("lowerBits %b\n", ef.lowerBits) } set(ef.upperBitsPosition, ((position[i]-bitDelta)>>ef.lPosition)+i) //fmt.Printf("i=%d, set pos for %d = %d\n", i, position[i]-bitDelta, (position[i]-bitDelta)>>ef.lPosition+i) } //fmt.Printf("loweBits %b\n", ef.lowerBits) //fmt.Printf("upperBitsCumKeys %b\n", ef.upperBitsCumKeys) //fmt.Printf("upperBitsPosition %b\n", ef.upperBitsPosition) // i iterates over the 64-bit words in the wordCumKeys vector // c iterates over bits in the wordCumKeys // lastSuperQ is the largest multiple of 2^14 (4096) which is no larger than c // c/superQ is the index of the current 4096 block of bits // superQSize is how many words is required to encode one block of 4096 bits. It is 17 words which is 1088 bits for i, c, lastSuperQ := uint64(0), uint64(0), uint64(0); i < uint64(wordsCumKeys); i++ { for b := uint64(0); b < 64; b++ { if ef.upperBitsCumKeys[i]&(uint64(1)<= (1 << 32) { panic("") } // c % superQ is the bit index inside the group of 4096 bits jumpSuperQ := (c / superQ) * (superQSize * 2) jumpInsideSuperQ := 2 * (c % superQ) / q idx64 := jumpSuperQ + 2 + (jumpInsideSuperQ >> 1) shift := 32 * (jumpInsideSuperQ % 2) mask := uint64(0xffffffff) << shift ef.jump[idx64] = (ef.jump[idx64] &^ mask) | (offset << shift) } c++ } } } for i, c, lastSuperQ := uint64(0), uint64(0), uint64(0); i < uint64(wordsPosition); i++ { for b := uint64(0); b < 64; b++ { if ef.upperBitsPosition[i]&(uint64(1)<= (1 << 32) { panic("") } jumpSuperQ := (c / superQ) * (superQSize * 2) jumpInsideSuperQ := 2*((c%superQ)/q) + 1 idx64 := jumpSuperQ + 2 + (jumpInsideSuperQ >> 1) shift := 32 * (jumpInsideSuperQ % 2) mask := uint64(0xffffffff) << shift ef.jump[idx64] = (ef.jump[idx64] &^ mask) | (offset << shift) } c++ } } } //fmt.Printf("jump: %x\n", ef.jump) } // set_bits assumes that bits are set in monotonic order, so that // we can skip the masking for the second word func set_bits(bits []uint64, start uint64, width int, value uint64) { shift := int(start & 63) idx64 := start >> 6 mask := (uint64(1)< 64 { // changes two 64-bit words bits[idx64+1] = value >> (64 - shift) } } func set(bits []uint64, pos uint64) { //bits[pos>>6] |= uint64(1) << (pos & 63) bits[pos/64] |= uint64(1) << (pos % 64) } func (ef DoubleEliasFano) jumpSizeWords() int { size := ((ef.numBuckets + 1) / superQ) * superQSize * 2 // Whole blocks if (ef.numBuckets+1)%superQ != 0 { size += (1 + (((ef.numBuckets+1)%superQ+q-1)/q+3)/2) * 2 // Partial block } return int(size) } // Data returns binary representation of double Ellias-Fano index that has been built func (ef DoubleEliasFano) Data() []uint64 { return ef.data } func (ef DoubleEliasFano) get2(i uint64) (cumKeys uint64, position uint64, windowCumKeys uint64, selectCumKeys int, currWordCumKeys uint64, lower uint64, cumDelta uint64) { posLower := i * (ef.lCumKeys + ef.lPosition) idx64 := posLower / 64 shift := posLower % 64 lower = ef.lowerBits[idx64] >> shift if shift > 0 { lower |= ef.lowerBits[idx64+1] << (64 - shift) } //fmt.Printf("i = %d, posLower = %d, lower = %b\n", i, posLower, lower) jumpSuperQ := (i / superQ) * superQSize * 2 jumpInsideSuperQ := (i % superQ) / q idx16 := 2*(jumpSuperQ+2) + 2*jumpInsideSuperQ idx64 = idx16 / 2 shift = 32 * (idx16 % 2) mask := uint64(0xffffffff) << shift jumpCumKeys := ef.jump[jumpSuperQ] + (ef.jump[idx64]&mask)>>shift idx16++ idx64 = idx16 / 2 shift = 32 * (idx16 % 2) mask = uint64(0xffffffff) << shift jumpPosition := ef.jump[jumpSuperQ+1] + (ef.jump[idx64]&mask)>>shift //fmt.Printf("i = %d, jumpCumKeys = %d, jumpPosition = %d\n", i, jumpCumKeys, jumpPosition) currWordCumKeys = jumpCumKeys / 64 currWordPosition := jumpPosition / 64 windowCumKeys = ef.upperBitsCumKeys[currWordCumKeys] & (uint64(0xffffffffffffffff) << (jumpCumKeys % 64)) windowPosition := ef.upperBitsPosition[currWordPosition] & (uint64(0xffffffffffffffff) << (jumpPosition % 64)) deltaCumKeys := int(i & qMask) deltaPosition := int(i & qMask) for bitCount := bits.OnesCount64(windowCumKeys); bitCount <= deltaCumKeys; bitCount = bits.OnesCount64(windowCumKeys) { //fmt.Printf("i = %d, bitCount cum = %d\n", i, bitCount) currWordCumKeys++ windowCumKeys = ef.upperBitsCumKeys[currWordCumKeys] deltaCumKeys -= bitCount } for bitCount := bits.OnesCount64(windowPosition); bitCount <= deltaPosition; bitCount = bits.OnesCount64(windowPosition) { //fmt.Printf("i = %d, bitCount pos = %d\n", i, bitCount) currWordPosition++ windowPosition = ef.upperBitsPosition[currWordPosition] deltaPosition -= bitCount } selectCumKeys = bitutil.Select64(windowCumKeys, deltaCumKeys) //fmt.Printf("i = %d, select cum in %b for %d = %d\n", i, windowCumKeys, deltaCumKeys, selectCumKeys) cumDelta = i * ef.cumKeysMinDelta cumKeys = ((currWordCumKeys*64+uint64(selectCumKeys)-i)<>= ef.lCumKeys //fmt.Printf("i = %d, lower = %b\n", i, lower) selectPosition := bitutil.Select64(windowPosition, deltaPosition) //fmt.Printf("i = %d, select pos in %b for %d = %d\n", i, windowPosition, deltaPosition, selectPosition) bitDelta := i * ef.posMinDelta position = ((currWordPosition*64+uint64(selectPosition)-i)<>= ef.lPosition cumKeysNext = ((currWordCumKeys*64+uint64(bits.TrailingZeros64(windowCumKeys))-i-1)<