2021-10-16 09:43:41 +00:00
/ *
Copyright 2021 Erigon contributors
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package compress
import (
2022-01-24 09:18:08 +00:00
"bytes"
2021-10-16 09:43:41 +00:00
"encoding/binary"
2021-11-15 14:19:56 +00:00
"fmt"
2021-10-16 09:43:41 +00:00
"os"
2022-01-24 09:18:08 +00:00
"strings"
2021-10-16 09:43:41 +00:00
"github.com/ledgerwatch/erigon-lib/mmap"
)
2022-01-17 08:50:42 +00:00
// Decompressor provides access to the superstrings in a file produced by a compressor
2021-10-16 09:43:41 +00:00
type Decompressor struct {
compressedFile string
f * os . File
mmapHandle1 [ ] byte // mmap handle for unix (this is used to close mmap)
mmapHandle2 * [ mmap . MaxMapSize ] byte // mmap handle for windows (this is used to close mmap)
data [ ] byte // slice of correct size for the decompressor to work with
dict Dictionary
posDict Dictionary
2022-01-17 08:50:42 +00:00
wordsStart uint64 // Offset of whether the superstrings actually start
2021-12-21 03:45:20 +00:00
count uint64
2022-01-31 22:32:00 +00:00
size int64
2021-10-16 09:43:41 +00:00
}
func NewDecompressor ( compressedFile string ) ( * Decompressor , error ) {
d := & Decompressor {
compressedFile : compressedFile ,
}
var err error
d . f , err = os . Open ( compressedFile )
if err != nil {
return nil , err
}
var stat os . FileInfo
if stat , err = d . f . Stat ( ) ; err != nil {
return nil , err
}
2022-01-31 22:32:00 +00:00
d . size = stat . Size ( )
if d . size < 24 {
2021-11-15 14:19:56 +00:00
return nil , fmt . Errorf ( "compressed file is too short" )
}
2022-01-31 22:32:00 +00:00
if d . mmapHandle1 , d . mmapHandle2 , err = mmap . Mmap ( d . f , int ( d . size ) ) ; err != nil {
2021-10-16 09:43:41 +00:00
return nil , err
}
2022-01-31 22:32:00 +00:00
d . data = d . mmapHandle1 [ : d . size ]
2021-12-21 03:45:20 +00:00
d . count = binary . BigEndian . Uint64 ( d . data [ : 8 ] )
dictSize := binary . BigEndian . Uint64 ( d . data [ 8 : 16 ] )
d . dict . rootOffset = binary . BigEndian . Uint64 ( d . data [ 16 : 24 ] )
d . dict . cutoff = binary . BigEndian . Uint64 ( d . data [ 24 : 32 ] )
d . dict . data = d . data [ 32 : 32 + dictSize ]
pos := 32 + dictSize
2021-10-16 09:43:41 +00:00
dictSize = binary . BigEndian . Uint64 ( d . data [ pos : pos + 8 ] )
d . posDict . rootOffset = binary . BigEndian . Uint64 ( d . data [ pos + 8 : pos + 16 ] )
d . posDict . cutoff = binary . BigEndian . Uint64 ( d . data [ pos + 16 : pos + 24 ] )
d . posDict . data = d . data [ pos + 24 : pos + 24 + dictSize ]
d . wordsStart = pos + 24 + dictSize
return d , nil
}
2022-01-31 22:32:00 +00:00
func ( d * Decompressor ) Size ( ) int64 {
return d . size
}
2021-10-16 09:43:41 +00:00
func ( d * Decompressor ) Close ( ) error {
if err := mmap . Munmap ( d . mmapHandle1 , d . mmapHandle2 ) ; err != nil {
return err
}
if err := d . f . Close ( ) ; err != nil {
return err
}
return nil
}
2021-12-31 11:42:43 +00:00
func ( d * Decompressor ) FilePath ( ) string { return d . compressedFile }
2022-02-01 04:19:11 +00:00
//WithReadAhead - Expect read in sequential order. (Hence, pages in the given range can be aggressively read ahead, and may be freed soon after they are accessed.)
func ( d * Decompressor ) WithReadAhead ( f func ( ) error ) error {
_ = mmap . MadviseSequential ( d . mmapHandle1 )
defer mmap . MadviseRandom ( d . mmapHandle1 )
return f ( )
}
2021-10-16 09:43:41 +00:00
type Dictionary struct {
data [ ] byte
rootOffset uint64
cutoff uint64
}
2022-01-29 11:12:38 +00:00
// Getter represent "reader" or "interator" that can move accross the data of the decompressor
// The full state of the getter can be captured by saving dataP, b, and mask values.
2021-10-16 09:43:41 +00:00
type Getter struct {
data [ ] byte
dataP uint64
patternDict * Dictionary
posDict * Dictionary
offset uint64
b byte
mask byte
uncovered [ ] int // Buffer for uncovered portions of the word
word [ ] byte
2022-02-09 06:22:45 +00:00
fName string
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) zero ( ) bool {
g . offset , _ = binary . Uvarint ( g . patternDict . data [ g . offset : ] )
return g . offset < g . patternDict . cutoff
}
func ( g * Getter ) one ( ) bool {
_ , n := binary . Uvarint ( g . patternDict . data [ g . offset : ] )
g . offset , _ = binary . Uvarint ( g . patternDict . data [ g . offset + uint64 ( n ) : ] )
return g . offset < g . patternDict . cutoff
}
func ( g * Getter ) posZero ( ) bool {
g . offset , _ = binary . Uvarint ( g . posDict . data [ g . offset : ] )
return g . offset < g . posDict . cutoff
}
func ( g * Getter ) posOne ( ) bool {
_ , n := binary . Uvarint ( g . posDict . data [ g . offset : ] )
g . offset , _ = binary . Uvarint ( g . posDict . data [ g . offset + uint64 ( n ) : ] )
return g . offset < g . posDict . cutoff
}
func ( g * Getter ) pattern ( ) [ ] byte {
l , n := binary . Uvarint ( g . patternDict . data [ g . offset : ] )
return g . patternDict . data [ g . offset + uint64 ( n ) : g . offset + uint64 ( n ) + l ]
}
func ( g * Getter ) pos ( ) uint64 {
pos , _ := binary . Uvarint ( g . posDict . data [ g . offset : ] )
return pos
}
func ( g * Getter ) nextPos ( clean bool ) uint64 {
if clean {
g . mask = 0
}
g . offset = g . posDict . rootOffset
if g . offset < g . posDict . cutoff {
return g . pos ( )
}
for {
if g . mask == 0 {
g . mask = 1
g . b = g . data [ g . dataP ]
g . dataP ++
}
if g . b & g . mask == 0 {
g . mask <<= 1
if g . posZero ( ) {
break
}
} else {
g . mask <<= 1
if g . posOne ( ) {
break
}
}
}
return g . pos ( )
}
func ( g * Getter ) nextPattern ( ) [ ] byte {
g . offset = g . patternDict . rootOffset
if g . offset < g . patternDict . cutoff {
return g . pattern ( )
}
2022-01-24 09:18:08 +00:00
2021-10-16 09:43:41 +00:00
for {
if g . mask == 0 {
g . mask = 1
g . b = g . data [ g . dataP ]
g . dataP ++
}
if g . b & g . mask == 0 {
g . mask <<= 1
if g . zero ( ) {
break
}
} else {
g . mask <<= 1
if g . one ( ) {
break
}
}
}
return g . pattern ( )
}
2022-01-09 10:32:56 +00:00
func ( d * Decompressor ) Count ( ) int { return int ( d . count ) }
2021-11-19 15:00:55 +00:00
2022-01-17 08:50:42 +00:00
// MakeGetter creates an object that can be used to access superstrings in the decompressor's file
2022-01-24 09:18:08 +00:00
// Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently
2021-10-16 09:43:41 +00:00
// for the same decompressor
func ( d * Decompressor ) MakeGetter ( ) * Getter {
2022-02-09 06:22:45 +00:00
return & Getter { patternDict : & d . dict , posDict : & d . posDict , data : d . data [ d . wordsStart : ] , uncovered : make ( [ ] int , 0 , 128 ) , fName : d . compressedFile }
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) Reset ( offset uint64 ) {
g . dataP = offset
2022-02-03 17:58:56 +00:00
g . offset = 0
g . mask = 0
g . b = 0
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) HasNext ( ) bool {
return g . dataP < uint64 ( len ( g . data ) )
}
// Next extracts a compressed word from current offset in the file
// and appends it to the given buf, returning the result of appending
// After extracting next word, it moves to the beginning of the next one
func ( g * Getter ) Next ( buf [ ] byte ) ( [ ] byte , uint64 ) {
l := g . nextPos ( true )
2021-11-10 12:50:36 +00:00
l -- // because when create huffman tree we do ++ , because 0 is terminator
2021-11-07 07:32:01 +00:00
if l == 0 {
return buf , g . dataP
}
if int ( l ) > len ( g . word ) {
g . word = make ( [ ] byte , l )
}
var pos uint64
var lastPos int
var lastUncovered int
g . uncovered = g . uncovered [ : 0 ]
for pos = g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
intPos := lastPos + int ( pos ) - 1
lastPos = intPos
pattern := g . nextPattern ( )
2022-01-17 08:50:42 +00:00
if len ( g . word ) < intPos {
2022-02-09 06:22:45 +00:00
panic ( fmt . Sprintf ( "likely .idx is invalid: %s" , g . fName ) )
2022-01-17 08:50:42 +00:00
}
2021-11-07 07:32:01 +00:00
copy ( g . word [ intPos : ] , pattern )
if intPos > lastUncovered {
g . uncovered = append ( g . uncovered , lastUncovered , intPos )
2021-10-16 09:43:41 +00:00
}
2021-11-07 07:32:01 +00:00
lastUncovered = intPos + len ( pattern )
}
if int ( l ) > lastUncovered {
g . uncovered = append ( g . uncovered , lastUncovered , int ( l ) )
}
// Uncovered characters
for i := 0 ; i < len ( g . uncovered ) ; i += 2 {
copy ( g . word [ g . uncovered [ i ] : g . uncovered [ i + 1 ] ] , g . data [ g . dataP : ] )
g . dataP += uint64 ( g . uncovered [ i + 1 ] - g . uncovered [ i ] )
2021-10-16 09:43:41 +00:00
}
2021-11-07 07:32:01 +00:00
buf = append ( buf , g . word [ : l ] ... )
2021-10-16 09:43:41 +00:00
return buf , g . dataP
}
2022-01-24 09:18:08 +00:00
// Skip moves offset to the next word and returns the new offset.
func ( g * Getter ) Skip ( ) uint64 {
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
return g . dataP
}
wordLen := int ( l )
var add uint64
var pos uint64
var lastPos int
var lastUncovered int
for pos = g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
intPos := lastPos + int ( pos ) - 1
lastPos = intPos
if wordLen < intPos {
2022-02-09 06:22:45 +00:00
panic ( fmt . Sprintf ( "likely .idx is invalid: %s" , g . fName ) )
2022-01-24 09:18:08 +00:00
}
if intPos > lastUncovered {
add += uint64 ( intPos - lastUncovered )
}
pattern := g . nextPattern ( )
lastUncovered = intPos + len ( pattern )
}
if int ( l ) > lastUncovered {
add += l - uint64 ( lastUncovered )
}
// Uncovered characters
g . dataP += add
return g . dataP
}
// Match returns true and next offset if the word at current offset fully matches the buf
// returns false and current offset otherwise.
func ( g * Getter ) Match ( buf [ ] byte ) ( bool , uint64 ) {
savePos := g . dataP
2022-01-29 11:12:38 +00:00
saveMask := g . mask
saveB := g . b
2022-01-24 09:18:08 +00:00
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
2022-01-29 11:12:38 +00:00
lenBuf := len ( buf )
2022-01-24 09:18:08 +00:00
if l == 0 {
2022-01-29 11:12:38 +00:00
if lenBuf != 0 {
g . dataP = savePos
}
return lenBuf == 0 , g . dataP
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
res := true
2022-01-24 09:18:08 +00:00
var pos uint64
var lastPos int
var pattern [ ] byte
2022-01-29 11:12:38 +00:00
preLoopPos := g . dataP
preLoopMask := g . mask
preLoopB := g . b
// In the first pass, we only check patterns
2022-01-24 09:18:08 +00:00
for pos = g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
intPos := lastPos + int ( pos ) - 1
lastPos = intPos
pattern = g . nextPattern ( )
2022-01-29 11:12:38 +00:00
if res && ( lenBuf < intPos + len ( pattern ) || ! bytes . Equal ( buf [ intPos : intPos + len ( pattern ) ] , pattern ) ) {
2022-01-24 09:18:08 +00:00
res = false
}
2022-01-29 11:12:38 +00:00
}
postLoopPos := g . dataP
postLoopMask := g . mask
postLoopB := g . b
g . dataP = preLoopPos
g . mask = preLoopMask
g . b = preLoopB
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
lastPos = 0
for pos = g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
intPos := lastPos + int ( pos ) - 1
lastPos = intPos
pattern = g . nextPattern ( )
2022-01-24 09:18:08 +00:00
if intPos > lastUncovered {
dif := uint64 ( intPos - lastUncovered )
2022-01-29 11:12:38 +00:00
if res && ( lenBuf < intPos || ! bytes . Equal ( buf [ lastUncovered : intPos ] , g . data [ postLoopPos : postLoopPos + dif ] ) ) {
2022-01-24 09:18:08 +00:00
res = false
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
lastUncovered = intPos + len ( pattern )
}
if int ( l ) > lastUncovered {
dif := l - uint64 ( lastUncovered )
2022-01-29 11:12:38 +00:00
if res && ( lenBuf < int ( l ) || ! bytes . Equal ( buf [ lastUncovered : l ] , g . data [ postLoopPos : postLoopPos + dif ] ) ) {
2022-01-24 09:18:08 +00:00
res = false
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
if res && lenBuf != int ( l ) {
res = false
}
if res {
g . dataP = postLoopPos
g . mask = postLoopMask
g . b = postLoopB
} else {
2022-01-24 09:18:08 +00:00
g . dataP = savePos
2022-01-29 11:12:38 +00:00
g . mask = saveMask
g . b = saveB
2022-01-24 09:18:08 +00:00
}
return res , g . dataP
}
// MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
func ( g * Getter ) MatchPrefix ( buf [ ] byte ) bool {
savePos := g . dataP
defer func ( ) {
g . dataP = savePos
} ( )
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
return false
}
// count available space for word without actual reallocating memory
wordLen := len ( g . word )
if int ( l ) > wordLen {
wordLen = int ( l )
}
var pos uint64
var lastPos int
var lastUncovered int
var pattern [ ] byte
for pos = g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
intPos := lastPos + int ( pos ) - 1
lastPos = intPos
if wordLen < intPos {
2022-02-09 06:22:45 +00:00
panic ( fmt . Sprintf ( "likely .idx is invalid: %s" , g . fName ) )
2022-01-24 09:18:08 +00:00
}
pattern = g . nextPattern ( )
if strings . HasPrefix ( string ( pattern ) , string ( buf ) ) {
return true
}
if intPos > lastUncovered {
dif := uint64 ( intPos - lastUncovered )
if strings . HasPrefix ( string ( pattern ) + string ( g . data [ g . dataP : g . dataP + dif ] ) , string ( buf ) ) {
return true
}
}
lastUncovered = intPos + len ( pattern )
}
if int ( l ) > lastUncovered {
dif := l - uint64 ( lastUncovered )
if strings . HasPrefix ( string ( pattern ) + string ( g . data [ g . dataP : g . dataP + dif ] ) , string ( buf ) ) {
return true
}
}
return false
}