2021-10-16 09:43:41 +00:00
/ *
2022-05-06 13:55:11 +00:00
Copyright 2022 Erigon contributors
2021-10-16 09:43:41 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package compress
import (
2022-01-24 09:18:08 +00:00
"bytes"
2021-10-16 09:43:41 +00:00
"encoding/binary"
2021-11-15 14:19:56 +00:00
"fmt"
2021-10-16 09:43:41 +00:00
"os"
2022-05-27 01:20:53 +00:00
"github.com/ledgerwatch/erigon-lib/common/dbg"
2021-10-16 09:43:41 +00:00
"github.com/ledgerwatch/erigon-lib/mmap"
)
2022-05-20 04:23:05 +00:00
type codeword struct {
len byte // Number of bits in the codes
pattern * word // Pattern corresponding to entries
ptr * patternTable // pointer to deeper level tables
}
2022-03-18 09:10:18 +00:00
type patternTable struct {
2022-05-20 04:23:05 +00:00
bitLen int // Number of bits to lookup in the table
patterns [ ] * codeword
2022-03-18 09:10:18 +00:00
}
type posTable struct {
bitLen int // Number of bits to lookup in the table
pos [ ] uint64
lens [ ] byte
ptrs [ ] * posTable
}
2022-01-17 08:50:42 +00:00
// Decompressor provides access to the superstrings in a file produced by a compressor
2021-10-16 09:43:41 +00:00
type Decompressor struct {
compressedFile string
f * os . File
mmapHandle1 [ ] byte // mmap handle for unix (this is used to close mmap)
mmapHandle2 * [ mmap . MaxMapSize ] byte // mmap handle for windows (this is used to close mmap)
data [ ] byte // slice of correct size for the decompressor to work with
2022-03-18 09:10:18 +00:00
dict * patternTable
posDict * posTable
2022-01-17 08:50:42 +00:00
wordsStart uint64 // Offset of whether the superstrings actually start
2022-01-31 22:32:00 +00:00
size int64
2022-03-10 07:48:37 +00:00
wordsCount , emptyWordsCount uint64
2021-10-16 09:43:41 +00:00
}
func NewDecompressor ( compressedFile string ) ( * Decompressor , error ) {
d := & Decompressor {
compressedFile : compressedFile ,
}
2022-05-27 01:20:53 +00:00
2021-10-16 09:43:41 +00:00
var err error
2022-05-27 01:20:53 +00:00
defer func ( ) {
if rec := recover ( ) ; rec != nil {
err = fmt . Errorf ( "decompressing file: %s, %+v, trace: %s" , compressedFile , rec , dbg . Stack ( ) )
}
} ( )
2021-10-16 09:43:41 +00:00
d . f , err = os . Open ( compressedFile )
if err != nil {
return nil , err
}
var stat os . FileInfo
if stat , err = d . f . Stat ( ) ; err != nil {
return nil , err
}
2022-01-31 22:32:00 +00:00
d . size = stat . Size ( )
2022-04-13 11:55:15 +00:00
if d . size < 32 {
2022-03-09 17:25:22 +00:00
return nil , fmt . Errorf ( "compressed file is too short: %d" , d . size )
2021-11-15 14:19:56 +00:00
}
2022-01-31 22:32:00 +00:00
if d . mmapHandle1 , d . mmapHandle2 , err = mmap . Mmap ( d . f , int ( d . size ) ) ; err != nil {
2021-10-16 09:43:41 +00:00
return nil , err
}
2022-05-20 04:23:05 +00:00
// read patterns from file
2022-01-31 22:32:00 +00:00
d . data = d . mmapHandle1 [ : d . size ]
2022-03-10 07:48:37 +00:00
d . wordsCount = binary . BigEndian . Uint64 ( d . data [ : 8 ] )
d . emptyWordsCount = binary . BigEndian . Uint64 ( d . data [ 8 : 16 ] )
dictSize := binary . BigEndian . Uint64 ( d . data [ 16 : 24 ] )
2022-04-13 11:55:15 +00:00
data := d . data [ 24 : 24 + dictSize ]
var depths [ ] uint64
var patterns [ ] [ ] byte
var i uint64
var patternMaxDepth uint64
2022-05-20 04:23:05 +00:00
2022-04-13 11:55:15 +00:00
//fmt.Printf("[decomp] dictSize = %d\n", dictSize)
for i < dictSize {
d , ns := binary . Uvarint ( data [ i : ] )
depths = append ( depths , d )
if d > patternMaxDepth {
patternMaxDepth = d
}
i += uint64 ( ns )
l , n := binary . Uvarint ( data [ i : ] )
i += uint64 ( n )
patterns = append ( patterns , data [ i : i + l ] )
//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
i += l
}
2022-05-20 04:23:05 +00:00
2022-02-20 22:14:06 +00:00
if dictSize > 0 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if patternMaxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( patternMaxDepth )
2022-03-18 09:10:18 +00:00
}
//fmt.Printf("pattern maxDepth=%d\n", tree.maxDepth)
tableSize := 1 << bitLen
d . dict = & patternTable {
bitLen : bitLen ,
2022-05-20 04:23:05 +00:00
patterns : make ( [ ] * codeword , tableSize ) ,
2022-03-18 09:10:18 +00:00
}
2022-08-01 05:37:10 +00:00
if _ , err := buildPatternTable ( d . dict , depths , patterns , 0 , 0 , 0 , patternMaxDepth ) ; err != nil {
return nil , err
}
2022-02-20 22:14:06 +00:00
}
2022-05-20 04:23:05 +00:00
// read positions
2022-04-13 11:55:15 +00:00
pos := 24 + dictSize
2021-10-16 09:43:41 +00:00
dictSize = binary . BigEndian . Uint64 ( d . data [ pos : pos + 8 ] )
2022-04-13 11:55:15 +00:00
data = d . data [ pos + 8 : pos + 8 + dictSize ]
var posDepths [ ] uint64
var poss [ ] uint64
var posMaxDepth uint64
//fmt.Printf("[decomp] posDictSize = %d\n", dictSize)
i = 0
for i < dictSize {
d , ns := binary . Uvarint ( data [ i : ] )
posDepths = append ( posDepths , d )
if d > posMaxDepth {
posMaxDepth = d
}
i += uint64 ( ns )
pos , n := binary . Uvarint ( data [ i : ] )
i += uint64 ( n )
poss = append ( poss , pos )
}
2022-05-20 04:23:05 +00:00
2022-02-20 22:14:06 +00:00
if dictSize > 0 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if posMaxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( posMaxDepth )
2022-03-18 09:10:18 +00:00
}
//fmt.Printf("pos maxDepth=%d\n", tree.maxDepth)
tableSize := 1 << bitLen
d . posDict = & posTable {
bitLen : bitLen ,
pos : make ( [ ] uint64 , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * posTable , tableSize ) ,
}
2022-04-13 11:55:15 +00:00
buildPosTable ( posDepths , poss , d . posDict , 0 , 0 , 0 , posMaxDepth )
2022-02-20 22:14:06 +00:00
}
2022-04-13 11:55:15 +00:00
d . wordsStart = pos + 8 + dictSize
2021-10-16 09:43:41 +00:00
return d , nil
}
2022-05-20 04:23:05 +00:00
type word [ ] byte // plain text word associated with code from dictionary
2022-05-17 05:38:48 +00:00
2022-04-13 11:55:15 +00:00
// returns number of depth and patterns comsumed
2022-08-01 05:37:10 +00:00
func buildPatternTable ( table * patternTable , depths [ ] uint64 , patterns [ ] [ ] byte , code uint16 , bits int , depth uint64 , maxDepth uint64 ) ( int , error ) {
2022-04-13 11:55:15 +00:00
if len ( depths ) == 0 {
2022-08-01 05:37:10 +00:00
return 0 , nil
2022-02-20 22:14:06 +00:00
}
2022-04-13 11:55:15 +00:00
if depth == depths [ 0 ] {
2022-05-20 04:23:05 +00:00
pattern := word ( patterns [ 0 ] )
2022-04-13 11:55:15 +00:00
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
2022-05-17 05:38:48 +00:00
2022-05-20 04:23:05 +00:00
codeStep := uint16 ( 1 ) << bits
codeFrom , codeTo := code , code + codeStep
if table . bitLen != bits {
codeTo = code | ( uint16 ( 1 ) << table . bitLen )
}
cw := & codeword { pattern : & pattern , len : byte ( bits ) , ptr : nil }
for c := codeFrom ; c < codeTo ; c += codeStep {
if p := table . patterns [ c ] ; p == nil {
table . patterns [ c ] = cw
} else {
p . pattern , p . len , p . ptr = & pattern , byte ( bits ) , nil
2022-03-18 09:10:18 +00:00
}
}
2022-08-01 05:37:10 +00:00
return 1 , nil
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
if bits == 9 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if maxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( maxDepth )
2022-03-18 09:10:18 +00:00
}
tableSize := 1 << bitLen
newTable := & patternTable {
bitLen : bitLen ,
2022-05-20 04:23:05 +00:00
patterns : make ( [ ] * codeword , tableSize ) ,
2022-03-18 09:10:18 +00:00
}
2022-05-20 04:23:05 +00:00
table . patterns [ code ] = & codeword { pattern : nil , len : byte ( 0 ) , ptr : newTable }
return buildPatternTable ( newTable , depths , patterns , 0 , 0 , depth , maxDepth )
2022-03-18 09:10:18 +00:00
}
2022-08-01 05:37:10 +00:00
if maxDepth == 0 {
return 0 , fmt . Errorf ( "invalid snapshot format. decompress.buildPatternTable faced maxDepth underflow" )
}
b0 , err := buildPatternTable ( table , depths , patterns , code , bits + 1 , depth + 1 , maxDepth - 1 )
if err != nil {
return 0 , err
}
b1 , err := buildPatternTable ( table , depths [ b0 : ] , patterns [ b0 : ] , ( uint16 ( 1 ) << bits ) | code , bits + 1 , depth + 1 , maxDepth - 1 )
if err != nil {
return 0 , err
}
return b0 + b1 , nil
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
func buildPosTable ( depths [ ] uint64 , poss [ ] uint64 , table * posTable , code uint16 , bits int , depth uint64 , maxDepth uint64 ) int {
if len ( depths ) == 0 {
return 0
}
if depth == depths [ 0 ] {
p := poss [ 0 ]
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pos=%d\n", depth, maxDepth, code, bits, p)
2022-08-10 12:08:09 +00:00
if table . bitLen == bits {
2022-04-13 11:55:15 +00:00
table . pos [ code ] = p
table . lens [ code ] = byte ( bits )
2022-03-18 09:10:18 +00:00
table . ptrs [ code ] = nil
} else {
2022-04-13 11:55:15 +00:00
codeStep := uint16 ( 1 ) << bits
2022-03-18 09:10:18 +00:00
codeFrom := code
codeTo := code | ( uint16 ( 1 ) << table . bitLen )
for c := codeFrom ; c < codeTo ; c += codeStep {
2022-04-13 11:55:15 +00:00
table . pos [ c ] = p
table . lens [ c ] = byte ( bits )
2022-03-18 09:10:18 +00:00
table . ptrs [ c ] = nil
}
}
2022-04-13 11:55:15 +00:00
return 1
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
if bits == 9 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if maxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( maxDepth )
2022-03-18 09:10:18 +00:00
}
tableSize := 1 << bitLen
newTable := & posTable {
bitLen : bitLen ,
pos : make ( [ ] uint64 , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * posTable , tableSize ) ,
}
table . pos [ code ] = 0
table . lens [ code ] = byte ( 0 )
table . ptrs [ code ] = newTable
2022-04-13 11:55:15 +00:00
return buildPosTable ( depths , poss , newTable , 0 , 0 , depth , maxDepth )
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
b0 := buildPosTable ( depths , poss , table , code , bits + 1 , depth + 1 , maxDepth - 1 )
return b0 + buildPosTable ( depths [ b0 : ] , poss [ b0 : ] , table , ( uint16 ( 1 ) << bits ) | code , bits + 1 , depth + 1 , maxDepth - 1 )
2022-02-20 22:14:06 +00:00
}
2022-01-31 22:32:00 +00:00
func ( d * Decompressor ) Size ( ) int64 {
return d . size
}
2021-10-16 09:43:41 +00:00
func ( d * Decompressor ) Close ( ) error {
if err := mmap . Munmap ( d . mmapHandle1 , d . mmapHandle2 ) ; err != nil {
return err
}
if err := d . f . Close ( ) ; err != nil {
return err
}
return nil
}
2021-12-31 11:42:43 +00:00
func ( d * Decompressor ) FilePath ( ) string { return d . compressedFile }
2022-08-10 12:00:19 +00:00
// WithReadAhead - Expect read in sequential order. (Hence, pages in the given range can be aggressively read ahead, and may be freed soon after they are accessed.)
2022-02-01 04:19:11 +00:00
func ( d * Decompressor ) WithReadAhead ( f func ( ) error ) error {
_ = mmap . MadviseSequential ( d . mmapHandle1 )
defer mmap . MadviseRandom ( d . mmapHandle1 )
return f ( )
}
2022-01-29 11:12:38 +00:00
// Getter represent "reader" or "interator" that can move accross the data of the decompressor
2022-07-02 18:38:34 +00:00
// The full state of the getter can be captured by saving dataP, and dataBit
2021-10-16 09:43:41 +00:00
type Getter struct {
data [ ] byte
dataP uint64
2022-03-18 09:10:18 +00:00
dataBit int // Value 0..7 - position of the bit
patternDict * patternTable
posDict * posTable
2022-02-09 06:22:45 +00:00
fName string
2022-05-18 07:36:01 +00:00
trace bool
2021-10-16 09:43:41 +00:00
}
2022-05-18 07:36:01 +00:00
func ( g * Getter ) Trace ( t bool ) { g . trace = t }
2021-10-16 09:43:41 +00:00
func ( g * Getter ) nextPos ( clean bool ) uint64 {
if clean {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
}
table := g . posDict
if table . bitLen == 0 {
return table . pos [ 0 ]
}
var l byte
var pos uint64
for l == 0 {
code := uint16 ( g . data [ g . dataP ] ) >> g . dataBit
if 8 - g . dataBit < table . bitLen && int ( g . dataP ) + 1 < len ( g . data ) {
code |= uint16 ( g . data [ g . dataP + 1 ] ) << ( 8 - g . dataBit )
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
code &= ( uint16 ( 1 ) << table . bitLen ) - 1
l = table . lens [ code ]
if l == 0 {
table = table . ptrs [ code ]
g . dataBit += 9
2021-10-16 09:43:41 +00:00
} else {
2022-03-18 09:10:18 +00:00
g . dataBit += int ( l )
pos = table . pos [ code ]
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP += uint64 ( g . dataBit / 8 )
g . dataBit = g . dataBit % 8
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
return pos
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) nextPattern ( ) [ ] byte {
2022-03-18 09:10:18 +00:00
table := g . patternDict
if table . bitLen == 0 {
2022-05-20 04:23:05 +00:00
return * table . patterns [ 0 ] . pattern
2022-03-18 09:10:18 +00:00
}
var l byte
var pattern [ ] byte
for l == 0 {
code := uint16 ( g . data [ g . dataP ] ) >> g . dataBit
if 8 - g . dataBit < table . bitLen && int ( g . dataP ) + 1 < len ( g . data ) {
code |= uint16 ( g . data [ g . dataP + 1 ] ) << ( 8 - g . dataBit )
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
code &= ( uint16 ( 1 ) << table . bitLen ) - 1
2022-05-20 04:23:05 +00:00
cw := table . patterns [ code ]
l = cw . len
2022-03-18 09:10:18 +00:00
if l == 0 {
2022-05-20 04:23:05 +00:00
table = cw . ptr
2022-03-18 09:10:18 +00:00
g . dataBit += 9
2021-10-16 09:43:41 +00:00
} else {
2022-03-18 09:10:18 +00:00
g . dataBit += int ( l )
2022-05-20 04:23:05 +00:00
pattern = * cw . pattern
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP += uint64 ( g . dataBit / 8 )
g . dataBit = g . dataBit % 8
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
return pattern
2021-10-16 09:43:41 +00:00
}
2022-07-23 08:06:52 +00:00
func ( g * Getter ) Size ( ) int {
return len ( g . data )
}
2022-03-10 07:48:37 +00:00
func ( d * Decompressor ) Count ( ) int { return int ( d . wordsCount ) }
func ( d * Decompressor ) EmptyWordsCount ( ) int { return int ( d . emptyWordsCount ) }
2021-11-19 15:00:55 +00:00
2022-01-17 08:50:42 +00:00
// MakeGetter creates an object that can be used to access superstrings in the decompressor's file
2022-01-24 09:18:08 +00:00
// Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently
2021-10-16 09:43:41 +00:00
// for the same decompressor
func ( d * Decompressor ) MakeGetter ( ) * Getter {
2022-03-09 17:25:22 +00:00
return & Getter { patternDict : d . dict , posDict : d . posDict , data : d . data [ d . wordsStart : ] , fName : d . compressedFile }
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) Reset ( offset uint64 ) {
g . dataP = offset
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) HasNext ( ) bool {
return g . dataP < uint64 ( len ( g . data ) )
}
// Next extracts a compressed word from current offset in the file
// and appends it to the given buf, returning the result of appending
// After extracting next word, it moves to the beginning of the next one
func ( g * Getter ) Next ( buf [ ] byte ) ( [ ] byte , uint64 ) {
2022-03-09 17:25:22 +00:00
savePos := g . dataP
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
if wordLen == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2021-11-07 07:32:01 +00:00
return buf , g . dataP
}
2022-03-09 17:25:22 +00:00
bufPos := len ( buf ) // Tracking position in buf where to insert part of the word
lastUncovered := len ( buf )
2022-05-27 01:20:53 +00:00
if len ( buf ) + int ( wordLen ) > cap ( buf ) {
newBuf := make ( [ ] byte , len ( buf ) + int ( wordLen ) )
2022-03-09 17:25:22 +00:00
copy ( newBuf , buf )
buf = newBuf
} else {
// Expand buffer
2022-05-27 01:20:53 +00:00
buf = buf [ : len ( buf ) + int ( wordLen ) ]
2021-11-07 07:32:01 +00:00
}
2022-03-09 17:25:22 +00:00
// Loop below fills in the patterns
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1 // Positions where to insert patterns are encoded relative to one another
2022-05-06 13:55:11 +00:00
copy ( buf [ bufPos : ] , g . nextPattern ( ) )
2022-03-09 17:25:22 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
postLoopPos := g . dataP
g . dataP = savePos
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman reader
bufPos = lastUncovered // Restore to the beginning of buf
// Loop below fills the data which is not in the patterns
2022-05-20 04:23:05 +00:00
for pos := g . nextPos ( false ) ; pos != 0 ; pos = g . nextPos ( false ) {
2022-03-09 17:25:22 +00:00
bufPos += int ( pos ) - 1 // Positions where to insert patterns are encoded relative to one another
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
copy ( buf [ lastUncovered : bufPos ] , g . data [ postLoopPos : postLoopPos + dif ] )
postLoopPos += dif
2021-10-16 09:43:41 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2021-11-07 07:32:01 +00:00
}
2022-05-27 01:20:53 +00:00
if int ( wordLen ) > lastUncovered {
dif := wordLen - uint64 ( lastUncovered )
copy ( buf [ lastUncovered : wordLen ] , g . data [ postLoopPos : postLoopPos + dif ] )
2022-03-09 17:25:22 +00:00
postLoopPos += dif
2021-11-07 07:32:01 +00:00
}
2022-03-09 17:25:22 +00:00
g . dataP = postLoopPos
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2022-03-09 17:25:22 +00:00
return buf , postLoopPos
}
func ( g * Getter ) NextUncompressed ( ) ( [ ] byte , uint64 ) {
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
if wordLen == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
return g . data [ g . dataP : g . dataP ] , g . dataP
2021-10-16 09:43:41 +00:00
}
2022-03-13 22:46:17 +00:00
g . nextPos ( false )
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
pos := g . dataP
2022-05-27 01:20:53 +00:00
g . dataP += wordLen
2022-03-09 17:25:22 +00:00
return g . data [ pos : g . dataP ] , g . dataP
2021-10-16 09:43:41 +00:00
}
2022-01-24 09:18:08 +00:00
// Skip moves offset to the next word and returns the new offset.
func ( g * Getter ) Skip ( ) uint64 {
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-24 09:18:08 +00:00
return g . dataP
}
wordLen := int ( l )
var add uint64
2022-03-09 17:25:22 +00:00
var bufPos int
2022-01-24 09:18:08 +00:00
var lastUncovered int
2022-03-09 17:25:22 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
if wordLen < bufPos {
2022-02-09 06:22:45 +00:00
panic ( fmt . Sprintf ( "likely .idx is invalid: %s" , g . fName ) )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
if bufPos > lastUncovered {
add += uint64 ( bufPos - lastUncovered )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-24 09:18:08 +00:00
if int ( l ) > lastUncovered {
add += l - uint64 ( lastUncovered )
}
// Uncovered characters
g . dataP += add
return g . dataP
}
2022-06-17 11:39:49 +00:00
func ( g * Getter ) SkipUncompressed ( ) uint64 {
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
if wordLen == 0 {
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
return g . dataP
}
g . nextPos ( false )
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
g . dataP += wordLen
return g . dataP
}
2022-01-24 09:18:08 +00:00
// Match returns true and next offset if the word at current offset fully matches the buf
// returns false and current offset otherwise.
func ( g * Getter ) Match ( buf [ ] byte ) ( bool , uint64 ) {
savePos := g . dataP
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
2022-01-29 11:12:38 +00:00
lenBuf := len ( buf )
2022-05-27 01:20:53 +00:00
if wordLen == 0 || int ( wordLen ) != lenBuf {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-29 11:12:38 +00:00
if lenBuf != 0 {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-01-29 11:12:38 +00:00
}
2022-05-27 01:20:53 +00:00
return lenBuf == int ( wordLen ) , g . dataP
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
var bufPos int
2022-01-29 11:12:38 +00:00
// In the first pass, we only check patterns
2022-03-09 17:25:22 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
pattern := g . nextPattern ( )
if lenBuf < bufPos + len ( pattern ) || ! bytes . Equal ( buf [ bufPos : bufPos + len ( pattern ) ] , pattern ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-29 11:12:38 +00:00
postLoopPos := g . dataP
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman decoder
2022-01-29 11:12:38 +00:00
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
2022-03-09 17:25:22 +00:00
bufPos = 0
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
if lenBuf < bufPos || ! bytes . Equal ( buf [ lastUncovered : bufPos ] , g . data [ postLoopPos : postLoopPos + dif ] ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-05-27 01:20:53 +00:00
if int ( wordLen ) > lastUncovered {
dif := wordLen - uint64 ( lastUncovered )
if lenBuf < int ( wordLen ) || ! bytes . Equal ( buf [ lastUncovered : wordLen ] , g . data [ postLoopPos : postLoopPos + dif ] ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-05-27 01:20:53 +00:00
if lenBuf != int ( wordLen ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = postLoopPos , 0
2022-03-09 17:25:22 +00:00
return true , postLoopPos
2022-01-24 09:18:08 +00:00
}
// MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
2022-05-16 19:59:29 +00:00
func ( g * Getter ) MatchPrefix ( prefix [ ] byte ) bool {
2022-01-24 09:18:08 +00:00
savePos := g . dataP
defer func ( ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-01-24 09:18:08 +00:00
} ( )
2022-03-09 17:25:22 +00:00
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true /* clean */ )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
2022-05-16 19:59:29 +00:00
prefixLen := len ( prefix )
2022-05-27 01:20:53 +00:00
if wordLen == 0 || int ( wordLen ) < prefixLen {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-05-16 19:59:29 +00:00
if prefixLen != 0 {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
}
2022-05-27 01:20:53 +00:00
return prefixLen == int ( wordLen )
2022-01-24 09:18:08 +00:00
}
2022-05-18 07:36:01 +00:00
var bufPos int
2022-03-09 17:25:22 +00:00
// In the first pass, we only check patterns
// Only run this loop as far as the prefix goes, there is no need to check further
2022-05-18 07:36:01 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
2022-03-09 17:25:22 +00:00
pattern := g . nextPattern ( )
var comparisonLen int
2022-05-18 07:36:01 +00:00
if prefixLen < bufPos + len ( pattern ) {
comparisonLen = prefixLen - bufPos
2022-03-09 17:25:22 +00:00
} else {
comparisonLen = len ( pattern )
2022-01-24 09:18:08 +00:00
}
2022-05-18 07:36:01 +00:00
if bufPos < prefixLen {
if ! bytes . Equal ( prefix [ bufPos : bufPos + comparisonLen ] , pattern [ : comparisonLen ] ) {
return false
}
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
}
2022-05-18 07:36:01 +00:00
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
postLoopPos := g . dataP
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman decoder
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
2022-05-18 07:36:01 +00:00
bufPos = 0
2022-05-19 05:27:36 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 && lastUncovered < prefixLen ; pos = g . nextPos ( false ) {
2022-05-18 07:36:01 +00:00
bufPos += int ( pos ) - 1
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
2022-03-09 17:25:22 +00:00
var comparisonLen int
2022-05-16 19:59:29 +00:00
if prefixLen < lastUncovered + int ( dif ) {
comparisonLen = prefixLen - lastUncovered
2022-03-09 17:25:22 +00:00
} else {
comparisonLen = int ( dif )
}
2022-05-19 05:27:36 +00:00
if ! bytes . Equal ( prefix [ lastUncovered : lastUncovered + comparisonLen ] , g . data [ postLoopPos : postLoopPos + uint64 ( comparisonLen ) ] ) {
return false
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-05-18 07:36:01 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-05-27 01:20:53 +00:00
if prefixLen > lastUncovered && int ( wordLen ) > lastUncovered {
dif := wordLen - uint64 ( lastUncovered )
2022-03-09 17:25:22 +00:00
var comparisonLen int
2022-05-27 01:20:53 +00:00
if prefixLen < int ( wordLen ) {
2022-05-16 19:59:29 +00:00
comparisonLen = prefixLen - lastUncovered
2022-03-09 17:25:22 +00:00
} else {
comparisonLen = int ( dif )
}
2022-05-16 19:59:29 +00:00
if ! bytes . Equal ( prefix [ lastUncovered : lastUncovered + comparisonLen ] , g . data [ postLoopPos : postLoopPos + uint64 ( comparisonLen ) ] ) {
2022-03-09 17:25:22 +00:00
return false
2022-01-24 09:18:08 +00:00
}
}
2022-03-09 17:25:22 +00:00
return true
2022-01-24 09:18:08 +00:00
}