2021-10-16 09:43:41 +00:00
/ *
Copyright 2021 Erigon contributors
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package compress
import (
2022-01-24 09:18:08 +00:00
"bytes"
2021-10-16 09:43:41 +00:00
"encoding/binary"
2021-11-15 14:19:56 +00:00
"fmt"
2021-10-16 09:43:41 +00:00
"os"
"github.com/ledgerwatch/erigon-lib/mmap"
)
2022-03-18 09:10:18 +00:00
type patternTable struct {
bitLen int // Number of bits to lookup in the table
patterns [ ] [ ] byte // Patterns corresponding to entries
lens [ ] byte // Number of bits in the codes
ptrs [ ] * patternTable // pointers to deeper level tables
}
type posTable struct {
bitLen int // Number of bits to lookup in the table
pos [ ] uint64
lens [ ] byte
ptrs [ ] * posTable
}
2022-02-20 22:14:06 +00:00
type huffmanNodePos struct {
2022-03-18 09:10:18 +00:00
zero * huffmanNodePos
one * huffmanNodePos
pos uint64
maxDepth int
2022-02-20 22:14:06 +00:00
}
type huffmanNodePattern struct {
2022-03-18 09:10:18 +00:00
zero * huffmanNodePattern
one * huffmanNodePattern
pattern [ ] byte
maxDepth int
2022-02-20 22:14:06 +00:00
}
2022-01-17 08:50:42 +00:00
// Decompressor provides access to the superstrings in a file produced by a compressor
2021-10-16 09:43:41 +00:00
type Decompressor struct {
compressedFile string
f * os . File
mmapHandle1 [ ] byte // mmap handle for unix (this is used to close mmap)
mmapHandle2 * [ mmap . MaxMapSize ] byte // mmap handle for windows (this is used to close mmap)
data [ ] byte // slice of correct size for the decompressor to work with
2022-03-18 09:10:18 +00:00
dict * patternTable
posDict * posTable
2022-01-17 08:50:42 +00:00
wordsStart uint64 // Offset of whether the superstrings actually start
2022-01-31 22:32:00 +00:00
size int64
2022-03-10 07:48:37 +00:00
wordsCount , emptyWordsCount uint64
2021-10-16 09:43:41 +00:00
}
func NewDecompressor ( compressedFile string ) ( * Decompressor , error ) {
d := & Decompressor {
compressedFile : compressedFile ,
}
var err error
d . f , err = os . Open ( compressedFile )
if err != nil {
return nil , err
}
var stat os . FileInfo
if stat , err = d . f . Stat ( ) ; err != nil {
return nil , err
}
2022-01-31 22:32:00 +00:00
d . size = stat . Size ( )
2022-03-10 07:48:37 +00:00
if d . size < 40 {
2022-03-09 17:25:22 +00:00
return nil , fmt . Errorf ( "compressed file is too short: %d" , d . size )
2021-11-15 14:19:56 +00:00
}
2022-01-31 22:32:00 +00:00
if d . mmapHandle1 , d . mmapHandle2 , err = mmap . Mmap ( d . f , int ( d . size ) ) ; err != nil {
2021-10-16 09:43:41 +00:00
return nil , err
}
2022-01-31 22:32:00 +00:00
d . data = d . mmapHandle1 [ : d . size ]
2022-03-10 07:48:37 +00:00
d . wordsCount = binary . BigEndian . Uint64 ( d . data [ : 8 ] )
d . emptyWordsCount = binary . BigEndian . Uint64 ( d . data [ 8 : 16 ] )
dictSize := binary . BigEndian . Uint64 ( d . data [ 16 : 24 ] )
rootOffset := binary . BigEndian . Uint64 ( d . data [ 24 : 32 ] )
cutoff := binary . BigEndian . Uint64 ( d . data [ 32 : 40 ] )
data := d . data [ 40 : 40 + dictSize ]
2022-02-20 22:14:06 +00:00
if dictSize > 0 {
2022-03-18 09:10:18 +00:00
tree := buildHuffmanPattern ( data , rootOffset , cutoff )
var bitLen int
if tree . maxDepth > 9 {
bitLen = 9
} else {
bitLen = tree . maxDepth
}
//fmt.Printf("pattern maxDepth=%d\n", tree.maxDepth)
tableSize := 1 << bitLen
d . dict = & patternTable {
bitLen : bitLen ,
patterns : make ( [ ] [ ] byte , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * patternTable , tableSize ) ,
}
buildPatternTable ( tree , d . dict , 0 , 0 )
2022-02-20 22:14:06 +00:00
}
2022-03-10 07:48:37 +00:00
pos := 40 + dictSize
2021-10-16 09:43:41 +00:00
dictSize = binary . BigEndian . Uint64 ( d . data [ pos : pos + 8 ] )
2022-02-20 22:14:06 +00:00
rootOffset = binary . BigEndian . Uint64 ( d . data [ pos + 8 : pos + 16 ] )
cutoff = binary . BigEndian . Uint64 ( d . data [ pos + 16 : pos + 24 ] )
data = d . data [ pos + 24 : pos + 24 + dictSize ]
if dictSize > 0 {
2022-03-18 09:10:18 +00:00
tree := buildHuffmanPos ( data , rootOffset , cutoff )
var bitLen int
if tree . maxDepth > 9 {
bitLen = 9
} else {
bitLen = tree . maxDepth
}
//fmt.Printf("pos maxDepth=%d\n", tree.maxDepth)
tableSize := 1 << bitLen
d . posDict = & posTable {
bitLen : bitLen ,
pos : make ( [ ] uint64 , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * posTable , tableSize ) ,
}
buildPosTable ( tree , d . posDict , 0 , 0 )
2022-02-20 22:14:06 +00:00
}
2021-10-16 09:43:41 +00:00
d . wordsStart = pos + 24 + dictSize
return d , nil
}
2022-02-20 22:14:06 +00:00
func buildHuffmanPos ( data [ ] byte , offset uint64 , cutoff uint64 ) * huffmanNodePos {
if offset < cutoff {
pos , _ := binary . Uvarint ( data [ offset : ] )
2022-03-18 09:10:18 +00:00
return & huffmanNodePos { pos : pos , maxDepth : 0 }
2022-02-20 22:14:06 +00:00
}
offsetZero , n := binary . Uvarint ( data [ offset : ] )
offsetOne , _ := binary . Uvarint ( data [ offset + uint64 ( n ) : ] )
2022-03-18 09:10:18 +00:00
t0 := buildHuffmanPos ( data , offsetZero , cutoff )
t1 := buildHuffmanPos ( data , offsetOne , cutoff )
var maxDepth int
if t0 . maxDepth > t1 . maxDepth {
maxDepth = t0 . maxDepth + 1
} else {
maxDepth = t1 . maxDepth + 1
}
return & huffmanNodePos { zero : t0 , one : t1 , maxDepth : maxDepth }
2022-02-20 22:14:06 +00:00
}
func buildHuffmanPattern ( data [ ] byte , offset uint64 , cutoff uint64 ) * huffmanNodePattern {
if offset < cutoff {
l , n := binary . Uvarint ( data [ offset : ] )
2022-03-18 09:10:18 +00:00
return & huffmanNodePattern { pattern : data [ offset + uint64 ( n ) : offset + uint64 ( n ) + l ] , maxDepth : 0 }
2022-02-20 22:14:06 +00:00
}
offsetZero , n := binary . Uvarint ( data [ offset : ] )
offsetOne , _ := binary . Uvarint ( data [ offset + uint64 ( n ) : ] )
2022-03-18 09:10:18 +00:00
t0 := buildHuffmanPattern ( data , offsetZero , cutoff )
t1 := buildHuffmanPattern ( data , offsetOne , cutoff )
var maxDepth int
if t0 . maxDepth > t1 . maxDepth {
maxDepth = t0 . maxDepth + 1
} else {
maxDepth = t1 . maxDepth + 1
}
return & huffmanNodePattern { zero : t0 , one : t1 , maxDepth : maxDepth }
}
func buildPatternTable ( tree * huffmanNodePattern , table * patternTable , code uint16 , depth int ) {
if tree . zero == nil && tree . one == nil {
if table . bitLen == depth {
table . patterns [ code ] = tree . pattern
//fmt.Printf(".[%b]%d=>[%s]\n", code, table.bitLen, tree.pattern)
table . lens [ code ] = byte ( depth )
table . ptrs [ code ] = nil
} else {
codeStep := uint16 ( 1 ) << depth
codeFrom := code
codeTo := code | ( uint16 ( 1 ) << table . bitLen )
for c := codeFrom ; c < codeTo ; c += codeStep {
table . patterns [ c ] = tree . pattern
//fmt.Printf("*[%b]%d=>[%s]\n", c, table.bitLen, tree.pattern)
table . lens [ c ] = byte ( depth )
table . ptrs [ c ] = nil
}
}
return
}
if depth == 9 {
var bitLen int
if tree . maxDepth > 9 {
bitLen = 9
} else {
bitLen = tree . maxDepth
}
tableSize := 1 << bitLen
newTable := & patternTable {
bitLen : bitLen ,
patterns : make ( [ ] [ ] byte , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * patternTable , tableSize ) ,
}
table . patterns [ code ] = nil
table . lens [ code ] = byte ( 0 )
table . ptrs [ code ] = newTable
buildPatternTable ( tree , newTable , 0 , 0 )
return
}
buildPatternTable ( tree . zero , table , code , depth + 1 )
buildPatternTable ( tree . one , table , ( uint16 ( 1 ) << depth ) | code , depth + 1 )
}
func buildPosTable ( tree * huffmanNodePos , table * posTable , code uint16 , depth int ) {
if tree . zero == nil && tree . one == nil {
if table . bitLen == depth {
table . pos [ code ] = tree . pos
//fmt.Printf(".[%b]%d=>%d\n", code, table.bitLen, tree.pos)
table . lens [ code ] = byte ( depth )
table . ptrs [ code ] = nil
} else {
codeStep := uint16 ( 1 ) << depth
codeFrom := code
codeTo := code | ( uint16 ( 1 ) << table . bitLen )
for c := codeFrom ; c < codeTo ; c += codeStep {
table . pos [ c ] = tree . pos
//fmt.Printf("*[%b]%d=>%d\n", c, table.bitLen, tree.pos)
table . lens [ c ] = byte ( depth )
table . ptrs [ c ] = nil
}
}
return
}
if depth == 9 {
var bitLen int
if tree . maxDepth > 9 {
bitLen = 9
} else {
bitLen = tree . maxDepth
}
tableSize := 1 << bitLen
newTable := & posTable {
bitLen : bitLen ,
pos : make ( [ ] uint64 , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * posTable , tableSize ) ,
}
table . pos [ code ] = 0
table . lens [ code ] = byte ( 0 )
table . ptrs [ code ] = newTable
buildPosTable ( tree , newTable , 0 , 0 )
return
}
buildPosTable ( tree . zero , table , code , depth + 1 )
buildPosTable ( tree . one , table , ( uint16 ( 1 ) << depth ) | code , depth + 1 )
2022-02-20 22:14:06 +00:00
}
2022-01-31 22:32:00 +00:00
func ( d * Decompressor ) Size ( ) int64 {
return d . size
}
2021-10-16 09:43:41 +00:00
func ( d * Decompressor ) Close ( ) error {
if err := mmap . Munmap ( d . mmapHandle1 , d . mmapHandle2 ) ; err != nil {
return err
}
if err := d . f . Close ( ) ; err != nil {
return err
}
return nil
}
2021-12-31 11:42:43 +00:00
func ( d * Decompressor ) FilePath ( ) string { return d . compressedFile }
2022-02-01 04:19:11 +00:00
//WithReadAhead - Expect read in sequential order. (Hence, pages in the given range can be aggressively read ahead, and may be freed soon after they are accessed.)
func ( d * Decompressor ) WithReadAhead ( f func ( ) error ) error {
_ = mmap . MadviseSequential ( d . mmapHandle1 )
defer mmap . MadviseRandom ( d . mmapHandle1 )
return f ( )
}
2022-01-29 11:12:38 +00:00
// Getter represent "reader" or "interator" that can move accross the data of the decompressor
// The full state of the getter can be captured by saving dataP, b, and mask values.
2021-10-16 09:43:41 +00:00
type Getter struct {
data [ ] byte
dataP uint64
2022-03-18 09:10:18 +00:00
dataBit int // Value 0..7 - position of the bit
patternDict * patternTable
posDict * posTable
2022-02-09 06:22:45 +00:00
fName string
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) nextPos ( clean bool ) uint64 {
if clean {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
}
table := g . posDict
if table . bitLen == 0 {
return table . pos [ 0 ]
}
var l byte
var pos uint64
for l == 0 {
code := uint16 ( g . data [ g . dataP ] ) >> g . dataBit
if 8 - g . dataBit < table . bitLen && int ( g . dataP ) + 1 < len ( g . data ) {
code |= uint16 ( g . data [ g . dataP + 1 ] ) << ( 8 - g . dataBit )
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
code &= ( uint16 ( 1 ) << table . bitLen ) - 1
l = table . lens [ code ]
if l == 0 {
table = table . ptrs [ code ]
g . dataBit += 9
2021-10-16 09:43:41 +00:00
} else {
2022-03-18 09:10:18 +00:00
g . dataBit += int ( l )
pos = table . pos [ code ]
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP += uint64 ( g . dataBit / 8 )
g . dataBit = g . dataBit % 8
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
return pos
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) nextPattern ( ) [ ] byte {
2022-03-18 09:10:18 +00:00
table := g . patternDict
if table . bitLen == 0 {
return table . patterns [ 0 ]
}
var l byte
var pattern [ ] byte
for l == 0 {
code := uint16 ( g . data [ g . dataP ] ) >> g . dataBit
if 8 - g . dataBit < table . bitLen && int ( g . dataP ) + 1 < len ( g . data ) {
code |= uint16 ( g . data [ g . dataP + 1 ] ) << ( 8 - g . dataBit )
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
code &= ( uint16 ( 1 ) << table . bitLen ) - 1
l = table . lens [ code ]
if l == 0 {
table = table . ptrs [ code ]
g . dataBit += 9
2021-10-16 09:43:41 +00:00
} else {
2022-03-18 09:10:18 +00:00
g . dataBit += int ( l )
pattern = table . patterns [ code ]
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP += uint64 ( g . dataBit / 8 )
g . dataBit = g . dataBit % 8
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
return pattern
2021-10-16 09:43:41 +00:00
}
2022-03-10 07:48:37 +00:00
func ( d * Decompressor ) Count ( ) int { return int ( d . wordsCount ) }
func ( d * Decompressor ) EmptyWordsCount ( ) int { return int ( d . emptyWordsCount ) }
2021-11-19 15:00:55 +00:00
2022-01-17 08:50:42 +00:00
// MakeGetter creates an object that can be used to access superstrings in the decompressor's file
2022-01-24 09:18:08 +00:00
// Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently
2021-10-16 09:43:41 +00:00
// for the same decompressor
func ( d * Decompressor ) MakeGetter ( ) * Getter {
2022-03-09 17:25:22 +00:00
return & Getter { patternDict : d . dict , posDict : d . posDict , data : d . data [ d . wordsStart : ] , fName : d . compressedFile }
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) Reset ( offset uint64 ) {
g . dataP = offset
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) HasNext ( ) bool {
return g . dataP < uint64 ( len ( g . data ) )
}
// Next extracts a compressed word from current offset in the file
// and appends it to the given buf, returning the result of appending
// After extracting next word, it moves to the beginning of the next one
func ( g * Getter ) Next ( buf [ ] byte ) ( [ ] byte , uint64 ) {
2022-03-09 17:25:22 +00:00
savePos := g . dataP
2021-10-16 09:43:41 +00:00
l := g . nextPos ( true )
2021-11-10 12:50:36 +00:00
l -- // because when create huffman tree we do ++ , because 0 is terminator
2021-11-07 07:32:01 +00:00
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2021-11-07 07:32:01 +00:00
return buf , g . dataP
}
2022-03-09 17:25:22 +00:00
bufPos := len ( buf ) // Tracking position in buf where to insert part of the word
lastUncovered := len ( buf )
if len ( buf ) + int ( l ) > cap ( buf ) {
newBuf := make ( [ ] byte , len ( buf ) + int ( l ) )
copy ( newBuf , buf )
buf = newBuf
} else {
// Expand buffer
buf = buf [ : len ( buf ) + int ( l ) ]
2021-11-07 07:32:01 +00:00
}
2022-03-09 17:25:22 +00:00
// Loop below fills in the patterns
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1 // Positions where to insert patterns are encoded relative to one another
copy ( buf [ bufPos : ] , g . nextPattern ( ) )
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
postLoopPos := g . dataP
g . dataP = savePos
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman reader
bufPos = lastUncovered // Restore to the beginning of buf
// Loop below fills the data which is not in the patterns
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1 // Positions where to insert patterns are encoded relative to one another
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
copy ( buf [ lastUncovered : bufPos ] , g . data [ postLoopPos : postLoopPos + dif ] )
postLoopPos += dif
2021-10-16 09:43:41 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2021-11-07 07:32:01 +00:00
}
if int ( l ) > lastUncovered {
2022-03-09 17:25:22 +00:00
dif := l - uint64 ( lastUncovered )
copy ( buf [ lastUncovered : l ] , g . data [ postLoopPos : postLoopPos + dif ] )
postLoopPos += dif
2021-11-07 07:32:01 +00:00
}
2022-03-09 17:25:22 +00:00
g . dataP = postLoopPos
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2022-03-09 17:25:22 +00:00
return buf , postLoopPos
}
func ( g * Getter ) NextUncompressed ( ) ( [ ] byte , uint64 ) {
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
return g . data [ g . dataP : g . dataP ] , g . dataP
2021-10-16 09:43:41 +00:00
}
2022-03-13 22:46:17 +00:00
g . nextPos ( false )
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
pos := g . dataP
g . dataP += l
return g . data [ pos : g . dataP ] , g . dataP
2021-10-16 09:43:41 +00:00
}
2022-01-24 09:18:08 +00:00
// Skip moves offset to the next word and returns the new offset.
func ( g * Getter ) Skip ( ) uint64 {
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-24 09:18:08 +00:00
return g . dataP
}
wordLen := int ( l )
var add uint64
2022-03-09 17:25:22 +00:00
var bufPos int
2022-01-24 09:18:08 +00:00
var lastUncovered int
2022-03-09 17:25:22 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
if wordLen < bufPos {
2022-02-09 06:22:45 +00:00
panic ( fmt . Sprintf ( "likely .idx is invalid: %s" , g . fName ) )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
if bufPos > lastUncovered {
add += uint64 ( bufPos - lastUncovered )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-24 09:18:08 +00:00
if int ( l ) > lastUncovered {
add += l - uint64 ( lastUncovered )
}
// Uncovered characters
g . dataP += add
return g . dataP
}
// Match returns true and next offset if the word at current offset fully matches the buf
// returns false and current offset otherwise.
func ( g * Getter ) Match ( buf [ ] byte ) ( bool , uint64 ) {
savePos := g . dataP
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
2022-01-29 11:12:38 +00:00
lenBuf := len ( buf )
2022-01-24 09:18:08 +00:00
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-29 11:12:38 +00:00
if lenBuf != 0 {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-01-29 11:12:38 +00:00
}
return lenBuf == 0 , g . dataP
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
var bufPos int
2022-01-29 11:12:38 +00:00
// In the first pass, we only check patterns
2022-03-09 17:25:22 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
pattern := g . nextPattern ( )
if lenBuf < bufPos + len ( pattern ) || ! bytes . Equal ( buf [ bufPos : bufPos + len ( pattern ) ] , pattern ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-29 11:12:38 +00:00
postLoopPos := g . dataP
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman decoder
2022-01-29 11:12:38 +00:00
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
2022-03-09 17:25:22 +00:00
bufPos = 0
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
if lenBuf < bufPos || ! bytes . Equal ( buf [ lastUncovered : bufPos ] , g . data [ postLoopPos : postLoopPos + dif ] ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
if int ( l ) > lastUncovered {
dif := l - uint64 ( lastUncovered )
2022-03-09 17:25:22 +00:00
if lenBuf < int ( l ) || ! bytes . Equal ( buf [ lastUncovered : l ] , g . data [ postLoopPos : postLoopPos + dif ] ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
if lenBuf != int ( l ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = postLoopPos , 0
2022-03-09 17:25:22 +00:00
return true , postLoopPos
2022-01-24 09:18:08 +00:00
}
// MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
func ( g * Getter ) MatchPrefix ( buf [ ] byte ) bool {
savePos := g . dataP
defer func ( ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-01-24 09:18:08 +00:00
} ( )
2022-03-09 17:25:22 +00:00
l := g . nextPos ( true /* clean */ )
2022-01-24 09:18:08 +00:00
l -- // because when create huffman tree we do ++ , because 0 is terminator
2022-03-09 17:25:22 +00:00
lenBuf := len ( buf )
2022-01-24 09:18:08 +00:00
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
if lenBuf != 0 {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
}
return lenBuf == 0
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
var bufPos int
// In the first pass, we only check patterns
// Only run this loop as far as the prefix goes, there is no need to check further
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 && bufPos < lenBuf ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
pattern := g . nextPattern ( )
var comparisonLen int
if lenBuf < bufPos + len ( pattern ) {
comparisonLen = lenBuf - bufPos
} else {
comparisonLen = len ( pattern )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
if ! bytes . Equal ( buf [ bufPos : bufPos + comparisonLen ] , pattern [ : comparisonLen ] ) {
return false
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
postLoopPos := g . dataP
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman decoder
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
bufPos = 0
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 && lastUncovered < lenBuf ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
patternLen := len ( g . nextPattern ( ) )
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
var comparisonLen int
if lenBuf < lastUncovered + int ( dif ) {
comparisonLen = lenBuf - lastUncovered
} else {
comparisonLen = int ( dif )
}
if ! bytes . Equal ( buf [ lastUncovered : lastUncovered + comparisonLen ] , g . data [ postLoopPos : postLoopPos + uint64 ( comparisonLen ) ] ) {
return false
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + patternLen
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
if lenBuf > lastUncovered && int ( l ) > lastUncovered {
2022-01-24 09:18:08 +00:00
dif := l - uint64 ( lastUncovered )
2022-03-09 17:25:22 +00:00
var comparisonLen int
if lenBuf < int ( l ) {
comparisonLen = lenBuf - lastUncovered
} else {
comparisonLen = int ( dif )
}
if ! bytes . Equal ( buf [ lastUncovered : lastUncovered + comparisonLen ] , g . data [ postLoopPos : postLoopPos + uint64 ( comparisonLen ) ] ) {
return false
2022-01-24 09:18:08 +00:00
}
}
2022-03-09 17:25:22 +00:00
return true
2022-01-24 09:18:08 +00:00
}