2021-10-16 09:43:41 +00:00
/ *
2022-05-06 13:55:11 +00:00
Copyright 2022 Erigon contributors
2021-10-16 09:43:41 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package compress
import (
2022-01-24 09:18:08 +00:00
"bytes"
2021-10-16 09:43:41 +00:00
"encoding/binary"
2021-11-15 14:19:56 +00:00
"fmt"
2021-10-16 09:43:41 +00:00
"os"
2022-05-27 01:20:53 +00:00
"github.com/ledgerwatch/erigon-lib/common/dbg"
2021-10-16 09:43:41 +00:00
"github.com/ledgerwatch/erigon-lib/mmap"
)
2022-08-22 02:11:56 +00:00
type word [ ] byte // plain text word associated with code from dictionary
2022-05-20 04:23:05 +00:00
type codeword struct {
pattern * word // Pattern corresponding to entries
ptr * patternTable // pointer to deeper level tables
2022-08-22 02:11:56 +00:00
next * codeword // points to next word in condensed table
2022-10-21 08:31:23 +00:00
code uint16 // code associated with that word
len byte // Number of bits in the codes
2022-05-20 04:23:05 +00:00
}
2022-03-18 09:10:18 +00:00
type patternTable struct {
2022-08-22 02:11:56 +00:00
head * codeword
2022-10-21 08:31:23 +00:00
patterns [ ] * codeword
bitLen int // Number of bits to lookup in the table
2022-08-22 02:11:56 +00:00
}
func newPatternTable ( bitLen int ) * patternTable {
pt := & patternTable {
2022-08-30 02:50:23 +00:00
bitLen : bitLen ,
2022-08-22 02:11:56 +00:00
}
if bitLen <= condensePatternTableBitThreshold {
pt . patterns = make ( [ ] * codeword , 1 << pt . bitLen )
}
return pt
}
func ( pt * patternTable ) insertWord ( cw * codeword ) {
if pt . bitLen <= condensePatternTableBitThreshold {
codeStep := uint16 ( 1 ) << uint16 ( cw . len )
codeFrom , codeTo := cw . code , cw . code + codeStep
if pt . bitLen != int ( cw . len ) && cw . len > 0 {
codeTo = codeFrom | ( uint16 ( 1 ) << pt . bitLen )
}
// cw := &codeword{code: codeFrom, pattern: &pattern, len: byte(bits), ptr: nil}
for c := codeFrom ; c < codeTo ; c += codeStep {
if p := pt . patterns [ c ] ; p == nil {
pt . patterns [ c ] = cw
} else {
p . pattern , p . len , p . ptr , p . code = cw . pattern , cw . len , nil , c
}
}
return
}
if pt . head == nil {
cw . next = nil
pt . head = cw
return
}
var prev * codeword
for cur := pt . head ; cur != nil ; prev , cur = cur , cur . next {
}
cw . next = nil
prev . next = cw
}
func ( pt * patternTable ) condensedTableSearch ( code uint16 ) * codeword {
if pt . bitLen <= condensePatternTableBitThreshold {
return pt . patterns [ code ]
}
var prev * codeword
for cur := pt . head ; cur != nil ; prev , cur = cur , cur . next {
if cur . code == code {
if prev != nil {
prev . next = cur . next
cur . next = pt . head
pt . head = cur
}
return cur
}
d := code - cur . code
if d & 1 != 0 {
continue
}
if checkDistance ( int ( cur . len ) , int ( d ) ) {
if prev != nil {
prev . next = cur . next
cur . next = pt . head
pt . head = cur
}
return cur
}
}
return nil
2022-03-18 09:10:18 +00:00
}
type posTable struct {
pos [ ] uint64
lens [ ] byte
ptrs [ ] * posTable
2022-10-21 08:31:23 +00:00
bitLen int
2022-03-18 09:10:18 +00:00
}
2022-01-17 08:50:42 +00:00
// Decompressor provides access to the superstrings in a file produced by a compressor
2021-10-16 09:43:41 +00:00
type Decompressor struct {
2022-10-21 08:31:23 +00:00
f * os . File
mmapHandle2 * [ mmap . MaxMapSize ] byte // mmap handle for windows (this is used to close mmap)
dict * patternTable
posDict * posTable
compressedFile string
mmapHandle1 [ ] byte // mmap handle for unix (this is used to close mmap)
data [ ] byte // slice of correct size for the decompressor to work with
wordsStart uint64 // Offset of whether the superstrings actually start
size int64
wordsCount uint64
emptyWordsCount uint64
2021-10-16 09:43:41 +00:00
}
2022-08-22 02:11:56 +00:00
// Tables with bitlen greater than threshold will be condensed.
// Condensing reduces size of decompression table but leads to slower reads.
// To disable condesning at all set to 9 (we dont use tables larger than 2^9)
// To enable condensing for tables of size larger 64 = 6
// for all tables = 0
// There is no sense to condense tables of size [1 - 64] in terms of performance
//
// Should be set before calling NewDecompression.
var condensePatternTableBitThreshold = 9
func SetDecompressionTableCondensity ( fromBitSize int ) {
condensePatternTableBitThreshold = fromBitSize
}
2021-10-16 09:43:41 +00:00
func NewDecompressor ( compressedFile string ) ( * Decompressor , error ) {
d := & Decompressor {
compressedFile : compressedFile ,
}
2022-05-27 01:20:53 +00:00
2021-10-16 09:43:41 +00:00
var err error
2022-05-27 01:20:53 +00:00
defer func ( ) {
if rec := recover ( ) ; rec != nil {
err = fmt . Errorf ( "decompressing file: %s, %+v, trace: %s" , compressedFile , rec , dbg . Stack ( ) )
}
} ( )
2021-10-16 09:43:41 +00:00
d . f , err = os . Open ( compressedFile )
if err != nil {
return nil , err
}
var stat os . FileInfo
if stat , err = d . f . Stat ( ) ; err != nil {
return nil , err
}
2022-01-31 22:32:00 +00:00
d . size = stat . Size ( )
2022-04-13 11:55:15 +00:00
if d . size < 32 {
2022-03-09 17:25:22 +00:00
return nil , fmt . Errorf ( "compressed file is too short: %d" , d . size )
2021-11-15 14:19:56 +00:00
}
2022-01-31 22:32:00 +00:00
if d . mmapHandle1 , d . mmapHandle2 , err = mmap . Mmap ( d . f , int ( d . size ) ) ; err != nil {
2021-10-16 09:43:41 +00:00
return nil , err
}
2022-05-20 04:23:05 +00:00
// read patterns from file
2022-01-31 22:32:00 +00:00
d . data = d . mmapHandle1 [ : d . size ]
2022-03-10 07:48:37 +00:00
d . wordsCount = binary . BigEndian . Uint64 ( d . data [ : 8 ] )
d . emptyWordsCount = binary . BigEndian . Uint64 ( d . data [ 8 : 16 ] )
dictSize := binary . BigEndian . Uint64 ( d . data [ 16 : 24 ] )
2022-04-13 11:55:15 +00:00
data := d . data [ 24 : 24 + dictSize ]
2022-08-22 02:11:56 +00:00
2022-04-13 11:55:15 +00:00
var depths [ ] uint64
var patterns [ ] [ ] byte
var i uint64
var patternMaxDepth uint64
2022-05-20 04:23:05 +00:00
2022-04-13 11:55:15 +00:00
for i < dictSize {
d , ns := binary . Uvarint ( data [ i : ] )
2022-08-22 06:04:01 +00:00
if d > 2048 {
return nil , fmt . Errorf ( "dictionary is invalid: patternMaxDepth=%d" , d )
}
2022-04-13 11:55:15 +00:00
depths = append ( depths , d )
if d > patternMaxDepth {
patternMaxDepth = d
}
i += uint64 ( ns )
l , n := binary . Uvarint ( data [ i : ] )
i += uint64 ( n )
patterns = append ( patterns , data [ i : i + l ] )
//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
i += l
}
2022-05-20 04:23:05 +00:00
2022-02-20 22:14:06 +00:00
if dictSize > 0 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if patternMaxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( patternMaxDepth )
2022-03-18 09:10:18 +00:00
}
2022-08-22 02:11:56 +00:00
// fmt.Printf("pattern maxDepth=%d\n", tree.maxDepth)
d . dict = newPatternTable ( bitLen )
buildCondensedPatternTable ( d . dict , depths , patterns , 0 , 0 , 0 , patternMaxDepth )
2022-02-20 22:14:06 +00:00
}
2022-05-20 04:23:05 +00:00
// read positions
2022-04-13 11:55:15 +00:00
pos := 24 + dictSize
2021-10-16 09:43:41 +00:00
dictSize = binary . BigEndian . Uint64 ( d . data [ pos : pos + 8 ] )
2022-04-13 11:55:15 +00:00
data = d . data [ pos + 8 : pos + 8 + dictSize ]
2022-08-22 02:11:56 +00:00
2022-04-13 11:55:15 +00:00
var posDepths [ ] uint64
var poss [ ] uint64
var posMaxDepth uint64
2022-08-22 02:11:56 +00:00
2022-04-13 11:55:15 +00:00
i = 0
for i < dictSize {
d , ns := binary . Uvarint ( data [ i : ] )
2022-08-22 06:04:01 +00:00
if d > 2048 {
return nil , fmt . Errorf ( "dictionary is invalid: posMaxDepth=%d" , d )
}
2022-04-13 11:55:15 +00:00
posDepths = append ( posDepths , d )
if d > posMaxDepth {
posMaxDepth = d
}
i += uint64 ( ns )
pos , n := binary . Uvarint ( data [ i : ] )
i += uint64 ( n )
poss = append ( poss , pos )
}
2022-05-20 04:23:05 +00:00
2022-02-20 22:14:06 +00:00
if dictSize > 0 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if posMaxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( posMaxDepth )
2022-03-18 09:10:18 +00:00
}
//fmt.Printf("pos maxDepth=%d\n", tree.maxDepth)
tableSize := 1 << bitLen
d . posDict = & posTable {
bitLen : bitLen ,
pos : make ( [ ] uint64 , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * posTable , tableSize ) ,
}
2022-04-13 11:55:15 +00:00
buildPosTable ( posDepths , poss , d . posDict , 0 , 0 , 0 , posMaxDepth )
2022-02-20 22:14:06 +00:00
}
2022-04-13 11:55:15 +00:00
d . wordsStart = pos + 8 + dictSize
2021-10-16 09:43:41 +00:00
return d , nil
}
2022-08-22 02:11:56 +00:00
func buildCondensedPatternTable ( table * patternTable , depths [ ] uint64 , patterns [ ] [ ] byte , code uint16 , bits int , depth uint64 , maxDepth uint64 ) int {
2022-04-13 11:55:15 +00:00
if len ( depths ) == 0 {
2022-08-22 02:11:56 +00:00
return 0
2022-02-20 22:14:06 +00:00
}
2022-04-13 11:55:15 +00:00
if depth == depths [ 0 ] {
2022-05-20 04:23:05 +00:00
pattern := word ( patterns [ 0 ] )
2022-04-13 11:55:15 +00:00
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
2022-08-22 02:11:56 +00:00
cw := & codeword { code : code , pattern : & pattern , len : byte ( bits ) , ptr : nil }
2022-05-17 05:38:48 +00:00
2022-08-22 02:11:56 +00:00
// table.patterns = append(table.patterns, cw)
table . insertWord ( cw )
return 1
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
if bits == 9 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if maxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( maxDepth )
2022-03-18 09:10:18 +00:00
}
2022-08-22 02:11:56 +00:00
newTable := newPatternTable ( bitLen )
cw := & codeword { code : code , pattern : nil , len : byte ( 0 ) , ptr : newTable }
2022-05-20 04:23:05 +00:00
2022-08-22 02:11:56 +00:00
// table.patterns = append(table.patterns, &codeword{code: code, pattern: nil, len: byte(0), ptr: newTable})
table . insertWord ( cw )
return buildCondensedPatternTable ( newTable , depths , patterns , 0 , 0 , depth , maxDepth )
2022-08-01 05:37:10 +00:00
}
2022-08-22 02:11:56 +00:00
b0 := buildCondensedPatternTable ( table , depths , patterns , code , bits + 1 , depth + 1 , maxDepth - 1 )
return b0 + buildCondensedPatternTable ( table , depths [ b0 : ] , patterns [ b0 : ] , ( uint16 ( 1 ) << bits ) | code , bits + 1 , depth + 1 , maxDepth - 1 )
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
func buildPosTable ( depths [ ] uint64 , poss [ ] uint64 , table * posTable , code uint16 , bits int , depth uint64 , maxDepth uint64 ) int {
if len ( depths ) == 0 {
return 0
}
if depth == depths [ 0 ] {
p := poss [ 0 ]
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pos=%d\n", depth, maxDepth, code, bits, p)
2022-08-10 12:08:09 +00:00
if table . bitLen == bits {
2022-04-13 11:55:15 +00:00
table . pos [ code ] = p
table . lens [ code ] = byte ( bits )
2022-03-18 09:10:18 +00:00
table . ptrs [ code ] = nil
} else {
2022-04-13 11:55:15 +00:00
codeStep := uint16 ( 1 ) << bits
2022-03-18 09:10:18 +00:00
codeFrom := code
codeTo := code | ( uint16 ( 1 ) << table . bitLen )
for c := codeFrom ; c < codeTo ; c += codeStep {
2022-04-13 11:55:15 +00:00
table . pos [ c ] = p
table . lens [ c ] = byte ( bits )
2022-03-18 09:10:18 +00:00
table . ptrs [ c ] = nil
}
}
2022-04-13 11:55:15 +00:00
return 1
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
if bits == 9 {
2022-03-18 09:10:18 +00:00
var bitLen int
2022-04-13 11:55:15 +00:00
if maxDepth > 9 {
2022-03-18 09:10:18 +00:00
bitLen = 9
} else {
2022-04-13 11:55:15 +00:00
bitLen = int ( maxDepth )
2022-03-18 09:10:18 +00:00
}
tableSize := 1 << bitLen
newTable := & posTable {
bitLen : bitLen ,
pos : make ( [ ] uint64 , tableSize ) ,
lens : make ( [ ] byte , tableSize ) ,
ptrs : make ( [ ] * posTable , tableSize ) ,
}
table . pos [ code ] = 0
table . lens [ code ] = byte ( 0 )
table . ptrs [ code ] = newTable
2022-04-13 11:55:15 +00:00
return buildPosTable ( depths , poss , newTable , 0 , 0 , depth , maxDepth )
2022-03-18 09:10:18 +00:00
}
2022-04-13 11:55:15 +00:00
b0 := buildPosTable ( depths , poss , table , code , bits + 1 , depth + 1 , maxDepth - 1 )
return b0 + buildPosTable ( depths [ b0 : ] , poss [ b0 : ] , table , ( uint16 ( 1 ) << bits ) | code , bits + 1 , depth + 1 , maxDepth - 1 )
2022-02-20 22:14:06 +00:00
}
2022-01-31 22:32:00 +00:00
func ( d * Decompressor ) Size ( ) int64 {
return d . size
}
2021-10-16 09:43:41 +00:00
func ( d * Decompressor ) Close ( ) error {
if err := mmap . Munmap ( d . mmapHandle1 , d . mmapHandle2 ) ; err != nil {
return err
}
if err := d . f . Close ( ) ; err != nil {
return err
}
return nil
}
2021-12-31 11:42:43 +00:00
func ( d * Decompressor ) FilePath ( ) string { return d . compressedFile }
2022-08-10 12:00:19 +00:00
// WithReadAhead - Expect read in sequential order. (Hence, pages in the given range can be aggressively read ahead, and may be freed soon after they are accessed.)
2022-02-01 04:19:11 +00:00
func ( d * Decompressor ) WithReadAhead ( f func ( ) error ) error {
_ = mmap . MadviseSequential ( d . mmapHandle1 )
2022-10-04 09:51:51 +00:00
//_ = mmap.MadviseWillNeed(d.mmapHandle1)
2022-02-01 04:19:11 +00:00
defer mmap . MadviseRandom ( d . mmapHandle1 )
return f ( )
}
2022-09-29 05:14:45 +00:00
// DisableReadAhead - usage: `defer d.EnableReadAhead().DisableReadAhead()`. Please don't use this funcs without `defer` to avoid leak.
func ( d * Decompressor ) DisableReadAhead ( ) { _ = mmap . MadviseRandom ( d . mmapHandle1 ) }
func ( d * Decompressor ) EnableReadAhead ( ) * Decompressor {
_ = mmap . MadviseSequential ( d . mmapHandle1 )
return d
}
2022-10-04 09:51:51 +00:00
func ( d * Decompressor ) EnableMadvNormal ( ) * Decompressor {
_ = mmap . MadviseNormal ( d . mmapHandle1 )
return d
}
func ( d * Decompressor ) EnableWillNeed ( ) * Decompressor {
_ = mmap . MadviseWillNeed ( d . mmapHandle1 )
return d
}
2022-09-29 05:14:45 +00:00
2022-01-29 11:12:38 +00:00
// Getter represent "reader" or "interator" that can move accross the data of the decompressor
2022-07-02 18:38:34 +00:00
// The full state of the getter can be captured by saving dataP, and dataBit
2021-10-16 09:43:41 +00:00
type Getter struct {
2022-03-18 09:10:18 +00:00
patternDict * patternTable
posDict * posTable
2022-02-09 06:22:45 +00:00
fName string
2022-10-21 08:31:23 +00:00
data [ ] byte
dataP uint64
dataBit int // Value 0..7 - position of the bit
2022-05-18 07:36:01 +00:00
trace bool
2021-10-16 09:43:41 +00:00
}
2022-05-18 07:36:01 +00:00
func ( g * Getter ) Trace ( t bool ) { g . trace = t }
2021-10-16 09:43:41 +00:00
func ( g * Getter ) nextPos ( clean bool ) uint64 {
if clean {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
}
table := g . posDict
if table . bitLen == 0 {
return table . pos [ 0 ]
}
var l byte
var pos uint64
for l == 0 {
code := uint16 ( g . data [ g . dataP ] ) >> g . dataBit
if 8 - g . dataBit < table . bitLen && int ( g . dataP ) + 1 < len ( g . data ) {
code |= uint16 ( g . data [ g . dataP + 1 ] ) << ( 8 - g . dataBit )
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
code &= ( uint16 ( 1 ) << table . bitLen ) - 1
l = table . lens [ code ]
if l == 0 {
table = table . ptrs [ code ]
g . dataBit += 9
2021-10-16 09:43:41 +00:00
} else {
2022-03-18 09:10:18 +00:00
g . dataBit += int ( l )
pos = table . pos [ code ]
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP += uint64 ( g . dataBit / 8 )
g . dataBit = g . dataBit % 8
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
return pos
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) nextPattern ( ) [ ] byte {
2022-03-18 09:10:18 +00:00
table := g . patternDict
2022-08-22 02:11:56 +00:00
2022-03-18 09:10:18 +00:00
if table . bitLen == 0 {
2022-05-20 04:23:05 +00:00
return * table . patterns [ 0 ] . pattern
2022-03-18 09:10:18 +00:00
}
2022-08-22 02:11:56 +00:00
2022-03-18 09:10:18 +00:00
var l byte
var pattern [ ] byte
for l == 0 {
code := uint16 ( g . data [ g . dataP ] ) >> g . dataBit
if 8 - g . dataBit < table . bitLen && int ( g . dataP ) + 1 < len ( g . data ) {
code |= uint16 ( g . data [ g . dataP + 1 ] ) << ( 8 - g . dataBit )
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
code &= ( uint16 ( 1 ) << table . bitLen ) - 1
2022-08-22 02:11:56 +00:00
cw := table . condensedTableSearch ( code )
2022-05-20 04:23:05 +00:00
l = cw . len
2022-03-18 09:10:18 +00:00
if l == 0 {
2022-05-20 04:23:05 +00:00
table = cw . ptr
2022-03-18 09:10:18 +00:00
g . dataBit += 9
2021-10-16 09:43:41 +00:00
} else {
2022-03-18 09:10:18 +00:00
g . dataBit += int ( l )
2022-05-20 04:23:05 +00:00
pattern = * cw . pattern
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP += uint64 ( g . dataBit / 8 )
g . dataBit = g . dataBit % 8
2021-10-16 09:43:41 +00:00
}
2022-03-18 09:10:18 +00:00
return pattern
2021-10-16 09:43:41 +00:00
}
2022-08-22 02:11:56 +00:00
var condensedWordDistances = buildCondensedWordDistances ( )
func checkDistance ( power int , d int ) bool {
for _ , dist := range condensedWordDistances [ power ] {
if dist == d {
return true
}
}
return false
}
func buildCondensedWordDistances ( ) [ ] [ ] int {
dist2 := make ( [ ] [ ] int , 10 )
for i := 1 ; i <= 9 ; i ++ {
dl := make ( [ ] int , 0 )
for j := 1 << i ; j < 512 ; j += 1 << i {
dl = append ( dl , j )
}
dist2 [ i ] = dl
}
return dist2
}
2022-07-23 08:06:52 +00:00
func ( g * Getter ) Size ( ) int {
return len ( g . data )
}
2022-03-10 07:48:37 +00:00
func ( d * Decompressor ) Count ( ) int { return int ( d . wordsCount ) }
func ( d * Decompressor ) EmptyWordsCount ( ) int { return int ( d . emptyWordsCount ) }
2021-11-19 15:00:55 +00:00
2022-01-17 08:50:42 +00:00
// MakeGetter creates an object that can be used to access superstrings in the decompressor's file
2022-01-24 09:18:08 +00:00
// Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently
2021-10-16 09:43:41 +00:00
// for the same decompressor
func ( d * Decompressor ) MakeGetter ( ) * Getter {
2022-08-22 02:11:56 +00:00
return & Getter {
posDict : d . posDict ,
data : d . data [ d . wordsStart : ] ,
patternDict : d . dict ,
fName : d . compressedFile ,
}
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) Reset ( offset uint64 ) {
g . dataP = offset
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2021-10-16 09:43:41 +00:00
}
func ( g * Getter ) HasNext ( ) bool {
return g . dataP < uint64 ( len ( g . data ) )
}
// Next extracts a compressed word from current offset in the file
// and appends it to the given buf, returning the result of appending
// After extracting next word, it moves to the beginning of the next one
func ( g * Getter ) Next ( buf [ ] byte ) ( [ ] byte , uint64 ) {
2022-10-20 05:51:12 +00:00
defer func ( ) {
if rec := recover ( ) ; rec != nil {
panic ( fmt . Sprintf ( "file: %s, %s, %s" , g . fName , rec , dbg . Stack ( ) ) )
}
} ( )
2022-03-09 17:25:22 +00:00
savePos := g . dataP
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
if wordLen == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2021-11-07 07:32:01 +00:00
return buf , g . dataP
}
2022-03-09 17:25:22 +00:00
bufPos := len ( buf ) // Tracking position in buf where to insert part of the word
lastUncovered := len ( buf )
2022-05-27 01:20:53 +00:00
if len ( buf ) + int ( wordLen ) > cap ( buf ) {
newBuf := make ( [ ] byte , len ( buf ) + int ( wordLen ) )
2022-03-09 17:25:22 +00:00
copy ( newBuf , buf )
buf = newBuf
} else {
// Expand buffer
2022-05-27 01:20:53 +00:00
buf = buf [ : len ( buf ) + int ( wordLen ) ]
2021-11-07 07:32:01 +00:00
}
2022-03-09 17:25:22 +00:00
// Loop below fills in the patterns
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1 // Positions where to insert patterns are encoded relative to one another
2022-08-22 02:11:56 +00:00
pt := g . nextPattern ( )
copy ( buf [ bufPos : ] , pt )
2022-03-09 17:25:22 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
postLoopPos := g . dataP
g . dataP = savePos
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman reader
bufPos = lastUncovered // Restore to the beginning of buf
// Loop below fills the data which is not in the patterns
2022-05-20 04:23:05 +00:00
for pos := g . nextPos ( false ) ; pos != 0 ; pos = g . nextPos ( false ) {
2022-03-09 17:25:22 +00:00
bufPos += int ( pos ) - 1 // Positions where to insert patterns are encoded relative to one another
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
copy ( buf [ lastUncovered : bufPos ] , g . data [ postLoopPos : postLoopPos + dif ] )
postLoopPos += dif
2021-10-16 09:43:41 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2021-11-07 07:32:01 +00:00
}
2022-05-27 01:20:53 +00:00
if int ( wordLen ) > lastUncovered {
dif := wordLen - uint64 ( lastUncovered )
copy ( buf [ lastUncovered : wordLen ] , g . data [ postLoopPos : postLoopPos + dif ] )
2022-03-09 17:25:22 +00:00
postLoopPos += dif
2021-11-07 07:32:01 +00:00
}
2022-03-09 17:25:22 +00:00
g . dataP = postLoopPos
2022-03-18 09:10:18 +00:00
g . dataBit = 0
2022-03-09 17:25:22 +00:00
return buf , postLoopPos
}
func ( g * Getter ) NextUncompressed ( ) ( [ ] byte , uint64 ) {
2022-10-20 05:51:12 +00:00
defer func ( ) {
if rec := recover ( ) ; rec != nil {
panic ( fmt . Sprintf ( "file: %s, %s, %s" , g . fName , rec , dbg . Stack ( ) ) )
}
} ( )
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
if wordLen == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
return g . data [ g . dataP : g . dataP ] , g . dataP
2021-10-16 09:43:41 +00:00
}
2022-03-13 22:46:17 +00:00
g . nextPos ( false )
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
pos := g . dataP
2022-05-27 01:20:53 +00:00
g . dataP += wordLen
2022-03-09 17:25:22 +00:00
return g . data [ pos : g . dataP ] , g . dataP
2021-10-16 09:43:41 +00:00
}
2022-01-24 09:18:08 +00:00
// Skip moves offset to the next word and returns the new offset.
func ( g * Getter ) Skip ( ) uint64 {
l := g . nextPos ( true )
l -- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-24 09:18:08 +00:00
return g . dataP
}
wordLen := int ( l )
var add uint64
2022-03-09 17:25:22 +00:00
var bufPos int
2022-01-24 09:18:08 +00:00
var lastUncovered int
2022-03-09 17:25:22 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
if wordLen < bufPos {
2022-02-09 06:22:45 +00:00
panic ( fmt . Sprintf ( "likely .idx is invalid: %s" , g . fName ) )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
if bufPos > lastUncovered {
add += uint64 ( bufPos - lastUncovered )
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-24 09:18:08 +00:00
if int ( l ) > lastUncovered {
add += l - uint64 ( lastUncovered )
}
// Uncovered characters
g . dataP += add
return g . dataP
}
2022-06-17 11:39:49 +00:00
func ( g * Getter ) SkipUncompressed ( ) uint64 {
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
if wordLen == 0 {
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
return g . dataP
}
g . nextPos ( false )
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
g . dataP += wordLen
return g . dataP
}
2022-01-24 09:18:08 +00:00
// Match returns true and next offset if the word at current offset fully matches the buf
// returns false and current offset otherwise.
func ( g * Getter ) Match ( buf [ ] byte ) ( bool , uint64 ) {
savePos := g . dataP
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
2022-01-29 11:12:38 +00:00
lenBuf := len ( buf )
2022-05-27 01:20:53 +00:00
if wordLen == 0 || int ( wordLen ) != lenBuf {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-29 11:12:38 +00:00
if lenBuf != 0 {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-01-29 11:12:38 +00:00
}
2022-05-27 01:20:53 +00:00
return lenBuf == int ( wordLen ) , g . dataP
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
var bufPos int
2022-01-29 11:12:38 +00:00
// In the first pass, we only check patterns
2022-03-09 17:25:22 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
pattern := g . nextPattern ( )
if lenBuf < bufPos + len ( pattern ) || ! bytes . Equal ( buf [ bufPos : bufPos + len ( pattern ) ] , pattern ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
}
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-01-29 11:12:38 +00:00
postLoopPos := g . dataP
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman decoder
2022-01-29 11:12:38 +00:00
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
2022-03-09 17:25:22 +00:00
bufPos = 0
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
if lenBuf < bufPos || ! bytes . Equal ( buf [ lastUncovered : bufPos ] , g . data [ postLoopPos : postLoopPos + dif ] ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-05-27 01:20:53 +00:00
if int ( wordLen ) > lastUncovered {
dif := wordLen - uint64 ( lastUncovered )
if lenBuf < int ( wordLen ) || ! bytes . Equal ( buf [ lastUncovered : wordLen ] , g . data [ postLoopPos : postLoopPos + dif ] ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-01-29 11:12:38 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-05-27 01:20:53 +00:00
if lenBuf != int ( wordLen ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
return false , savePos
2022-01-24 09:18:08 +00:00
}
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = postLoopPos , 0
2022-03-09 17:25:22 +00:00
return true , postLoopPos
2022-01-24 09:18:08 +00:00
}
// MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
2022-05-16 19:59:29 +00:00
func ( g * Getter ) MatchPrefix ( prefix [ ] byte ) bool {
2022-01-24 09:18:08 +00:00
savePos := g . dataP
defer func ( ) {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-01-24 09:18:08 +00:00
} ( )
2022-03-09 17:25:22 +00:00
2022-05-27 01:20:53 +00:00
wordLen := g . nextPos ( true /* clean */ )
wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
2022-05-16 19:59:29 +00:00
prefixLen := len ( prefix )
2022-05-27 01:20:53 +00:00
if wordLen == 0 || int ( wordLen ) < prefixLen {
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-05-16 19:59:29 +00:00
if prefixLen != 0 {
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
}
2022-05-27 01:20:53 +00:00
return prefixLen == int ( wordLen )
2022-01-24 09:18:08 +00:00
}
2022-05-18 07:36:01 +00:00
var bufPos int
2022-03-09 17:25:22 +00:00
// In the first pass, we only check patterns
// Only run this loop as far as the prefix goes, there is no need to check further
2022-05-18 07:36:01 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 ; pos = g . nextPos ( false ) {
bufPos += int ( pos ) - 1
2022-03-09 17:25:22 +00:00
pattern := g . nextPattern ( )
var comparisonLen int
2022-05-18 07:36:01 +00:00
if prefixLen < bufPos + len ( pattern ) {
comparisonLen = prefixLen - bufPos
2022-03-09 17:25:22 +00:00
} else {
comparisonLen = len ( pattern )
2022-01-24 09:18:08 +00:00
}
2022-05-18 07:36:01 +00:00
if bufPos < prefixLen {
if ! bytes . Equal ( prefix [ bufPos : bufPos + comparisonLen ] , pattern [ : comparisonLen ] ) {
return false
}
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
}
2022-05-18 07:36:01 +00:00
2022-03-18 09:10:18 +00:00
if g . dataBit > 0 {
g . dataP ++
g . dataBit = 0
}
2022-03-09 17:25:22 +00:00
postLoopPos := g . dataP
2022-03-18 09:10:18 +00:00
g . dataP , g . dataBit = savePos , 0
2022-03-09 17:25:22 +00:00
g . nextPos ( true /* clean */ ) // Reset the state of huffman decoder
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
2022-05-18 07:36:01 +00:00
bufPos = 0
2022-05-19 05:27:36 +00:00
for pos := g . nextPos ( false /* clean */ ) ; pos != 0 && lastUncovered < prefixLen ; pos = g . nextPos ( false ) {
2022-05-18 07:36:01 +00:00
bufPos += int ( pos ) - 1
if bufPos > lastUncovered {
dif := uint64 ( bufPos - lastUncovered )
2022-03-09 17:25:22 +00:00
var comparisonLen int
2022-05-16 19:59:29 +00:00
if prefixLen < lastUncovered + int ( dif ) {
comparisonLen = prefixLen - lastUncovered
2022-03-09 17:25:22 +00:00
} else {
comparisonLen = int ( dif )
}
2022-05-19 05:27:36 +00:00
if ! bytes . Equal ( prefix [ lastUncovered : lastUncovered + comparisonLen ] , g . data [ postLoopPos : postLoopPos + uint64 ( comparisonLen ) ] ) {
return false
2022-01-24 09:18:08 +00:00
}
2022-03-09 17:25:22 +00:00
postLoopPos += dif
2022-01-24 09:18:08 +00:00
}
2022-05-18 07:36:01 +00:00
lastUncovered = bufPos + len ( g . nextPattern ( ) )
2022-01-24 09:18:08 +00:00
}
2022-05-27 01:20:53 +00:00
if prefixLen > lastUncovered && int ( wordLen ) > lastUncovered {
dif := wordLen - uint64 ( lastUncovered )
2022-03-09 17:25:22 +00:00
var comparisonLen int
2022-05-27 01:20:53 +00:00
if prefixLen < int ( wordLen ) {
2022-05-16 19:59:29 +00:00
comparisonLen = prefixLen - lastUncovered
2022-03-09 17:25:22 +00:00
} else {
comparisonLen = int ( dif )
}
2022-05-16 19:59:29 +00:00
if ! bytes . Equal ( prefix [ lastUncovered : lastUncovered + comparisonLen ] , g . data [ postLoopPos : postLoopPos + uint64 ( comparisonLen ) ] ) {
2022-03-09 17:25:22 +00:00
return false
2022-01-24 09:18:08 +00:00
}
}
2022-03-09 17:25:22 +00:00
return true
2022-01-24 09:18:08 +00:00
}