mirror of
https://gitlab.com/pulsechaincom/erigon-pulse.git
synced 2024-12-27 22:28:21 +00:00
reduced memory footprint on building huffman table (#459)
This commit is contained in:
parent
59d810650c
commit
6de4ac4ba9
@ -25,11 +25,15 @@ import (
|
||||
"github.com/ledgerwatch/erigon-lib/mmap"
|
||||
)
|
||||
|
||||
type codeword struct {
|
||||
len byte // Number of bits in the codes
|
||||
pattern *word // Pattern corresponding to entries
|
||||
ptr *patternTable // pointer to deeper level tables
|
||||
}
|
||||
|
||||
type patternTable struct {
|
||||
bitLen int // Number of bits to lookup in the table
|
||||
patterns []*word // Patterns corresponding to entries
|
||||
lens []byte // Number of bits in the codes
|
||||
ptrs []*patternTable // pointers to deeper level tables
|
||||
bitLen int // Number of bits to lookup in the table
|
||||
patterns []*codeword
|
||||
}
|
||||
|
||||
type posTable struct {
|
||||
@ -74,6 +78,8 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
||||
if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// read patterns from file
|
||||
d.data = d.mmapHandle1[:d.size]
|
||||
d.wordsCount = binary.BigEndian.Uint64(d.data[:8])
|
||||
d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16])
|
||||
@ -83,6 +89,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
||||
var patterns [][]byte
|
||||
var i uint64
|
||||
var patternMaxDepth uint64
|
||||
|
||||
//fmt.Printf("[decomp] dictSize = %d\n", dictSize)
|
||||
for i < dictSize {
|
||||
d, ns := binary.Uvarint(data[i:])
|
||||
@ -97,6 +104,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
||||
//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
|
||||
i += l
|
||||
}
|
||||
|
||||
if dictSize > 0 {
|
||||
var bitLen int
|
||||
if patternMaxDepth > 9 {
|
||||
@ -108,12 +116,12 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
||||
tableSize := 1 << bitLen
|
||||
d.dict = &patternTable{
|
||||
bitLen: bitLen,
|
||||
patterns: make([]*word, tableSize),
|
||||
lens: make([]byte, tableSize),
|
||||
ptrs: make([]*patternTable, tableSize),
|
||||
patterns: make([]*codeword, tableSize),
|
||||
}
|
||||
buildPatternTable(depths, patterns, d.dict, 0, 0, 0, patternMaxDepth)
|
||||
buildPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth)
|
||||
}
|
||||
|
||||
// read positions
|
||||
pos := 24 + dictSize
|
||||
dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8])
|
||||
data = d.data[pos+8 : pos+8+dictSize]
|
||||
@ -133,6 +141,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
||||
i += uint64(n)
|
||||
poss = append(poss, pos)
|
||||
}
|
||||
|
||||
if dictSize > 0 {
|
||||
var bitLen int
|
||||
if posMaxDepth > 9 {
|
||||
@ -154,30 +163,29 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
||||
return d, nil
|
||||
}
|
||||
|
||||
type word []byte
|
||||
type word []byte // plain text word associated with code from dictionary
|
||||
|
||||
// returns number of depth and patterns comsumed
|
||||
func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
||||
func buildPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
||||
if len(depths) == 0 {
|
||||
return 0
|
||||
}
|
||||
if depth == depths[0] {
|
||||
pattern := word(make([]byte, len(patterns[0])))
|
||||
copy(pattern, patterns[0])
|
||||
pattern := word(patterns[0])
|
||||
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
|
||||
if table.bitLen == int(bits) {
|
||||
table.patterns[code] = &pattern
|
||||
table.lens[code] = byte(bits)
|
||||
table.ptrs[code] = nil
|
||||
} else {
|
||||
codeStep := uint16(1) << bits
|
||||
codeFrom := code
|
||||
codeTo := code | (uint16(1) << table.bitLen)
|
||||
|
||||
for c := codeFrom; c < codeTo; c += codeStep {
|
||||
table.patterns[c] = &pattern
|
||||
table.lens[c] = byte(bits)
|
||||
table.ptrs[c] = nil
|
||||
codeStep := uint16(1) << bits
|
||||
codeFrom, codeTo := code, code+codeStep
|
||||
if table.bitLen != bits {
|
||||
codeTo = code | (uint16(1) << table.bitLen)
|
||||
}
|
||||
|
||||
cw := &codeword{pattern: &pattern, len: byte(bits), ptr: nil}
|
||||
for c := codeFrom; c < codeTo; c += codeStep {
|
||||
if p := table.patterns[c]; p == nil {
|
||||
table.patterns[c] = cw
|
||||
} else {
|
||||
p.pattern, p.len, p.ptr = &pattern, byte(bits), nil
|
||||
}
|
||||
}
|
||||
return 1
|
||||
@ -192,17 +200,14 @@ func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable,
|
||||
tableSize := 1 << bitLen
|
||||
newTable := &patternTable{
|
||||
bitLen: bitLen,
|
||||
patterns: make([]*word, tableSize),
|
||||
lens: make([]byte, tableSize),
|
||||
ptrs: make([]*patternTable, tableSize),
|
||||
patterns: make([]*codeword, tableSize),
|
||||
}
|
||||
table.patterns[code] = nil
|
||||
table.lens[code] = byte(0)
|
||||
table.ptrs[code] = newTable
|
||||
return buildPatternTable(depths, patterns, newTable, 0, 0, depth, maxDepth)
|
||||
|
||||
table.patterns[code] = &codeword{pattern: nil, len: byte(0), ptr: newTable}
|
||||
return buildPatternTable(newTable, depths, patterns, 0, 0, depth, maxDepth)
|
||||
}
|
||||
b0 := buildPatternTable(depths, patterns, table, code, bits+1, depth+1, maxDepth-1)
|
||||
return b0 + buildPatternTable(depths[b0:], patterns[b0:], table, (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
|
||||
b0 := buildPatternTable(table, depths, patterns, code, bits+1, depth+1, maxDepth-1)
|
||||
return b0 + buildPatternTable(table, depths[b0:], patterns[b0:], (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
|
||||
}
|
||||
|
||||
func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
||||
@ -324,7 +329,7 @@ func (g *Getter) nextPos(clean bool) uint64 {
|
||||
func (g *Getter) nextPattern() []byte {
|
||||
table := g.patternDict
|
||||
if table.bitLen == 0 {
|
||||
return *table.patterns[0]
|
||||
return *table.patterns[0].pattern
|
||||
}
|
||||
var l byte
|
||||
var pattern []byte
|
||||
@ -334,13 +339,14 @@ func (g *Getter) nextPattern() []byte {
|
||||
code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
|
||||
}
|
||||
code &= (uint16(1) << table.bitLen) - 1
|
||||
l = table.lens[code]
|
||||
cw := table.patterns[code]
|
||||
l = cw.len
|
||||
if l == 0 {
|
||||
table = table.ptrs[code]
|
||||
table = cw.ptr
|
||||
g.dataBit += 9
|
||||
} else {
|
||||
g.dataBit += int(l)
|
||||
pattern = *table.patterns[code]
|
||||
pattern = *cw.pattern
|
||||
}
|
||||
g.dataP += uint64(g.dataBit / 8)
|
||||
g.dataBit = g.dataBit % 8
|
||||
@ -372,9 +378,9 @@ func (g *Getter) HasNext() bool {
|
||||
// After extracting next word, it moves to the beginning of the next one
|
||||
func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
||||
savePos := g.dataP
|
||||
l := g.nextPos(true)
|
||||
l-- // because when create huffman tree we do ++ , because 0 is terminator
|
||||
if l == 0 {
|
||||
nextPos := g.nextPos(true)
|
||||
nextPos-- // because when create huffman tree we do ++ , because 0 is terminator
|
||||
if nextPos == 0 {
|
||||
if g.dataBit > 0 {
|
||||
g.dataP++
|
||||
g.dataBit = 0
|
||||
@ -383,13 +389,13 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
||||
}
|
||||
bufPos := len(buf) // Tracking position in buf where to insert part of the word
|
||||
lastUncovered := len(buf)
|
||||
if len(buf)+int(l) > cap(buf) {
|
||||
newBuf := make([]byte, len(buf)+int(l))
|
||||
if len(buf)+int(nextPos) > cap(buf) {
|
||||
newBuf := make([]byte, len(buf)+int(nextPos))
|
||||
copy(newBuf, buf)
|
||||
buf = newBuf
|
||||
} else {
|
||||
// Expand buffer
|
||||
buf = buf[:len(buf)+int(l)]
|
||||
buf = buf[:len(buf)+int(nextPos)]
|
||||
}
|
||||
// Loop below fills in the patterns
|
||||
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
|
||||
@ -406,7 +412,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
||||
g.nextPos(true /* clean */) // Reset the state of huffman reader
|
||||
bufPos = lastUncovered // Restore to the beginning of buf
|
||||
// Loop below fills the data which is not in the patterns
|
||||
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
|
||||
for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) {
|
||||
bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
|
||||
if bufPos > lastUncovered {
|
||||
dif := uint64(bufPos - lastUncovered)
|
||||
@ -415,9 +421,9 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
||||
}
|
||||
lastUncovered = bufPos + len(g.nextPattern())
|
||||
}
|
||||
if int(l) > lastUncovered {
|
||||
dif := l - uint64(lastUncovered)
|
||||
copy(buf[lastUncovered:l], g.data[postLoopPos:postLoopPos+dif])
|
||||
if int(nextPos) > lastUncovered {
|
||||
dif := nextPos - uint64(lastUncovered)
|
||||
copy(buf[lastUncovered:nextPos], g.data[postLoopPos:postLoopPos+dif])
|
||||
postLoopPos += dif
|
||||
}
|
||||
g.dataP = postLoopPos
|
||||
|
@ -17,7 +17,11 @@
|
||||
package compress
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func BenchmarkDecompressNext(b *testing.B) {
|
||||
@ -67,3 +71,28 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) {
|
||||
_ = g.MatchPrefix([]byte("longlongword"))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecompressTorrent(t *testing.B) {
|
||||
t.Skip()
|
||||
|
||||
// fpath := "./v1-000500-001000-transactions.seg"
|
||||
// fpath := "./v1-004000-004500-transactions.seg"
|
||||
// fpath := "./v1-005500-006000-transactions.seg"
|
||||
fpath := "./v1-006000-006500-transactions.seg"
|
||||
st, err := os.Stat(fpath)
|
||||
require.NoError(t, err)
|
||||
fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
|
||||
|
||||
d, err := NewDecompressor(fpath)
|
||||
require.NoError(t, err)
|
||||
defer d.Close()
|
||||
|
||||
getter := d.MakeGetter()
|
||||
|
||||
for i := 0; i < t.N; i++ {
|
||||
_, sz := getter.Next(nil)
|
||||
if sz == 0 {
|
||||
t.Fatal("sz == 0")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -20,11 +20,13 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/ledgerwatch/log/v3"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func prepareLoremDict(t *testing.T) *Decompressor {
|
||||
@ -207,3 +209,24 @@ consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
|
||||
Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`
|
||||
|
||||
var loremStrings = strings.Split(lorem, " ")
|
||||
|
||||
func TestDecompressTorrentWIthSwitch(t *testing.T) {
|
||||
t.Skip()
|
||||
|
||||
fpath := "./v1-006000-006500-transactions.seg"
|
||||
// fpath := "./v1-000500-001000-transactions.seg"
|
||||
st, err := os.Stat(fpath)
|
||||
require.NoError(t, err)
|
||||
fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
|
||||
|
||||
d, err := NewDecompressor(fpath)
|
||||
require.NoError(t, err)
|
||||
defer d.Close()
|
||||
|
||||
getter := d.MakeGetter()
|
||||
for getter.HasNext() {
|
||||
aux, sz := getter.Next(nil)
|
||||
require.NotZero(t, sz)
|
||||
_ = aux
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user