mirror of
https://gitlab.com/pulsechaincom/erigon-pulse.git
synced 2024-12-28 14:47:16 +00:00
reduced memory footprint on building huffman table (#459)
This commit is contained in:
parent
59d810650c
commit
6de4ac4ba9
@ -25,11 +25,15 @@ import (
|
|||||||
"github.com/ledgerwatch/erigon-lib/mmap"
|
"github.com/ledgerwatch/erigon-lib/mmap"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type codeword struct {
|
||||||
|
len byte // Number of bits in the codes
|
||||||
|
pattern *word // Pattern corresponding to entries
|
||||||
|
ptr *patternTable // pointer to deeper level tables
|
||||||
|
}
|
||||||
|
|
||||||
type patternTable struct {
|
type patternTable struct {
|
||||||
bitLen int // Number of bits to lookup in the table
|
bitLen int // Number of bits to lookup in the table
|
||||||
patterns []*word // Patterns corresponding to entries
|
patterns []*codeword
|
||||||
lens []byte // Number of bits in the codes
|
|
||||||
ptrs []*patternTable // pointers to deeper level tables
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type posTable struct {
|
type posTable struct {
|
||||||
@ -74,6 +78,8 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
|||||||
if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil {
|
if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// read patterns from file
|
||||||
d.data = d.mmapHandle1[:d.size]
|
d.data = d.mmapHandle1[:d.size]
|
||||||
d.wordsCount = binary.BigEndian.Uint64(d.data[:8])
|
d.wordsCount = binary.BigEndian.Uint64(d.data[:8])
|
||||||
d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16])
|
d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16])
|
||||||
@ -83,6 +89,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
|||||||
var patterns [][]byte
|
var patterns [][]byte
|
||||||
var i uint64
|
var i uint64
|
||||||
var patternMaxDepth uint64
|
var patternMaxDepth uint64
|
||||||
|
|
||||||
//fmt.Printf("[decomp] dictSize = %d\n", dictSize)
|
//fmt.Printf("[decomp] dictSize = %d\n", dictSize)
|
||||||
for i < dictSize {
|
for i < dictSize {
|
||||||
d, ns := binary.Uvarint(data[i:])
|
d, ns := binary.Uvarint(data[i:])
|
||||||
@ -97,6 +104,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
|||||||
//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
|
//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
|
||||||
i += l
|
i += l
|
||||||
}
|
}
|
||||||
|
|
||||||
if dictSize > 0 {
|
if dictSize > 0 {
|
||||||
var bitLen int
|
var bitLen int
|
||||||
if patternMaxDepth > 9 {
|
if patternMaxDepth > 9 {
|
||||||
@ -108,12 +116,12 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
|||||||
tableSize := 1 << bitLen
|
tableSize := 1 << bitLen
|
||||||
d.dict = &patternTable{
|
d.dict = &patternTable{
|
||||||
bitLen: bitLen,
|
bitLen: bitLen,
|
||||||
patterns: make([]*word, tableSize),
|
patterns: make([]*codeword, tableSize),
|
||||||
lens: make([]byte, tableSize),
|
|
||||||
ptrs: make([]*patternTable, tableSize),
|
|
||||||
}
|
}
|
||||||
buildPatternTable(depths, patterns, d.dict, 0, 0, 0, patternMaxDepth)
|
buildPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// read positions
|
||||||
pos := 24 + dictSize
|
pos := 24 + dictSize
|
||||||
dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8])
|
dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8])
|
||||||
data = d.data[pos+8 : pos+8+dictSize]
|
data = d.data[pos+8 : pos+8+dictSize]
|
||||||
@ -133,6 +141,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
|||||||
i += uint64(n)
|
i += uint64(n)
|
||||||
poss = append(poss, pos)
|
poss = append(poss, pos)
|
||||||
}
|
}
|
||||||
|
|
||||||
if dictSize > 0 {
|
if dictSize > 0 {
|
||||||
var bitLen int
|
var bitLen int
|
||||||
if posMaxDepth > 9 {
|
if posMaxDepth > 9 {
|
||||||
@ -154,30 +163,29 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
|
|||||||
return d, nil
|
return d, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type word []byte
|
type word []byte // plain text word associated with code from dictionary
|
||||||
|
|
||||||
// returns number of depth and patterns comsumed
|
// returns number of depth and patterns comsumed
|
||||||
func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
func buildPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
||||||
if len(depths) == 0 {
|
if len(depths) == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
if depth == depths[0] {
|
if depth == depths[0] {
|
||||||
pattern := word(make([]byte, len(patterns[0])))
|
pattern := word(patterns[0])
|
||||||
copy(pattern, patterns[0])
|
|
||||||
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
|
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
|
||||||
if table.bitLen == int(bits) {
|
|
||||||
table.patterns[code] = &pattern
|
|
||||||
table.lens[code] = byte(bits)
|
|
||||||
table.ptrs[code] = nil
|
|
||||||
} else {
|
|
||||||
codeStep := uint16(1) << bits
|
|
||||||
codeFrom := code
|
|
||||||
codeTo := code | (uint16(1) << table.bitLen)
|
|
||||||
|
|
||||||
for c := codeFrom; c < codeTo; c += codeStep {
|
codeStep := uint16(1) << bits
|
||||||
table.patterns[c] = &pattern
|
codeFrom, codeTo := code, code+codeStep
|
||||||
table.lens[c] = byte(bits)
|
if table.bitLen != bits {
|
||||||
table.ptrs[c] = nil
|
codeTo = code | (uint16(1) << table.bitLen)
|
||||||
|
}
|
||||||
|
|
||||||
|
cw := &codeword{pattern: &pattern, len: byte(bits), ptr: nil}
|
||||||
|
for c := codeFrom; c < codeTo; c += codeStep {
|
||||||
|
if p := table.patterns[c]; p == nil {
|
||||||
|
table.patterns[c] = cw
|
||||||
|
} else {
|
||||||
|
p.pattern, p.len, p.ptr = &pattern, byte(bits), nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 1
|
return 1
|
||||||
@ -192,17 +200,14 @@ func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable,
|
|||||||
tableSize := 1 << bitLen
|
tableSize := 1 << bitLen
|
||||||
newTable := &patternTable{
|
newTable := &patternTable{
|
||||||
bitLen: bitLen,
|
bitLen: bitLen,
|
||||||
patterns: make([]*word, tableSize),
|
patterns: make([]*codeword, tableSize),
|
||||||
lens: make([]byte, tableSize),
|
|
||||||
ptrs: make([]*patternTable, tableSize),
|
|
||||||
}
|
}
|
||||||
table.patterns[code] = nil
|
|
||||||
table.lens[code] = byte(0)
|
table.patterns[code] = &codeword{pattern: nil, len: byte(0), ptr: newTable}
|
||||||
table.ptrs[code] = newTable
|
return buildPatternTable(newTable, depths, patterns, 0, 0, depth, maxDepth)
|
||||||
return buildPatternTable(depths, patterns, newTable, 0, 0, depth, maxDepth)
|
|
||||||
}
|
}
|
||||||
b0 := buildPatternTable(depths, patterns, table, code, bits+1, depth+1, maxDepth-1)
|
b0 := buildPatternTable(table, depths, patterns, code, bits+1, depth+1, maxDepth-1)
|
||||||
return b0 + buildPatternTable(depths[b0:], patterns[b0:], table, (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
|
return b0 + buildPatternTable(table, depths[b0:], patterns[b0:], (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
|
||||||
@ -324,7 +329,7 @@ func (g *Getter) nextPos(clean bool) uint64 {
|
|||||||
func (g *Getter) nextPattern() []byte {
|
func (g *Getter) nextPattern() []byte {
|
||||||
table := g.patternDict
|
table := g.patternDict
|
||||||
if table.bitLen == 0 {
|
if table.bitLen == 0 {
|
||||||
return *table.patterns[0]
|
return *table.patterns[0].pattern
|
||||||
}
|
}
|
||||||
var l byte
|
var l byte
|
||||||
var pattern []byte
|
var pattern []byte
|
||||||
@ -334,13 +339,14 @@ func (g *Getter) nextPattern() []byte {
|
|||||||
code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
|
code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
|
||||||
}
|
}
|
||||||
code &= (uint16(1) << table.bitLen) - 1
|
code &= (uint16(1) << table.bitLen) - 1
|
||||||
l = table.lens[code]
|
cw := table.patterns[code]
|
||||||
|
l = cw.len
|
||||||
if l == 0 {
|
if l == 0 {
|
||||||
table = table.ptrs[code]
|
table = cw.ptr
|
||||||
g.dataBit += 9
|
g.dataBit += 9
|
||||||
} else {
|
} else {
|
||||||
g.dataBit += int(l)
|
g.dataBit += int(l)
|
||||||
pattern = *table.patterns[code]
|
pattern = *cw.pattern
|
||||||
}
|
}
|
||||||
g.dataP += uint64(g.dataBit / 8)
|
g.dataP += uint64(g.dataBit / 8)
|
||||||
g.dataBit = g.dataBit % 8
|
g.dataBit = g.dataBit % 8
|
||||||
@ -372,9 +378,9 @@ func (g *Getter) HasNext() bool {
|
|||||||
// After extracting next word, it moves to the beginning of the next one
|
// After extracting next word, it moves to the beginning of the next one
|
||||||
func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
||||||
savePos := g.dataP
|
savePos := g.dataP
|
||||||
l := g.nextPos(true)
|
nextPos := g.nextPos(true)
|
||||||
l-- // because when create huffman tree we do ++ , because 0 is terminator
|
nextPos-- // because when create huffman tree we do ++ , because 0 is terminator
|
||||||
if l == 0 {
|
if nextPos == 0 {
|
||||||
if g.dataBit > 0 {
|
if g.dataBit > 0 {
|
||||||
g.dataP++
|
g.dataP++
|
||||||
g.dataBit = 0
|
g.dataBit = 0
|
||||||
@ -383,13 +389,13 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
|||||||
}
|
}
|
||||||
bufPos := len(buf) // Tracking position in buf where to insert part of the word
|
bufPos := len(buf) // Tracking position in buf where to insert part of the word
|
||||||
lastUncovered := len(buf)
|
lastUncovered := len(buf)
|
||||||
if len(buf)+int(l) > cap(buf) {
|
if len(buf)+int(nextPos) > cap(buf) {
|
||||||
newBuf := make([]byte, len(buf)+int(l))
|
newBuf := make([]byte, len(buf)+int(nextPos))
|
||||||
copy(newBuf, buf)
|
copy(newBuf, buf)
|
||||||
buf = newBuf
|
buf = newBuf
|
||||||
} else {
|
} else {
|
||||||
// Expand buffer
|
// Expand buffer
|
||||||
buf = buf[:len(buf)+int(l)]
|
buf = buf[:len(buf)+int(nextPos)]
|
||||||
}
|
}
|
||||||
// Loop below fills in the patterns
|
// Loop below fills in the patterns
|
||||||
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
|
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
|
||||||
@ -406,7 +412,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
|||||||
g.nextPos(true /* clean */) // Reset the state of huffman reader
|
g.nextPos(true /* clean */) // Reset the state of huffman reader
|
||||||
bufPos = lastUncovered // Restore to the beginning of buf
|
bufPos = lastUncovered // Restore to the beginning of buf
|
||||||
// Loop below fills the data which is not in the patterns
|
// Loop below fills the data which is not in the patterns
|
||||||
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
|
for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) {
|
||||||
bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
|
bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
|
||||||
if bufPos > lastUncovered {
|
if bufPos > lastUncovered {
|
||||||
dif := uint64(bufPos - lastUncovered)
|
dif := uint64(bufPos - lastUncovered)
|
||||||
@ -415,9 +421,9 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
|
|||||||
}
|
}
|
||||||
lastUncovered = bufPos + len(g.nextPattern())
|
lastUncovered = bufPos + len(g.nextPattern())
|
||||||
}
|
}
|
||||||
if int(l) > lastUncovered {
|
if int(nextPos) > lastUncovered {
|
||||||
dif := l - uint64(lastUncovered)
|
dif := nextPos - uint64(lastUncovered)
|
||||||
copy(buf[lastUncovered:l], g.data[postLoopPos:postLoopPos+dif])
|
copy(buf[lastUncovered:nextPos], g.data[postLoopPos:postLoopPos+dif])
|
||||||
postLoopPos += dif
|
postLoopPos += dif
|
||||||
}
|
}
|
||||||
g.dataP = postLoopPos
|
g.dataP = postLoopPos
|
||||||
|
@ -17,7 +17,11 @@
|
|||||||
package compress
|
package compress
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func BenchmarkDecompressNext(b *testing.B) {
|
func BenchmarkDecompressNext(b *testing.B) {
|
||||||
@ -67,3 +71,28 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) {
|
|||||||
_ = g.MatchPrefix([]byte("longlongword"))
|
_ = g.MatchPrefix([]byte("longlongword"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkDecompressTorrent(t *testing.B) {
|
||||||
|
t.Skip()
|
||||||
|
|
||||||
|
// fpath := "./v1-000500-001000-transactions.seg"
|
||||||
|
// fpath := "./v1-004000-004500-transactions.seg"
|
||||||
|
// fpath := "./v1-005500-006000-transactions.seg"
|
||||||
|
fpath := "./v1-006000-006500-transactions.seg"
|
||||||
|
st, err := os.Stat(fpath)
|
||||||
|
require.NoError(t, err)
|
||||||
|
fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
|
||||||
|
|
||||||
|
d, err := NewDecompressor(fpath)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer d.Close()
|
||||||
|
|
||||||
|
getter := d.MakeGetter()
|
||||||
|
|
||||||
|
for i := 0; i < t.N; i++ {
|
||||||
|
_, sz := getter.Next(nil)
|
||||||
|
if sz == 0 {
|
||||||
|
t.Fatal("sz == 0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -20,11 +20,13 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/ledgerwatch/log/v3"
|
"github.com/ledgerwatch/log/v3"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func prepareLoremDict(t *testing.T) *Decompressor {
|
func prepareLoremDict(t *testing.T) *Decompressor {
|
||||||
@ -207,3 +209,24 @@ consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
|
|||||||
Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`
|
Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`
|
||||||
|
|
||||||
var loremStrings = strings.Split(lorem, " ")
|
var loremStrings = strings.Split(lorem, " ")
|
||||||
|
|
||||||
|
func TestDecompressTorrentWIthSwitch(t *testing.T) {
|
||||||
|
t.Skip()
|
||||||
|
|
||||||
|
fpath := "./v1-006000-006500-transactions.seg"
|
||||||
|
// fpath := "./v1-000500-001000-transactions.seg"
|
||||||
|
st, err := os.Stat(fpath)
|
||||||
|
require.NoError(t, err)
|
||||||
|
fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
|
||||||
|
|
||||||
|
d, err := NewDecompressor(fpath)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer d.Close()
|
||||||
|
|
||||||
|
getter := d.MakeGetter()
|
||||||
|
for getter.HasNext() {
|
||||||
|
aux, sz := getter.Next(nil)
|
||||||
|
require.NotZero(t, sz)
|
||||||
|
_ = aux
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user