reduced memory footprint on building huffman table (#459)

This commit is contained in:
Artem Tsebrovskiy 2022-05-20 07:23:05 +03:00 committed by GitHub
parent 59d810650c
commit 6de4ac4ba9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 105 additions and 47 deletions

View File

@ -25,11 +25,15 @@ import (
"github.com/ledgerwatch/erigon-lib/mmap"
)
type codeword struct {
len byte // Number of bits in the codes
pattern *word // Pattern corresponding to entries
ptr *patternTable // pointer to deeper level tables
}
type patternTable struct {
bitLen int // Number of bits to lookup in the table
patterns []*word // Patterns corresponding to entries
lens []byte // Number of bits in the codes
ptrs []*patternTable // pointers to deeper level tables
bitLen int // Number of bits to lookup in the table
patterns []*codeword
}
type posTable struct {
@ -74,6 +78,8 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil {
return nil, err
}
// read patterns from file
d.data = d.mmapHandle1[:d.size]
d.wordsCount = binary.BigEndian.Uint64(d.data[:8])
d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16])
@ -83,6 +89,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
var patterns [][]byte
var i uint64
var patternMaxDepth uint64
//fmt.Printf("[decomp] dictSize = %d\n", dictSize)
for i < dictSize {
d, ns := binary.Uvarint(data[i:])
@ -97,6 +104,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
i += l
}
if dictSize > 0 {
var bitLen int
if patternMaxDepth > 9 {
@ -108,12 +116,12 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
tableSize := 1 << bitLen
d.dict = &patternTable{
bitLen: bitLen,
patterns: make([]*word, tableSize),
lens: make([]byte, tableSize),
ptrs: make([]*patternTable, tableSize),
patterns: make([]*codeword, tableSize),
}
buildPatternTable(depths, patterns, d.dict, 0, 0, 0, patternMaxDepth)
buildPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth)
}
// read positions
pos := 24 + dictSize
dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8])
data = d.data[pos+8 : pos+8+dictSize]
@ -133,6 +141,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
i += uint64(n)
poss = append(poss, pos)
}
if dictSize > 0 {
var bitLen int
if posMaxDepth > 9 {
@ -154,30 +163,29 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
return d, nil
}
type word []byte
type word []byte // plain text word associated with code from dictionary
// returns number of depth and patterns comsumed
func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
func buildPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int {
if len(depths) == 0 {
return 0
}
if depth == depths[0] {
pattern := word(make([]byte, len(patterns[0])))
copy(pattern, patterns[0])
pattern := word(patterns[0])
//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
if table.bitLen == int(bits) {
table.patterns[code] = &pattern
table.lens[code] = byte(bits)
table.ptrs[code] = nil
} else {
codeStep := uint16(1) << bits
codeFrom := code
codeTo := code | (uint16(1) << table.bitLen)
for c := codeFrom; c < codeTo; c += codeStep {
table.patterns[c] = &pattern
table.lens[c] = byte(bits)
table.ptrs[c] = nil
codeStep := uint16(1) << bits
codeFrom, codeTo := code, code+codeStep
if table.bitLen != bits {
codeTo = code | (uint16(1) << table.bitLen)
}
cw := &codeword{pattern: &pattern, len: byte(bits), ptr: nil}
for c := codeFrom; c < codeTo; c += codeStep {
if p := table.patterns[c]; p == nil {
table.patterns[c] = cw
} else {
p.pattern, p.len, p.ptr = &pattern, byte(bits), nil
}
}
return 1
@ -192,17 +200,14 @@ func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable,
tableSize := 1 << bitLen
newTable := &patternTable{
bitLen: bitLen,
patterns: make([]*word, tableSize),
lens: make([]byte, tableSize),
ptrs: make([]*patternTable, tableSize),
patterns: make([]*codeword, tableSize),
}
table.patterns[code] = nil
table.lens[code] = byte(0)
table.ptrs[code] = newTable
return buildPatternTable(depths, patterns, newTable, 0, 0, depth, maxDepth)
table.patterns[code] = &codeword{pattern: nil, len: byte(0), ptr: newTable}
return buildPatternTable(newTable, depths, patterns, 0, 0, depth, maxDepth)
}
b0 := buildPatternTable(depths, patterns, table, code, bits+1, depth+1, maxDepth-1)
return b0 + buildPatternTable(depths[b0:], patterns[b0:], table, (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
b0 := buildPatternTable(table, depths, patterns, code, bits+1, depth+1, maxDepth-1)
return b0 + buildPatternTable(table, depths[b0:], patterns[b0:], (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
}
func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
@ -324,7 +329,7 @@ func (g *Getter) nextPos(clean bool) uint64 {
func (g *Getter) nextPattern() []byte {
table := g.patternDict
if table.bitLen == 0 {
return *table.patterns[0]
return *table.patterns[0].pattern
}
var l byte
var pattern []byte
@ -334,13 +339,14 @@ func (g *Getter) nextPattern() []byte {
code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
}
code &= (uint16(1) << table.bitLen) - 1
l = table.lens[code]
cw := table.patterns[code]
l = cw.len
if l == 0 {
table = table.ptrs[code]
table = cw.ptr
g.dataBit += 9
} else {
g.dataBit += int(l)
pattern = *table.patterns[code]
pattern = *cw.pattern
}
g.dataP += uint64(g.dataBit / 8)
g.dataBit = g.dataBit % 8
@ -372,9 +378,9 @@ func (g *Getter) HasNext() bool {
// After extracting next word, it moves to the beginning of the next one
func (g *Getter) Next(buf []byte) ([]byte, uint64) {
savePos := g.dataP
l := g.nextPos(true)
l-- // because when create huffman tree we do ++ , because 0 is terminator
if l == 0 {
nextPos := g.nextPos(true)
nextPos-- // because when create huffman tree we do ++ , because 0 is terminator
if nextPos == 0 {
if g.dataBit > 0 {
g.dataP++
g.dataBit = 0
@ -383,13 +389,13 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
}
bufPos := len(buf) // Tracking position in buf where to insert part of the word
lastUncovered := len(buf)
if len(buf)+int(l) > cap(buf) {
newBuf := make([]byte, len(buf)+int(l))
if len(buf)+int(nextPos) > cap(buf) {
newBuf := make([]byte, len(buf)+int(nextPos))
copy(newBuf, buf)
buf = newBuf
} else {
// Expand buffer
buf = buf[:len(buf)+int(l)]
buf = buf[:len(buf)+int(nextPos)]
}
// Loop below fills in the patterns
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
@ -406,7 +412,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
g.nextPos(true /* clean */) // Reset the state of huffman reader
bufPos = lastUncovered // Restore to the beginning of buf
// Loop below fills the data which is not in the patterns
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) {
bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
if bufPos > lastUncovered {
dif := uint64(bufPos - lastUncovered)
@ -415,9 +421,9 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
}
lastUncovered = bufPos + len(g.nextPattern())
}
if int(l) > lastUncovered {
dif := l - uint64(lastUncovered)
copy(buf[lastUncovered:l], g.data[postLoopPos:postLoopPos+dif])
if int(nextPos) > lastUncovered {
dif := nextPos - uint64(lastUncovered)
copy(buf[lastUncovered:nextPos], g.data[postLoopPos:postLoopPos+dif])
postLoopPos += dif
}
g.dataP = postLoopPos

View File

@ -17,7 +17,11 @@
package compress
import (
"fmt"
"os"
"testing"
"github.com/stretchr/testify/require"
)
func BenchmarkDecompressNext(b *testing.B) {
@ -67,3 +71,28 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) {
_ = g.MatchPrefix([]byte("longlongword"))
}
}
func BenchmarkDecompressTorrent(t *testing.B) {
t.Skip()
// fpath := "./v1-000500-001000-transactions.seg"
// fpath := "./v1-004000-004500-transactions.seg"
// fpath := "./v1-005500-006000-transactions.seg"
fpath := "./v1-006000-006500-transactions.seg"
st, err := os.Stat(fpath)
require.NoError(t, err)
fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
d, err := NewDecompressor(fpath)
require.NoError(t, err)
defer d.Close()
getter := d.MakeGetter()
for i := 0; i < t.N; i++ {
_, sz := getter.Next(nil)
if sz == 0 {
t.Fatal("sz == 0")
}
}
}

View File

@ -20,11 +20,13 @@ import (
"bytes"
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"github.com/ledgerwatch/log/v3"
"github.com/stretchr/testify/require"
)
func prepareLoremDict(t *testing.T) *Decompressor {
@ -207,3 +209,24 @@ consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`
var loremStrings = strings.Split(lorem, " ")
func TestDecompressTorrentWIthSwitch(t *testing.T) {
t.Skip()
fpath := "./v1-006000-006500-transactions.seg"
// fpath := "./v1-000500-001000-transactions.seg"
st, err := os.Stat(fpath)
require.NoError(t, err)
fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
d, err := NewDecompressor(fpath)
require.NoError(t, err)
defer d.Close()
getter := d.MakeGetter()
for getter.HasNext() {
aux, sz := getter.Next(nil)
require.NotZero(t, sz)
_ = aux
}
}