From 6de4ac4ba96325808e2e5c0f92ef1733623edef0 Mon Sep 17 00:00:00 2001 From: Artem Tsebrovskiy Date: Fri, 20 May 2022 07:23:05 +0300 Subject: [PATCH] reduced memory footprint on building huffman table (#459) --- compress/decompress.go | 100 ++++++++++++++++-------------- compress/decompress_bench_test.go | 29 +++++++++ compress/decompress_test.go | 23 +++++++ 3 files changed, 105 insertions(+), 47 deletions(-) diff --git a/compress/decompress.go b/compress/decompress.go index fc52e979c..13bb077f4 100644 --- a/compress/decompress.go +++ b/compress/decompress.go @@ -25,11 +25,15 @@ import ( "github.com/ledgerwatch/erigon-lib/mmap" ) +type codeword struct { + len byte // Number of bits in the codes + pattern *word // Pattern corresponding to entries + ptr *patternTable // pointer to deeper level tables +} + type patternTable struct { - bitLen int // Number of bits to lookup in the table - patterns []*word // Patterns corresponding to entries - lens []byte // Number of bits in the codes - ptrs []*patternTable // pointers to deeper level tables + bitLen int // Number of bits to lookup in the table + patterns []*codeword } type posTable struct { @@ -74,6 +78,8 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) { if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil { return nil, err } + + // read patterns from file d.data = d.mmapHandle1[:d.size] d.wordsCount = binary.BigEndian.Uint64(d.data[:8]) d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16]) @@ -83,6 +89,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) { var patterns [][]byte var i uint64 var patternMaxDepth uint64 + //fmt.Printf("[decomp] dictSize = %d\n", dictSize) for i < dictSize { d, ns := binary.Uvarint(data[i:]) @@ -97,6 +104,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) { //fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l]) i += l } + if dictSize > 0 { var bitLen int if patternMaxDepth > 9 { @@ -108,12 +116,12 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) { tableSize := 1 << bitLen d.dict = &patternTable{ bitLen: bitLen, - patterns: make([]*word, tableSize), - lens: make([]byte, tableSize), - ptrs: make([]*patternTable, tableSize), + patterns: make([]*codeword, tableSize), } - buildPatternTable(depths, patterns, d.dict, 0, 0, 0, patternMaxDepth) + buildPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth) } + + // read positions pos := 24 + dictSize dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8]) data = d.data[pos+8 : pos+8+dictSize] @@ -133,6 +141,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) { i += uint64(n) poss = append(poss, pos) } + if dictSize > 0 { var bitLen int if posMaxDepth > 9 { @@ -154,30 +163,29 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) { return d, nil } -type word []byte +type word []byte // plain text word associated with code from dictionary // returns number of depth and patterns comsumed -func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable, code uint16, bits int, depth uint64, maxDepth uint64) int { +func buildPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int { if len(depths) == 0 { return 0 } if depth == depths[0] { - pattern := word(make([]byte, len(patterns[0]))) - copy(pattern, patterns[0]) + pattern := word(patterns[0]) //fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern) - if table.bitLen == int(bits) { - table.patterns[code] = &pattern - table.lens[code] = byte(bits) - table.ptrs[code] = nil - } else { - codeStep := uint16(1) << bits - codeFrom := code - codeTo := code | (uint16(1) << table.bitLen) - for c := codeFrom; c < codeTo; c += codeStep { - table.patterns[c] = &pattern - table.lens[c] = byte(bits) - table.ptrs[c] = nil + codeStep := uint16(1) << bits + codeFrom, codeTo := code, code+codeStep + if table.bitLen != bits { + codeTo = code | (uint16(1) << table.bitLen) + } + + cw := &codeword{pattern: &pattern, len: byte(bits), ptr: nil} + for c := codeFrom; c < codeTo; c += codeStep { + if p := table.patterns[c]; p == nil { + table.patterns[c] = cw + } else { + p.pattern, p.len, p.ptr = &pattern, byte(bits), nil } } return 1 @@ -192,17 +200,14 @@ func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable, tableSize := 1 << bitLen newTable := &patternTable{ bitLen: bitLen, - patterns: make([]*word, tableSize), - lens: make([]byte, tableSize), - ptrs: make([]*patternTable, tableSize), + patterns: make([]*codeword, tableSize), } - table.patterns[code] = nil - table.lens[code] = byte(0) - table.ptrs[code] = newTable - return buildPatternTable(depths, patterns, newTable, 0, 0, depth, maxDepth) + + table.patterns[code] = &codeword{pattern: nil, len: byte(0), ptr: newTable} + return buildPatternTable(newTable, depths, patterns, 0, 0, depth, maxDepth) } - b0 := buildPatternTable(depths, patterns, table, code, bits+1, depth+1, maxDepth-1) - return b0 + buildPatternTable(depths[b0:], patterns[b0:], table, (uint16(1)< 0 { g.dataP++ g.dataBit = 0 @@ -383,13 +389,13 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) { } bufPos := len(buf) // Tracking position in buf where to insert part of the word lastUncovered := len(buf) - if len(buf)+int(l) > cap(buf) { - newBuf := make([]byte, len(buf)+int(l)) + if len(buf)+int(nextPos) > cap(buf) { + newBuf := make([]byte, len(buf)+int(nextPos)) copy(newBuf, buf) buf = newBuf } else { // Expand buffer - buf = buf[:len(buf)+int(l)] + buf = buf[:len(buf)+int(nextPos)] } // Loop below fills in the patterns for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { @@ -406,7 +412,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) { g.nextPos(true /* clean */) // Reset the state of huffman reader bufPos = lastUncovered // Restore to the beginning of buf // Loop below fills the data which is not in the patterns - for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { + for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) { bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another if bufPos > lastUncovered { dif := uint64(bufPos - lastUncovered) @@ -415,9 +421,9 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) { } lastUncovered = bufPos + len(g.nextPattern()) } - if int(l) > lastUncovered { - dif := l - uint64(lastUncovered) - copy(buf[lastUncovered:l], g.data[postLoopPos:postLoopPos+dif]) + if int(nextPos) > lastUncovered { + dif := nextPos - uint64(lastUncovered) + copy(buf[lastUncovered:nextPos], g.data[postLoopPos:postLoopPos+dif]) postLoopPos += dif } g.dataP = postLoopPos diff --git a/compress/decompress_bench_test.go b/compress/decompress_bench_test.go index a58494e5b..9ae6c9dc0 100644 --- a/compress/decompress_bench_test.go +++ b/compress/decompress_bench_test.go @@ -17,7 +17,11 @@ package compress import ( + "fmt" + "os" "testing" + + "github.com/stretchr/testify/require" ) func BenchmarkDecompressNext(b *testing.B) { @@ -67,3 +71,28 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) { _ = g.MatchPrefix([]byte("longlongword")) } } + +func BenchmarkDecompressTorrent(t *testing.B) { + t.Skip() + + // fpath := "./v1-000500-001000-transactions.seg" + // fpath := "./v1-004000-004500-transactions.seg" + // fpath := "./v1-005500-006000-transactions.seg" + fpath := "./v1-006000-006500-transactions.seg" + st, err := os.Stat(fpath) + require.NoError(t, err) + fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size()) + + d, err := NewDecompressor(fpath) + require.NoError(t, err) + defer d.Close() + + getter := d.MakeGetter() + + for i := 0; i < t.N; i++ { + _, sz := getter.Next(nil) + if sz == 0 { + t.Fatal("sz == 0") + } + } +} diff --git a/compress/decompress_test.go b/compress/decompress_test.go index 0f7731328..5b01ef123 100644 --- a/compress/decompress_test.go +++ b/compress/decompress_test.go @@ -20,11 +20,13 @@ import ( "bytes" "context" "fmt" + "os" "path/filepath" "strings" "testing" "github.com/ledgerwatch/log/v3" + "github.com/stretchr/testify/require" ) func prepareLoremDict(t *testing.T) *Decompressor { @@ -207,3 +209,24 @@ consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum` var loremStrings = strings.Split(lorem, " ") + +func TestDecompressTorrentWIthSwitch(t *testing.T) { + t.Skip() + + fpath := "./v1-006000-006500-transactions.seg" + // fpath := "./v1-000500-001000-transactions.seg" + st, err := os.Stat(fpath) + require.NoError(t, err) + fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size()) + + d, err := NewDecompressor(fpath) + require.NoError(t, err) + defer d.Close() + + getter := d.MakeGetter() + for getter.HasNext() { + aux, sz := getter.Next(nil) + require.NotZero(t, sz) + _ = aux + } +}