reduced memory footprint on building huffman table (#459)

2024-12-27 22:28:21 +00:00 · 2022-05-20 07:23:05 +03:00 · 2022-05-20 07:23:05 +03:00 · 6de4ac4ba9
commit 6de4ac4ba9
parent 59d810650c
3 changed files with 105 additions and 47 deletions
--- a/compress/decompress.go
+++ b/compress/decompress.go
@ -25,11 +25,15 @@ import (
 	"github.com/ledgerwatch/erigon-lib/mmap"
 )

+type codeword struct {
+	len     byte          // Number of bits in the codes
+	pattern *word         // Pattern corresponding to entries
+	ptr     *patternTable // pointer to deeper level tables
+}
+
 type patternTable struct {
-	bitLen   int             // Number of bits to lookup in the table
-	patterns []*word         // Patterns corresponding to entries
-	lens     []byte          // Number of bits in the codes
-	ptrs     []*patternTable // pointers to deeper level tables
+	bitLen   int // Number of bits to lookup in the table
+	patterns []*codeword
 }

 type posTable struct {
@ -74,6 +78,8 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
 	if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil {
 		return nil, err
 	}
+
+	// read patterns from file
 	d.data = d.mmapHandle1[:d.size]
 	d.wordsCount = binary.BigEndian.Uint64(d.data[:8])
 	d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16])
@ -83,6 +89,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
 	var patterns [][]byte
 	var i uint64
 	var patternMaxDepth uint64
+
 	//fmt.Printf("[decomp] dictSize = %d\n", dictSize)
 	for i < dictSize {
 		d, ns := binary.Uvarint(data[i:])
@ -97,6 +104,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
 		//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
 		i += l
 	}
+
 	if dictSize > 0 {
 		var bitLen int
 		if patternMaxDepth > 9 {
@ -108,12 +116,12 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
 		tableSize := 1 << bitLen
 		d.dict = &patternTable{
 			bitLen:   bitLen,
-			patterns: make([]*word, tableSize),
-			lens:     make([]byte, tableSize),
-			ptrs:     make([]*patternTable, tableSize),
+			patterns: make([]*codeword, tableSize),
 		}
-		buildPatternTable(depths, patterns, d.dict, 0, 0, 0, patternMaxDepth)
+		buildPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth)
 	}
+
+	// read positions
 	pos := 24 + dictSize
 	dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8])
 	data = d.data[pos+8 : pos+8+dictSize]
@ -133,6 +141,7 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
 		i += uint64(n)
 		poss = append(poss, pos)
 	}
+
 	if dictSize > 0 {
 		var bitLen int
 		if posMaxDepth > 9 {
@ -154,30 +163,29 @@ func NewDecompressor(compressedFile string) (*Decompressor, error) {
 	return d, nil
 }

-type word []byte
+type word []byte // plain text word associated with code from dictionary

 // returns number of depth and patterns comsumed
-func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
+func buildPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int {
 	if len(depths) == 0 {
 		return 0
 	}
 	if depth == depths[0] {
-		pattern := word(make([]byte, len(patterns[0])))
-		copy(pattern, patterns[0])
+		pattern := word(patterns[0])
 		//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
-		if table.bitLen == int(bits) {
-			table.patterns[code] = &pattern
-			table.lens[code] = byte(bits)
-			table.ptrs[code] = nil
-		} else {
-			codeStep := uint16(1) << bits
-			codeFrom := code
-			codeTo := code | (uint16(1) << table.bitLen)

-			for c := codeFrom; c < codeTo; c += codeStep {
-				table.patterns[c] = &pattern
-				table.lens[c] = byte(bits)
-				table.ptrs[c] = nil
+		codeStep := uint16(1) << bits
+		codeFrom, codeTo := code, code+codeStep
+		if table.bitLen != bits {
+			codeTo = code | (uint16(1) << table.bitLen)
+		}
+
+		cw := &codeword{pattern: &pattern, len: byte(bits), ptr: nil}
+		for c := codeFrom; c < codeTo; c += codeStep {
+			if p := table.patterns[c]; p == nil {
+				table.patterns[c] = cw
+			} else {
+				p.pattern, p.len, p.ptr = &pattern, byte(bits), nil
 			}
 		}
 		return 1
@ -192,17 +200,14 @@ func buildPatternTable(depths []uint64, patterns [][]byte, table *patternTable,
 		tableSize := 1 << bitLen
 		newTable := &patternTable{
 			bitLen:   bitLen,
-			patterns: make([]*word, tableSize),
-			lens:     make([]byte, tableSize),
-			ptrs:     make([]*patternTable, tableSize),
+			patterns: make([]*codeword, tableSize),
 		}
-		table.patterns[code] = nil
-		table.lens[code] = byte(0)
-		table.ptrs[code] = newTable
-		return buildPatternTable(depths, patterns, newTable, 0, 0, depth, maxDepth)
+
+		table.patterns[code] = &codeword{pattern: nil, len: byte(0), ptr: newTable}
+		return buildPatternTable(newTable, depths, patterns, 0, 0, depth, maxDepth)
 	}
-	b0 := buildPatternTable(depths, patterns, table, code, bits+1, depth+1, maxDepth-1)
-	return b0 + buildPatternTable(depths[b0:], patterns[b0:], table, (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
+	b0 := buildPatternTable(table, depths, patterns, code, bits+1, depth+1, maxDepth-1)
+	return b0 + buildPatternTable(table, depths[b0:], patterns[b0:], (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
 }

 func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
@ -324,7 +329,7 @@ func (g *Getter) nextPos(clean bool) uint64 {
 func (g *Getter) nextPattern() []byte {
 	table := g.patternDict
 	if table.bitLen == 0 {
-		return *table.patterns[0]
+		return *table.patterns[0].pattern
 	}
 	var l byte
 	var pattern []byte
@ -334,13 +339,14 @@ func (g *Getter) nextPattern() []byte {
 			code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
 		}
 		code &= (uint16(1) << table.bitLen) - 1
-		l = table.lens[code]
+		cw := table.patterns[code]
+		l = cw.len
 		if l == 0 {
-			table = table.ptrs[code]
+			table = cw.ptr
 			g.dataBit += 9
 		} else {
 			g.dataBit += int(l)
-			pattern = *table.patterns[code]
+			pattern = *cw.pattern
 		}
 		g.dataP += uint64(g.dataBit / 8)
 		g.dataBit = g.dataBit % 8
@ -372,9 +378,9 @@ func (g *Getter) HasNext() bool {
 // After extracting next word, it moves to the beginning of the next one
 func (g *Getter) Next(buf []byte) ([]byte, uint64) {
 	savePos := g.dataP
-	l := g.nextPos(true)
-	l-- // because when create huffman tree we do ++ , because 0 is terminator
-	if l == 0 {
+	nextPos := g.nextPos(true)
+	nextPos-- // because when create huffman tree we do ++ , because 0 is terminator
+	if nextPos == 0 {
 		if g.dataBit > 0 {
 			g.dataP++
 			g.dataBit = 0
@ -383,13 +389,13 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
 	}
 	bufPos := len(buf) // Tracking position in buf where to insert part of the word
 	lastUncovered := len(buf)
-	if len(buf)+int(l) > cap(buf) {
-		newBuf := make([]byte, len(buf)+int(l))
+	if len(buf)+int(nextPos) > cap(buf) {
+		newBuf := make([]byte, len(buf)+int(nextPos))
 		copy(newBuf, buf)
 		buf = newBuf
 	} else {
 		// Expand buffer
-		buf = buf[:len(buf)+int(l)]
+		buf = buf[:len(buf)+int(nextPos)]
 	}
 	// Loop below fills in the patterns
 	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
@ -406,7 +412,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
 	g.nextPos(true /* clean */) // Reset the state of huffman reader
 	bufPos = lastUncovered      // Restore to the beginning of buf
 	// Loop below fills the data which is not in the patterns
-	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
+	for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) {
 		bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
 		if bufPos > lastUncovered {
 			dif := uint64(bufPos - lastUncovered)
@ -415,9 +421,9 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
 		}
 		lastUncovered = bufPos + len(g.nextPattern())
 	}
-	if int(l) > lastUncovered {
-		dif := l - uint64(lastUncovered)
-		copy(buf[lastUncovered:l], g.data[postLoopPos:postLoopPos+dif])
+	if int(nextPos) > lastUncovered {
+		dif := nextPos - uint64(lastUncovered)
+		copy(buf[lastUncovered:nextPos], g.data[postLoopPos:postLoopPos+dif])
 		postLoopPos += dif
 	}
 	g.dataP = postLoopPos
--- a/compress/decompress_bench_test.go
+++ b/compress/decompress_bench_test.go
@ -17,7 +17,11 @@
 package compress

 import (
+	"fmt"
+	"os"
 	"testing"
+
+	"github.com/stretchr/testify/require"
 )

 func BenchmarkDecompressNext(b *testing.B) {
@ -67,3 +71,28 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) {
 		_ = g.MatchPrefix([]byte("longlongword"))
 	}
 }
+
+func BenchmarkDecompressTorrent(t *testing.B) {
+	t.Skip()
+
+	// fpath := "./v1-000500-001000-transactions.seg"
+	// fpath := "./v1-004000-004500-transactions.seg"
+	// fpath := "./v1-005500-006000-transactions.seg"
+	fpath := "./v1-006000-006500-transactions.seg"
+	st, err := os.Stat(fpath)
+	require.NoError(t, err)
+	fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
+
+	d, err := NewDecompressor(fpath)
+	require.NoError(t, err)
+	defer d.Close()
+
+	getter := d.MakeGetter()
+
+	for i := 0; i < t.N; i++ {
+		_, sz := getter.Next(nil)
+		if sz == 0 {
+			t.Fatal("sz == 0")
+		}
+	}
+}
--- a/compress/decompress_test.go
+++ b/compress/decompress_test.go
@ -20,11 +20,13 @@ import (
 	"bytes"
 	"context"
 	"fmt"
+	"os"
 	"path/filepath"
 	"strings"
 	"testing"

 	"github.com/ledgerwatch/log/v3"
+	"github.com/stretchr/testify/require"
 )

 func prepareLoremDict(t *testing.T) *Decompressor {
@ -207,3 +209,24 @@ consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
 Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`

 var loremStrings = strings.Split(lorem, " ")
+
+func TestDecompressTorrentWIthSwitch(t *testing.T) {
+	t.Skip()
+
+	fpath := "./v1-006000-006500-transactions.seg"
+	// fpath := "./v1-000500-001000-transactions.seg"
+	st, err := os.Stat(fpath)
+	require.NoError(t, err)
+	fmt.Printf("stat: %+v %dbytes\n", st.Name(), st.Size())
+
+	d, err := NewDecompressor(fpath)
+	require.NoError(t, err)
+	defer d.Close()
+
+	getter := d.MakeGetter()
+	for getter.HasNext() {
+		aux, sz := getter.Next(nil)
+		require.NotZero(t, sz)
+		_ = aux
+	}
+}