From 82753a6cda4ae96802ec6f9ca5988fb6ef6a8f89 Mon Sep 17 00:00:00 2001 From: Alex Sharov Date: Tue, 21 Dec 2021 10:48:34 +0700 Subject: [PATCH] add words count in .seg (breaking change in snapshot format) (#3140) --- cmd/hack/hack.go | 22 ++++++++++++++-------- go.mod | 2 +- go.sum | 4 ++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cmd/hack/hack.go b/cmd/hack/hack.go index c7a41d1bf..6c6c98828 100644 --- a/cmd/hack/hack.go +++ b/cmd/hack/hack.go @@ -2171,19 +2171,19 @@ func reducedict(name string, segmentFileName string) error { wg.Add(1) go reduceDictWorker(ch, &wg, &pt, collector, inputSize, outputSize, posMap) } - i := 0 + var wordsCount uint64 if err := snapshotsync.ReadSimpleFile(name+".dat", func(v []byte) error { input := make([]byte, 8+int(len(v))) - binary.BigEndian.PutUint64(input, uint64(i)) + binary.BigEndian.PutUint64(input, wordsCount) copy(input[8:], v) ch <- input - i++ + wordsCount++ select { default: case <-logEvery.C: var m runtime.MemStats runtime.ReadMemStats(&m) - log.Info("Replacement preprocessing", "processed", fmt.Sprintf("%dK", i/1_000), "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys)) + log.Info("Replacement preprocessing", "processed", fmt.Sprintf("%dK", wordsCount/1_000), "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys)) } return nil }); err != nil { @@ -2191,6 +2191,7 @@ func reducedict(name string, segmentFileName string) error { } close(ch) wg.Wait() + var m runtime.MemStats runtime.ReadMemStats(&m) log.Info("Done", "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys)) @@ -2223,7 +2224,7 @@ func reducedict(name string, segmentFileName string) error { offset += uint64(n + len(p.w)) } patternCutoff := offset // All offsets below this will be considered patterns - i = 0 + i := 0 log.Info("Effective dictionary", "size", patternList.Len()) // Build Huffman tree for codes var codeHeap PatternHeap @@ -2284,17 +2285,22 @@ func reducedict(name string, segmentFileName string) error { return err } cw := bufio.NewWriterSize(cf, etl.BufIOSize) - // First, output dictionary + // 1-st, output dictionary + binary.BigEndian.PutUint64(numBuf, wordsCount) // Dictionary size + if _, err = cw.Write(numBuf[:8]); err != nil { + return err + } + // 2-nd, output dictionary binary.BigEndian.PutUint64(numBuf, offset) // Dictionary size if _, err = cw.Write(numBuf[:8]); err != nil { return err } - // Secondly, output directory root + // 3-rd, output directory root binary.BigEndian.PutUint64(numBuf, root.offset) if _, err = cw.Write(numBuf[:8]); err != nil { return err } - // Thirdly, output pattern cutoff offset + // 4-th, output pattern cutoff offset binary.BigEndian.PutUint64(numBuf, patternCutoff) if _, err = cw.Write(numBuf[:8]); err != nil { return err diff --git a/go.mod b/go.mod index 1b84228ff..7c7c10ef0 100644 --- a/go.mod +++ b/go.mod @@ -37,7 +37,7 @@ require ( github.com/json-iterator/go v1.1.12 github.com/julienschmidt/httprouter v1.3.0 github.com/kevinburke/go-bindata v3.21.0+incompatible - github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3 + github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f github.com/ledgerwatch/log/v3 v3.4.0 github.com/ledgerwatch/secp256k1 v1.0.0 github.com/logrusorgru/aurora/v3 v3.0.0 diff --git a/go.sum b/go.sum index 2ef60e92e..5eb9a4e59 100644 --- a/go.sum +++ b/go.sum @@ -617,8 +617,8 @@ github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758 h1:0D5M2HQSGD3P github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c= github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8= -github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3 h1:8qDZvisP+6pFiVFd20BTD2y8/rYAe4go//HdBnk6CX8= -github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3/go.mod h1:lyGP3i0x4CeabdKZ4beycD5xZfHWZwJsAX+70OfGj4Y= +github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f h1:MCIljelbCsLcgMzNTsrRg2Nu5DFyNlLxf5ZSWdy3CiM= +github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f/go.mod h1:lyGP3i0x4CeabdKZ4beycD5xZfHWZwJsAX+70OfGj4Y= github.com/ledgerwatch/log/v3 v3.4.0 h1:SEIOcv5a2zkG3PmoT5jeTU9m/0nEUv0BJS5bzsjwKCI= github.com/ledgerwatch/log/v3 v3.4.0/go.mod h1:VXcz6Ssn6XEeU92dCMc39/g1F0OYAjw1Mt+dGP5DjXY= github.com/ledgerwatch/secp256k1 v1.0.0 h1:Usvz87YoTG0uePIV8woOof5cQnLXGYa162rFf3YnwaQ=