add words count in .seg (breaking change in snapshot format) (#3140)

This commit is contained in:
Alex Sharov 2021-12-21 10:48:34 +07:00 committed by GitHub
parent 502e933029
commit 82753a6cda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 17 additions and 11 deletions

View File

@ -2171,19 +2171,19 @@ func reducedict(name string, segmentFileName string) error {
wg.Add(1) wg.Add(1)
go reduceDictWorker(ch, &wg, &pt, collector, inputSize, outputSize, posMap) go reduceDictWorker(ch, &wg, &pt, collector, inputSize, outputSize, posMap)
} }
i := 0 var wordsCount uint64
if err := snapshotsync.ReadSimpleFile(name+".dat", func(v []byte) error { if err := snapshotsync.ReadSimpleFile(name+".dat", func(v []byte) error {
input := make([]byte, 8+int(len(v))) input := make([]byte, 8+int(len(v)))
binary.BigEndian.PutUint64(input, uint64(i)) binary.BigEndian.PutUint64(input, wordsCount)
copy(input[8:], v) copy(input[8:], v)
ch <- input ch <- input
i++ wordsCount++
select { select {
default: default:
case <-logEvery.C: case <-logEvery.C:
var m runtime.MemStats var m runtime.MemStats
runtime.ReadMemStats(&m) runtime.ReadMemStats(&m)
log.Info("Replacement preprocessing", "processed", fmt.Sprintf("%dK", i/1_000), "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys)) log.Info("Replacement preprocessing", "processed", fmt.Sprintf("%dK", wordsCount/1_000), "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
} }
return nil return nil
}); err != nil { }); err != nil {
@ -2191,6 +2191,7 @@ func reducedict(name string, segmentFileName string) error {
} }
close(ch) close(ch)
wg.Wait() wg.Wait()
var m runtime.MemStats var m runtime.MemStats
runtime.ReadMemStats(&m) runtime.ReadMemStats(&m)
log.Info("Done", "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys)) log.Info("Done", "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
@ -2223,7 +2224,7 @@ func reducedict(name string, segmentFileName string) error {
offset += uint64(n + len(p.w)) offset += uint64(n + len(p.w))
} }
patternCutoff := offset // All offsets below this will be considered patterns patternCutoff := offset // All offsets below this will be considered patterns
i = 0 i := 0
log.Info("Effective dictionary", "size", patternList.Len()) log.Info("Effective dictionary", "size", patternList.Len())
// Build Huffman tree for codes // Build Huffman tree for codes
var codeHeap PatternHeap var codeHeap PatternHeap
@ -2284,17 +2285,22 @@ func reducedict(name string, segmentFileName string) error {
return err return err
} }
cw := bufio.NewWriterSize(cf, etl.BufIOSize) cw := bufio.NewWriterSize(cf, etl.BufIOSize)
// First, output dictionary // 1-st, output dictionary
binary.BigEndian.PutUint64(numBuf, wordsCount) // Dictionary size
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
// 2-nd, output dictionary
binary.BigEndian.PutUint64(numBuf, offset) // Dictionary size binary.BigEndian.PutUint64(numBuf, offset) // Dictionary size
if _, err = cw.Write(numBuf[:8]); err != nil { if _, err = cw.Write(numBuf[:8]); err != nil {
return err return err
} }
// Secondly, output directory root // 3-rd, output directory root
binary.BigEndian.PutUint64(numBuf, root.offset) binary.BigEndian.PutUint64(numBuf, root.offset)
if _, err = cw.Write(numBuf[:8]); err != nil { if _, err = cw.Write(numBuf[:8]); err != nil {
return err return err
} }
// Thirdly, output pattern cutoff offset // 4-th, output pattern cutoff offset
binary.BigEndian.PutUint64(numBuf, patternCutoff) binary.BigEndian.PutUint64(numBuf, patternCutoff)
if _, err = cw.Write(numBuf[:8]); err != nil { if _, err = cw.Write(numBuf[:8]); err != nil {
return err return err

2
go.mod
View File

@ -37,7 +37,7 @@ require (
github.com/json-iterator/go v1.1.12 github.com/json-iterator/go v1.1.12
github.com/julienschmidt/httprouter v1.3.0 github.com/julienschmidt/httprouter v1.3.0
github.com/kevinburke/go-bindata v3.21.0+incompatible github.com/kevinburke/go-bindata v3.21.0+incompatible
github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3 github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f
github.com/ledgerwatch/log/v3 v3.4.0 github.com/ledgerwatch/log/v3 v3.4.0
github.com/ledgerwatch/secp256k1 v1.0.0 github.com/ledgerwatch/secp256k1 v1.0.0
github.com/logrusorgru/aurora/v3 v3.0.0 github.com/logrusorgru/aurora/v3 v3.0.0

4
go.sum
View File

@ -617,8 +617,8 @@ github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758 h1:0D5M2HQSGD3P
github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k=
github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c= github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8= github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8=
github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3 h1:8qDZvisP+6pFiVFd20BTD2y8/rYAe4go//HdBnk6CX8= github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f h1:MCIljelbCsLcgMzNTsrRg2Nu5DFyNlLxf5ZSWdy3CiM=
github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3/go.mod h1:lyGP3i0x4CeabdKZ4beycD5xZfHWZwJsAX+70OfGj4Y= github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f/go.mod h1:lyGP3i0x4CeabdKZ4beycD5xZfHWZwJsAX+70OfGj4Y=
github.com/ledgerwatch/log/v3 v3.4.0 h1:SEIOcv5a2zkG3PmoT5jeTU9m/0nEUv0BJS5bzsjwKCI= github.com/ledgerwatch/log/v3 v3.4.0 h1:SEIOcv5a2zkG3PmoT5jeTU9m/0nEUv0BJS5bzsjwKCI=
github.com/ledgerwatch/log/v3 v3.4.0/go.mod h1:VXcz6Ssn6XEeU92dCMc39/g1F0OYAjw1Mt+dGP5DjXY= github.com/ledgerwatch/log/v3 v3.4.0/go.mod h1:VXcz6Ssn6XEeU92dCMc39/g1F0OYAjw1Mt+dGP5DjXY=
github.com/ledgerwatch/secp256k1 v1.0.0 h1:Usvz87YoTG0uePIV8woOof5cQnLXGYa162rFf3YnwaQ= github.com/ledgerwatch/secp256k1 v1.0.0 h1:Usvz87YoTG0uePIV8woOof5cQnLXGYa162rFf3YnwaQ=