add words count in .seg (breaking change in snapshot format) (#3140)

This commit is contained in:
Alex Sharov 2021-12-21 10:48:34 +07:00 committed by GitHub
parent 502e933029
commit 82753a6cda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 17 additions and 11 deletions

View File

@ -2171,19 +2171,19 @@ func reducedict(name string, segmentFileName string) error {
wg.Add(1)
go reduceDictWorker(ch, &wg, &pt, collector, inputSize, outputSize, posMap)
}
i := 0
var wordsCount uint64
if err := snapshotsync.ReadSimpleFile(name+".dat", func(v []byte) error {
input := make([]byte, 8+int(len(v)))
binary.BigEndian.PutUint64(input, uint64(i))
binary.BigEndian.PutUint64(input, wordsCount)
copy(input[8:], v)
ch <- input
i++
wordsCount++
select {
default:
case <-logEvery.C:
var m runtime.MemStats
runtime.ReadMemStats(&m)
log.Info("Replacement preprocessing", "processed", fmt.Sprintf("%dK", i/1_000), "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
log.Info("Replacement preprocessing", "processed", fmt.Sprintf("%dK", wordsCount/1_000), "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
}
return nil
}); err != nil {
@ -2191,6 +2191,7 @@ func reducedict(name string, segmentFileName string) error {
}
close(ch)
wg.Wait()
var m runtime.MemStats
runtime.ReadMemStats(&m)
log.Info("Done", "input", common.StorageSize(inputSize.Load()), "output", common.StorageSize(outputSize.Load()), "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
@ -2223,7 +2224,7 @@ func reducedict(name string, segmentFileName string) error {
offset += uint64(n + len(p.w))
}
patternCutoff := offset // All offsets below this will be considered patterns
i = 0
i := 0
log.Info("Effective dictionary", "size", patternList.Len())
// Build Huffman tree for codes
var codeHeap PatternHeap
@ -2284,17 +2285,22 @@ func reducedict(name string, segmentFileName string) error {
return err
}
cw := bufio.NewWriterSize(cf, etl.BufIOSize)
// First, output dictionary
// 1-st, output dictionary
binary.BigEndian.PutUint64(numBuf, wordsCount) // Dictionary size
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
// 2-nd, output dictionary
binary.BigEndian.PutUint64(numBuf, offset) // Dictionary size
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
// Secondly, output directory root
// 3-rd, output directory root
binary.BigEndian.PutUint64(numBuf, root.offset)
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
// Thirdly, output pattern cutoff offset
// 4-th, output pattern cutoff offset
binary.BigEndian.PutUint64(numBuf, patternCutoff)
if _, err = cw.Write(numBuf[:8]); err != nil {
return err

2
go.mod
View File

@ -37,7 +37,7 @@ require (
github.com/json-iterator/go v1.1.12
github.com/julienschmidt/httprouter v1.3.0
github.com/kevinburke/go-bindata v3.21.0+incompatible
github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3
github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f
github.com/ledgerwatch/log/v3 v3.4.0
github.com/ledgerwatch/secp256k1 v1.0.0
github.com/logrusorgru/aurora/v3 v3.0.0

4
go.sum
View File

@ -617,8 +617,8 @@ github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758 h1:0D5M2HQSGD3P
github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k=
github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8=
github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3 h1:8qDZvisP+6pFiVFd20BTD2y8/rYAe4go//HdBnk6CX8=
github.com/ledgerwatch/erigon-lib v0.0.0-20211217093546-8d06531e4ed3/go.mod h1:lyGP3i0x4CeabdKZ4beycD5xZfHWZwJsAX+70OfGj4Y=
github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f h1:MCIljelbCsLcgMzNTsrRg2Nu5DFyNlLxf5ZSWdy3CiM=
github.com/ledgerwatch/erigon-lib v0.0.0-20211221034520-583c3f9b5b6f/go.mod h1:lyGP3i0x4CeabdKZ4beycD5xZfHWZwJsAX+70OfGj4Y=
github.com/ledgerwatch/log/v3 v3.4.0 h1:SEIOcv5a2zkG3PmoT5jeTU9m/0nEUv0BJS5bzsjwKCI=
github.com/ledgerwatch/log/v3 v3.4.0/go.mod h1:VXcz6Ssn6XEeU92dCMc39/g1F0OYAjw1Mt+dGP5DjXY=
github.com/ledgerwatch/secp256k1 v1.0.0 h1:Usvz87YoTG0uePIV8woOof5cQnLXGYa162rFf3YnwaQ=