erigon-pulse/compress/parallel_compress.go

979 lines
28 KiB
Go
Raw Normal View History

/*
Copyright 2021 Erigon contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
2022-01-06 07:13:03 +00:00
package compress
import (
"bufio"
"container/heap"
"context"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"sync"
"sync/atomic"
"time"
"github.com/ledgerwatch/erigon-lib/common"
2022-01-06 07:13:03 +00:00
"github.com/ledgerwatch/erigon-lib/etl"
"github.com/ledgerwatch/erigon-lib/patricia"
"github.com/ledgerwatch/erigon-lib/sais"
2022-01-06 07:13:03 +00:00
"github.com/ledgerwatch/log/v3"
atomic2 "go.uber.org/atomic"
"golang.org/x/exp/slices"
2022-01-06 07:13:03 +00:00
)
// MinPatternScore is minimum score (per superstring) required to consider including pattern into the dictionary
const MinPatternScore = 1024
2022-01-06 07:13:03 +00:00
func optimiseCluster(trace bool, input []byte, mf2 *patricia.MatchFinder2, output []byte, uncovered []int, patterns []int, cellRing *Ring, posMap map[uint64]uint64) ([]byte, []int, []int) {
matches := mf2.FindLongestMatches(input)
2022-01-06 07:13:03 +00:00
if len(matches) == 0 {
output = append(output, 0) // Encoding of 0 in VarUint is 1 zero byte
2022-01-06 07:13:03 +00:00
output = append(output, input...)
return output, patterns, uncovered
}
if trace {
fmt.Printf("Cluster | input = %x\n", input)
for _, match := range matches {
fmt.Printf(" [%x %d-%d]", input[match.Start:match.End], match.Start, match.End)
}
}
cellRing.Reset()
patterns = append(patterns[:0], 0, 0) // Sentinel entry - no meaning
lastF := matches[len(matches)-1]
for j := lastF.Start; j < lastF.End; j++ {
d := cellRing.PushBack()
d.optimStart = j + 1
d.coverStart = len(input)
d.compression = 0
d.patternIdx = 0
d.score = 0
}
// Starting from the last match
for i := len(matches); i > 0; i-- {
f := matches[i-1]
p := f.Val.(*Pattern)
firstCell := cellRing.Get(0)
maxCompression := firstCell.compression
maxScore := firstCell.score
maxCell := firstCell
var maxInclude bool
for e := 0; e < cellRing.Len(); e++ {
cell := cellRing.Get(e)
comp := cell.compression - 4
if cell.coverStart >= f.End {
comp += f.End - f.Start
} else {
comp += cell.coverStart - f.Start
}
score := cell.score + p.score
if comp > maxCompression || (comp == maxCompression && score > maxScore) {
maxCompression = comp
maxScore = score
maxInclude = true
maxCell = cell
} else if cell.optimStart > f.End {
2022-01-06 07:13:03 +00:00
cellRing.Truncate(e)
break
}
}
d := cellRing.PushFront()
d.optimStart = f.Start
d.score = maxScore
d.compression = maxCompression
if maxInclude {
if trace {
fmt.Printf("[include] cell for %d: with patterns", f.Start)
fmt.Printf(" [%x %d-%d]", input[f.Start:f.End], f.Start, f.End)
patternIdx := maxCell.patternIdx
for patternIdx != 0 {
pattern := patterns[patternIdx]
fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End)
patternIdx = patterns[patternIdx+1]
}
fmt.Printf("\n\n")
}
d.coverStart = f.Start
d.patternIdx = len(patterns)
patterns = append(patterns, i-1, maxCell.patternIdx)
} else {
if trace {
fmt.Printf("cell for %d: with patterns", f.Start)
patternIdx := maxCell.patternIdx
for patternIdx != 0 {
pattern := patterns[patternIdx]
fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End)
patternIdx = patterns[patternIdx+1]
}
fmt.Printf("\n\n")
}
d.coverStart = maxCell.coverStart
d.patternIdx = maxCell.patternIdx
}
}
optimCell := cellRing.Get(0)
if trace {
fmt.Printf("optimal =")
}
// Count number of patterns
var patternCount uint64
patternIdx := optimCell.patternIdx
for patternIdx != 0 {
patternCount++
patternIdx = patterns[patternIdx+1]
}
var numBuf [binary.MaxVarintLen64]byte
p := binary.PutUvarint(numBuf[:], patternCount)
2022-01-06 07:13:03 +00:00
output = append(output, numBuf[:p]...)
patternIdx = optimCell.patternIdx
lastStart := 0
var lastUncovered int
uncovered = uncovered[:0]
for patternIdx != 0 {
pattern := patterns[patternIdx]
p := matches[pattern].Val.(*Pattern)
if trace {
fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End)
}
if matches[pattern].Start > lastUncovered {
uncovered = append(uncovered, lastUncovered, matches[pattern].Start)
}
lastUncovered = matches[pattern].End
// Starting position
posMap[uint64(matches[pattern].Start-lastStart+1)]++
lastStart = matches[pattern].Start
n := binary.PutUvarint(numBuf[:], uint64(matches[pattern].Start))
2022-01-06 07:13:03 +00:00
output = append(output, numBuf[:n]...)
// Code
n = binary.PutUvarint(numBuf[:], p.code)
2022-01-06 07:13:03 +00:00
output = append(output, numBuf[:n]...)
atomic.AddUint64(&p.uses, 1)
patternIdx = patterns[patternIdx+1]
}
if len(input) > lastUncovered {
uncovered = append(uncovered, lastUncovered, len(input))
}
if trace {
fmt.Printf("\n\n")
}
// Add uncoded input
for i := 0; i < len(uncovered); i += 2 {
output = append(output, input[uncovered[i]:uncovered[i+1]]...)
}
return output, patterns, uncovered
}
func reduceDictWorker(trace bool, inputCh chan *CompressionWord, outCh chan *CompressionWord, completion *sync.WaitGroup, trie *patricia.PatriciaTree, inputSize, outputSize *atomic2.Uint64, posMap map[uint64]uint64) {
2022-01-06 07:13:03 +00:00
defer completion.Done()
var output = make([]byte, 0, 256)
var uncovered = make([]int, 256)
var patterns = make([]int, 0, 256)
cellRing := NewRing()
mf2 := patricia.NewMatchFinder2(trie)
var numBuf [binary.MaxVarintLen64]byte
for compW := range inputCh {
wordLen := uint64(len(compW.word))
n := binary.PutUvarint(numBuf[:], wordLen)
output = append(output[:0], numBuf[:n]...) // Prepend with the encoding of length
output, patterns, uncovered = optimiseCluster(trace, compW.word, mf2, output, uncovered, patterns, cellRing, posMap)
compW.word = append(compW.word[:0], output...)
outCh <- compW
inputSize.Add(1 + wordLen)
2022-01-06 07:13:03 +00:00
outputSize.Add(uint64(len(output)))
posMap[wordLen+1]++
2022-01-06 07:13:03 +00:00
posMap[0]++
}
}
// CompressionWord hold a word to be compressed (if flag is set), and the result of compression
// To allow multiple words to be processed concurrently, order field is used to collect all
// the words after processing without disrupting their order
type CompressionWord struct {
word []byte
Fieldalign (#695) ``` ➜ erigon-lib git:(fieldalign) ✗ fieldalignment -fix ./... /Users/estensen/Developer/erigon-lib/commitment/bin_patricia_hashed.go:81:16: struct of size 1065120 could be 1065112 /Users/estensen/Developer/erigon-lib/commitment/bin_patricia_hashed.go:1063:14: struct of size 1032 could be 1024 /Users/estensen/Developer/erigon-lib/commitment/hex_patricia_hashed.go:62:24: struct of size 952776 could be 952768 /Users/estensen/Developer/erigon-lib/commitment/hex_patricia_hashed.go:98:12: struct of size 1832 could be 1824 /Users/estensen/Developer/erigon-lib/commitment/hex_patricia_hashed.go:113:12: struct with 208 pointer bytes could be 152 /Users/estensen/Developer/erigon-lib/commitment/hex_patricia_hashed.go:143:11: struct of size 464 could be 456 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:24:11: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:79:11: struct of size 56 could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:134:11: struct with 56 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:189:11: struct with 56 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:244:12: struct with 56 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:299:12: struct with 56 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:355:19: struct of size 56 could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:418:23: struct with 168 pointer bytes could be 128 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:571:20: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:626:20: struct with 136 pointer bytes could be 128 /Users/estensen/Developer/erigon-lib/gointerfaces/types/types.pb.go:721:15: struct of size 168 could be 160 /Users/estensen/Developer/erigon-lib/etl/buffers.go:75:21: struct with 88 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/etl/buffers.go:182:27: struct with 56 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/etl/buffers.go:274:32: struct with 56 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/etl/collector.go:41:16: struct with 72 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/etl/etl.go:66:20: struct with 96 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/etl/heap.go:25:15: struct with 40 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/patricia/patricia.go:29:11: struct with 40 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/patricia/patricia.go:347:12: struct with 32 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/patricia/patricia.go:367:18: struct with 48 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/patricia/patricia.go:377:19: struct with 168 pointer bytes could be 144 /Users/estensen/Developer/erigon-lib/compress/compress.go:52:17: struct with 176 pointer bytes could be 136 /Users/estensen/Developer/erigon-lib/compress/compress.go:241:24: struct with 48 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/compress/compress.go:327:14: struct with 40 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/compress/compress.go:353:18: struct with 48 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/compress/compress.go:450:19: struct with 48 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/compress/compress.go:670:21: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/compress/compress.go:734:23: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/compress/decompress.go:31:15: struct with 32 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/compress/decompress.go:39:19: struct with 40 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/compress/decompress.go:117:15: struct with 64 pointer bytes could be 56 /Users/estensen/Developer/erigon-lib/compress/decompress.go:125:19: struct with 96 pointer bytes could be 80 /Users/estensen/Developer/erigon-lib/compress/decompress.go:386:13: struct with 64 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/compress/parallel_compress.go:208:22: struct with 16 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/recsplit/golomb_rice.go:32:17: struct with 16 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/recsplit/index.go:35:12: struct of size 432 could be 424 /Users/estensen/Developer/erigon-lib/recsplit/index_reader.go:26:18: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/recsplit/recsplit.go:64:15: struct of size 704 could be 680 /Users/estensen/Developer/erigon-lib/recsplit/recsplit.go:111:19: struct of size 104 could be 96 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:171:17: struct of size 640 could be 632 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:198:17: struct with 168 pointer bytes could be 144 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:389:14: struct with 584 pointer bytes could be 568 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:921:21: struct with 72 pointer bytes could be 56 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:1195:22: struct with 2432 pointer bytes could be 2416 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:2123:13: struct with 2448 pointer bytes could be 2416 /Users/estensen/Developer/erigon-lib/aggregator/aggregator.go:2634:17: struct with 96 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/aggregator/history.go:39:14: struct with 96 pointer bytes could be 88 /Users/estensen/Developer/erigon-lib/bptree/binary_file.go:33:17: struct with 40 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/bptree/node.go:79:13: struct of size 88 could be 80 /Users/estensen/Developer/erigon-lib/chain/chain_config.go:28:13: struct with 136 pointer bytes could be 120 /Users/estensen/Developer/erigon-lib/common/background/progress.go:26:18: struct with 40 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/gointerfaces/downloader/downloader.pb.go:25:19: struct with 64 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/downloader/downloader.pb.go:80:22: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/downloader/downloader.pb.go:127:20: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/downloader/downloader.pb.go:165:19: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/downloader/downloader.pb.go:203:17: struct of size 104 could be 96 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:135:23: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:173:21: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:220:24: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:258:22: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:305:26: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:343:24: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:390:30: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:437:26: struct of size 72 could be 64 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:500:30: struct with 64 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:563:28: struct with 64 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:626:37: struct with 56 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:681:35: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:736:29: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:774:27: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:821:27: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:859:25: struct with 48 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:906:23: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:953:21: struct of size 72 could be 64 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1008:24: struct of size 104 could be 88 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1079:25: struct of size 144 could be 136 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1190:19: struct with 56 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1245:17: struct with 72 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1300:23: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1347:21: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1394:23: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1441:21: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1488:17: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/ethbackend.pb.go:1535:24: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:220:13: struct of size 120 could be 112 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:299:11: struct of size 104 could be 96 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:370:20: struct with 56 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:425:20: struct of size 136 could be 128 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:513:23: struct with 56 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:585:18: struct of size 112 could be 104 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:664:25: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:719:23: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/remote/kv.pb.go:757:21: struct with 72 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:300:26: struct of size 72 could be 64 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:355:35: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:410:29: struct with 56 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:465:38: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:520:16: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:567:26: struct of size 56 could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:622:26: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:677:21: struct of size 80 could be 72 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:740:12: struct with 56 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:795:17: struct with 72 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:874:21: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:912:21: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:959:22: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1006:17: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1053:23: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1091:21: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1138:22: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1185:20: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1232:24: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/sentry/sentry.pb.go:1270:16: struct of size 56 could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:25:28: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:63:26: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:110:26: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:148:24: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:195:27: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:233:25: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:280:21: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:318:19: struct with 96 pointer bytes could be 80 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:389:24: struct with 96 pointer bytes could be 88 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:452:22: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:499:28: struct with 56 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:554:26: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:601:22: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:639:20: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:686:20: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/mining.pb.go:724:18: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:132:15: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:179:17: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:226:15: struct with 72 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:281:26: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:328:24: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:375:19: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:413:17: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:460:17: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:498:15: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:545:19: struct with 48 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:592:20: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:630:18: struct of size 56 could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:693:19: struct with 48 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:740:17: struct of size 56 could be 48 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:795:18: struct of size 80 could be 72 /Users/estensen/Developer/erigon-lib/gointerfaces/txpool/txpool.pb.go:858:22: struct of size 80 could be 72 /Users/estensen/Developer/erigon-lib/direct/sentry_client.go:171:25: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/kv/mdbx/kv_mdbx.go:50:15: struct with 104 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/kv/mdbx/kv_mdbx.go:355:13: struct with 160 pointer bytes could be 152 /Users/estensen/Developer/erigon-lib/kv/memdb/memory_mutation_cursor.go:39:27: struct with 200 pointer bytes could be 184 /Users/estensen/Developer/erigon-lib/kv/remotedb/kv_remote.go:22:17: struct with 72 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/kv/remotedb/kv_remote.go:38:15: struct with 80 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/kv/remotedbserver/server.go:314:24: struct with 40 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/state/aggregator.go:44:17: struct with 192 pointer bytes could be 128 /Users/estensen/Developer/erigon-lib/state/aggregator.go:422:13: struct of size 384 could be 360 /Users/estensen/Developer/erigon-lib/state/aggregator.go:455:26: struct with 424 pointer bytes could be 368 /Users/estensen/Developer/erigon-lib/state/aggregator.go:853:22: struct with 24 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/state/aggregator22.go:37:19: struct with 176 pointer bytes could be 136 /Users/estensen/Developer/erigon-lib/state/aggregator22.go:643:15: struct of size 240 could be 216 /Users/estensen/Developer/erigon-lib/state/aggregator22.go:674:28: struct with 272 pointer bytes could be 224 /Users/estensen/Developer/erigon-lib/state/aggregator22.go:1126:26: struct with 104 pointer bytes could be 88 /Users/estensen/Developer/erigon-lib/state/aggregator22.go:1156:23: struct with 24 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/state/domain.go:52:16: struct with 32 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/state/domain.go:89:13: struct with 120 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/state/domain.go:399:17: struct with 96 pointer bytes could be 64 /Users/estensen/Developer/erigon-lib/state/domain.go:443:14: struct with 32 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/state/domain.go:594:16: struct with 72 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/state/history.go:48:14: struct with 72 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/state/history.go:435:20: struct with 48 pointer bytes could be 32 /Users/estensen/Developer/erigon-lib/state/history.go:529:23: struct with 40 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/state/history.go:1150:23: struct of size 344 could be 336 /Users/estensen/Developer/erigon-lib/state/inverted_index.go:46:20: struct with 128 pointer bytes could be 88 /Users/estensen/Developer/erigon-lib/state/inverted_index.go:389:23: struct with 136 pointer bytes could be 88 /Users/estensen/Developer/erigon-lib/state/inverted_index.go:541:24: struct with 184 pointer bytes could be 152 /Users/estensen/Developer/erigon-lib/state/merge.go:69:19: struct of size 72 could be 56 /Users/estensen/Developer/erigon-lib/state/merge.go:143:20: struct of size 48 could be 40 /Users/estensen/Developer/erigon-lib/state/read_indices.go:29:18: struct with 64 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/state/read_indices.go:211:14: struct of size 72 could be 56 /Users/estensen/Developer/erigon-lib/state/read_indices.go:233:27: struct with 72 pointer bytes could be 56 /Users/estensen/Developer/erigon-lib/state/state_recon.go:61:16: struct with 56 pointer bytes could be 16 /Users/estensen/Developer/erigon-lib/state/state_recon.go:112:19: struct with 136 pointer bytes could be 112 /Users/estensen/Developer/erigon-lib/state/state_recon.go:190:22: struct with 128 pointer bytes could be 112 /Users/estensen/Developer/erigon-lib/types/testdata.go:81:26: struct with 40 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/types/txn.go:46:21: struct with 400 pointer bytes could be 40 /Users/estensen/Developer/erigon-lib/types/txn.go:82:13: struct with 200 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/types/txn.go:691:18: struct with 32 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/txpool/fetch.go:43:12: struct with 136 pointer bytes could be 112 /Users/estensen/Developer/erigon-lib/txpool/pool.go:69:13: struct with 104 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/txpool/pool.go:221:13: struct of size 104 could be 96 /Users/estensen/Developer/erigon-lib/txpool/pool.go:291:13: struct with 336 pointer bytes could be 288 /Users/estensen/Developer/erigon-lib/txpool/pool.go:1789:29: struct with 32 pointer bytes could be 8 /Users/estensen/Developer/erigon-lib/txpool/pool.go:1819:19: struct with 32 pointer bytes could be 24 /Users/estensen/Developer/erigon-lib/txpool/pool.go:1994:18: struct of size 64 could be 56 /Users/estensen/Developer/erigon-lib/txpool/pool.go:2102:14: struct of size 64 could be 56 /Users/estensen/Developer/erigon-lib/txpool/send.go:37:11: struct with 64 pointer bytes could be 48 /Users/estensen/Developer/erigon-lib/txpool/test_util.go:31:17: struct with 72 pointer bytes could be 40 ```
2022-10-21 08:31:23 +00:00
order uint64
}
type CompressionQueue []*CompressionWord
func (cq CompressionQueue) Len() int {
return len(cq)
}
func (cq CompressionQueue) Less(i, j int) bool {
return cq[i].order < cq[j].order
}
func (cq *CompressionQueue) Swap(i, j int) {
(*cq)[i], (*cq)[j] = (*cq)[j], (*cq)[i]
}
func (cq *CompressionQueue) Push(x interface{}) {
*cq = append(*cq, x.(*CompressionWord))
}
func (cq *CompressionQueue) Pop() interface{} {
old := *cq
n := len(old)
x := old[n-1]
old[n-1] = nil
*cq = old[0 : n-1]
return x
}
2022-01-06 07:13:03 +00:00
// reduceDict reduces the dictionary by trying the substitutions and counting frequency for each word
2022-04-01 03:44:25 +00:00
func reducedict(ctx context.Context, trace bool, logPrefix, segmentFilePath string, datFile *DecompressedFile, workers int, dictBuilder *DictionaryBuilder, lvl log.Lvl) error {
2022-10-25 09:06:40 +00:00
logEvery := time.NewTicker(60 * time.Second)
2022-01-06 07:13:03 +00:00
defer logEvery.Stop()
// DictionaryBuilder is for sorting words by their freuency (to assign codes)
var pt patricia.PatriciaTree
code2pattern := make([]*Pattern, 0, 256)
dictBuilder.ForEach(func(score uint64, word []byte) {
2022-01-06 07:13:03 +00:00
p := &Pattern{
score: score,
uses: 0,
code: uint64(len(code2pattern)),
codeBits: 0,
word: word,
}
pt.Insert(word, p)
code2pattern = append(code2pattern, p)
})
dictBuilder.Close()
log.Log(lvl, fmt.Sprintf("[%s] dictionary file parsed", logPrefix), "entries", len(code2pattern))
ch := make(chan *CompressionWord, 10_000)
2022-01-06 07:13:03 +00:00
inputSize, outputSize := atomic2.NewUint64(0), atomic2.NewUint64(0)
2022-01-06 07:13:03 +00:00
var collectors []*etl.Collector
defer func() {
for _, c := range collectors {
c.Close()
}
}()
out := make(chan *CompressionWord, 1024)
var compressionQueue CompressionQueue
heap.Init(&compressionQueue)
queueLimit := 128 * 1024
// For the case of workers == 1
var output = make([]byte, 0, 256)
var uncovered = make([]int, 256)
var patterns = make([]int, 0, 256)
cellRing := NewRing()
mf2 := patricia.NewMatchFinder2(&pt)
2022-01-06 07:13:03 +00:00
var posMaps []map[uint64]uint64
uncompPosMap := make(map[uint64]uint64) // For the uncompressed words
posMaps = append(posMaps, uncompPosMap)
var wg sync.WaitGroup
if workers > 1 {
for i := 0; i < workers; i++ {
posMap := make(map[uint64]uint64)
posMaps = append(posMaps, posMap)
wg.Add(1)
go reduceDictWorker(trace, ch, out, &wg, &pt, inputSize, outputSize, posMap)
}
}
t := time.Now()
var err error
intermediatePath := segmentFilePath + ".tmp"
defer os.Remove(intermediatePath)
var intermediateFile *os.File
if intermediateFile, err = os.Create(intermediatePath); err != nil {
return fmt.Errorf("create intermediate file: %w", err)
}
defer intermediateFile.Close()
intermediateW := bufio.NewWriterSize(intermediateFile, 8*etl.BufIOSize)
var inCount, outCount, emptyWordsCount uint64 // Counters words sent to compression and returned for compression
var numBuf [binary.MaxVarintLen64]byte
2022-10-04 09:51:51 +00:00
totalWords := datFile.count
if err = datFile.ForEach(func(v []byte, compression bool) error {
2022-03-12 09:34:58 +00:00
select {
case <-ctx.Done():
return ctx.Err()
default:
}
if workers > 1 {
// take processed words in non-blocking way and push them to the queue
outer:
for {
select {
case compW := <-out:
heap.Push(&compressionQueue, compW)
default:
break outer
}
}
// take processed words in blocking way until either:
// 1. compressionQueue is below the limit so that new words can be allocated
// 2. there is word in order on top of the queue which can be written down and reused
for compressionQueue.Len() >= queueLimit && compressionQueue[0].order < outCount {
// Blocking wait to receive some outputs until the top of queue can be processed
compW := <-out
heap.Push(&compressionQueue, compW)
}
var compW *CompressionWord
// Either take the word from the top, write it down and reuse for the next unprocessed word
// Or allocate new word
if compressionQueue.Len() > 0 && compressionQueue[0].order == outCount {
compW = heap.Pop(&compressionQueue).(*CompressionWord)
outCount++
// Write to intermediate file
if _, e := intermediateW.Write(compW.word); e != nil {
return e
}
// Reuse compW for the next word
} else {
compW = &CompressionWord{}
}
compW.order = inCount
if len(v) == 0 {
// Empty word, cannot be compressed
compW.word = append(compW.word[:0], 0)
uncompPosMap[1]++
uncompPosMap[0]++
heap.Push(&compressionQueue, compW) // Push to the queue directly, bypassing compression
} else if compression {
compW.word = append(compW.word[:0], v...)
ch <- compW // Send for compression
} else {
// Prepend word with encoding of length + zero byte, which indicates no patterns to be found in this word
wordLen := uint64(len(v))
n := binary.PutUvarint(numBuf[:], wordLen)
uncompPosMap[wordLen+1]++
uncompPosMap[0]++
compW.word = append(append(append(compW.word[:0], numBuf[:n]...), 0), v...)
heap.Push(&compressionQueue, compW) // Push to the queue directly, bypassing compression
}
} else {
outCount++
wordLen := uint64(len(v))
n := binary.PutUvarint(numBuf[:], wordLen)
if _, e := intermediateW.Write(numBuf[:n]); e != nil {
return e
}
if wordLen > 0 {
if compression {
output, patterns, uncovered = optimiseCluster(trace, v, mf2, output[:0], uncovered, patterns, cellRing, uncompPosMap)
if _, e := intermediateW.Write(output); e != nil {
return e
}
outputSize.Add(uint64(len(output)))
} else {
if e := intermediateW.WriteByte(0); e != nil {
return e
}
if _, e := intermediateW.Write(v); e != nil {
return e
}
outputSize.Add(1 + uint64(len(v)))
}
}
inputSize.Add(1 + wordLen)
uncompPosMap[wordLen+1]++
uncompPosMap[0]++
}
inCount++
if len(v) == 0 {
emptyWordsCount++
}
2022-01-06 07:13:03 +00:00
select {
case <-logEvery.C:
log.Log(lvl, fmt.Sprintf("[%s] Replacement preprocessing", logPrefix), "processed", fmt.Sprintf("%.2f%%", 100*float64(outCount)/float64(totalWords)), "ch", len(ch), "workers", workers)
2022-10-04 09:51:51 +00:00
default:
2022-01-06 07:13:03 +00:00
}
return nil
}); err != nil {
return err
}
close(ch)
// Drain the out queue if necessary
if inCount > outCount {
for compressionQueue.Len() > 0 && compressionQueue[0].order == outCount {
compW := heap.Pop(&compressionQueue).(*CompressionWord)
outCount++
if outCount == inCount {
close(out)
}
// Write to intermediate file
if _, e := intermediateW.Write(compW.word); e != nil {
return e
}
}
for compW := range out {
heap.Push(&compressionQueue, compW)
for compressionQueue.Len() > 0 && compressionQueue[0].order == outCount {
compW = heap.Pop(&compressionQueue).(*CompressionWord)
outCount++
if outCount == inCount {
close(out)
}
// Write to intermediate file
if _, e := intermediateW.Write(compW.word); e != nil {
return e
}
}
}
}
if err = intermediateW.Flush(); err != nil {
return err
}
2022-01-06 07:13:03 +00:00
wg.Wait()
log.Log(lvl, fmt.Sprintf("[%s] Replacement preprocessing", logPrefix), "took", time.Since(t))
if _, err = intermediateFile.Seek(0, 0); err != nil {
return fmt.Errorf("return to the start of intermediate file: %w", err)
}
2022-01-06 07:13:03 +00:00
//var m runtime.MemStats
//common.ReadMemStats(&m)
//log.Info(fmt.Sprintf("[%s] Dictionary build done", logPrefix), "input", common.ByteCount(inputSize.Load()), "output", common.ByteCount(outputSize.Load()), "alloc", common.ByteCount(m.Alloc), "sys", common.ByteCount(m.Sys))
2022-01-06 07:13:03 +00:00
posMap := make(map[uint64]uint64)
for _, m := range posMaps {
for l, c := range m {
posMap[l] += c
}
}
//fmt.Printf("posMap = %v\n", posMap)
var patternList PatternList
distribution := make([]int, maxPatternLen+1)
2022-01-06 07:13:03 +00:00
for _, p := range code2pattern {
if p.uses > 0 {
patternList = append(patternList, p)
distribution[len(p.word)]++
2022-01-06 07:13:03 +00:00
}
}
2022-07-18 10:12:39 +00:00
slices.SortFunc(patternList, patternListLess)
logCtx := make([]interface{}, 0, 8)
logCtx = append(logCtx, "patternList.Len", patternList.Len())
2022-01-06 07:13:03 +00:00
i := 0
// Build Huffman tree for codes
var codeHeap PatternHeap
heap.Init(&codeHeap)
tieBreaker := uint64(0)
for codeHeap.Len()+(patternList.Len()-i) > 1 {
// New node
h := &PatternHuff{
tieBreaker: tieBreaker,
}
if codeHeap.Len() > 0 && (i >= patternList.Len() || codeHeap[0].uses < patternList[i].uses) {
// Take h0 from the heap
h.h0 = heap.Pop(&codeHeap).(*PatternHuff)
h.h0.AddZero()
h.uses += h.h0.uses
} else {
// Take p0 from the list
h.p0 = patternList[i]
h.p0.code = 0
h.p0.codeBits = 1
h.uses += h.p0.uses
i++
}
if codeHeap.Len() > 0 && (i >= patternList.Len() || codeHeap[0].uses < patternList[i].uses) {
// Take h1 from the heap
h.h1 = heap.Pop(&codeHeap).(*PatternHuff)
h.h1.AddOne()
h.uses += h.h1.uses
} else {
// Take p1 from the list
h.p1 = patternList[i]
h.p1.code = 1
h.p1.codeBits = 1
h.uses += h.p1.uses
i++
}
tieBreaker++
heap.Push(&codeHeap, h)
}
if codeHeap.Len() > 0 {
root := heap.Pop(&codeHeap).(*PatternHuff)
root.SetDepth(0)
}
// Calculate total size of the dictionary
var patternsSize uint64
for _, p := range patternList {
ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) // Length of the word's depth
n := binary.PutUvarint(numBuf[:], uint64(len(p.word))) // Length of the word's length
patternsSize += uint64(ns + n + len(p.word))
2022-01-06 07:13:03 +00:00
}
logCtx = append(logCtx, "patternsSize", common.ByteCount(patternsSize))
for i, n := range distribution {
if n == 0 {
continue
}
logCtx = append(logCtx, fmt.Sprintf("%d", i), fmt.Sprintf("%d", n))
}
log.Log(lvl, fmt.Sprintf("[%s] Effective dictionary", logPrefix), logCtx...)
2022-01-06 07:13:03 +00:00
var cf *os.File
if cf, err = os.Create(segmentFilePath); err != nil {
2022-01-06 07:13:03 +00:00
return err
}
cw := bufio.NewWriterSize(cf, 2*etl.BufIOSize)
// 1-st, output amount of words - just a useful metadata
binary.BigEndian.PutUint64(numBuf[:], inCount) // Dictionary size
2022-01-06 07:13:03 +00:00
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
binary.BigEndian.PutUint64(numBuf[:], emptyWordsCount)
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
// 2-nd, output dictionary size
binary.BigEndian.PutUint64(numBuf[:], patternsSize) // Dictionary size
2022-01-06 07:13:03 +00:00
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
//fmt.Printf("patternsSize = %d\n", patternsSize)
2022-01-06 07:13:03 +00:00
// Write all the pattens
2022-07-18 10:12:39 +00:00
slices.SortFunc(patternList, patternListLess)
2022-01-06 07:13:03 +00:00
for _, p := range patternList {
ns := binary.PutUvarint(numBuf[:], uint64(p.depth))
if _, err = cw.Write(numBuf[:ns]); err != nil {
2022-01-06 07:13:03 +00:00
return err
}
n := binary.PutUvarint(numBuf[:], uint64(len(p.word)))
2022-01-06 07:13:03 +00:00
if _, err = cw.Write(numBuf[:n]); err != nil {
return err
}
if _, err = cw.Write(p.word); err != nil {
2022-01-06 07:13:03 +00:00
return err
}
//fmt.Printf("[comp] depth=%d, code=[%b], codeLen=%d pattern=[%x]\n", p.depth, p.code, p.codeBits, p.word)
2022-01-06 07:13:03 +00:00
}
var positionList PositionList
pos2code := make(map[uint64]*Position)
for pos, uses := range posMap {
p := &Position{pos: pos, uses: uses, code: pos, codeBits: 0}
2022-01-06 07:13:03 +00:00
positionList = append(positionList, p)
pos2code[pos] = p
}
2022-07-18 10:12:39 +00:00
slices.SortFunc(positionList, positionListLess)
2022-01-06 07:13:03 +00:00
i = 0
// Build Huffman tree for codes
var posHeap PositionHeap
heap.Init(&posHeap)
tieBreaker = uint64(0)
for posHeap.Len()+(positionList.Len()-i) > 1 {
// New node
h := &PositionHuff{
tieBreaker: tieBreaker,
}
if posHeap.Len() > 0 && (i >= positionList.Len() || posHeap[0].uses < positionList[i].uses) {
// Take h0 from the heap
h.h0 = heap.Pop(&posHeap).(*PositionHuff)
h.h0.AddZero()
h.uses += h.h0.uses
} else {
// Take p0 from the list
h.p0 = positionList[i]
h.p0.code = 0
h.p0.codeBits = 1
h.uses += h.p0.uses
i++
}
if posHeap.Len() > 0 && (i >= positionList.Len() || posHeap[0].uses < positionList[i].uses) {
// Take h1 from the heap
h.h1 = heap.Pop(&posHeap).(*PositionHuff)
h.h1.AddOne()
h.uses += h.h1.uses
} else {
// Take p1 from the list
h.p1 = positionList[i]
h.p1.code = 1
h.p1.codeBits = 1
h.uses += h.p1.uses
i++
}
tieBreaker++
heap.Push(&posHeap, h)
}
2022-01-18 05:55:20 +00:00
if posHeap.Len() > 0 {
posRoot := heap.Pop(&posHeap).(*PositionHuff)
posRoot.SetDepth(0)
2022-01-18 05:55:20 +00:00
}
// Calculate the size of pos dictionary
var posSize uint64
for _, p := range positionList {
ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) // Length of the position's depth
n := binary.PutUvarint(numBuf[:], p.pos)
posSize += uint64(ns + n)
2022-01-06 07:13:03 +00:00
}
// First, output dictionary size
binary.BigEndian.PutUint64(numBuf[:], posSize) // Dictionary size
2022-01-06 07:13:03 +00:00
if _, err = cw.Write(numBuf[:8]); err != nil {
return err
}
//fmt.Printf("posSize = %d\n", posSize)
2022-01-06 07:13:03 +00:00
// Write all the positions
2022-07-18 10:12:39 +00:00
slices.SortFunc(positionList, positionListLess)
2022-01-06 07:13:03 +00:00
for _, p := range positionList {
ns := binary.PutUvarint(numBuf[:], uint64(p.depth))
if _, err = cw.Write(numBuf[:ns]); err != nil {
2022-01-06 07:13:03 +00:00
return err
}
n := binary.PutUvarint(numBuf[:], p.pos)
2022-01-06 07:13:03 +00:00
if _, err = cw.Write(numBuf[:n]); err != nil {
return err
}
//fmt.Printf("[comp] depth=%d, code=[%b], codeLen=%d pos=%d\n", p.depth, p.code, p.codeBits, p.pos)
2022-01-06 07:13:03 +00:00
}
log.Log(lvl, fmt.Sprintf("[%s] Positional dictionary", logPrefix), "positionList.len", positionList.Len(), "posSize", common.ByteCount(posSize))
// Re-encode all the words with the use of optimised (via Huffman coding) dictionaries
2022-01-06 07:13:03 +00:00
wc := 0
var hc HuffmanCoder
hc.w = cw
r := bufio.NewReaderSize(intermediateFile, 2*etl.BufIOSize)
var l uint64
var e error
for l, e = binary.ReadUvarint(r); e == nil; l, e = binary.ReadUvarint(r) {
2022-01-06 07:13:03 +00:00
posCode := pos2code[l+1]
2022-01-18 05:55:20 +00:00
if posCode != nil {
if e = hc.encode(posCode.code, posCode.codeBits); e != nil {
return e
}
2022-01-06 07:13:03 +00:00
}
if l == 0 {
if e = hc.flush(); e != nil {
return e
}
} else {
2022-01-06 07:13:03 +00:00
var pNum uint64 // Number of patterns
if pNum, e = binary.ReadUvarint(r); e != nil {
return e
}
// Now reading patterns one by one
var lastPos uint64
var lastUncovered int
var uncoveredCount int
for i := 0; i < int(pNum); i++ {
var pos uint64 // Starting position for pattern
if pos, e = binary.ReadUvarint(r); e != nil {
return e
}
posCode = pos2code[pos-lastPos+1]
lastPos = pos
2022-01-18 05:55:20 +00:00
if posCode != nil {
if e = hc.encode(posCode.code, posCode.codeBits); e != nil {
return e
}
2022-01-06 07:13:03 +00:00
}
var code uint64 // Code of the pattern
if code, e = binary.ReadUvarint(r); e != nil {
return e
}
patternCode := code2pattern[code]
if int(pos) > lastUncovered {
uncoveredCount += int(pos) - lastUncovered
}
lastUncovered = int(pos) + len(patternCode.word)
2022-01-18 05:55:20 +00:00
if patternCode != nil {
if e = hc.encode(patternCode.code, patternCode.codeBits); e != nil {
return e
}
2022-01-06 07:13:03 +00:00
}
}
if int(l) > lastUncovered {
uncoveredCount += int(l) - lastUncovered
}
// Terminating position and flush
posCode = pos2code[0]
if e = hc.encode(posCode.code, posCode.codeBits); e != nil {
return e
}
if e = hc.flush(); e != nil {
return e
}
// Copy uncovered characters
if uncoveredCount > 0 {
if _, e = io.CopyN(cw, r, int64(uncoveredCount)); e != nil {
return e
}
}
}
wc++
2022-10-04 09:51:51 +00:00
select {
case <-logEvery.C:
log.Log(lvl, fmt.Sprintf("[%s] Compressed", logPrefix), "processed", fmt.Sprintf("%.2f%%", 100*float64(wc)/float64(totalWords)))
default:
2022-01-06 07:13:03 +00:00
}
}
if e != nil && !errors.Is(e, io.EOF) {
return e
}
if err = intermediateFile.Close(); err != nil {
2022-01-06 07:13:03 +00:00
return err
}
if err = cw.Flush(); err != nil {
return err
}
if err = cf.Close(); err != nil {
return err
}
2022-01-06 07:13:03 +00:00
return nil
}
// processSuperstring is the worker that processes one superstring and puts results
// into the collector, using lock to mutual exclusion. At the end (when the input channel is closed),
// it notifies the waitgroup before exiting, so that the caller known when all work is done
// No error channels for now
func processSuperstring(superstringCh chan []byte, dictCollector *etl.Collector, minPatternScore uint64, completion *sync.WaitGroup) {
defer completion.Done()
2022-03-12 08:33:01 +00:00
dictVal := make([]byte, 8)
dictKey := make([]byte, maxPatternLen)
var lcp, sa, inv []int32
2022-01-06 07:13:03 +00:00
for superstring := range superstringCh {
if cap(sa) < len(superstring) {
sa = make([]int32, len(superstring))
} else {
sa = sa[:len(superstring)]
}
2022-01-06 07:13:03 +00:00
//log.Info("Superstring", "len", len(superstring))
//start := time.Now()
2022-03-12 08:33:01 +00:00
if err := sais.Sais(superstring, sa); err != nil {
panic(err)
}
2022-01-06 07:13:03 +00:00
//log.Info("Suffix array built", "in", time.Since(start))
// filter out suffixes that start with odd positions
n := len(sa) / 2
2022-03-12 08:33:01 +00:00
filtered := sa[:n]
//filtered := make([]int32, n)
2022-01-06 07:13:03 +00:00
var j int
for i := 0; i < len(sa); i++ {
if sa[i]&1 == 0 {
filtered[j] = sa[i] >> 1
j++
}
}
// Now create an inverted array
if cap(inv) < n {
inv = make([]int32, n)
} else {
inv = inv[:n]
}
for i := 0; i < n; i++ {
inv[filtered[i]] = int32(i)
2022-01-06 07:13:03 +00:00
}
//log.Info("Inverted array done")
var k int
2022-01-06 07:13:03 +00:00
// Process all suffixes one by one starting from
// first suffix in txt[]
if cap(lcp) < n {
lcp = make([]int32, n)
} else {
lcp = lcp[:n]
}
for i := 0; i < n; i++ {
/* If the current suffix is at n-1, then we dont
have next substring to consider. So lcp is not
defined for this substring, we put zero. */
if inv[i] == int32(n-1) {
k = 0
continue
}
2022-01-06 07:13:03 +00:00
/* j contains index of the next substring to
be considered to compare with the present
substring, i.e., next string in suffix array */
j := int(filtered[inv[i]+1])
2022-01-06 07:13:03 +00:00
// Directly start matching from k'th index as
// at-least k-1 characters will match
for i+k < n && j+k < n && superstring[(i+k)*2] != 0 && superstring[(j+k)*2] != 0 && superstring[(i+k)*2+1] == superstring[(j+k)*2+1] {
k++
}
lcp[inv[i]] = int32(k) // lcp for the present suffix.
// Deleting the starting character from the string.
if k > 0 {
k--
}
}
2022-01-06 07:13:03 +00:00
//log.Info("Kasai algorithm finished")
// Checking LCP array
if ASSERT {
for i := 0; i < n-1; i++ {
var prefixLen int
p1 := int(filtered[i])
p2 := int(filtered[i+1])
for p1+prefixLen < n &&
p2+prefixLen < n &&
superstring[(p1+prefixLen)*2] != 0 &&
superstring[(p2+prefixLen)*2] != 0 &&
superstring[(p1+prefixLen)*2+1] == superstring[(p2+prefixLen)*2+1] {
prefixLen++
}
if prefixLen != int(lcp[i]) {
log.Error("Mismatch", "prefixLen", prefixLen, "lcp[i]", lcp[i], "i", i)
break
}
l := int(lcp[i]) // Length of potential dictionary word
if l < 2 {
continue
}
}
}
//log.Info("LCP array checked")
// Walk over LCP array and compute the scores of the strings
var b = inv
2022-01-06 07:13:03 +00:00
j = 0
for i := 0; i < n-1; i++ {
// Only when there is a drop in LCP value
if lcp[i+1] >= lcp[i] {
j = i
continue
}
prevSkipped := false
for l := int(lcp[i]); l > int(lcp[i+1]) && l >= minPatternLen; l-- {
if l > maxPatternLen ||
l > 20 && (l&(l-1)) != 0 { // is power of 2
prevSkipped = true
2022-01-06 07:13:03 +00:00
continue
}
2022-01-06 07:13:03 +00:00
// Go back
2022-03-19 04:38:37 +00:00
var isNew bool
2022-01-06 07:13:03 +00:00
for j > 0 && int(lcp[j-1]) >= l {
j--
2022-03-19 04:38:37 +00:00
isNew = true
2022-01-06 07:13:03 +00:00
}
2022-03-19 04:38:37 +00:00
if !isNew && !prevSkipped {
2022-01-06 07:13:03 +00:00
break
}
2022-01-06 07:13:03 +00:00
window := i - j + 2
copy(b, filtered[j:i+2])
slices.Sort(b[:window])
2022-01-06 07:13:03 +00:00
repeats := 1
lastK := 0
for k := 1; k < window; k++ {
if b[k] >= b[lastK]+int32(l) {
2022-01-06 07:13:03 +00:00
repeats++
lastK = k
}
}
2022-04-17 00:59:29 +00:00
if (l < 8 || l > 64) && repeats < int(minPatternScore) {
prevSkipped = true
continue
}
score := uint64(repeats * (l))
2022-01-12 03:46:26 +00:00
if score < minPatternScore {
prevSkipped = true
2022-01-12 03:46:26 +00:00
continue
}
dictKey = dictKey[:l]
2022-01-12 03:46:26 +00:00
for s := 0; s < l; s++ {
dictKey[s] = superstring[(int(filtered[i])+s)*2+1]
2022-01-12 03:46:26 +00:00
}
2022-03-12 08:33:01 +00:00
binary.BigEndian.PutUint64(dictVal, score)
if err := dictCollector.Collect(dictKey, dictVal); err != nil {
2022-01-12 03:46:26 +00:00
log.Error("processSuperstring", "collect", err)
2022-01-06 07:13:03 +00:00
}
2022-03-19 04:38:37 +00:00
prevSkipped = false //nolint
break
2022-01-06 07:13:03 +00:00
}
}
}
}
func DictionaryBuilderFromCollectors(ctx context.Context, logPrefix, tmpDir string, collectors []*etl.Collector, lvl log.Lvl) (*DictionaryBuilder, error) {
dictCollector := etl.NewCollector(logPrefix+"_collectDict", tmpDir, etl.NewSortableBuffer(etl.BufferOptimalSize))
2022-01-06 07:13:03 +00:00
defer dictCollector.Close()
dictCollector.LogLvl(lvl)
dictAggregator := &DictAggregator{collector: dictCollector, dist: map[int]int{}}
2022-01-06 07:13:03 +00:00
for _, collector := range collectors {
if err := collector.Load(nil, "", dictAggregator.aggLoadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil {
return nil, err
}
collector.Close()
}
if err := dictAggregator.finish(); err != nil {
return nil, err
}
db := &DictionaryBuilder{limit: maxDictPatterns} // Only collect 1m words with highest scores
if err := dictCollector.Load(nil, "", db.loadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil {
return nil, err
}
db.finish()
2022-07-18 10:12:39 +00:00
db.Sort()
2022-01-06 07:13:03 +00:00
return db, nil
}
func PersistDictrionary(fileName string, db *DictionaryBuilder) error {
df, err := os.Create(fileName)
if err != nil {
return err
}
w := bufio.NewWriterSize(df, 2*etl.BufIOSize)
2022-01-06 07:13:03 +00:00
db.ForEach(func(score uint64, word []byte) { fmt.Fprintf(w, "%d %x\n", score, word) })
if err = w.Flush(); err != nil {
return err
}
if err := df.Sync(); err != nil {
return err
}
return df.Close()
}
func ReadSimpleFile(fileName string, walker func(v []byte) error) error {
2022-01-06 07:13:03 +00:00
// Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values)
// We only consider values with length > 2, because smaller values are not compressible without going into bits
f, err := os.Open(fileName)
if err != nil {
return err
}
defer f.Close()
r := bufio.NewReaderSize(f, etl.BufIOSize)
buf := make([]byte, 4096)
2022-01-18 05:55:20 +00:00
for l, e := binary.ReadUvarint(r); ; l, e = binary.ReadUvarint(r) {
if e != nil {
if errors.Is(e, io.EOF) {
break
}
return e
}
2022-01-06 07:13:03 +00:00
if len(buf) < int(l) {
buf = make([]byte, l)
}
if _, e = io.ReadFull(r, buf[:l]); e != nil {
return e
}
if err := walker(buf[:l]); err != nil {
return err
}
}
return nil
}