From f31a76baeaf749811fb9d1231d60fbfae1cfc715 Mon Sep 17 00:00:00 2001 From: Alex Sharov Date: Wed, 10 Nov 2021 22:10:01 +0700 Subject: [PATCH] Compress: add maxPatternLen=64 (#2943) --- cmd/hack/hack.go | 40 ++-------------------------------------- go.mod | 2 +- go.sum | 4 ++-- 3 files changed, 5 insertions(+), 41 deletions(-) diff --git a/cmd/hack/hack.go b/cmd/hack/hack.go index 150fcb02a..38cda0c1f 100644 --- a/cmd/hack/hack.go +++ b/cmd/hack/hack.go @@ -1483,7 +1483,6 @@ func processSuperstring(superstringCh chan []byte, dictCollector *etl.Collector, for i+k < n && j+k < n && superstring[(i+k)*2] != 0 && superstring[(j+k)*2] != 0 && superstring[(i+k)*2+1] == superstring[(j+k)*2+1] { k++ } - lcp[inv[i]] = int32(k) // lcp for the present suffix. // Deleting the starting character from the string. @@ -1532,7 +1531,7 @@ func processSuperstring(superstringCh chan []byte, dictCollector *etl.Collector, continue } for l := int(lcp[i]); l > int(lcp[i+1]); l-- { - if l < minPatternLen { + if l < minPatternLen || l > maxPatternLen { continue } // Go back @@ -1557,7 +1556,6 @@ func processSuperstring(superstringCh chan []byte, dictCollector *etl.Collector, } score := uint64(repeats * int(l-4)) if score > minPatternScore { - // Dictionary key is the concatenation of the score and the dictionary word (to later aggregate the scores from multiple chunks) dictKey := make([]byte, l) for s := 0; s < l; s++ { dictKey[s] = superstring[(filtered[i]+s)*2+1] @@ -1574,41 +1572,6 @@ func processSuperstring(superstringCh chan []byte, dictCollector *etl.Collector, completion.Done() } -type DictAggregator struct { - lastWord []byte - lastWordScore uint64 - collector *etl.Collector -} - -func (da *DictAggregator) processWord(word []byte, score uint64) error { - var scoreBuf [8]byte - binary.BigEndian.PutUint64(scoreBuf[:], score) - return da.collector.Collect(word, scoreBuf[:]) -} - -func (da *DictAggregator) aggLoadFunc(k, v []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error { - score := binary.BigEndian.Uint64(v) - if bytes.Equal(k, da.lastWord) { - da.lastWordScore += score - } else { - if da.lastWord != nil { - if err := da.processWord(da.lastWord, da.lastWordScore); err != nil { - return err - } - } - da.lastWord = common.CopyBytes(k) - da.lastWordScore = score - } - return nil -} - -func (da *DictAggregator) finish() error { - if da.lastWord != nil { - return da.processWord(da.lastWord, da.lastWordScore) - } - return nil -} - const CompressLogPrefix = "compress" // superstringLimit limits how large can one "superstring" get before it is processed @@ -1618,6 +1581,7 @@ const superstringLimit = 16 * 1024 * 1024 // minPatternLen is minimum length of pattern we consider to be included into the dictionary const minPatternLen = 5 +const maxPatternLen = 64 // minPatternScore is minimum score (per superstring) required to consider including pattern into the dictionary const minPatternScore = 1024 diff --git a/go.mod b/go.mod index 9a380c8e9..86b45d9dc 100644 --- a/go.mod +++ b/go.mod @@ -35,7 +35,7 @@ require ( github.com/json-iterator/go v1.1.12 github.com/julienschmidt/httprouter v1.3.0 github.com/kevinburke/go-bindata v3.21.0+incompatible - github.com/ledgerwatch/erigon-lib v0.0.0-20211109030232-5677f0c2bd53 + github.com/ledgerwatch/erigon-lib v0.0.0-20211110145339-ab66a02f5284 github.com/ledgerwatch/log/v3 v3.4.0 github.com/ledgerwatch/secp256k1 v1.0.0 github.com/logrusorgru/aurora/v3 v3.0.0 diff --git a/go.sum b/go.sum index d09d0c036..7bfe55c24 100644 --- a/go.sum +++ b/go.sum @@ -596,8 +596,8 @@ github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758 h1:0D5M2HQSGD3P github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c= github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8= -github.com/ledgerwatch/erigon-lib v0.0.0-20211109030232-5677f0c2bd53 h1:SRxoOSlbv2o1qzU5Cqx59ZVBKh7x8baEF2taRySNubk= -github.com/ledgerwatch/erigon-lib v0.0.0-20211109030232-5677f0c2bd53/go.mod h1:CuEZROm43MykZT5CjCj02jw0FOwaDl8Nh+PZkTEGopg= +github.com/ledgerwatch/erigon-lib v0.0.0-20211110145339-ab66a02f5284 h1:YV9tJXU5LB7UaVI6CLoSVxk56MaPQGZoaXjzu7me9Sg= +github.com/ledgerwatch/erigon-lib v0.0.0-20211110145339-ab66a02f5284/go.mod h1:CuEZROm43MykZT5CjCj02jw0FOwaDl8Nh+PZkTEGopg= github.com/ledgerwatch/log/v3 v3.4.0 h1:SEIOcv5a2zkG3PmoT5jeTU9m/0nEUv0BJS5bzsjwKCI= github.com/ledgerwatch/log/v3 v3.4.0/go.mod h1:VXcz6Ssn6XEeU92dCMc39/g1F0OYAjw1Mt+dGP5DjXY= github.com/ledgerwatch/secp256k1 v1.0.0 h1:Usvz87YoTG0uePIV8woOof5cQnLXGYa162rFf3YnwaQ=