2021-10-11 17:31:49 +00:00
/ *
Copyright 2021 Erigon contributors
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package compress
import (
"bufio"
"bytes"
"container/heap"
2022-01-17 08:50:42 +00:00
"context"
2021-10-11 17:31:49 +00:00
"encoding/binary"
"errors"
"fmt"
"io"
2022-04-13 11:55:15 +00:00
"math/bits"
2021-10-11 17:31:49 +00:00
"os"
2022-01-17 08:50:42 +00:00
"path/filepath"
"sync"
"time"
2021-10-11 17:31:49 +00:00
2022-10-05 10:54:48 +00:00
"github.com/c2h5oh/datasize"
2021-11-09 03:12:20 +00:00
"github.com/ledgerwatch/erigon-lib/common"
2023-09-29 20:11:13 +00:00
"github.com/ledgerwatch/erigon-lib/common/cmp"
2022-02-24 01:35:13 +00:00
dir2 "github.com/ledgerwatch/erigon-lib/common/dir"
2021-10-11 17:31:49 +00:00
"github.com/ledgerwatch/erigon-lib/etl"
2022-04-01 03:44:25 +00:00
"github.com/ledgerwatch/log/v3"
2022-07-18 10:12:39 +00:00
"golang.org/x/exp/slices"
2021-10-11 17:31:49 +00:00
)
// Compressor is the main operating type for performing per-word compression
2022-01-17 08:50:42 +00:00
// After creating a compression, one needs to add superstrings to it, using `AddWord` function
2022-03-09 17:25:22 +00:00
// In order to add word without compression, function `AddUncompressedWord` needs to be used
// Compressor only tracks which words are compressed and which are not until the compressed
// file is created. After that, the user of the file needs to know when to call
// `Next` or `NextUncompressed` function on the decompressor.
2021-10-16 09:43:41 +00:00
// After that, `Compress` function needs to be called to perform the compression
// and eventually create output file
2022-01-18 05:55:20 +00:00
type Compressor struct {
2022-10-21 08:31:23 +00:00
ctx context . Context
wg * sync . WaitGroup
superstrings chan [ ] byte
uncompressedFile * DecompressedFile
tmpDir string // temporary directory to use for ETL when building dictionary
logPrefix string
outputFile string // File where to output the dictionary and compressed data
tmpOutFilePath string // File where to output the dictionary and compressed data
suffixCollectors [ ] * etl . Collector
2022-01-17 08:50:42 +00:00
// Buffer for "superstring" - transformation of superstrings where each byte of a word, say b,
// is turned into 2 bytes, 0x01 and b, and two zero bytes 0x00 0x00 are inserted after each word
// this is needed for using ordinary (one string) suffix sorting algorithm instead of a generalised (many superstrings) suffix
// sorting algorithm
2022-01-27 10:13:26 +00:00
superstring [ ] byte
wordsCount uint64
2022-10-05 10:54:48 +00:00
superstringCount uint64
2022-10-21 08:31:23 +00:00
superstringLen int
workers int
Ratio CompressionRatio
lvl log . Lvl
trace bool
2023-05-18 19:55:02 +00:00
logger log . Logger
2023-07-11 06:09:05 +00:00
noFsync bool // fsync is enabled by default, but tests can manually disable
2022-01-17 08:50:42 +00:00
}
2023-05-18 19:55:02 +00:00
func NewCompressor ( ctx context . Context , logPrefix , outputFile , tmpDir string , minPatternScore uint64 , workers int , lvl log . Lvl , logger log . Logger ) ( * Compressor , error ) {
2022-02-24 01:35:13 +00:00
dir2 . MustExist ( tmpDir )
2022-01-17 08:50:42 +00:00
dir , fileName := filepath . Split ( outputFile )
tmpOutFilePath := filepath . Join ( dir , fileName ) + ".tmp"
2022-01-24 22:13:48 +00:00
// UncompressedFile - it's intermediate .idt file, outputFile it's final .seg (or .dat) file.
2022-02-24 01:35:13 +00:00
// tmpOutFilePath - it's ".seg.tmp" (".idt.tmp") file which will be renamed to .seg file if everything succeed.
2022-01-24 22:13:48 +00:00
// It allow atomically create .seg file (downloader will not see partially ready/ non-ready .seg files).
// I didn't create ".seg.tmp" file in tmpDir, because I think tmpDir and snapsthoDir may be mounted to different drives
2022-07-28 07:47:13 +00:00
uncompressedPath := filepath . Join ( tmpDir , fileName ) + ".idt"
2022-01-24 22:13:48 +00:00
uncompressedFile , err := NewUncompressedFile ( uncompressedPath )
2022-01-17 08:50:42 +00:00
if err != nil {
return nil , err
}
2022-01-27 10:13:26 +00:00
// Collector for dictionary superstrings (sorted by their score)
superstrings := make ( chan [ ] byte , workers * 2 )
wg := & sync . WaitGroup { }
wg . Add ( workers )
suffixCollectors := make ( [ ] * etl . Collector , workers )
for i := 0 ; i < workers ; i ++ {
2023-05-18 19:55:02 +00:00
collector := etl . NewCollector ( logPrefix + "_dict" , tmpDir , etl . NewSortableBuffer ( etl . BufferOptimalSize / 2 ) , logger )
2022-10-05 10:54:48 +00:00
collector . LogLvl ( lvl )
2022-01-27 10:13:26 +00:00
suffixCollectors [ i ] = collector
2023-08-30 03:20:40 +00:00
go processSuperstring ( ctx , superstrings , collector , minPatternScore , wg , logger )
2022-01-27 10:13:26 +00:00
}
2022-01-18 05:55:20 +00:00
return & Compressor {
2022-01-24 22:13:48 +00:00
uncompressedFile : uncompressedFile ,
tmpOutFilePath : tmpOutFilePath ,
outputFile : outputFile ,
tmpDir : tmpDir ,
logPrefix : logPrefix ,
workers : workers ,
ctx : ctx ,
2022-01-27 10:13:26 +00:00
superstrings : superstrings ,
suffixCollectors : suffixCollectors ,
2022-04-01 03:44:25 +00:00
lvl : lvl ,
2022-01-27 10:13:26 +00:00
wg : wg ,
2023-05-18 19:55:02 +00:00
logger : logger ,
2022-01-17 08:50:42 +00:00
} , nil
}
2022-01-18 05:55:20 +00:00
func ( c * Compressor ) Close ( ) {
2022-01-24 22:13:48 +00:00
c . uncompressedFile . Close ( )
2022-01-27 10:13:26 +00:00
for _ , collector := range c . suffixCollectors {
collector . Close ( )
}
c . suffixCollectors = nil
2022-01-24 22:13:48 +00:00
}
2023-07-11 06:09:05 +00:00
func ( c * Compressor ) SetTrace ( trace bool ) { c . trace = trace }
2023-12-23 09:37:30 +00:00
func ( c * Compressor ) Workers ( ) int { return c . workers }
2022-01-17 08:50:42 +00:00
2022-06-03 05:14:58 +00:00
func ( c * Compressor ) Count ( ) int { return int ( c . wordsCount ) }
2022-01-18 05:55:20 +00:00
func ( c * Compressor ) AddWord ( word [ ] byte ) error {
2023-04-05 04:49:05 +00:00
select {
case <- c . ctx . Done ( ) :
return c . ctx . Err ( )
default :
}
2022-01-17 08:50:42 +00:00
c . wordsCount ++
2022-10-05 10:54:48 +00:00
l := 2 * len ( word ) + 2
if c . superstringLen + l > superstringLimit {
if c . superstringCount % samplingFactor == 0 {
c . superstrings <- c . superstring
}
c . superstringCount ++
c . superstring = make ( [ ] byte , 0 , 1024 * 1024 )
c . superstringLen = 0
2022-01-27 10:13:26 +00:00
}
2022-10-05 10:54:48 +00:00
c . superstringLen += l
if c . superstringCount % samplingFactor == 0 {
for _ , a := range word {
c . superstring = append ( c . superstring , 1 , a )
}
c . superstring = append ( c . superstring , 0 , 0 )
2022-01-27 10:13:26 +00:00
}
2022-01-24 22:13:48 +00:00
return c . uncompressedFile . Append ( word )
2022-01-17 08:50:42 +00:00
}
2022-03-09 17:25:22 +00:00
func ( c * Compressor ) AddUncompressedWord ( word [ ] byte ) error {
2023-04-05 04:49:05 +00:00
select {
case <- c . ctx . Done ( ) :
return c . ctx . Err ( )
default :
}
2022-03-09 17:25:22 +00:00
c . wordsCount ++
return c . uncompressedFile . AppendUncompressed ( word )
}
2022-01-18 05:55:20 +00:00
func ( c * Compressor ) Compress ( ) error {
2022-01-24 22:13:48 +00:00
c . uncompressedFile . w . Flush ( )
2022-01-17 08:50:42 +00:00
logEvery := time . NewTicker ( 20 * time . Second )
defer logEvery . Stop ( )
if len ( c . superstring ) > 0 {
2022-01-27 10:13:26 +00:00
c . superstrings <- c . superstring
2022-01-17 08:50:42 +00:00
}
2022-01-27 10:13:26 +00:00
close ( c . superstrings )
c . wg . Wait ( )
2022-01-17 08:50:42 +00:00
2023-02-01 08:44:11 +00:00
if c . lvl < log . LvlTrace {
2023-05-18 19:55:02 +00:00
c . logger . Log ( c . lvl , fmt . Sprintf ( "[%s] BuildDict start" , c . logPrefix ) , "workers" , c . workers )
2023-02-01 08:44:11 +00:00
}
2022-10-05 10:54:48 +00:00
t := time . Now ( )
2023-05-18 19:55:02 +00:00
db , err := DictionaryBuilderFromCollectors ( c . ctx , compressLogPrefix , c . tmpDir , c . suffixCollectors , c . lvl , c . logger )
2022-01-17 08:50:42 +00:00
if err != nil {
return err
}
2022-01-27 05:54:38 +00:00
if c . trace {
2022-01-27 10:13:26 +00:00
_ , fileName := filepath . Split ( c . outputFile )
2022-01-27 05:54:38 +00:00
if err := PersistDictrionary ( filepath . Join ( c . tmpDir , fileName ) + ".dictionary.txt" , db ) ; err != nil {
return err
}
2022-01-17 08:50:42 +00:00
}
2022-01-27 05:54:38 +00:00
defer os . Remove ( c . tmpOutFilePath )
2023-02-01 08:44:11 +00:00
if c . lvl < log . LvlTrace {
2023-05-18 19:55:02 +00:00
c . logger . Log ( c . lvl , fmt . Sprintf ( "[%s] BuildDict" , c . logPrefix ) , "took" , time . Since ( t ) )
2023-02-01 08:44:11 +00:00
}
2022-10-05 10:54:48 +00:00
2023-07-11 06:09:05 +00:00
cf , err := os . Create ( c . tmpOutFilePath )
if err != nil {
return err
}
defer cf . Close ( )
2022-10-05 10:54:48 +00:00
t = time . Now ( )
2023-07-11 06:09:05 +00:00
if err := reducedict ( c . ctx , c . trace , c . logPrefix , c . tmpOutFilePath , cf , c . uncompressedFile , c . workers , db , c . lvl , c . logger ) ; err != nil {
return err
}
if err = c . fsync ( cf ) ; err != nil {
return err
}
if err = cf . Close ( ) ; err != nil {
2022-01-17 08:50:42 +00:00
return err
}
if err := os . Rename ( c . tmpOutFilePath , c . outputFile ) ; err != nil {
2022-01-24 22:13:48 +00:00
return fmt . Errorf ( "renaming: %w" , err )
2022-01-17 08:50:42 +00:00
}
2023-07-11 06:09:05 +00:00
2022-01-24 22:13:48 +00:00
c . Ratio , err = Ratio ( c . uncompressedFile . filePath , c . outputFile )
2022-01-17 08:50:42 +00:00
if err != nil {
2022-01-24 22:13:48 +00:00
return fmt . Errorf ( "ratio: %w" , err )
2022-01-17 08:50:42 +00:00
}
2022-10-05 10:54:48 +00:00
_ , fName := filepath . Split ( c . outputFile )
2023-02-01 08:44:11 +00:00
if c . lvl < log . LvlTrace {
2023-05-18 19:55:02 +00:00
c . logger . Log ( c . lvl , fmt . Sprintf ( "[%s] Compress" , c . logPrefix ) , "took" , time . Since ( t ) , "ratio" , c . Ratio , "file" , fName )
2023-02-01 08:44:11 +00:00
}
2022-01-17 08:50:42 +00:00
return nil
}
2023-07-11 06:09:05 +00:00
func ( c * Compressor ) DisableFsync ( ) { c . noFsync = true }
// fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes.
// To achieve it: write to .tmp file then `rename` when file is ready.
// Machine may power-off right after `rename` - it means `fsync` must be before `rename`
func ( c * Compressor ) fsync ( f * os . File ) error {
if c . noFsync {
return nil
}
if err := f . Sync ( ) ; err != nil {
c . logger . Warn ( "couldn't fsync" , "err" , err , "file" , c . tmpOutFilePath )
return err
}
return nil
}
2021-10-11 17:31:49 +00:00
// superstringLimit limits how large can one "superstring" get before it is processed
2022-01-18 05:55:20 +00:00
// CompressorSequential allocates 7 bytes for each uint of superstringLimit. For example,
2021-10-11 17:31:49 +00:00
// superstingLimit 16m will result in 112Mb being allocated for various arrays
const superstringLimit = 16 * 1024 * 1024
// minPatternLen is minimum length of pattern we consider to be included into the dictionary
const minPatternLen = 5
2022-04-17 00:59:29 +00:00
const maxPatternLen = 128
2021-10-11 17:31:49 +00:00
// maxDictPatterns is the maximum number of patterns allowed in the initial (not reduced dictionary)
// Large values increase memory consumption of dictionary reduction phase
2022-10-05 10:54:48 +00:00
/ *
Experiments on 74 Gb uncompressed file ( bsc 012500 - 013000 - transactions . seg )
Ram - needed just to open compressed file ( Huff tables , etc ... )
dec_speed - loop with ` word, _ = g.Next(word[:0]) `
skip_speed - loop with ` g.Skip() `
| DictSize | Ram | file_size | dec_speed | skip_speed |
| -- -- -- -- | -- -- | -- -- -- -- - | -- -- -- -- - | -- -- -- -- -- |
| 1 M | 70 Mb | 35871 Mb | 4 m06s | 1 m58s |
| 512 K | 42 Mb | 36496 Mb | 3 m49s | 1 m51s |
| 256 K | 21 Mb | 37100 Mb | 3 m44s | 1 m48s |
| 128 K | 11 Mb | 37782 Mb | 3 m25s | 1 m44s |
| 64 K | 7 Mb | 38597 Mb | 3 m16s | 1 m34s |
| 32 K | 5 Mb | 39626 Mb | 3 m0s | 1 m29s |
* /
const maxDictPatterns = 64 * 1024
// samplingFactor - skip superstrings if `superstringNumber % samplingFactor != 0`
const samplingFactor = 4
2021-10-11 17:31:49 +00:00
2022-08-10 12:00:19 +00:00
// nolint
2021-10-11 17:31:49 +00:00
const compressLogPrefix = "compress"
type DictionaryBuilder struct {
2021-11-09 03:12:20 +00:00
lastWord [ ] byte
items [ ] * Pattern
2022-10-21 08:31:23 +00:00
limit int
lastWordScore uint64
2021-10-11 17:31:49 +00:00
}
func ( db * DictionaryBuilder ) Reset ( limit int ) {
db . limit = limit
db . items = db . items [ : 0 ]
}
2022-07-18 10:12:39 +00:00
func ( db * DictionaryBuilder ) Len ( ) int { return len ( db . items ) }
func ( db * DictionaryBuilder ) Less ( i , j int ) bool {
2021-10-11 17:31:49 +00:00
if db . items [ i ] . score == db . items [ j ] . score {
2021-11-09 03:12:20 +00:00
return bytes . Compare ( db . items [ i ] . word , db . items [ j ] . word ) < 0
2021-10-11 17:31:49 +00:00
}
return db . items [ i ] . score < db . items [ j ] . score
}
2023-09-29 20:11:13 +00:00
func dictionaryBuilderCmp ( i , j * Pattern ) int {
2022-07-18 10:12:39 +00:00
if i . score == j . score {
2023-09-29 20:11:13 +00:00
return bytes . Compare ( i . word , j . word )
2022-07-18 10:12:39 +00:00
}
2023-09-29 20:11:13 +00:00
return cmp . Compare ( i . score , j . score )
2022-07-18 10:12:39 +00:00
}
2021-10-11 17:31:49 +00:00
func ( db * DictionaryBuilder ) Swap ( i , j int ) {
db . items [ i ] , db . items [ j ] = db . items [ j ] , db . items [ i ]
}
2023-09-29 20:11:13 +00:00
func ( db * DictionaryBuilder ) Sort ( ) { slices . SortFunc ( db . items , dictionaryBuilderCmp ) }
2021-10-11 17:31:49 +00:00
func ( db * DictionaryBuilder ) Push ( x interface { } ) {
db . items = append ( db . items , x . ( * Pattern ) )
}
func ( db * DictionaryBuilder ) Pop ( ) interface { } {
old := db . items
n := len ( old )
x := old [ n - 1 ]
2022-12-04 09:19:09 +00:00
old [ n - 1 ] = nil
2021-10-11 17:31:49 +00:00
db . items = old [ 0 : n - 1 ]
return x
}
2021-11-09 03:12:20 +00:00
func ( db * DictionaryBuilder ) processWord ( chars [ ] byte , score uint64 ) {
2021-11-10 12:50:36 +00:00
heap . Push ( db , & Pattern { word : common . Copy ( chars ) , score : score } )
2021-10-11 17:31:49 +00:00
if db . Len ( ) > db . limit {
// Remove the element with smallest score
heap . Pop ( db )
}
}
func ( db * DictionaryBuilder ) loadFunc ( k , v [ ] byte , table etl . CurrentTableReader , next etl . LoadNextFunc ) error {
score := binary . BigEndian . Uint64 ( v )
2021-11-09 03:12:20 +00:00
if bytes . Equal ( k , db . lastWord ) {
db . lastWordScore += score
2021-10-11 17:31:49 +00:00
} else {
2021-11-09 03:12:20 +00:00
if db . lastWord != nil {
db . processWord ( db . lastWord , db . lastWordScore )
2021-10-11 17:31:49 +00:00
}
2021-11-09 03:12:20 +00:00
db . lastWord = append ( db . lastWord [ : 0 ] , k ... )
db . lastWordScore = score
2021-10-11 17:31:49 +00:00
}
return nil
}
func ( db * DictionaryBuilder ) finish ( ) {
2021-11-09 03:12:20 +00:00
if db . lastWord != nil {
db . processWord ( db . lastWord , db . lastWordScore )
}
}
func ( db * DictionaryBuilder ) ForEach ( f func ( score uint64 , word [ ] byte ) ) {
2021-11-10 12:50:36 +00:00
for i := db . Len ( ) ; i > 0 ; i -- {
2021-11-09 03:12:20 +00:00
f ( db . items [ i - 1 ] . score , db . items [ i - 1 ] . word )
2021-10-11 17:31:49 +00:00
}
}
2022-01-27 05:54:38 +00:00
func ( db * DictionaryBuilder ) Close ( ) {
db . items = nil
db . lastWord = nil
}
2022-01-17 08:50:42 +00:00
// Pattern is representation of a pattern that is searched in the superstrings to compress them
2021-10-11 17:31:49 +00:00
// patterns are stored in a patricia tree and contain pattern score (calculated during
// the initial dictionary building), frequency of usage, and code
type Pattern struct {
2022-10-21 08:31:23 +00:00
word [ ] byte // Pattern characters
2021-10-11 17:31:49 +00:00
score uint64 // Score assigned to the pattern during dictionary building
uses uint64 // How many times this pattern has been used during search and optimisation
code uint64 // Allocated numerical code
codeBits int // Number of bits in the code
2022-04-13 11:55:15 +00:00
depth int // Depth of the pattern in the huffman tree (for encoding in the file)
2021-10-11 17:31:49 +00:00
}
// PatternList is a sorted list of pattern for the purpose of
// building Huffman tree to determine efficient coding.
// Patterns with least usage come first, we use numerical code
// as a tie breaker to make sure the resulting Huffman code is canonical
type PatternList [ ] * Pattern
2022-07-18 10:12:39 +00:00
func ( pl PatternList ) Len ( ) int { return len ( pl ) }
2023-09-29 20:11:13 +00:00
func patternListCmp ( i , j * Pattern ) int {
2022-07-18 10:12:39 +00:00
if i . uses == j . uses {
2023-09-29 20:11:13 +00:00
return cmp . Compare ( bits . Reverse64 ( i . code ) , bits . Reverse64 ( j . code ) )
2021-10-11 17:31:49 +00:00
}
2023-09-29 20:11:13 +00:00
return cmp . Compare ( i . uses , j . uses )
2021-10-11 17:31:49 +00:00
}
// PatternHuff is an intermediate node in a huffman tree of patterns
// It has two children, each of which may either be another intermediate node (h0 or h1)
// or leaf node, which is Pattern (p0 or p1).
type PatternHuff struct {
2022-10-21 08:31:23 +00:00
p0 * Pattern
p1 * Pattern
h0 * PatternHuff
h1 * PatternHuff
2021-10-11 17:31:49 +00:00
uses uint64
tieBreaker uint64
}
func ( h * PatternHuff ) AddZero ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . codeBits ++
} else {
h . h0 . AddZero ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . codeBits ++
} else {
h . h1 . AddZero ( )
}
}
func ( h * PatternHuff ) AddOne ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . code ++
h . p0 . codeBits ++
} else {
h . h0 . AddOne ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . code ++
h . p1 . codeBits ++
} else {
h . h1 . AddOne ( )
}
}
2022-04-13 11:55:15 +00:00
func ( h * PatternHuff ) SetDepth ( depth int ) {
if h . p0 != nil {
h . p0 . depth = depth + 1
h . p0 . uses = 0
}
if h . p1 != nil {
h . p1 . depth = depth + 1
h . p1 . uses = 0
}
if h . h0 != nil {
h . h0 . SetDepth ( depth + 1 )
}
if h . h1 != nil {
h . h1 . SetDepth ( depth + 1 )
}
}
2021-10-11 17:31:49 +00:00
// PatternHeap is priority queue of pattern for the purpose of building
// Huffman tree to determine efficient coding. Patterns with least usage
// have highest priority. We use a tie-breaker to make sure
// the resulting Huffman code is canonical
type PatternHeap [ ] * PatternHuff
func ( ph PatternHeap ) Len ( ) int {
return len ( ph )
}
func ( ph PatternHeap ) Less ( i , j int ) bool {
if ph [ i ] . uses == ph [ j ] . uses {
return ph [ i ] . tieBreaker < ph [ j ] . tieBreaker
}
return ph [ i ] . uses < ph [ j ] . uses
}
func ( ph * PatternHeap ) Swap ( i , j int ) {
( * ph ) [ i ] , ( * ph ) [ j ] = ( * ph ) [ j ] , ( * ph ) [ i ]
}
func ( ph * PatternHeap ) Push ( x interface { } ) {
* ph = append ( * ph , x . ( * PatternHuff ) )
}
func ( ph * PatternHeap ) Pop ( ) interface { } {
old := * ph
n := len ( old )
x := old [ n - 1 ]
2022-12-04 09:19:09 +00:00
old [ n - 1 ] = nil
2021-10-11 17:31:49 +00:00
* ph = old [ 0 : n - 1 ]
return x
}
type Position struct {
uses uint64
2022-04-13 11:55:15 +00:00
pos uint64
2021-10-11 17:31:49 +00:00
code uint64
codeBits int
2022-04-13 11:55:15 +00:00
depth int // Depth of the position in the huffman tree (for encoding in the file)
2021-10-11 17:31:49 +00:00
}
type PositionHuff struct {
2022-10-21 08:31:23 +00:00
p0 * Position
p1 * Position
h0 * PositionHuff
h1 * PositionHuff
2021-10-11 17:31:49 +00:00
uses uint64
tieBreaker uint64
}
func ( h * PositionHuff ) AddZero ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . codeBits ++
} else {
h . h0 . AddZero ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . codeBits ++
} else {
h . h1 . AddZero ( )
}
}
func ( h * PositionHuff ) AddOne ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . code ++
h . p0 . codeBits ++
} else {
h . h0 . AddOne ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . code ++
h . p1 . codeBits ++
} else {
h . h1 . AddOne ( )
}
}
2022-04-13 11:55:15 +00:00
func ( h * PositionHuff ) SetDepth ( depth int ) {
if h . p0 != nil {
h . p0 . depth = depth + 1
h . p0 . uses = 0
}
if h . p1 != nil {
h . p1 . depth = depth + 1
h . p1 . uses = 0
}
if h . h0 != nil {
h . h0 . SetDepth ( depth + 1 )
}
if h . h1 != nil {
h . h1 . SetDepth ( depth + 1 )
}
}
2021-10-11 17:31:49 +00:00
type PositionList [ ] * Position
2022-07-18 10:12:39 +00:00
func ( pl PositionList ) Len ( ) int { return len ( pl ) }
2021-10-11 17:31:49 +00:00
2023-09-29 20:11:13 +00:00
func positionListCmp ( i , j * Position ) int {
2022-07-18 10:12:39 +00:00
if i . uses == j . uses {
2023-09-29 20:11:13 +00:00
return cmp . Compare ( bits . Reverse64 ( i . code ) , bits . Reverse64 ( j . code ) )
2021-10-11 17:31:49 +00:00
}
2023-09-29 20:11:13 +00:00
return cmp . Compare ( i . uses , j . uses )
2021-10-11 17:31:49 +00:00
}
type PositionHeap [ ] * PositionHuff
func ( ph PositionHeap ) Len ( ) int {
return len ( ph )
}
func ( ph PositionHeap ) Less ( i , j int ) bool {
2023-09-29 20:11:13 +00:00
return ph . Compare ( i , j ) < 0
}
func ( ph PositionHeap ) Compare ( i , j int ) int {
2021-10-11 17:31:49 +00:00
if ph [ i ] . uses == ph [ j ] . uses {
2023-09-29 20:11:13 +00:00
return cmp . Compare ( ph [ i ] . tieBreaker , ph [ j ] . tieBreaker )
2021-10-11 17:31:49 +00:00
}
2023-09-29 20:11:13 +00:00
return cmp . Compare ( ph [ i ] . uses , ph [ j ] . uses )
2021-10-11 17:31:49 +00:00
}
func ( ph * PositionHeap ) Swap ( i , j int ) {
( * ph ) [ i ] , ( * ph ) [ j ] = ( * ph ) [ j ] , ( * ph ) [ i ]
}
func ( ph * PositionHeap ) Push ( x interface { } ) {
* ph = append ( * ph , x . ( * PositionHuff ) )
}
func ( ph * PositionHeap ) Pop ( ) interface { } {
old := * ph
n := len ( old )
x := old [ n - 1 ]
2022-12-04 09:19:09 +00:00
old [ n - 1 ] = nil
2021-10-11 17:31:49 +00:00
* ph = old [ 0 : n - 1 ]
return x
}
type HuffmanCoder struct {
w * bufio . Writer
outputBits int
outputByte byte
}
func ( hf * HuffmanCoder ) encode ( code uint64 , codeBits int ) error {
for codeBits > 0 {
var bitsUsed int
if hf . outputBits + codeBits > 8 {
bitsUsed = 8 - hf . outputBits
} else {
bitsUsed = codeBits
}
mask := ( uint64 ( 1 ) << bitsUsed ) - 1
hf . outputByte |= byte ( ( code & mask ) << hf . outputBits )
code >>= bitsUsed
codeBits -= bitsUsed
hf . outputBits += bitsUsed
if hf . outputBits == 8 {
if e := hf . w . WriteByte ( hf . outputByte ) ; e != nil {
return e
}
hf . outputBits = 0
hf . outputByte = 0
}
}
return nil
}
func ( hf * HuffmanCoder ) flush ( ) error {
if hf . outputBits > 0 {
if e := hf . w . WriteByte ( hf . outputByte ) ; e != nil {
return e
}
hf . outputBits = 0
hf . outputByte = 0
}
return nil
}
// DynamicCell represents result of dynamic programming for certain starting position
type DynamicCell struct {
optimStart int
coverStart int
compression int
score uint64
patternIdx int // offset of the last element in the pattern slice
}
type Ring struct {
cells [ ] DynamicCell
head , tail , count int
}
func NewRing ( ) * Ring {
return & Ring {
cells : make ( [ ] DynamicCell , 16 ) ,
head : 0 ,
tail : 0 ,
count : 0 ,
}
}
func ( r * Ring ) Reset ( ) {
r . count = 0
r . head = 0
r . tail = 0
}
func ( r * Ring ) ensureSize ( ) {
if r . count < len ( r . cells ) {
return
}
newcells := make ( [ ] DynamicCell , r . count * 2 )
if r . tail > r . head {
copy ( newcells , r . cells [ r . head : r . tail ] )
} else {
n := copy ( newcells , r . cells [ r . head : ] )
copy ( newcells [ n : ] , r . cells [ : r . tail ] )
}
r . head = 0
r . tail = r . count
r . cells = newcells
}
func ( r * Ring ) PushFront ( ) * DynamicCell {
r . ensureSize ( )
if r . head == 0 {
r . head = len ( r . cells )
}
r . head --
r . count ++
return & r . cells [ r . head ]
}
func ( r * Ring ) PushBack ( ) * DynamicCell {
r . ensureSize ( )
if r . tail == len ( r . cells ) {
r . tail = 0
}
result := & r . cells [ r . tail ]
r . tail ++
r . count ++
return result
}
func ( r Ring ) Len ( ) int {
return r . count
}
func ( r * Ring ) Get ( i int ) * DynamicCell {
if i < 0 || i >= r . count {
return nil
}
return & r . cells [ ( r . head + i ) & ( len ( r . cells ) - 1 ) ]
}
// Truncate removes all items starting from i
func ( r * Ring ) Truncate ( i int ) {
r . count = i
r . tail = ( r . head + i ) & ( len ( r . cells ) - 1 )
}
2021-11-09 03:12:20 +00:00
type DictAggregator struct {
2022-10-21 08:31:23 +00:00
collector * etl . Collector
dist map [ int ] int
2021-11-09 03:12:20 +00:00
lastWord [ ] byte
lastWordScore uint64
}
func ( da * DictAggregator ) processWord ( word [ ] byte , score uint64 ) error {
var scoreBuf [ 8 ] byte
binary . BigEndian . PutUint64 ( scoreBuf [ : ] , score )
return da . collector . Collect ( word , scoreBuf [ : ] )
}
func ( da * DictAggregator ) Load ( loadFunc etl . LoadFunc , args etl . TransformArgs ) error {
defer da . collector . Close ( )
return da . collector . Load ( nil , "" , loadFunc , args )
}
func ( da * DictAggregator ) aggLoadFunc ( k , v [ ] byte , table etl . CurrentTableReader , next etl . LoadNextFunc ) error {
2022-01-17 08:50:42 +00:00
if _ , ok := da . dist [ len ( k ) ] ; ! ok {
da . dist [ len ( k ) ] = 0
}
da . dist [ len ( k ) ] ++
2021-11-09 03:12:20 +00:00
score := binary . BigEndian . Uint64 ( v )
if bytes . Equal ( k , da . lastWord ) {
da . lastWordScore += score
} else {
if da . lastWord != nil {
if err := da . processWord ( da . lastWord , da . lastWordScore ) ; err != nil {
return err
}
}
2021-11-10 12:50:36 +00:00
da . lastWord = append ( da . lastWord [ : 0 ] , k ... )
2021-11-09 03:12:20 +00:00
da . lastWordScore = score
}
return nil
}
func ( da * DictAggregator ) finish ( ) error {
if da . lastWord != nil {
return da . processWord ( da . lastWord , da . lastWordScore )
}
return nil
}
2022-01-17 08:50:42 +00:00
type CompressionRatio float64
func ( r CompressionRatio ) String ( ) string { return fmt . Sprintf ( "%.2f" , r ) }
func Ratio ( f1 , f2 string ) ( CompressionRatio , error ) {
s1 , err := os . Stat ( f1 )
if err != nil {
2023-03-25 05:13:20 +00:00
return 0 , err
2022-01-17 08:50:42 +00:00
}
s2 , err := os . Stat ( f2 )
if err != nil {
2023-03-25 05:13:20 +00:00
return 0 , err
2022-01-17 08:50:42 +00:00
}
return CompressionRatio ( float64 ( s1 . Size ( ) ) / float64 ( s2 . Size ( ) ) ) , nil
}
// DecompressedFile - .dat file format - simple format for temporary data store
type DecompressedFile struct {
f * os . File
w * bufio . Writer
2022-10-21 08:31:23 +00:00
filePath string
2022-01-17 08:50:42 +00:00
buf [ ] byte
2022-10-21 08:31:23 +00:00
count uint64
2022-01-17 08:50:42 +00:00
}
func NewUncompressedFile ( filePath string ) ( * DecompressedFile , error ) {
f , err := os . Create ( filePath )
if err != nil {
return nil , err
}
2022-10-27 09:47:04 +00:00
w := bufio . NewWriterSize ( f , 2 * etl . BufIOSize )
2022-01-17 08:50:42 +00:00
return & DecompressedFile { filePath : filePath , f : f , w : w , buf : make ( [ ] byte , 128 ) } , nil
}
func ( f * DecompressedFile ) Close ( ) {
f . w . Flush ( )
f . f . Close ( )
2022-01-24 22:13:48 +00:00
os . Remove ( f . filePath )
2022-01-17 08:50:42 +00:00
}
func ( f * DecompressedFile ) Append ( v [ ] byte ) error {
f . count ++
2022-03-09 17:25:22 +00:00
// For compressed words, the length prefix is shifted to make lowest bit zero
n := binary . PutUvarint ( f . buf , 2 * uint64 ( len ( v ) ) )
if _ , e := f . w . Write ( f . buf [ : n ] ) ; e != nil {
return e
}
if len ( v ) > 0 {
if _ , e := f . w . Write ( v ) ; e != nil {
return e
}
}
return nil
}
func ( f * DecompressedFile ) AppendUncompressed ( v [ ] byte ) error {
f . count ++
// For uncompressed words, the length prefix is shifted to make lowest bit one
n := binary . PutUvarint ( f . buf , 2 * uint64 ( len ( v ) ) + 1 )
2022-01-17 08:50:42 +00:00
if _ , e := f . w . Write ( f . buf [ : n ] ) ; e != nil {
return e
}
if len ( v ) > 0 {
if _ , e := f . w . Write ( v ) ; e != nil {
return e
}
}
return nil
}
// ForEach - Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values)
// We only consider values with length > 2, because smaller values are not compressible without going into bits
2022-03-09 17:25:22 +00:00
func ( f * DecompressedFile ) ForEach ( walker func ( v [ ] byte , compressed bool ) error ) error {
2022-01-17 08:50:42 +00:00
_ , err := f . f . Seek ( 0 , 0 )
if err != nil {
return err
}
2022-10-05 10:54:48 +00:00
r := bufio . NewReaderSize ( f . f , int ( 8 * datasize . MB ) )
buf := make ( [ ] byte , 16 * 1024 )
2022-01-17 08:50:42 +00:00
l , e := binary . ReadUvarint ( r )
for ; e == nil ; l , e = binary . ReadUvarint ( r ) {
2022-03-09 17:25:22 +00:00
// extract lowest bit of length prefix as "uncompressed" flag and shift to obtain correct length
compressed := ( l & 1 ) == 0
l >>= 1
2022-01-17 08:50:42 +00:00
if len ( buf ) < int ( l ) {
buf = make ( [ ] byte , l )
}
if _ , e = io . ReadFull ( r , buf [ : l ] ) ; e != nil {
return e
}
2022-03-09 17:25:22 +00:00
if err := walker ( buf [ : l ] , compressed ) ; err != nil {
2022-01-17 08:50:42 +00:00
return err
}
}
if e != nil && ! errors . Is ( e , io . EOF ) {
return e
}
return nil
}