2021-10-11 17:31:49 +00:00
/ *
Copyright 2021 Erigon contributors
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package compress
import (
"bufio"
"bytes"
"container/heap"
2022-01-17 08:50:42 +00:00
"context"
2021-10-11 17:31:49 +00:00
"encoding/binary"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
2022-01-17 08:50:42 +00:00
"path/filepath"
2021-10-11 17:31:49 +00:00
"sort"
2022-01-17 08:50:42 +00:00
"sync"
"time"
2021-10-11 17:31:49 +00:00
"github.com/flanglet/kanzi-go/transform"
2021-11-09 03:12:20 +00:00
"github.com/ledgerwatch/erigon-lib/common"
2021-10-11 17:31:49 +00:00
"github.com/ledgerwatch/erigon-lib/etl"
"github.com/ledgerwatch/erigon-lib/patricia"
2022-01-17 08:50:42 +00:00
"github.com/ledgerwatch/log/v3"
2021-10-11 17:31:49 +00:00
)
2021-11-09 03:12:20 +00:00
const ASSERT = false
2021-10-11 17:31:49 +00:00
// Compressor is the main operating type for performing per-word compression
2022-01-17 08:50:42 +00:00
// After creating a compression, one needs to add superstrings to it, using `AddWord` function
2021-10-16 09:43:41 +00:00
// After that, `Compress` function needs to be called to perform the compression
// and eventually create output file
2022-01-18 05:55:20 +00:00
type Compressor struct {
2022-01-24 22:13:48 +00:00
uncompressedFile * DecompressedFile
2022-01-17 08:50:42 +00:00
outputFile , tmpOutFilePath string // File where to output the dictionary and compressed data
tmpDir string // temporary directory to use for ETL when building dictionary
minPatternScore uint64 //minimum score (per superstring) required to consider including pattern into the dictionary
workers int
// Buffer for "superstring" - transformation of superstrings where each byte of a word, say b,
// is turned into 2 bytes, 0x01 and b, and two zero bytes 0x00 0x00 are inserted after each word
// this is needed for using ordinary (one string) suffix sorting algorithm instead of a generalised (many superstrings) suffix
// sorting algorithm
superstring [ ] byte
wordsCount uint64
ctx context . Context
logPrefix string
Ratio CompressionRatio
2022-01-24 22:13:48 +00:00
trace bool
2022-01-17 08:50:42 +00:00
}
2022-01-18 05:55:20 +00:00
func NewCompressor ( ctx context . Context , logPrefix , outputFile , tmpDir string , minPatternScore uint64 , workers int ) ( * Compressor , error ) {
2022-01-17 08:50:42 +00:00
dir , fileName := filepath . Split ( outputFile )
tmpOutFilePath := filepath . Join ( dir , fileName ) + ".tmp"
2022-01-24 22:13:48 +00:00
ext := filepath . Ext ( fileName )
// UncompressedFile - it's intermediate .idt file, outputFile it's final .seg (or .dat) file.
// tmpOutFilePath - it's ".seg.tmp" ("dat.tmp") file which will be renamed to .seg file if everything succeed.
// It allow atomically create .seg file (downloader will not see partially ready/ non-ready .seg files).
// I didn't create ".seg.tmp" file in tmpDir, because I think tmpDir and snapsthoDir may be mounted to different drives
uncompressedPath := filepath . Join ( tmpDir , fileName [ : len ( fileName ) - len ( ext ) ] ) + ".idt"
uncompressedFile , err := NewUncompressedFile ( uncompressedPath )
2022-01-17 08:50:42 +00:00
if err != nil {
return nil , err
}
2022-01-18 05:55:20 +00:00
return & Compressor {
2022-01-24 22:13:48 +00:00
uncompressedFile : uncompressedFile ,
tmpOutFilePath : tmpOutFilePath ,
outputFile : outputFile ,
tmpDir : tmpDir ,
logPrefix : logPrefix ,
minPatternScore : minPatternScore ,
workers : workers ,
ctx : ctx ,
2022-01-17 08:50:42 +00:00
} , nil
}
2022-01-18 05:55:20 +00:00
func ( c * Compressor ) Close ( ) {
2022-01-24 22:13:48 +00:00
c . uncompressedFile . Close ( )
}
func ( c * Compressor ) SetTrace ( trace bool ) {
c . trace = trace
2022-01-17 08:50:42 +00:00
}
2022-01-18 05:55:20 +00:00
func ( c * Compressor ) AddWord ( word [ ] byte ) error {
2022-01-17 08:50:42 +00:00
c . wordsCount ++
2022-01-24 22:13:48 +00:00
return c . uncompressedFile . Append ( word )
2022-01-17 08:50:42 +00:00
}
2022-01-18 05:55:20 +00:00
func ( c * Compressor ) Compress ( ) error {
2022-01-24 22:13:48 +00:00
c . uncompressedFile . w . Flush ( )
2022-01-17 08:50:42 +00:00
logEvery := time . NewTicker ( 20 * time . Second )
defer logEvery . Stop ( )
// Collector for dictionary superstrings (sorted by their score)
superstrings := make ( chan [ ] byte , c . workers )
wg := & sync . WaitGroup { }
wg . Add ( c . workers )
suffixCollectors := make ( [ ] * etl . Collector , c . workers )
defer func ( ) {
for _ , collector := range suffixCollectors {
collector . Close ( )
}
} ( )
for i := 0 ; i < c . workers ; i ++ {
//nolint
collector := etl . NewCollector ( compressLogPrefix , c . tmpDir , etl . NewSortableBuffer ( etl . BufferOptimalSize ) )
suffixCollectors [ i ] = collector
go processSuperstring ( superstrings , collector , c . minPatternScore , wg )
}
i := 0
2022-01-17 10:05:37 +00:00
c . superstring = nil
2022-01-24 22:13:48 +00:00
if err := c . uncompressedFile . ForEach ( func ( word [ ] byte ) error {
2022-01-17 08:50:42 +00:00
if len ( c . superstring ) + 2 * len ( word ) + 2 > superstringLimit {
superstrings <- c . superstring
2022-01-17 10:05:37 +00:00
c . superstring = nil
2022-01-17 08:50:42 +00:00
}
for _ , a := range word {
c . superstring = append ( c . superstring , 1 , a )
}
c . superstring = append ( c . superstring , 0 , 0 )
i ++
select {
default :
case <- c . ctx . Done ( ) :
return c . ctx . Err ( )
case <- logEvery . C :
2022-01-24 22:13:48 +00:00
log . Info ( fmt . Sprintf ( "[%s] Dictionary preprocessing" , c . logPrefix ) , "processed" , fmt . Sprintf ( "%.2f%%" , 100 * float64 ( i ) / float64 ( c . uncompressedFile . count ) ) )
2022-01-17 08:50:42 +00:00
}
return nil
} ) ; err != nil {
return err
}
if len ( c . superstring ) > 0 {
superstrings <- c . superstring
}
close ( superstrings )
wg . Wait ( )
db , err := DictionaryBuilderFromCollectors ( c . ctx , compressLogPrefix , c . tmpDir , suffixCollectors )
if err != nil {
return err
}
_ , fileName := filepath . Split ( c . outputFile )
2022-01-27 05:54:38 +00:00
if c . trace {
if err := PersistDictrionary ( filepath . Join ( c . tmpDir , fileName ) + ".dictionary.txt" , db ) ; err != nil {
return err
}
2022-01-17 08:50:42 +00:00
}
2022-01-27 05:54:38 +00:00
defer os . Remove ( c . tmpOutFilePath )
if err := reducedict ( c . trace , c . logPrefix , c . tmpOutFilePath , c . tmpDir , c . uncompressedFile , c . workers , db ) ; err != nil {
2022-01-17 08:50:42 +00:00
return err
}
2022-01-24 22:13:48 +00:00
2022-01-17 08:50:42 +00:00
if err := os . Rename ( c . tmpOutFilePath , c . outputFile ) ; err != nil {
2022-01-24 22:13:48 +00:00
return fmt . Errorf ( "renaming: %w" , err )
2022-01-17 08:50:42 +00:00
}
2022-01-24 22:13:48 +00:00
c . Ratio , err = Ratio ( c . uncompressedFile . filePath , c . outputFile )
2022-01-17 08:50:42 +00:00
if err != nil {
2022-01-24 22:13:48 +00:00
return fmt . Errorf ( "ratio: %w" , err )
2022-01-17 08:50:42 +00:00
}
return nil
}
2022-01-18 05:55:20 +00:00
type CompressorSequential struct {
2021-10-16 09:43:41 +00:00
outputFile string // File where to output the dictionary and compressed data
tmpDir string // temporary directory to use for ETL when building dictionary
minPatternScore uint64 //minimum score (per superstring) required to consider including pattern into the dictionary
2022-01-17 08:50:42 +00:00
// Buffer for "superstring" - transformation of superstrings where each byte of a word, say b,
2021-10-11 17:31:49 +00:00
// is turned into 2 bytes, 0x01 and b, and two zero bytes 0x00 0x00 are inserted after each word
2022-01-17 08:50:42 +00:00
// this is needed for using ordinary (one string) suffix sorting algorithm instead of a generalised (many superstrings) suffix
2021-10-11 17:31:49 +00:00
// sorting algorithm
superstring [ ] byte
divsufsort * transform . DivSufSort // Instance of DivSufSort - algorithm for building suffix array for the superstring
suffixarray [ ] int32 // Suffix array - output for divsufsort algorithm
lcp [ ] int32 // LCP array (Longest Common Prefix)
2022-01-17 08:50:42 +00:00
collector * etl . Collector // Collector used to handle very large sets of superstrings
2021-10-11 17:31:49 +00:00
numBuf [ binary . MaxVarintLen64 ] byte // Buffer for producing var int serialisation
collectBuf [ ] byte // Buffer for forming key to call collector
dictBuilder DictionaryBuilder // Priority queue that selects dictionary patterns with highest scores, and then sorts them by scores
pt patricia . PatriciaTree // Patricia tree of dictionary patterns
mf patricia . MatchFinder // Match finder to use together with patricia tree (it stores search context and buffers matches)
ring * Ring // Cycling ring for dynamic programming algorithm determining optimal coverage of word by dictionary patterns
2022-01-17 08:50:42 +00:00
wordFile * os . File // Temporary file to keep superstrings in for the second pass
2021-10-16 09:43:41 +00:00
wordW * bufio . Writer // Bufferred writer for temporary file
2021-10-11 17:31:49 +00:00
interFile * os . File // File to write intermediate compression to
interW * bufio . Writer // Buffered writer associate to interFile
patterns [ ] int // Buffer of pattern ids (used in the dynamic programming algorithm to remember patterns corresponding to dynamic cells)
uncovered [ ] int // Buffer of intervals that are not covered by patterns
posMap map [ uint64 ] uint64 // Counter of use for each position within compressed word (for building huffman code for positions)
2021-12-21 03:45:20 +00:00
wordsCount uint64
2021-10-11 17:31:49 +00:00
}
// superstringLimit limits how large can one "superstring" get before it is processed
2022-01-18 05:55:20 +00:00
// CompressorSequential allocates 7 bytes for each uint of superstringLimit. For example,
2021-10-11 17:31:49 +00:00
// superstingLimit 16m will result in 112Mb being allocated for various arrays
const superstringLimit = 16 * 1024 * 1024
// minPatternLen is minimum length of pattern we consider to be included into the dictionary
const minPatternLen = 5
2022-01-17 08:50:42 +00:00
const maxPatternLen = 4096
2021-10-11 17:31:49 +00:00
// maxDictPatterns is the maximum number of patterns allowed in the initial (not reduced dictionary)
// Large values increase memory consumption of dictionary reduction phase
2022-01-17 08:50:42 +00:00
const maxDictPatterns = 512 * 1024
2021-10-11 17:31:49 +00:00
2021-10-25 02:12:00 +00:00
//nolint
2021-10-11 17:31:49 +00:00
const compressLogPrefix = "compress"
type DictionaryBuilder struct {
2021-11-09 03:12:20 +00:00
limit int
lastWord [ ] byte
lastWordScore uint64
items [ ] * Pattern
2021-10-11 17:31:49 +00:00
}
func ( db * DictionaryBuilder ) Reset ( limit int ) {
db . limit = limit
db . items = db . items [ : 0 ]
}
func ( db DictionaryBuilder ) Len ( ) int {
return len ( db . items )
}
func ( db DictionaryBuilder ) Less ( i , j int ) bool {
if db . items [ i ] . score == db . items [ j ] . score {
2021-11-09 03:12:20 +00:00
return bytes . Compare ( db . items [ i ] . word , db . items [ j ] . word ) < 0
2021-10-11 17:31:49 +00:00
}
return db . items [ i ] . score < db . items [ j ] . score
}
func ( db * DictionaryBuilder ) Swap ( i , j int ) {
db . items [ i ] , db . items [ j ] = db . items [ j ] , db . items [ i ]
}
func ( db * DictionaryBuilder ) Push ( x interface { } ) {
db . items = append ( db . items , x . ( * Pattern ) )
}
func ( db * DictionaryBuilder ) Pop ( ) interface { } {
old := db . items
n := len ( old )
x := old [ n - 1 ]
db . items = old [ 0 : n - 1 ]
return x
}
2021-11-09 03:12:20 +00:00
func ( db * DictionaryBuilder ) processWord ( chars [ ] byte , score uint64 ) {
2021-11-10 12:50:36 +00:00
heap . Push ( db , & Pattern { word : common . Copy ( chars ) , score : score } )
2021-10-11 17:31:49 +00:00
if db . Len ( ) > db . limit {
// Remove the element with smallest score
heap . Pop ( db )
}
}
func ( db * DictionaryBuilder ) loadFunc ( k , v [ ] byte , table etl . CurrentTableReader , next etl . LoadNextFunc ) error {
score := binary . BigEndian . Uint64 ( v )
2021-11-09 03:12:20 +00:00
if bytes . Equal ( k , db . lastWord ) {
db . lastWordScore += score
2021-10-11 17:31:49 +00:00
} else {
2021-11-09 03:12:20 +00:00
if db . lastWord != nil {
db . processWord ( db . lastWord , db . lastWordScore )
2021-10-11 17:31:49 +00:00
}
2021-11-09 03:12:20 +00:00
db . lastWord = append ( db . lastWord [ : 0 ] , k ... )
db . lastWordScore = score
2021-10-11 17:31:49 +00:00
}
return nil
}
func ( db * DictionaryBuilder ) finish ( ) {
2021-11-09 03:12:20 +00:00
if db . lastWord != nil {
db . processWord ( db . lastWord , db . lastWordScore )
}
}
func ( db * DictionaryBuilder ) ForEach ( f func ( score uint64 , word [ ] byte ) ) {
2021-11-10 12:50:36 +00:00
for i := db . Len ( ) ; i > 0 ; i -- {
2021-11-09 03:12:20 +00:00
f ( db . items [ i - 1 ] . score , db . items [ i - 1 ] . word )
2021-10-11 17:31:49 +00:00
}
}
2022-01-27 05:54:38 +00:00
func ( db * DictionaryBuilder ) Close ( ) {
db . items = nil
db . lastWord = nil
}
2022-01-17 08:50:42 +00:00
// Pattern is representation of a pattern that is searched in the superstrings to compress them
2021-10-11 17:31:49 +00:00
// patterns are stored in a patricia tree and contain pattern score (calculated during
// the initial dictionary building), frequency of usage, and code
type Pattern struct {
score uint64 // Score assigned to the pattern during dictionary building
uses uint64 // How many times this pattern has been used during search and optimisation
code uint64 // Allocated numerical code
codeBits int // Number of bits in the code
2021-11-09 03:12:20 +00:00
word [ ] byte // Pattern characters
2021-10-11 17:31:49 +00:00
offset uint64 // Offset of this patten in the dictionary representation
}
// PatternList is a sorted list of pattern for the purpose of
// building Huffman tree to determine efficient coding.
// Patterns with least usage come first, we use numerical code
// as a tie breaker to make sure the resulting Huffman code is canonical
type PatternList [ ] * Pattern
func ( pl PatternList ) Len ( ) int {
return len ( pl )
}
func ( pl PatternList ) Less ( i , j int ) bool {
if pl [ i ] . uses == pl [ j ] . uses {
return pl [ i ] . code < pl [ j ] . code
}
return pl [ i ] . uses < pl [ j ] . uses
}
func ( pl * PatternList ) Swap ( i , j int ) {
( * pl ) [ i ] , ( * pl ) [ j ] = ( * pl ) [ j ] , ( * pl ) [ i ]
}
// PatternHuff is an intermediate node in a huffman tree of patterns
// It has two children, each of which may either be another intermediate node (h0 or h1)
// or leaf node, which is Pattern (p0 or p1).
type PatternHuff struct {
uses uint64
tieBreaker uint64
p0 , p1 * Pattern
h0 , h1 * PatternHuff
offset uint64 // Offset of this huffman tree node in the dictionary representation
}
func ( h * PatternHuff ) AddZero ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . codeBits ++
} else {
h . h0 . AddZero ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . codeBits ++
} else {
h . h1 . AddZero ( )
}
}
func ( h * PatternHuff ) AddOne ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . code ++
h . p0 . codeBits ++
} else {
h . h0 . AddOne ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . code ++
h . p1 . codeBits ++
} else {
h . h1 . AddOne ( )
}
}
// PatternHeap is priority queue of pattern for the purpose of building
// Huffman tree to determine efficient coding. Patterns with least usage
// have highest priority. We use a tie-breaker to make sure
// the resulting Huffman code is canonical
type PatternHeap [ ] * PatternHuff
func ( ph PatternHeap ) Len ( ) int {
return len ( ph )
}
func ( ph PatternHeap ) Less ( i , j int ) bool {
if ph [ i ] . uses == ph [ j ] . uses {
return ph [ i ] . tieBreaker < ph [ j ] . tieBreaker
}
return ph [ i ] . uses < ph [ j ] . uses
}
func ( ph * PatternHeap ) Swap ( i , j int ) {
( * ph ) [ i ] , ( * ph ) [ j ] = ( * ph ) [ j ] , ( * ph ) [ i ]
}
func ( ph * PatternHeap ) Push ( x interface { } ) {
* ph = append ( * ph , x . ( * PatternHuff ) )
}
func ( ph * PatternHeap ) Pop ( ) interface { } {
old := * ph
n := len ( old )
x := old [ n - 1 ]
* ph = old [ 0 : n - 1 ]
return x
}
type Position struct {
pos uint64
uses uint64
code uint64
codeBits int
offset uint64
}
type PositionHuff struct {
uses uint64
tieBreaker uint64
p0 , p1 * Position
h0 , h1 * PositionHuff
offset uint64
}
func ( h * PositionHuff ) AddZero ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . codeBits ++
} else {
h . h0 . AddZero ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . codeBits ++
} else {
h . h1 . AddZero ( )
}
}
func ( h * PositionHuff ) AddOne ( ) {
if h . p0 != nil {
h . p0 . code <<= 1
h . p0 . code ++
h . p0 . codeBits ++
} else {
h . h0 . AddOne ( )
}
if h . p1 != nil {
h . p1 . code <<= 1
h . p1 . code ++
h . p1 . codeBits ++
} else {
h . h1 . AddOne ( )
}
}
type PositionList [ ] * Position
func ( pl PositionList ) Len ( ) int {
return len ( pl )
}
func ( pl PositionList ) Less ( i , j int ) bool {
if pl [ i ] . uses == pl [ j ] . uses {
return pl [ i ] . pos < pl [ j ] . pos
}
return pl [ i ] . uses < pl [ j ] . uses
}
func ( pl * PositionList ) Swap ( i , j int ) {
( * pl ) [ i ] , ( * pl ) [ j ] = ( * pl ) [ j ] , ( * pl ) [ i ]
}
type PositionHeap [ ] * PositionHuff
func ( ph PositionHeap ) Len ( ) int {
return len ( ph )
}
func ( ph PositionHeap ) Less ( i , j int ) bool {
if ph [ i ] . uses == ph [ j ] . uses {
return ph [ i ] . tieBreaker < ph [ j ] . tieBreaker
}
return ph [ i ] . uses < ph [ j ] . uses
}
func ( ph * PositionHeap ) Swap ( i , j int ) {
( * ph ) [ i ] , ( * ph ) [ j ] = ( * ph ) [ j ] , ( * ph ) [ i ]
}
func ( ph * PositionHeap ) Push ( x interface { } ) {
* ph = append ( * ph , x . ( * PositionHuff ) )
}
func ( ph * PositionHeap ) Pop ( ) interface { } {
old := * ph
n := len ( old )
x := old [ n - 1 ]
* ph = old [ 0 : n - 1 ]
return x
}
type HuffmanCoder struct {
w * bufio . Writer
outputBits int
outputByte byte
}
func ( hf * HuffmanCoder ) encode ( code uint64 , codeBits int ) error {
for codeBits > 0 {
var bitsUsed int
if hf . outputBits + codeBits > 8 {
bitsUsed = 8 - hf . outputBits
} else {
bitsUsed = codeBits
}
mask := ( uint64 ( 1 ) << bitsUsed ) - 1
hf . outputByte |= byte ( ( code & mask ) << hf . outputBits )
code >>= bitsUsed
codeBits -= bitsUsed
hf . outputBits += bitsUsed
if hf . outputBits == 8 {
if e := hf . w . WriteByte ( hf . outputByte ) ; e != nil {
return e
}
hf . outputBits = 0
hf . outputByte = 0
}
}
return nil
}
func ( hf * HuffmanCoder ) flush ( ) error {
if hf . outputBits > 0 {
if e := hf . w . WriteByte ( hf . outputByte ) ; e != nil {
return e
}
hf . outputBits = 0
hf . outputByte = 0
}
return nil
}
// DynamicCell represents result of dynamic programming for certain starting position
type DynamicCell struct {
optimStart int
coverStart int
compression int
score uint64
patternIdx int // offset of the last element in the pattern slice
}
type Ring struct {
cells [ ] DynamicCell
head , tail , count int
}
func NewRing ( ) * Ring {
return & Ring {
cells : make ( [ ] DynamicCell , 16 ) ,
head : 0 ,
tail : 0 ,
count : 0 ,
}
}
func ( r * Ring ) Reset ( ) {
r . count = 0
r . head = 0
r . tail = 0
}
func ( r * Ring ) ensureSize ( ) {
if r . count < len ( r . cells ) {
return
}
newcells := make ( [ ] DynamicCell , r . count * 2 )
if r . tail > r . head {
copy ( newcells , r . cells [ r . head : r . tail ] )
} else {
n := copy ( newcells , r . cells [ r . head : ] )
copy ( newcells [ n : ] , r . cells [ : r . tail ] )
}
r . head = 0
r . tail = r . count
r . cells = newcells
}
func ( r * Ring ) PushFront ( ) * DynamicCell {
r . ensureSize ( )
if r . head == 0 {
r . head = len ( r . cells )
}
r . head --
r . count ++
return & r . cells [ r . head ]
}
func ( r * Ring ) PushBack ( ) * DynamicCell {
r . ensureSize ( )
if r . tail == len ( r . cells ) {
r . tail = 0
}
result := & r . cells [ r . tail ]
r . tail ++
r . count ++
return result
}
func ( r Ring ) Len ( ) int {
return r . count
}
func ( r * Ring ) Get ( i int ) * DynamicCell {
if i < 0 || i >= r . count {
return nil
}
return & r . cells [ ( r . head + i ) & ( len ( r . cells ) - 1 ) ]
}
// Truncate removes all items starting from i
func ( r * Ring ) Truncate ( i int ) {
r . count = i
r . tail = ( r . head + i ) & ( len ( r . cells ) - 1 )
}
2022-01-18 05:55:20 +00:00
func NewCompressorSequential ( logPrefix , outputFile string , tmpDir string , minPatternScore uint64 ) ( * CompressorSequential , error ) {
c := & CompressorSequential {
2021-10-16 09:43:41 +00:00
minPatternScore : minPatternScore ,
outputFile : outputFile ,
tmpDir : tmpDir ,
superstring : make ( [ ] byte , 0 , superstringLimit ) , // Allocate enough, so we never need to resize
suffixarray : make ( [ ] int32 , superstringLimit ) ,
lcp : make ( [ ] int32 , superstringLimit / 2 ) ,
collectBuf : make ( [ ] byte , 8 , 256 ) ,
ring : NewRing ( ) ,
patterns : make ( [ ] int , 0 , 32 ) ,
uncovered : make ( [ ] int , 0 , 32 ) ,
posMap : make ( map [ uint64 ] uint64 ) ,
2021-10-11 17:31:49 +00:00
}
var err error
if c . divsufsort , err = transform . NewDivSufSort ( ) ; err != nil {
return nil , err
}
2022-01-17 08:50:42 +00:00
if c . wordFile , err = ioutil . TempFile ( c . tmpDir , "superstrings-" ) ; err != nil {
2021-10-16 09:43:41 +00:00
return nil , err
}
2021-11-07 07:32:01 +00:00
c . wordW = bufio . NewWriterSize ( c . wordFile , etl . BufIOSize )
2021-10-25 02:12:00 +00:00
c . collector = etl . NewCollector ( logPrefix , tmpDir , etl . NewSortableBuffer ( etl . BufferOptimalSize ) )
2021-10-11 17:31:49 +00:00
return c , nil
}
2022-01-17 08:50:42 +00:00
// AddWord needs to be called repeatedly to provide all the superstrings to compress
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) AddWord ( word [ ] byte ) error {
2021-12-21 03:45:20 +00:00
c . wordsCount ++
2021-10-11 17:31:49 +00:00
if len ( c . superstring ) + 2 * len ( word ) + 2 > superstringLimit {
// Adding this word would make superstring go over the limit
if err := c . processSuperstring ( ) ; err != nil {
return fmt . Errorf ( "buildDictNextWord: error processing superstring: %w" , err )
}
}
for _ , b := range word {
c . superstring = append ( c . superstring , 1 , b )
}
c . superstring = append ( c . superstring , 0 , 0 )
2021-10-16 09:43:41 +00:00
n := binary . PutUvarint ( c . numBuf [ : ] , uint64 ( len ( word ) ) )
if _ , err := c . wordW . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
if len ( word ) > 0 {
if _ , err := c . wordW . Write ( word ) ; err != nil {
return err
}
}
2021-10-11 17:31:49 +00:00
return nil
}
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) Compress ( ) error {
2021-10-16 09:43:41 +00:00
if c . wordW != nil {
if err := c . wordW . Flush ( ) ; err != nil {
2021-10-11 17:31:49 +00:00
return err
}
}
2021-10-16 09:43:41 +00:00
if err := c . buildDictionary ( ) ; err != nil {
2021-10-11 17:31:49 +00:00
return err
}
2021-10-16 09:43:41 +00:00
if err := c . findMatches ( ) ; err != nil {
return err
}
if err := c . optimiseCodes ( ) ; err != nil {
return err
}
return nil
}
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) Close ( ) {
2021-11-15 14:19:56 +00:00
c . collector . Close ( )
c . wordFile . Close ( )
c . interFile . Close ( )
}
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) findMatches ( ) error {
2022-01-17 08:50:42 +00:00
// Build patricia tree out of the patterns in the dictionary, for further matching in individual superstrings
2021-10-16 09:43:41 +00:00
// Allocate temporary initial codes to the patterns so that patterns with higher scores get smaller code
// This helps reduce the size of intermediate compression
for i , p := range c . dictBuilder . items {
p . code = uint64 ( len ( c . dictBuilder . items ) - i - 1 )
2021-11-09 03:12:20 +00:00
c . pt . Insert ( p . word , p )
2021-10-16 09:43:41 +00:00
}
var err error
if c . interFile , err = ioutil . TempFile ( c . tmpDir , "inter-compress-" ) ; err != nil {
return err
}
2021-11-07 07:32:01 +00:00
c . interW = bufio . NewWriterSize ( c . interFile , etl . BufIOSize )
2021-10-16 09:43:41 +00:00
if _ , err := c . wordFile . Seek ( 0 , 0 ) ; err != nil {
return err
}
defer os . Remove ( c . wordFile . Name ( ) )
defer c . wordFile . Close ( )
2021-11-07 07:32:01 +00:00
r := bufio . NewReaderSize ( c . wordFile , etl . BufIOSize )
2021-10-16 09:43:41 +00:00
var readBuf [ ] byte
l , e := binary . ReadUvarint ( r )
for ; e == nil ; l , e = binary . ReadUvarint ( r ) {
c . posMap [ l + 1 ] ++
c . posMap [ 0 ] ++
if int ( l ) > len ( readBuf ) {
readBuf = make ( [ ] byte , l )
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
if _ , e := io . ReadFull ( r , readBuf [ : l ] ) ; e != nil {
return e
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
word := readBuf [ : l ]
// Encode length of the word as var int for the intermediate compression
n := binary . PutUvarint ( c . numBuf [ : ] , uint64 ( len ( word ) ) )
if _ , err := c . interW . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
if len ( word ) > 0 {
matches := c . mf . FindLongestMatches ( & c . pt , word )
if len ( matches ) == 0 {
n = binary . PutUvarint ( c . numBuf [ : ] , 0 )
if _ , err := c . interW . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
if _ , err := c . interW . Write ( word ) ; err != nil {
return err
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
continue
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
c . ring . Reset ( )
c . patterns = append ( c . patterns [ : 0 ] , 0 , 0 ) // Sentinel entry - no meaning
lastF := matches [ len ( matches ) - 1 ]
for j := lastF . Start ; j < lastF . End ; j ++ {
d := c . ring . PushBack ( )
d . optimStart = j + 1
d . coverStart = len ( word )
d . compression = 0
d . patternIdx = 0
d . score = 0
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
// Starting from the last match
for i := len ( matches ) ; i > 0 ; i -- {
f := matches [ i - 1 ]
p := f . Val . ( * Pattern )
firstCell := c . ring . Get ( 0 )
maxCompression := firstCell . compression
maxScore := firstCell . score
maxCell := firstCell
var maxInclude bool
for e := 0 ; e < c . ring . Len ( ) ; e ++ {
cell := c . ring . Get ( e )
comp := cell . compression - 4
if cell . coverStart >= f . End {
comp += f . End - f . Start
} else {
comp += cell . coverStart - f . Start
}
score := cell . score + p . score
if comp > maxCompression || ( comp == maxCompression && score > maxScore ) {
maxCompression = comp
maxScore = score
maxInclude = true
maxCell = cell
2022-01-24 22:13:48 +00:00
} else if cell . optimStart > f . End {
2021-10-16 09:43:41 +00:00
c . ring . Truncate ( e )
break
}
}
d := c . ring . PushFront ( )
d . optimStart = f . Start
d . score = maxScore
d . compression = maxCompression
if maxInclude {
d . coverStart = f . Start
d . patternIdx = len ( c . patterns )
c . patterns = append ( c . patterns , i - 1 , maxCell . patternIdx )
} else {
d . coverStart = maxCell . coverStart
d . patternIdx = maxCell . patternIdx
}
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
optimCell := c . ring . Get ( 0 )
// Count number of patterns
var patternCount uint64
patternIdx := optimCell . patternIdx
for patternIdx != 0 {
patternCount ++
patternIdx = c . patterns [ patternIdx + 1 ]
2021-10-11 17:31:49 +00:00
}
2021-10-16 09:43:41 +00:00
n = binary . PutUvarint ( c . numBuf [ : ] , patternCount )
2021-10-11 17:31:49 +00:00
if _ , err := c . interW . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
2021-10-16 09:43:41 +00:00
patternIdx = optimCell . patternIdx
lastStart := 0
var lastUncovered int
c . uncovered = c . uncovered [ : 0 ]
for patternIdx != 0 {
pattern := c . patterns [ patternIdx ]
p := matches [ pattern ] . Val . ( * Pattern )
if matches [ pattern ] . Start > lastUncovered {
c . uncovered = append ( c . uncovered , lastUncovered , matches [ pattern ] . Start )
}
lastUncovered = matches [ pattern ] . End
// Starting position
c . posMap [ uint64 ( matches [ pattern ] . Start - lastStart + 1 ) ] ++
lastStart = matches [ pattern ] . Start
n = binary . PutUvarint ( c . numBuf [ : ] , uint64 ( matches [ pattern ] . Start ) )
if _ , err := c . interW . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
// Code
n = binary . PutUvarint ( c . numBuf [ : ] , p . code )
if _ , err := c . interW . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
p . uses ++
patternIdx = c . patterns [ patternIdx + 1 ]
}
if len ( word ) > lastUncovered {
c . uncovered = append ( c . uncovered , lastUncovered , len ( word ) )
}
// Add uncoded input
for i := 0 ; i < len ( c . uncovered ) ; i += 2 {
if _ , err := c . interW . Write ( word [ c . uncovered [ i ] : c . uncovered [ i + 1 ] ] ) ; err != nil {
return err
}
2021-10-11 17:31:49 +00:00
}
}
}
2021-10-16 09:43:41 +00:00
if e != nil && ! errors . Is ( e , io . EOF ) {
return e
}
if err = c . interW . Flush ( ) ; err != nil {
return err
}
2021-10-11 17:31:49 +00:00
return nil
}
2021-10-16 09:43:41 +00:00
// optimises coding for patterns and positions
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) optimiseCodes ( ) error {
2021-10-11 17:31:49 +00:00
if _ , err := c . interFile . Seek ( 0 , 0 ) ; err != nil {
return err
}
defer os . Remove ( c . interFile . Name ( ) )
defer c . interFile . Close ( )
// Select patterns with non-zero use and sort them by increasing frequency of use (in preparation for building Huffman tree)
var patternList PatternList
for _ , p := range c . dictBuilder . items {
if p . uses > 0 {
patternList = append ( patternList , p )
}
}
sort . Sort ( & patternList )
2022-01-18 05:55:20 +00:00
2021-10-11 17:31:49 +00:00
// Calculate offsets of the dictionary patterns and total size
var offset uint64
for _ , p := range patternList {
p . offset = offset
2021-11-09 03:12:20 +00:00
n := binary . PutUvarint ( c . numBuf [ : ] , uint64 ( len ( p . word ) ) )
offset += uint64 ( n + len ( p . word ) )
2021-10-11 17:31:49 +00:00
}
patternCutoff := offset // All offsets below this will be considered patterns
i := 0 // Will be going over the patternList
// Build Huffman tree for codes
var codeHeap PatternHeap
heap . Init ( & codeHeap )
tieBreaker := uint64 ( 0 )
var huffs [ ] * PatternHuff // To be used to output dictionary
for codeHeap . Len ( ) + ( patternList . Len ( ) - i ) > 1 {
// New node
h := & PatternHuff {
tieBreaker : tieBreaker ,
offset : offset ,
}
if codeHeap . Len ( ) > 0 && ( i >= patternList . Len ( ) || codeHeap [ 0 ] . uses < patternList [ i ] . uses ) {
// Take h0 from the heap
h . h0 = heap . Pop ( & codeHeap ) . ( * PatternHuff )
h . h0 . AddZero ( )
h . uses += h . h0 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . h0 . offset )
offset += uint64 ( n )
} else {
// Take p0 from the list
h . p0 = patternList [ i ]
h . p0 . code = 0
h . p0 . codeBits = 1
h . uses += h . p0 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . p0 . offset )
offset += uint64 ( n )
i ++
}
if codeHeap . Len ( ) > 0 && ( i >= patternList . Len ( ) || codeHeap [ 0 ] . uses < patternList [ i ] . uses ) {
// Take h1 from the heap
h . h1 = heap . Pop ( & codeHeap ) . ( * PatternHuff )
h . h1 . AddOne ( )
h . uses += h . h1 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . h1 . offset )
offset += uint64 ( n )
} else {
// Take p1 from the list
h . p1 = patternList [ i ]
h . p1 . code = 1
h . p1 . codeBits = 1
h . uses += h . p1 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . p1 . offset )
offset += uint64 ( n )
i ++
}
tieBreaker ++
heap . Push ( & codeHeap , h )
huffs = append ( huffs , h )
}
2021-10-16 09:43:41 +00:00
var root * PatternHuff
if codeHeap . Len ( ) > 0 {
root = heap . Pop ( & codeHeap ) . ( * PatternHuff ) // Root node of huffman tree
}
2021-12-21 03:45:20 +00:00
// Start writing to result file
2021-10-11 17:31:49 +00:00
cf , err := os . Create ( c . outputFile )
if err != nil {
return err
}
2021-11-15 14:19:56 +00:00
defer cf . Close ( )
defer cf . Sync ( )
2021-11-07 07:32:01 +00:00
cw := bufio . NewWriterSize ( cf , etl . BufIOSize )
2021-11-15 14:19:56 +00:00
defer cw . Flush ( )
2022-01-17 08:50:42 +00:00
// 1-st, output amount of superstrings in file
2021-12-21 03:45:20 +00:00
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , c . wordsCount )
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
// 2-nd, output dictionary size
2021-10-11 17:31:49 +00:00
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , offset ) // Dictionary size
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
2021-12-21 03:45:20 +00:00
// 3-rd, output directory root
2021-10-16 09:43:41 +00:00
if root == nil {
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , 0 )
} else {
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , root . offset )
}
2021-10-11 17:31:49 +00:00
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
2021-12-21 03:45:20 +00:00
// 4-th, output pattern cutoff offset
2021-10-11 17:31:49 +00:00
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , patternCutoff )
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
// Write all the pattens
for _ , p := range patternList {
2021-11-09 03:12:20 +00:00
n := binary . PutUvarint ( c . numBuf [ : ] , uint64 ( len ( p . word ) ) )
2021-10-11 17:31:49 +00:00
if _ , err = cw . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
2021-11-09 03:12:20 +00:00
if _ , err = cw . Write ( p . word ) ; err != nil {
2021-10-11 17:31:49 +00:00
return err
}
}
// Write all the huffman nodes
for _ , h := range huffs {
var n int
if h . h0 != nil {
n = binary . PutUvarint ( c . numBuf [ : ] , h . h0 . offset )
} else {
n = binary . PutUvarint ( c . numBuf [ : ] , h . p0 . offset )
}
if _ , err = cw . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
if h . h1 != nil {
n = binary . PutUvarint ( c . numBuf [ : ] , h . h1 . offset )
} else {
n = binary . PutUvarint ( c . numBuf [ : ] , h . p1 . offset )
}
if _ , err = cw . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
}
var positionList PositionList
pos2code := make ( map [ uint64 ] * Position )
for pos , uses := range c . posMap {
p := & Position { pos : pos , uses : uses , code : 0 , codeBits : 0 , offset : 0 }
positionList = append ( positionList , p )
pos2code [ pos ] = p
}
sort . Sort ( & positionList )
// Calculate offsets of the dictionary positions and total size
offset = 0
for _ , p := range positionList {
p . offset = offset
n := binary . PutUvarint ( c . numBuf [ : ] , p . pos )
offset += uint64 ( n )
}
positionCutoff := offset // All offsets below this will be considered positions
i = 0 // Will be going over the positionList
// Build Huffman tree for codes
var posHeap PositionHeap
heap . Init ( & posHeap )
tieBreaker = uint64 ( 0 )
var posHuffs [ ] * PositionHuff // To be used to output dictionary
for posHeap . Len ( ) + ( positionList . Len ( ) - i ) > 1 {
// New node
h := & PositionHuff {
tieBreaker : tieBreaker ,
offset : offset ,
}
if posHeap . Len ( ) > 0 && ( i >= positionList . Len ( ) || posHeap [ 0 ] . uses < positionList [ i ] . uses ) {
// Take h0 from the heap
h . h0 = heap . Pop ( & posHeap ) . ( * PositionHuff )
h . h0 . AddZero ( )
h . uses += h . h0 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . h0 . offset )
offset += uint64 ( n )
} else {
// Take p0 from the list
h . p0 = positionList [ i ]
h . p0 . code = 0
h . p0 . codeBits = 1
h . uses += h . p0 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . p0 . offset )
offset += uint64 ( n )
i ++
}
if posHeap . Len ( ) > 0 && ( i >= positionList . Len ( ) || posHeap [ 0 ] . uses < positionList [ i ] . uses ) {
// Take h1 from the heap
h . h1 = heap . Pop ( & posHeap ) . ( * PositionHuff )
h . h1 . AddOne ( )
h . uses += h . h1 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . h1 . offset )
offset += uint64 ( n )
} else {
// Take p1 from the list
h . p1 = positionList [ i ]
h . p1 . code = 1
h . p1 . codeBits = 1
h . uses += h . p1 . uses
n := binary . PutUvarint ( c . numBuf [ : ] , h . p1 . offset )
offset += uint64 ( n )
i ++
}
tieBreaker ++
heap . Push ( & posHeap , h )
posHuffs = append ( posHuffs , h )
}
2021-10-16 09:43:41 +00:00
var posRoot * PositionHuff
if posHeap . Len ( ) > 0 {
posRoot = heap . Pop ( & posHeap ) . ( * PositionHuff )
}
// First, output dictionary size
2021-10-11 17:31:49 +00:00
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , offset ) // Dictionary size
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
// Secondly, output directory root
2021-10-16 09:43:41 +00:00
if posRoot == nil {
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , 0 )
} else {
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , posRoot . offset )
}
2021-10-11 17:31:49 +00:00
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
// Thirdly, output pattern cutoff offset
binary . BigEndian . PutUint64 ( c . numBuf [ : ] , positionCutoff )
if _ , err = cw . Write ( c . numBuf [ : 8 ] ) ; err != nil {
return err
}
// Write all the positions
for _ , p := range positionList {
n := binary . PutUvarint ( c . numBuf [ : ] , p . pos )
if _ , err = cw . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
}
// Write all the huffman nodes
for _ , h := range posHuffs {
var n int
if h . h0 != nil {
n = binary . PutUvarint ( c . numBuf [ : ] , h . h0 . offset )
} else {
n = binary . PutUvarint ( c . numBuf [ : ] , h . p0 . offset )
}
if _ , err = cw . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
if h . h1 != nil {
n = binary . PutUvarint ( c . numBuf [ : ] , h . h1 . offset )
} else {
n = binary . PutUvarint ( c . numBuf [ : ] , h . p1 . offset )
}
if _ , err = cw . Write ( c . numBuf [ : n ] ) ; err != nil {
return err
}
}
2021-11-07 07:32:01 +00:00
r := bufio . NewReaderSize ( c . interFile , etl . BufIOSize )
2021-10-11 17:31:49 +00:00
var hc HuffmanCoder
hc . w = cw
l , e := binary . ReadUvarint ( r )
for ; e == nil ; l , e = binary . ReadUvarint ( r ) {
posCode := pos2code [ l + 1 ]
2021-10-16 09:43:41 +00:00
if posCode != nil {
if e = hc . encode ( posCode . code , posCode . codeBits ) ; e != nil {
return e
}
2021-10-11 17:31:49 +00:00
}
2021-11-16 00:11:40 +00:00
if l == 0 {
if e = hc . flush ( ) ; e != nil {
return e
}
} else {
2021-10-11 17:31:49 +00:00
var pNum uint64 // Number of patterns
if pNum , e = binary . ReadUvarint ( r ) ; e != nil {
return e
}
// Now reading patterns one by one
var lastPos uint64
var lastUncovered int
var uncoveredCount int
for i := 0 ; i < int ( pNum ) ; i ++ {
var pos uint64 // Starting position for pattern
if pos , e = binary . ReadUvarint ( r ) ; e != nil {
return e
}
posCode = pos2code [ pos - lastPos + 1 ]
lastPos = pos
2021-10-16 09:43:41 +00:00
if posCode != nil {
if e = hc . encode ( posCode . code , posCode . codeBits ) ; e != nil {
return e
}
2021-10-11 17:31:49 +00:00
}
var code uint64 // Code of the pattern
if code , e = binary . ReadUvarint ( r ) ; e != nil {
return e
}
patternCode := c . dictBuilder . items [ len ( c . dictBuilder . items ) - 1 - int ( code ) ]
if int ( pos ) > lastUncovered {
uncoveredCount += int ( pos ) - lastUncovered
}
2021-11-09 03:12:20 +00:00
lastUncovered = int ( pos ) + len ( patternCode . word )
2021-10-11 17:31:49 +00:00
if e = hc . encode ( patternCode . code , patternCode . codeBits ) ; e != nil {
return e
}
}
if int ( l ) > lastUncovered {
uncoveredCount += int ( l ) - lastUncovered
}
// Terminating position and flush
posCode = pos2code [ 0 ]
2021-10-16 09:43:41 +00:00
if posCode != nil {
if e = hc . encode ( posCode . code , posCode . codeBits ) ; e != nil {
return e
}
2021-10-11 17:31:49 +00:00
}
if e = hc . flush ( ) ; e != nil {
return e
}
// Copy uncovered characters
if uncoveredCount > 0 {
if _ , e = io . CopyN ( cw , r , int64 ( uncoveredCount ) ) ; e != nil {
return e
}
}
}
}
if e != nil && ! errors . Is ( e , io . EOF ) {
return e
}
return nil
}
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) buildDictionary ( ) error {
2021-10-11 17:31:49 +00:00
if len ( c . superstring ) > 0 {
2022-01-17 08:50:42 +00:00
// Process any residual superstrings
2021-10-11 17:31:49 +00:00
if err := c . processSuperstring ( ) ; err != nil {
return fmt . Errorf ( "buildDictionary: error processing superstring: %w" , err )
}
}
c . dictBuilder . Reset ( maxDictPatterns )
2021-10-25 02:12:00 +00:00
if err := c . collector . Load ( nil , "" , c . dictBuilder . loadFunc , etl . TransformArgs { } ) ; err != nil {
2021-10-11 17:31:49 +00:00
return err
}
c . dictBuilder . finish ( )
2021-10-25 02:12:00 +00:00
c . collector . Close ( )
2021-10-11 17:31:49 +00:00
// Sort dictionary inside the dictionary bilder in the order of increasing scores
sort . Sort ( & c . dictBuilder )
return nil
}
2022-01-18 05:55:20 +00:00
func ( c * CompressorSequential ) processSuperstring ( ) error {
2021-10-11 17:31:49 +00:00
c . divsufsort . ComputeSuffixArray ( c . superstring , c . suffixarray [ : len ( c . superstring ) ] )
// filter out suffixes that start with odd positions - we reuse the first half of sa.suffixarray for that
// because it won't be used after filtration
n := len ( c . superstring ) / 2
saFiltered := c . suffixarray [ : n ]
2021-10-16 09:43:41 +00:00
j := 0
for _ , s := range c . suffixarray [ : len ( c . superstring ) ] {
if ( s & 1 ) == 0 {
saFiltered [ j ] = s >> 1
j ++
}
2021-10-11 17:31:49 +00:00
}
// Now create an inverted array - we reuse the second half of sa.suffixarray for that
2021-10-16 09:43:41 +00:00
saInverted := c . suffixarray [ : n ]
2021-10-11 17:31:49 +00:00
for i := 0 ; i < n ; i ++ {
saInverted [ saFiltered [ i ] ] = int32 ( i )
}
// Create LCP array (Kasai's algorithm)
var k int
// Process all suffixes one by one starting from
// first suffix in superstring
for i := 0 ; i < n ; i ++ {
/ * If the current suffix is at n - 1 , then we don ’ t
have next substring to consider . So lcp is not
defined for this substring , we put zero . * /
if saInverted [ i ] == int32 ( n - 1 ) {
k = 0
continue
}
/ * j contains index of the next substring to
be considered to compare with the present
substring , i . e . , next string in suffix array * /
j := int ( saFiltered [ saInverted [ i ] + 1 ] )
// Directly start matching from k'th index as
// at-least k-1 characters will match
for i + k < n && j + k < n && c . superstring [ ( i + k ) * 2 ] != 0 && c . superstring [ ( j + k ) * 2 ] != 0 && c . superstring [ ( i + k ) * 2 + 1 ] == c . superstring [ ( j + k ) * 2 + 1 ] {
k ++
}
c . lcp [ saInverted [ i ] ] = int32 ( k ) // lcp for the present suffix.
// Deleting the starting character from the string.
if k > 0 {
k --
}
}
// Walk over LCP array and compute the scores of the strings
b := saInverted
2021-10-16 09:43:41 +00:00
j = 0
2021-10-11 17:31:49 +00:00
for i := 0 ; i < n - 1 ; i ++ {
// Only when there is a drop in LCP value
if c . lcp [ i + 1 ] >= c . lcp [ i ] {
j = i
continue
}
for l := c . lcp [ i ] ; l > c . lcp [ i + 1 ] ; l -- {
2021-11-10 15:09:54 +00:00
if l < minPatternLen || l > maxPatternLen {
2021-10-11 17:31:49 +00:00
continue
}
// Go back
var new bool
for j > 0 && c . lcp [ j - 1 ] >= l {
j --
new = true
}
if ! new {
break
}
window := i - j + 2
copy ( b , saFiltered [ j : i + 2 ] )
sort . Slice ( b [ : window ] , func ( i1 , i2 int ) bool { return b [ i1 ] < b [ i2 ] } )
repeats := 1
lastK := 0
for k := 1 ; k < window ; k ++ {
if b [ k ] >= b [ lastK ] + l {
repeats ++
lastK = k
}
}
2021-12-27 08:11:24 +00:00
score := uint64 ( repeats * int ( l - 4 ) )
2021-10-16 09:43:41 +00:00
if score >= c . minPatternScore {
2021-10-11 17:31:49 +00:00
// Dictionary key is the concatenation of the score and the dictionary word (to later aggregate the scores from multiple chunks)
c . collectBuf = c . collectBuf [ : 8 ]
for s := int32 ( 0 ) ; s < l ; s ++ {
c . collectBuf = append ( c . collectBuf , c . superstring [ ( saFiltered [ i ] + s ) * 2 + 1 ] )
}
binary . BigEndian . PutUint64 ( c . collectBuf [ : 8 ] , score )
if err := c . collector . Collect ( c . collectBuf [ 8 : ] , c . collectBuf [ : 8 ] ) ; err != nil { // key will be copied by Collect function
return fmt . Errorf ( "collecting %x with score %d: %w" , c . collectBuf [ 8 : ] , score , err )
}
}
}
}
c . superstring = c . superstring [ : 0 ]
return nil
}
2021-11-09 03:12:20 +00:00
type DictAggregator struct {
lastWord [ ] byte
lastWordScore uint64
collector * etl . Collector
2022-01-17 08:50:42 +00:00
dist map [ int ] int
2021-11-09 03:12:20 +00:00
}
func ( da * DictAggregator ) processWord ( word [ ] byte , score uint64 ) error {
var scoreBuf [ 8 ] byte
binary . BigEndian . PutUint64 ( scoreBuf [ : ] , score )
return da . collector . Collect ( word , scoreBuf [ : ] )
}
func ( da * DictAggregator ) Load ( loadFunc etl . LoadFunc , args etl . TransformArgs ) error {
defer da . collector . Close ( )
return da . collector . Load ( nil , "" , loadFunc , args )
}
func ( da * DictAggregator ) aggLoadFunc ( k , v [ ] byte , table etl . CurrentTableReader , next etl . LoadNextFunc ) error {
2022-01-17 08:50:42 +00:00
if _ , ok := da . dist [ len ( k ) ] ; ! ok {
da . dist [ len ( k ) ] = 0
}
da . dist [ len ( k ) ] ++
2021-11-09 03:12:20 +00:00
score := binary . BigEndian . Uint64 ( v )
if bytes . Equal ( k , da . lastWord ) {
da . lastWordScore += score
} else {
if da . lastWord != nil {
if err := da . processWord ( da . lastWord , da . lastWordScore ) ; err != nil {
return err
}
}
2021-11-10 12:50:36 +00:00
da . lastWord = append ( da . lastWord [ : 0 ] , k ... )
2021-11-09 03:12:20 +00:00
da . lastWordScore = score
}
return nil
}
func ( da * DictAggregator ) finish ( ) error {
if da . lastWord != nil {
return da . processWord ( da . lastWord , da . lastWordScore )
}
return nil
}
2022-01-17 08:50:42 +00:00
type CompressionRatio float64
func ( r CompressionRatio ) String ( ) string { return fmt . Sprintf ( "%.2f" , r ) }
func Ratio ( f1 , f2 string ) ( CompressionRatio , error ) {
s1 , err := os . Stat ( f1 )
if err != nil {
return 0 , err
}
s2 , err := os . Stat ( f2 )
if err != nil {
return 0 , err
}
return CompressionRatio ( float64 ( s1 . Size ( ) ) / float64 ( s2 . Size ( ) ) ) , nil
}
// DecompressedFile - .dat file format - simple format for temporary data store
type DecompressedFile struct {
filePath string
f * os . File
w * bufio . Writer
count uint64
buf [ ] byte
}
func NewUncompressedFile ( filePath string ) ( * DecompressedFile , error ) {
f , err := os . Create ( filePath )
if err != nil {
return nil , err
}
w := bufio . NewWriterSize ( f , etl . BufIOSize )
return & DecompressedFile { filePath : filePath , f : f , w : w , buf : make ( [ ] byte , 128 ) } , nil
}
func ( f * DecompressedFile ) Close ( ) {
f . w . Flush ( )
//f.f.Sync()
f . f . Close ( )
2022-01-24 22:13:48 +00:00
os . Remove ( f . filePath )
2022-01-17 08:50:42 +00:00
}
func ( f * DecompressedFile ) Append ( v [ ] byte ) error {
f . count ++
n := binary . PutUvarint ( f . buf , uint64 ( len ( v ) ) )
if _ , e := f . w . Write ( f . buf [ : n ] ) ; e != nil {
return e
}
if len ( v ) > 0 {
if _ , e := f . w . Write ( v ) ; e != nil {
return e
}
}
return nil
}
// ForEach - Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values)
// We only consider values with length > 2, because smaller values are not compressible without going into bits
func ( f * DecompressedFile ) ForEach ( walker func ( v [ ] byte ) error ) error {
_ , err := f . f . Seek ( 0 , 0 )
if err != nil {
return err
}
r := bufio . NewReaderSize ( f . f , etl . BufIOSize )
buf := make ( [ ] byte , 4096 )
l , e := binary . ReadUvarint ( r )
for ; e == nil ; l , e = binary . ReadUvarint ( r ) {
if len ( buf ) < int ( l ) {
buf = make ( [ ] byte , l )
}
if _ , e = io . ReadFull ( r , buf [ : l ] ) ; e != nil {
return e
}
if err := walker ( buf [ : l ] ) ; err != nil {
return err
}
}
if e != nil && ! errors . Is ( e , io . EOF ) {
return e
}
return nil
}