erigon-pulse/common/etl/etl.go

189 lines
5.0 KiB
Go
Raw Normal View History

package etl
import (
"bytes"
"fmt"
"io"
"reflect"
"runtime"
"time"
2020-08-05 15:33:58 +00:00
"github.com/c2h5oh/datasize"
"github.com/ledgerwatch/erigon/common"
"github.com/ledgerwatch/erigon/common/dbutils"
"github.com/ledgerwatch/erigon/ethdb"
"github.com/ledgerwatch/erigon/log"
"github.com/ugorji/go/codec"
)
var (
cbor codec.CborHandle
)
type Decoder interface {
Reset(reader io.Reader)
Decode(interface{}) error
}
type CurrentTableReader interface {
Get([]byte) ([]byte, error)
}
type ExtractNextFunc func(originalK, k []byte, v []byte) error
type ExtractFunc func(k []byte, v []byte, next ExtractNextFunc) error
// NextKey generates the possible next key w/o changing the key length.
// for [0x01, 0x01, 0x01] it will generate [0x01, 0x01, 0x02], etc
func NextKey(key []byte) ([]byte, error) {
if len(key) == 0 {
return key, fmt.Errorf("could not apply NextKey for the empty key")
}
nextKey := common.CopyBytes(key)
for i := len(key) - 1; i >= 0; i-- {
b := nextKey[i]
if b < 0xFF {
nextKey[i] = b + 1
return nextKey, nil
}
if b == 0xFF {
nextKey[i] = 0
}
}
return key, fmt.Errorf("overflow while applying NextKey")
}
// LoadCommitHandler is a callback called each time a new batch is being
// loaded from files into a DB
// * `key`: last commited key to the database (use etl.NextKey helper to use in LoadStartKey)
// * `isDone`: true, if everything is processed
type LoadCommitHandler func(db ethdb.Putter, key []byte, isDone bool) error
type AdditionalLogArguments func(k, v []byte) (additionalLogArguments []interface{})
type TransformArgs struct {
ExtractStartKey []byte
ExtractEndKey []byte
FixedBits int
BufferType int
BufferSize int
Quit <-chan struct{}
LogDetailsExtract AdditionalLogArguments
LogDetailsLoad AdditionalLogArguments
Comparator dbutils.CmpFunc
}
func Transform(
logPrefix string,
2021-04-02 11:22:25 +00:00
db ethdb.RwTx,
2020-08-10 23:55:32 +00:00
fromBucket string,
toBucket string,
tmpdir string,
extractFunc ExtractFunc,
loadFunc LoadFunc,
args TransformArgs,
) error {
bufferSize := BufferOptimalSize
if args.BufferSize > 0 {
2020-10-27 15:53:49 +00:00
bufferSize = datasize.ByteSize(args.BufferSize)
}
buffer := getBufferByType(args.BufferType, bufferSize)
collector := NewCollector(tmpdir, buffer)
t := time.Now()
if err := extractBucketIntoFiles(logPrefix, db, fromBucket, args.ExtractStartKey, args.ExtractEndKey, args.FixedBits, collector, extractFunc, args.Quit, args.LogDetailsExtract); err != nil {
disposeProviders(logPrefix, collector.dataProviders)
return err
}
log.Debug(fmt.Sprintf("[%s] Extraction finished", logPrefix), "it took", time.Since(t))
defer func(t time.Time) {
log.Debug(fmt.Sprintf("[%s] Collection finished", logPrefix), "it took", time.Since(t))
}(time.Now())
return collector.Load(logPrefix, db, toBucket, loadFunc, args)
}
func extractBucketIntoFiles(
logPrefix string,
2021-04-02 11:22:25 +00:00
db ethdb.Tx,
2020-08-10 23:55:32 +00:00
bucket string,
startkey []byte,
endkey []byte,
fixedBits int,
collector *Collector,
extractFunc ExtractFunc,
quit <-chan struct{},
additionalLogArguments AdditionalLogArguments,
) error {
logEvery := time.NewTicker(30 * time.Second)
defer logEvery.Stop()
var m runtime.MemStats
2021-04-02 11:22:25 +00:00
c, err := db.Cursor(bucket)
if err != nil {
return err
}
defer c.Close()
if err := ethdb.Walk(c, startkey, fixedBits, func(k, v []byte) (bool, error) {
if err := common.Stopped(quit); err != nil {
return false, err
}
select {
default:
case <-logEvery.C:
logArs := []interface{}{"from", bucket}
if additionalLogArguments != nil {
logArs = append(logArs, additionalLogArguments(k, v)...)
} else {
logArs = append(logArs, "current key", makeCurrentKeyStr(k))
}
runtime.ReadMemStats(&m)
2021-07-03 07:44:23 +00:00
logArs = append(logArs, "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
log.Info(fmt.Sprintf("[%s] ETL [1/2] Extracting", logPrefix), logArs...)
}
if endkey != nil && bytes.Compare(k, endkey) > 0 {
return false, nil
}
if err := extractFunc(k, v, collector.extractNextFunc); err != nil {
return false, err
}
return true, nil
}); err != nil {
return err
}
return collector.flushBuffer(nil, true)
}
func disposeProviders(logPrefix string, providers []dataProvider) {
2020-08-05 15:33:58 +00:00
totalSize := uint64(0)
for _, p := range providers {
2020-08-05 15:33:58 +00:00
providerSize, err := p.Dispose()
if err != nil {
log.Warn(fmt.Sprintf("[%s] promoting hashed state, error while disposing provider", logPrefix), "provider", p, "err", err)
}
2020-08-05 15:33:58 +00:00
totalSize += providerSize
}
if totalSize > 0 {
log.Info(fmt.Sprintf("[%s] etl: temp files removed successfully", logPrefix), "total size", datasize.ByteSize(totalSize).HumanReadable())
}
}
type currentTableReader struct {
2021-04-02 11:22:25 +00:00
getter ethdb.Tx
2020-08-10 23:55:32 +00:00
bucket string
}
func (s *currentTableReader) Get(key []byte) ([]byte, error) {
2021-04-02 11:22:25 +00:00
return s.getter.GetOne(s.bucket, key)
}
// IdentityLoadFunc loads entries as they are, without transformation
var IdentityLoadFunc LoadFunc = func(k []byte, value []byte, _ CurrentTableReader, next LoadNextFunc) error {
return next(k, k, value)
}
func isIdentityLoadFunc(f LoadFunc) bool {
return f == nil || reflect.ValueOf(IdentityLoadFunc).Pointer() == reflect.ValueOf(f).Pointer()
}