2020-05-30 07:00:35 +00:00
|
|
|
package etl
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
2020-06-13 15:03:38 +00:00
|
|
|
"io"
|
2020-08-17 06:45:52 +00:00
|
|
|
"reflect"
|
2020-08-21 06:30:30 +00:00
|
|
|
"runtime"
|
2020-06-13 15:03:38 +00:00
|
|
|
"time"
|
|
|
|
|
2020-08-05 15:33:58 +00:00
|
|
|
"github.com/c2h5oh/datasize"
|
2021-05-20 18:25:53 +00:00
|
|
|
"github.com/ledgerwatch/erigon/common"
|
|
|
|
"github.com/ledgerwatch/erigon/common/dbutils"
|
|
|
|
"github.com/ledgerwatch/erigon/ethdb"
|
|
|
|
"github.com/ledgerwatch/erigon/log"
|
2020-06-10 20:07:14 +00:00
|
|
|
"github.com/ugorji/go/codec"
|
2020-05-30 07:00:35 +00:00
|
|
|
)
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
var (
|
2020-06-10 20:07:14 +00:00
|
|
|
cbor codec.CborHandle
|
2020-05-31 07:32:33 +00:00
|
|
|
)
|
2020-05-30 07:00:35 +00:00
|
|
|
|
|
|
|
type Decoder interface {
|
2020-06-10 20:07:14 +00:00
|
|
|
Reset(reader io.Reader)
|
2020-05-30 07:00:35 +00:00
|
|
|
Decode(interface{}) error
|
|
|
|
}
|
|
|
|
|
2020-10-19 13:11:01 +00:00
|
|
|
type CurrentTableReader interface {
|
2020-05-30 07:00:35 +00:00
|
|
|
Get([]byte) ([]byte, error)
|
|
|
|
}
|
|
|
|
|
2020-06-10 20:07:14 +00:00
|
|
|
type ExtractNextFunc func(originalK, k []byte, v []byte) error
|
2020-05-30 07:00:35 +00:00
|
|
|
type ExtractFunc func(k []byte, v []byte, next ExtractNextFunc) error
|
|
|
|
|
2020-06-01 14:14:40 +00:00
|
|
|
// NextKey generates the possible next key w/o changing the key length.
|
|
|
|
// for [0x01, 0x01, 0x01] it will generate [0x01, 0x01, 0x02], etc
|
|
|
|
func NextKey(key []byte) ([]byte, error) {
|
|
|
|
if len(key) == 0 {
|
|
|
|
return key, fmt.Errorf("could not apply NextKey for the empty key")
|
|
|
|
}
|
|
|
|
nextKey := common.CopyBytes(key)
|
|
|
|
for i := len(key) - 1; i >= 0; i-- {
|
|
|
|
b := nextKey[i]
|
|
|
|
if b < 0xFF {
|
|
|
|
nextKey[i] = b + 1
|
|
|
|
return nextKey, nil
|
|
|
|
}
|
|
|
|
if b == 0xFF {
|
|
|
|
nextKey[i] = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return key, fmt.Errorf("overflow while applying NextKey")
|
|
|
|
}
|
|
|
|
|
|
|
|
// LoadCommitHandler is a callback called each time a new batch is being
|
|
|
|
// loaded from files into a DB
|
|
|
|
// * `key`: last commited key to the database (use etl.NextKey helper to use in LoadStartKey)
|
|
|
|
// * `isDone`: true, if everything is processed
|
2020-08-05 10:13:35 +00:00
|
|
|
type LoadCommitHandler func(db ethdb.Putter, key []byte, isDone bool) error
|
2020-08-22 10:12:33 +00:00
|
|
|
type AdditionalLogArguments func(k, v []byte) (additionalLogArguments []interface{})
|
2020-06-01 14:14:40 +00:00
|
|
|
|
|
|
|
type TransformArgs struct {
|
|
|
|
ExtractStartKey []byte
|
2020-06-10 20:07:14 +00:00
|
|
|
ExtractEndKey []byte
|
|
|
|
FixedBits int
|
|
|
|
BufferType int
|
|
|
|
BufferSize int
|
2020-07-05 06:18:21 +00:00
|
|
|
Quit <-chan struct{}
|
2020-08-22 10:12:33 +00:00
|
|
|
|
|
|
|
LogDetailsExtract AdditionalLogArguments
|
|
|
|
LogDetailsLoad AdditionalLogArguments
|
2020-09-10 12:35:58 +00:00
|
|
|
|
|
|
|
Comparator dbutils.CmpFunc
|
2020-05-31 12:23:34 +00:00
|
|
|
}
|
2020-05-30 07:00:35 +00:00
|
|
|
|
2020-05-31 12:23:34 +00:00
|
|
|
func Transform(
|
2020-10-21 17:01:40 +00:00
|
|
|
logPrefix string,
|
2021-04-02 11:22:25 +00:00
|
|
|
db ethdb.RwTx,
|
2020-08-10 23:55:32 +00:00
|
|
|
fromBucket string,
|
|
|
|
toBucket string,
|
2020-10-23 11:18:45 +00:00
|
|
|
tmpdir string,
|
2020-05-31 12:23:34 +00:00
|
|
|
extractFunc ExtractFunc,
|
|
|
|
loadFunc LoadFunc,
|
2020-06-01 14:14:40 +00:00
|
|
|
args TransformArgs,
|
2020-05-31 12:23:34 +00:00
|
|
|
) error {
|
2020-06-10 20:07:14 +00:00
|
|
|
bufferSize := BufferOptimalSize
|
|
|
|
if args.BufferSize > 0 {
|
2020-10-27 15:53:49 +00:00
|
|
|
bufferSize = datasize.ByteSize(args.BufferSize)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
2020-06-10 20:07:14 +00:00
|
|
|
buffer := getBufferByType(args.BufferType, bufferSize)
|
2020-10-23 11:18:45 +00:00
|
|
|
collector := NewCollector(tmpdir, buffer)
|
2020-06-10 20:07:14 +00:00
|
|
|
|
|
|
|
t := time.Now()
|
2020-10-21 17:01:40 +00:00
|
|
|
if err := extractBucketIntoFiles(logPrefix, db, fromBucket, args.ExtractStartKey, args.ExtractEndKey, args.FixedBits, collector, extractFunc, args.Quit, args.LogDetailsExtract); err != nil {
|
|
|
|
disposeProviders(logPrefix, collector.dataProviders)
|
2020-06-28 06:10:27 +00:00
|
|
|
return err
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
2020-10-21 17:01:40 +00:00
|
|
|
log.Debug(fmt.Sprintf("[%s] Extraction finished", logPrefix), "it took", time.Since(t))
|
2020-06-10 20:07:14 +00:00
|
|
|
|
2020-10-21 17:01:40 +00:00
|
|
|
defer func(t time.Time) {
|
|
|
|
log.Debug(fmt.Sprintf("[%s] Collection finished", logPrefix), "it took", time.Since(t))
|
|
|
|
}(time.Now())
|
|
|
|
return collector.Load(logPrefix, db, toBucket, loadFunc, args)
|
2020-05-31 12:23:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func extractBucketIntoFiles(
|
2020-10-21 17:01:40 +00:00
|
|
|
logPrefix string,
|
2021-04-02 11:22:25 +00:00
|
|
|
db ethdb.Tx,
|
2020-08-10 23:55:32 +00:00
|
|
|
bucket string,
|
2020-05-31 12:23:34 +00:00
|
|
|
startkey []byte,
|
2020-06-10 20:07:14 +00:00
|
|
|
endkey []byte,
|
|
|
|
fixedBits int,
|
2020-05-31 12:23:34 +00:00
|
|
|
collector *Collector,
|
|
|
|
extractFunc ExtractFunc,
|
2020-07-05 06:18:21 +00:00
|
|
|
quit <-chan struct{},
|
2020-08-22 10:12:33 +00:00
|
|
|
additionalLogArguments AdditionalLogArguments,
|
2020-05-31 12:23:34 +00:00
|
|
|
) error {
|
2020-08-22 10:12:33 +00:00
|
|
|
logEvery := time.NewTicker(30 * time.Second)
|
|
|
|
defer logEvery.Stop()
|
2020-08-21 06:30:30 +00:00
|
|
|
var m runtime.MemStats
|
|
|
|
|
2021-04-02 11:22:25 +00:00
|
|
|
c, err := db.Cursor(bucket)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer c.Close()
|
|
|
|
if err := ethdb.Walk(c, startkey, fixedBits, func(k, v []byte) (bool, error) {
|
2020-05-31 12:23:34 +00:00
|
|
|
if err := common.Stopped(quit); err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2020-08-22 10:12:33 +00:00
|
|
|
|
|
|
|
select {
|
|
|
|
default:
|
|
|
|
case <-logEvery.C:
|
|
|
|
logArs := []interface{}{"from", bucket}
|
|
|
|
if additionalLogArguments != nil {
|
|
|
|
logArs = append(logArs, additionalLogArguments(k, v)...)
|
|
|
|
} else {
|
|
|
|
logArs = append(logArs, "current key", makeCurrentKeyStr(k))
|
|
|
|
}
|
|
|
|
|
2020-08-21 06:30:30 +00:00
|
|
|
runtime.ReadMemStats(&m)
|
2021-07-03 07:44:23 +00:00
|
|
|
logArs = append(logArs, "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
|
2020-10-21 17:01:40 +00:00
|
|
|
log.Info(fmt.Sprintf("[%s] ETL [1/2] Extracting", logPrefix), logArs...)
|
2020-08-22 10:12:33 +00:00
|
|
|
}
|
2020-06-10 20:07:14 +00:00
|
|
|
if endkey != nil && bytes.Compare(k, endkey) > 0 {
|
|
|
|
return false, nil
|
|
|
|
}
|
2020-05-31 12:23:34 +00:00
|
|
|
if err := extractFunc(k, v, collector.extractNextFunc); err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return collector.flushBuffer(nil, true)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
2020-10-21 17:01:40 +00:00
|
|
|
func disposeProviders(logPrefix string, providers []dataProvider) {
|
2020-08-05 15:33:58 +00:00
|
|
|
totalSize := uint64(0)
|
2020-05-31 07:32:33 +00:00
|
|
|
for _, p := range providers {
|
2020-08-05 15:33:58 +00:00
|
|
|
providerSize, err := p.Dispose()
|
2020-05-30 07:00:35 +00:00
|
|
|
if err != nil {
|
2020-10-21 17:01:40 +00:00
|
|
|
log.Warn(fmt.Sprintf("[%s] promoting hashed state, error while disposing provider", logPrefix), "provider", p, "err", err)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
2020-08-05 15:33:58 +00:00
|
|
|
totalSize += providerSize
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
2020-08-06 07:33:09 +00:00
|
|
|
if totalSize > 0 {
|
2020-10-21 17:01:40 +00:00
|
|
|
log.Info(fmt.Sprintf("[%s] etl: temp files removed successfully", logPrefix), "total size", datasize.ByteSize(totalSize).HumanReadable())
|
2020-08-06 07:33:09 +00:00
|
|
|
}
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
2020-10-19 13:11:01 +00:00
|
|
|
type currentTableReader struct {
|
2021-04-02 11:22:25 +00:00
|
|
|
getter ethdb.Tx
|
2020-08-10 23:55:32 +00:00
|
|
|
bucket string
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
2020-10-19 13:11:01 +00:00
|
|
|
func (s *currentTableReader) Get(key []byte) ([]byte, error) {
|
2021-04-02 11:22:25 +00:00
|
|
|
return s.getter.GetOne(s.bucket, key)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
2020-05-30 13:44:54 +00:00
|
|
|
|
2020-05-31 12:23:34 +00:00
|
|
|
// IdentityLoadFunc loads entries as they are, without transformation
|
2020-10-19 13:11:01 +00:00
|
|
|
var IdentityLoadFunc LoadFunc = func(k []byte, value []byte, _ CurrentTableReader, next LoadNextFunc) error {
|
2020-07-01 14:56:56 +00:00
|
|
|
return next(k, k, value)
|
2020-05-31 12:23:34 +00:00
|
|
|
}
|
2020-08-17 06:45:52 +00:00
|
|
|
|
|
|
|
func isIdentityLoadFunc(f LoadFunc) bool {
|
|
|
|
return f == nil || reflect.ValueOf(IdentityLoadFunc).Pointer() == reflect.ValueOf(f).Pointer()
|
|
|
|
}
|