2020-06-10 20:07:14 +00:00
|
|
|
package etl
|
|
|
|
|
|
|
|
import (
|
2020-08-17 06:45:52 +00:00
|
|
|
"bytes"
|
2020-06-10 20:07:14 +00:00
|
|
|
"container/heap"
|
|
|
|
"fmt"
|
2020-07-07 04:00:25 +00:00
|
|
|
"io"
|
2020-10-17 13:00:12 +00:00
|
|
|
"io/ioutil"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
2020-07-07 04:00:25 +00:00
|
|
|
"runtime"
|
2020-08-17 06:45:52 +00:00
|
|
|
"time"
|
2020-07-07 04:00:25 +00:00
|
|
|
|
2021-05-20 18:25:53 +00:00
|
|
|
"github.com/ledgerwatch/erigon/common"
|
|
|
|
"github.com/ledgerwatch/erigon/common/dbutils"
|
|
|
|
"github.com/ledgerwatch/erigon/ethdb"
|
|
|
|
"github.com/ledgerwatch/erigon/log"
|
2020-06-10 20:07:14 +00:00
|
|
|
"github.com/ugorji/go/codec"
|
|
|
|
)
|
|
|
|
|
2020-10-23 11:18:45 +00:00
|
|
|
const TmpDirName = "etl-temp"
|
|
|
|
|
2020-07-01 14:56:56 +00:00
|
|
|
type LoadNextFunc func(originalK, k, v []byte) error
|
2020-10-19 13:11:01 +00:00
|
|
|
type LoadFunc func(k []byte, value []byte, table CurrentTableReader, next LoadNextFunc) error
|
2020-06-10 20:07:14 +00:00
|
|
|
|
|
|
|
// Collector performs the job of ETL Transform, but can also be used without "E" (Extract) part
|
|
|
|
// as a Collect Transform Load
|
|
|
|
type Collector struct {
|
|
|
|
extractNextFunc ExtractNextFunc
|
|
|
|
flushBuffer func([]byte, bool) error
|
|
|
|
dataProviders []dataProvider
|
|
|
|
allFlushed bool
|
2020-10-25 08:38:55 +00:00
|
|
|
autoClean bool
|
2020-10-17 13:00:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewCollectorFromFiles creates collector from existing files (left over from previous unsuccessful loading)
|
2020-10-23 11:18:45 +00:00
|
|
|
func NewCollectorFromFiles(tmpdir string) (*Collector, error) {
|
|
|
|
if _, err := os.Stat(tmpdir); os.IsNotExist(err) {
|
2020-10-17 13:00:12 +00:00
|
|
|
return nil, nil
|
|
|
|
}
|
2020-10-23 11:18:45 +00:00
|
|
|
fileInfos, err := ioutil.ReadDir(tmpdir)
|
2020-10-17 13:00:12 +00:00
|
|
|
if err != nil {
|
2020-10-23 11:18:45 +00:00
|
|
|
return nil, fmt.Errorf("collector from files - reading directory %s: %w", tmpdir, err)
|
2020-10-17 13:00:12 +00:00
|
|
|
}
|
|
|
|
if len(fileInfos) == 0 {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
dataProviders := make([]dataProvider, len(fileInfos))
|
|
|
|
for i, fileInfo := range fileInfos {
|
|
|
|
var dataProvider fileDataProvider
|
2020-10-23 11:18:45 +00:00
|
|
|
dataProvider.file, err = os.Open(filepath.Join(tmpdir, fileInfo.Name()))
|
2020-10-17 13:00:12 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("collector from files - opening file %s: %w", fileInfo.Name(), err)
|
|
|
|
}
|
|
|
|
dataProviders[i] = &dataProvider
|
|
|
|
}
|
2020-10-25 08:38:55 +00:00
|
|
|
return &Collector{dataProviders: dataProviders, allFlushed: true, autoClean: false}, nil
|
2020-10-17 13:00:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewCriticalCollector does not clean up temporary files if loading has failed
|
2020-10-23 11:18:45 +00:00
|
|
|
func NewCriticalCollector(tmpdir string, sortableBuffer Buffer) *Collector {
|
|
|
|
c := NewCollector(tmpdir, sortableBuffer)
|
2020-10-25 08:38:55 +00:00
|
|
|
c.autoClean = false
|
2020-10-17 13:00:12 +00:00
|
|
|
return c
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
|
|
|
|
2020-10-23 11:18:45 +00:00
|
|
|
func NewCollector(tmpdir string, sortableBuffer Buffer) *Collector {
|
2020-10-25 08:38:55 +00:00
|
|
|
c := &Collector{autoClean: true}
|
2020-06-10 20:07:14 +00:00
|
|
|
encoder := codec.NewEncoder(nil, &cbor)
|
|
|
|
|
|
|
|
c.flushBuffer = func(currentKey []byte, canStoreInRam bool) error {
|
|
|
|
if sortableBuffer.Len() == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
var provider dataProvider
|
|
|
|
var err error
|
|
|
|
sortableBuffer.Sort()
|
|
|
|
if canStoreInRam && len(c.dataProviders) == 0 {
|
|
|
|
provider = KeepInRAM(sortableBuffer)
|
|
|
|
c.allFlushed = true
|
|
|
|
} else {
|
2020-10-23 11:18:45 +00:00
|
|
|
provider, err = FlushToDisk(encoder, currentKey, sortableBuffer, tmpdir)
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if provider != nil {
|
|
|
|
c.dataProviders = append(c.dataProviders, provider)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
c.extractNextFunc = func(originalK, k []byte, v []byte) error {
|
|
|
|
sortableBuffer.Put(common.CopyBytes(k), common.CopyBytes(v))
|
|
|
|
if sortableBuffer.CheckFlushSize() {
|
|
|
|
if err := c.flushBuffer(originalK, false); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *Collector) Collect(k, v []byte) error {
|
|
|
|
return c.extractNextFunc(k, k, v)
|
|
|
|
}
|
|
|
|
|
2021-04-02 11:22:25 +00:00
|
|
|
func (c *Collector) Load(logPrefix string, db ethdb.RwTx, toBucket string, loadFunc LoadFunc, args TransformArgs) error {
|
2020-10-19 19:20:18 +00:00
|
|
|
defer func() {
|
2020-10-25 08:38:55 +00:00
|
|
|
if c.autoClean {
|
|
|
|
c.Close(logPrefix)
|
2020-10-19 19:20:18 +00:00
|
|
|
}
|
|
|
|
}()
|
2020-06-10 20:07:14 +00:00
|
|
|
if !c.allFlushed {
|
2021-03-29 03:58:45 +00:00
|
|
|
if e := c.flushBuffer(nil, true); e != nil {
|
|
|
|
return e
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
|
|
|
}
|
2021-03-29 03:58:45 +00:00
|
|
|
if err := loadFilesIntoBucket(logPrefix, db, toBucket, c.dataProviders, loadFunc, args); err != nil {
|
2020-10-17 13:00:12 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
|
|
|
|
2020-10-25 08:38:55 +00:00
|
|
|
func (c *Collector) Close(logPrefix string) {
|
|
|
|
disposeProviders(logPrefix, c.dataProviders)
|
|
|
|
}
|
|
|
|
|
2021-04-02 11:22:25 +00:00
|
|
|
func loadFilesIntoBucket(logPrefix string, db ethdb.RwTx, bucket string, providers []dataProvider, loadFunc LoadFunc, args TransformArgs) error {
|
2020-06-10 20:07:14 +00:00
|
|
|
decoder := codec.NewDecoder(nil, &cbor)
|
|
|
|
var m runtime.MemStats
|
2020-09-10 12:35:58 +00:00
|
|
|
|
|
|
|
h := &Heap{comparator: args.Comparator}
|
2020-06-10 20:07:14 +00:00
|
|
|
heap.Init(h)
|
|
|
|
for i, provider := range providers {
|
|
|
|
if key, value, err := provider.Next(decoder); err == nil {
|
|
|
|
he := HeapElem{key, i, value}
|
|
|
|
heap.Push(h, he)
|
|
|
|
} else /* we must have at least one entry per file */ {
|
2020-10-21 17:01:40 +00:00
|
|
|
eee := fmt.Errorf("%s: error reading first readers: n=%d current=%d provider=%s err=%v",
|
|
|
|
logPrefix, len(providers), i, provider, err)
|
2020-06-10 20:07:14 +00:00
|
|
|
panic(eee)
|
|
|
|
}
|
|
|
|
}
|
2021-04-03 00:54:02 +00:00
|
|
|
var c ethdb.RwCursor
|
2021-04-02 11:22:25 +00:00
|
|
|
|
|
|
|
currentTable := ¤tTableReader{db, bucket}
|
2020-08-17 06:45:52 +00:00
|
|
|
haveSortingGuaranties := isIdentityLoadFunc(loadFunc) // user-defined loadFunc may change ordering
|
|
|
|
var lastKey []byte
|
|
|
|
if bucket != "" { // passing empty bucket name is valid case for etl when DB modification is not expected
|
2021-04-03 00:54:02 +00:00
|
|
|
var err error
|
|
|
|
c, err = db.RwCursor(bucket)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-08-17 06:45:52 +00:00
|
|
|
var errLast error
|
2021-04-02 11:22:25 +00:00
|
|
|
lastKey, _, errLast = c.Last()
|
2020-08-17 06:45:52 +00:00
|
|
|
if errLast != nil {
|
|
|
|
return errLast
|
|
|
|
}
|
|
|
|
}
|
|
|
|
var canUseAppend bool
|
2020-11-28 14:24:47 +00:00
|
|
|
isDupSort := dbutils.BucketsConfigs[bucket].Flags&dbutils.DupSort != 0 && !dbutils.BucketsConfigs[bucket].AutoDupSortKeysConversion
|
2020-06-10 20:07:14 +00:00
|
|
|
|
2020-08-22 10:12:33 +00:00
|
|
|
logEvery := time.NewTicker(30 * time.Second)
|
|
|
|
defer logEvery.Stop()
|
|
|
|
|
2020-08-17 06:45:52 +00:00
|
|
|
i := 0
|
2020-07-01 14:56:56 +00:00
|
|
|
loadNextFunc := func(originalK, k, v []byte) error {
|
2020-08-17 06:45:52 +00:00
|
|
|
if i == 0 {
|
|
|
|
isEndOfBucket := lastKey == nil || bytes.Compare(lastKey, k) == -1
|
|
|
|
canUseAppend = haveSortingGuaranties && isEndOfBucket
|
|
|
|
}
|
|
|
|
i++
|
2020-08-22 10:12:33 +00:00
|
|
|
|
|
|
|
select {
|
|
|
|
default:
|
|
|
|
case <-logEvery.C:
|
|
|
|
logArs := []interface{}{"into", bucket}
|
|
|
|
if args.LogDetailsLoad != nil {
|
|
|
|
logArs = append(logArs, args.LogDetailsLoad(k, v)...)
|
|
|
|
} else {
|
|
|
|
logArs = append(logArs, "current key", makeCurrentKeyStr(k))
|
|
|
|
}
|
|
|
|
|
2020-08-17 06:45:52 +00:00
|
|
|
runtime.ReadMemStats(&m)
|
2021-07-03 07:44:23 +00:00
|
|
|
logArs = append(logArs, "alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
|
2020-10-21 17:01:40 +00:00
|
|
|
log.Info(fmt.Sprintf("[%s] ETL [2/2] Loading", logPrefix), logArs...)
|
2020-08-22 10:12:33 +00:00
|
|
|
}
|
2020-08-17 06:45:52 +00:00
|
|
|
|
|
|
|
if canUseAppend && len(v) == 0 {
|
|
|
|
return nil // nothing to delete after end of bucket
|
|
|
|
}
|
2020-06-10 20:07:14 +00:00
|
|
|
if len(v) == 0 {
|
2021-04-02 11:22:25 +00:00
|
|
|
if err := c.Delete(k, nil); err != nil {
|
2020-06-10 20:07:14 +00:00
|
|
|
return err
|
|
|
|
}
|
2020-08-17 06:45:52 +00:00
|
|
|
return nil
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
2020-08-17 06:45:52 +00:00
|
|
|
if canUseAppend {
|
2020-11-28 14:24:47 +00:00
|
|
|
if isDupSort {
|
2021-06-19 20:29:02 +00:00
|
|
|
if err := c.(ethdb.RwCursorDupSort).AppendDup(k, v); err != nil {
|
|
|
|
return fmt.Errorf("%s: bucket: %s, appendDup: k=%x, %w", logPrefix, bucket, k, err)
|
2020-11-28 14:24:47 +00:00
|
|
|
}
|
|
|
|
} else {
|
2021-04-02 11:22:25 +00:00
|
|
|
if err := c.Append(k, v); err != nil {
|
2021-06-19 20:29:02 +00:00
|
|
|
return fmt.Errorf("%s: bucket: %s, append: k=%x, v=%x, %w", logPrefix, bucket, k, v, err)
|
2020-11-28 14:24:47 +00:00
|
|
|
}
|
2020-06-18 18:14:10 +00:00
|
|
|
}
|
2020-11-28 14:24:47 +00:00
|
|
|
|
2020-08-17 06:45:52 +00:00
|
|
|
return nil
|
|
|
|
}
|
2021-04-02 11:22:25 +00:00
|
|
|
if err := c.Put(k, v); err != nil {
|
2020-10-21 17:01:40 +00:00
|
|
|
return fmt.Errorf("%s: put: k=%x, %w", logPrefix, k, err)
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
2020-06-28 06:10:27 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
// Main loading loop
|
|
|
|
for h.Len() > 0 {
|
|
|
|
if err := common.Stopped(args.Quit); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
element := (heap.Pop(h)).(HeapElem)
|
|
|
|
provider := providers[element.TimeIdx]
|
2020-10-19 13:11:01 +00:00
|
|
|
err := loadFunc(element.Key, element.Value, currentTable, loadNextFunc)
|
2020-06-28 06:10:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
2020-06-28 06:10:27 +00:00
|
|
|
if element.Key, element.Value, err = provider.Next(decoder); err == nil {
|
|
|
|
heap.Push(h, element)
|
|
|
|
} else if err != io.EOF {
|
2020-10-21 17:01:40 +00:00
|
|
|
return fmt.Errorf("%s: error while reading next element from disk: %v", logPrefix, err)
|
2020-06-28 06:10:27 +00:00
|
|
|
}
|
|
|
|
}
|
2020-08-17 06:45:52 +00:00
|
|
|
|
2020-06-28 06:10:27 +00:00
|
|
|
runtime.ReadMemStats(&m)
|
2020-07-10 21:37:34 +00:00
|
|
|
log.Debug(
|
2020-10-21 17:01:40 +00:00
|
|
|
fmt.Sprintf("[%s] Committed batch", logPrefix),
|
2020-08-10 23:55:32 +00:00
|
|
|
"bucket", bucket,
|
2020-08-22 10:12:33 +00:00
|
|
|
"records", i,
|
2020-06-28 06:10:27 +00:00
|
|
|
"current key", makeCurrentKeyStr(nil),
|
2021-07-03 07:44:23 +00:00
|
|
|
"alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys))
|
2020-06-10 20:07:14 +00:00
|
|
|
|
2020-06-28 06:10:27 +00:00
|
|
|
return nil
|
2020-06-10 20:07:14 +00:00
|
|
|
}
|
2020-06-21 14:13:17 +00:00
|
|
|
|
|
|
|
func makeCurrentKeyStr(k []byte) string {
|
|
|
|
var currentKeyStr string
|
|
|
|
if k == nil {
|
|
|
|
currentKeyStr = "final"
|
|
|
|
} else if len(k) < 4 {
|
|
|
|
currentKeyStr = fmt.Sprintf("%x", k)
|
2020-08-17 06:45:52 +00:00
|
|
|
} else if k[0] == 0 && k[1] == 0 && k[2] == 0 && k[3] == 0 && len(k) >= 8 { // if key has leading zeroes, show a bit more info
|
2020-10-19 13:11:01 +00:00
|
|
|
currentKeyStr = fmt.Sprintf("%x", k)
|
2020-06-21 14:13:17 +00:00
|
|
|
} else {
|
|
|
|
currentKeyStr = fmt.Sprintf("%x...", k[:4])
|
|
|
|
}
|
|
|
|
return currentKeyStr
|
|
|
|
}
|