2020-05-30 07:00:35 +00:00
|
|
|
package etl
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"container/heap"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"runtime"
|
|
|
|
|
2020-05-30 13:44:54 +00:00
|
|
|
"github.com/ugorji/go/codec"
|
|
|
|
|
2020-05-30 07:00:35 +00:00
|
|
|
"github.com/ledgerwatch/turbo-geth/common"
|
|
|
|
"github.com/ledgerwatch/turbo-geth/ethdb"
|
|
|
|
"github.com/ledgerwatch/turbo-geth/log"
|
|
|
|
)
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
var (
|
|
|
|
cbor codec.CborHandle
|
|
|
|
bufferOptimalSize = 256 * 1024 * 1024 /* 256 mb | var because we want to sometimes change it from tests */
|
|
|
|
)
|
2020-05-30 07:00:35 +00:00
|
|
|
|
|
|
|
type Decoder interface {
|
|
|
|
Decode(interface{}) error
|
|
|
|
}
|
|
|
|
|
|
|
|
type State interface {
|
|
|
|
Get([]byte) ([]byte, error)
|
2020-05-30 13:44:54 +00:00
|
|
|
Stopped() error
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type ExtractNextFunc func(k []byte, v interface{}) error
|
|
|
|
type ExtractFunc func(k []byte, v []byte, next ExtractNextFunc) error
|
|
|
|
|
|
|
|
type LoadNextFunc func(k []byte, v []byte) error
|
|
|
|
type LoadFunc func(k []byte, valueDecoder Decoder, state State, next LoadNextFunc) error
|
|
|
|
|
|
|
|
func Transform(
|
|
|
|
db ethdb.Database,
|
|
|
|
fromBucket []byte,
|
|
|
|
toBucket []byte,
|
|
|
|
datadir string,
|
|
|
|
startkey []byte,
|
|
|
|
extractFunc ExtractFunc,
|
|
|
|
loadFunc LoadFunc,
|
2020-05-30 13:44:54 +00:00
|
|
|
quit chan struct{},
|
2020-05-30 07:00:35 +00:00
|
|
|
) error {
|
2020-05-31 07:32:33 +00:00
|
|
|
dataProviders, err := extractBucketIntoFiles(db, fromBucket, startkey, datadir, extractFunc, quit)
|
2020-05-30 07:00:35 +00:00
|
|
|
|
|
|
|
defer func() {
|
2020-05-31 07:32:33 +00:00
|
|
|
disposeProviders(dataProviders)
|
2020-05-30 07:00:35 +00:00
|
|
|
}()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
return loadFilesIntoBucket(db, toBucket, dataProviders, loadFunc, quit)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func extractBucketIntoFiles(
|
|
|
|
db ethdb.Database,
|
|
|
|
bucket []byte,
|
|
|
|
startkey []byte,
|
|
|
|
datadir string,
|
|
|
|
extractFunc ExtractFunc,
|
2020-05-30 13:44:54 +00:00
|
|
|
quit chan struct{},
|
2020-05-31 07:32:33 +00:00
|
|
|
) ([]dataProvider, error) {
|
2020-05-30 07:00:35 +00:00
|
|
|
buffer := bytes.NewBuffer(make([]byte, 0))
|
|
|
|
encoder := codec.NewEncoder(nil, &cbor)
|
2020-05-31 07:32:33 +00:00
|
|
|
providers := make([]dataProvider, 0)
|
2020-05-30 07:00:35 +00:00
|
|
|
|
|
|
|
sortableBuffer := newSortableBuffer()
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
flushBuffer := func(canStoreInRam bool) error {
|
|
|
|
if sortableBuffer.Len() == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
var provider dataProvider
|
|
|
|
var err error
|
|
|
|
if canStoreInRam && len(providers) == 0 {
|
|
|
|
provider = KeepInRAM(sortableBuffer)
|
|
|
|
} else {
|
|
|
|
provider, err = FlushToDisk(sortableBuffer, datadir)
|
|
|
|
}
|
2020-05-30 07:00:35 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-05-31 07:32:33 +00:00
|
|
|
if provider != nil {
|
|
|
|
providers = append(providers, provider)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
extractNextFunc := func(k []byte, v interface{}) error {
|
|
|
|
buffer.Reset()
|
|
|
|
encoder.Reset(buffer)
|
|
|
|
err := encoder.Encode(v)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
encodedValue := buffer.Bytes()
|
|
|
|
sortableBuffer.Put(common.CopyBytes(k), common.CopyBytes(encodedValue))
|
|
|
|
if sortableBuffer.Size() >= sortableBuffer.OptimalSize {
|
2020-05-31 07:32:33 +00:00
|
|
|
err = flushBuffer(false)
|
2020-05-30 07:00:35 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
err := db.Walk(bucket, startkey, len(startkey), func(k, v []byte) (bool, error) {
|
2020-05-30 13:44:54 +00:00
|
|
|
if err := common.Stopped(quit); err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2020-05-30 07:00:35 +00:00
|
|
|
err := extractFunc(k, v, extractNextFunc)
|
|
|
|
return true, err
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
err = flushBuffer(true)
|
2020-05-30 07:00:35 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-05-31 07:32:33 +00:00
|
|
|
return providers, nil
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
func loadFilesIntoBucket(db ethdb.Database, bucket []byte, providers []dataProvider, loadFunc LoadFunc, quit chan struct{}) error {
|
2020-05-30 07:00:35 +00:00
|
|
|
decoder := codec.NewDecoder(nil, &cbor)
|
|
|
|
var m runtime.MemStats
|
|
|
|
h := &Heap{}
|
|
|
|
heap.Init(h)
|
2020-05-31 07:32:33 +00:00
|
|
|
for i, provider := range providers {
|
|
|
|
if key, value, err := provider.Next(decoder); err == nil {
|
2020-05-30 07:00:35 +00:00
|
|
|
he := HeapElem{key, i, value}
|
|
|
|
heap.Push(h, he)
|
|
|
|
} else /* we must have at least one entry per file */ {
|
2020-05-31 07:32:33 +00:00
|
|
|
eee := fmt.Errorf("error reading first readers: n=%d current=%d provider=%s err=%v",
|
|
|
|
len(providers), i, provider, err)
|
|
|
|
panic(eee)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
batch := db.NewBatch()
|
2020-05-30 13:44:54 +00:00
|
|
|
state := &bucketState{batch, bucket, quit}
|
2020-05-30 07:00:35 +00:00
|
|
|
|
|
|
|
loadNextFunc := func(k, v []byte) error {
|
|
|
|
if err := batch.Put(bucket, k, v); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
batchSize := batch.BatchSize()
|
|
|
|
if batchSize > batch.IdealBatchSize() {
|
|
|
|
if _, err := batch.Commit(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
runtime.ReadMemStats(&m)
|
|
|
|
log.Info(
|
|
|
|
"Commited hashed state",
|
|
|
|
"bucket", string(bucket),
|
|
|
|
"size", common.StorageSize(batchSize),
|
|
|
|
"hashedKey", fmt.Sprintf("%x...", k[:4]),
|
|
|
|
"alloc", common.StorageSize(m.Alloc), "sys", common.StorageSize(m.Sys), "numGC", int(m.NumGC))
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
for h.Len() > 0 {
|
2020-05-30 13:44:54 +00:00
|
|
|
if err := common.Stopped(quit); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-05-30 07:00:35 +00:00
|
|
|
element := (heap.Pop(h)).(HeapElem)
|
2020-05-31 07:32:33 +00:00
|
|
|
provider := providers[element.TimeIdx]
|
2020-05-31 06:57:47 +00:00
|
|
|
decoder.ResetBytes(element.Value)
|
|
|
|
err := loadFunc(element.Key, decoder, state, loadNextFunc)
|
2020-05-30 07:00:35 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-05-31 07:32:33 +00:00
|
|
|
if element.Key, element.Value, err = provider.Next(decoder); err == nil {
|
2020-05-30 07:00:35 +00:00
|
|
|
heap.Push(h, element)
|
|
|
|
} else if err != io.EOF {
|
|
|
|
return fmt.Errorf("error while reading next element from disk: %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_, err := batch.Commit()
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
func disposeProviders(providers []dataProvider) {
|
|
|
|
for _, p := range providers {
|
|
|
|
err := p.Dispose()
|
2020-05-30 07:00:35 +00:00
|
|
|
if err != nil {
|
2020-05-31 07:32:33 +00:00
|
|
|
log.Warn("promoting hashed state, error while disposing provider", "provier", p, "err", err)
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
2020-05-31 07:32:33 +00:00
|
|
|
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type sortableBufferEntry struct {
|
|
|
|
key []byte
|
|
|
|
value []byte
|
|
|
|
}
|
|
|
|
|
|
|
|
type sortableBuffer struct {
|
|
|
|
entries []sortableBufferEntry
|
|
|
|
size int
|
|
|
|
OptimalSize int
|
|
|
|
encoder *codec.Encoder
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *sortableBuffer) Put(k, v []byte) {
|
|
|
|
b.size += len(k)
|
|
|
|
b.size += len(v)
|
|
|
|
b.entries = append(b.entries, sortableBufferEntry{k, v})
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *sortableBuffer) Size() int {
|
|
|
|
return b.size
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *sortableBuffer) Len() int {
|
|
|
|
return len(b.entries)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *sortableBuffer) Less(i, j int) bool {
|
|
|
|
return bytes.Compare(b.entries[i].key, b.entries[j].key) < 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *sortableBuffer) Swap(i, j int) {
|
|
|
|
b.entries[i], b.entries[j] = b.entries[j], b.entries[i]
|
|
|
|
}
|
|
|
|
|
2020-05-31 07:32:33 +00:00
|
|
|
func (b *sortableBuffer) Get(i int) sortableBufferEntry {
|
|
|
|
return b.entries[i]
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func newSortableBuffer() *sortableBuffer {
|
|
|
|
return &sortableBuffer{
|
|
|
|
entries: make([]sortableBufferEntry, 0),
|
|
|
|
size: 0,
|
2020-05-31 07:32:33 +00:00
|
|
|
OptimalSize: bufferOptimalSize,
|
2020-05-30 07:00:35 +00:00
|
|
|
encoder: codec.NewEncoder(nil, &cbor),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type bucketState struct {
|
|
|
|
getter ethdb.Getter
|
|
|
|
bucket []byte
|
2020-05-30 13:44:54 +00:00
|
|
|
quit chan struct{}
|
2020-05-30 07:00:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *bucketState) Get(key []byte) ([]byte, error) {
|
|
|
|
return s.getter.Get(s.bucket, key)
|
|
|
|
}
|
2020-05-30 13:44:54 +00:00
|
|
|
|
|
|
|
func (s *bucketState) Stopped() error {
|
|
|
|
return common.Stopped(s.quit)
|
|
|
|
}
|