mirror of
https://gitlab.com/pulsechaincom/prysm-pulse.git
synced 2024-12-26 05:17:22 +00:00
51796d77f6
* cherry-pick from previous commits * use finalized checpoint instead of head slot * add helpers * add dedicated error for missing blocks * add backtracking options to fetcher * TestBlocksFetcher_alternativeSlotBefore regression test * update peer block provider test * update injected deps * update round robing deps * update fetcher * patch status and stream handlers * test template * gazelle * revert BestNonFinalized changes * extract waitForBandwidth method * extend test * remove redundant code * add flag * update queue * update thresholds * upd inequality * minor cleanup * check for mode * add backtracking queue test * merge from master * remove redundant test * correct logging level * fix tests * remove test template * Nishant's suggestion to parametrize magic number
453 lines
15 KiB
Go
453 lines
15 KiB
Go
package initialsync
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"time"
|
|
|
|
"github.com/libp2p/go-libp2p-core/peer"
|
|
eth "github.com/prysmaticlabs/ethereumapis/eth/v1alpha1"
|
|
"github.com/prysmaticlabs/prysm/beacon-chain/core/helpers"
|
|
"github.com/prysmaticlabs/prysm/beacon-chain/db"
|
|
"github.com/prysmaticlabs/prysm/beacon-chain/p2p"
|
|
beaconsync "github.com/prysmaticlabs/prysm/beacon-chain/sync"
|
|
"github.com/prysmaticlabs/prysm/shared/featureconfig"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
const (
|
|
// queueStopCallTimeout is time allowed for queue to release resources when quitting.
|
|
queueStopCallTimeout = 1 * time.Second
|
|
// pollingInterval defines how often state machine needs to check for new events.
|
|
pollingInterval = 200 * time.Millisecond
|
|
// staleEpochTimeout is an period after which epoch's state is considered stale.
|
|
staleEpochTimeout = 1 * time.Second
|
|
// skippedMachineTimeout is a period after which skipped machine is considered as stuck
|
|
// and is reset (if machine is the last one, then all machines are reset and search for
|
|
// skipped slot or backtracking takes place).
|
|
skippedMachineTimeout = 10 * staleEpochTimeout
|
|
// lookaheadSteps is a limit on how many forward steps are loaded into queue.
|
|
// Each step is managed by assigned finite state machine.
|
|
lookaheadSteps = 8
|
|
// noRequiredPeersErrMaxRetries defines number of retries when no required peers are found.
|
|
noRequiredPeersErrMaxRetries = 1000
|
|
// noRequiredPeersErrRefreshInterval defines interval for which queue will be paused before
|
|
// making the next attempt to obtain data.
|
|
noRequiredPeersErrRefreshInterval = 15 * time.Second
|
|
// maxResetAttempts number of times stale FSM is reset, before backtracking is triggered.
|
|
maxResetAttempts = 4
|
|
)
|
|
|
|
var (
|
|
errQueueCtxIsDone = errors.New("queue's context is done, reinitialize")
|
|
errQueueTakesTooLongToStop = errors.New("queue takes too long to stop")
|
|
errInvalidInitialState = errors.New("invalid initial state")
|
|
errInputNotFetchRequestParams = errors.New("input data is not type *fetchRequestParams")
|
|
errNoRequiredPeers = errors.New("no peers with required blocks are found")
|
|
)
|
|
|
|
const (
|
|
modeStopOnFinalizedEpoch syncMode = iota
|
|
modeNonConstrained
|
|
)
|
|
|
|
// syncMode specifies sync mod type.
|
|
type syncMode uint8
|
|
|
|
// blocksQueueConfig is a config to setup block queue service.
|
|
type blocksQueueConfig struct {
|
|
blocksFetcher *blocksFetcher
|
|
chain blockchainService
|
|
startSlot uint64
|
|
highestExpectedSlot uint64
|
|
p2p p2p.P2P
|
|
db db.ReadOnlyDatabase
|
|
mode syncMode
|
|
}
|
|
|
|
// blocksQueue is a priority queue that serves as a intermediary between block fetchers (producers)
|
|
// and block processing goroutine (consumer). Consumer can rely on order of incoming blocks.
|
|
type blocksQueue struct {
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
smm *stateMachineManager
|
|
blocksFetcher *blocksFetcher
|
|
chain blockchainService
|
|
highestExpectedSlot uint64
|
|
mode syncMode
|
|
exitConditions struct {
|
|
noRequiredPeersErrRetries int
|
|
}
|
|
fetchedData chan *blocksQueueFetchedData // output channel for ready blocks
|
|
staleEpochs map[uint64]uint8 // counter to keep track of stale FSMs
|
|
quit chan struct{} // termination notifier
|
|
}
|
|
|
|
// blocksQueueFetchedData is a data container that is returned from a queue on each step.
|
|
type blocksQueueFetchedData struct {
|
|
pid peer.ID
|
|
blocks []*eth.SignedBeaconBlock
|
|
}
|
|
|
|
// newBlocksQueue creates initialized priority queue.
|
|
func newBlocksQueue(ctx context.Context, cfg *blocksQueueConfig) *blocksQueue {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
|
|
blocksFetcher := cfg.blocksFetcher
|
|
if blocksFetcher == nil {
|
|
blocksFetcher = newBlocksFetcher(ctx, &blocksFetcherConfig{
|
|
chain: cfg.chain,
|
|
p2p: cfg.p2p,
|
|
db: cfg.db,
|
|
})
|
|
}
|
|
highestExpectedSlot := cfg.highestExpectedSlot
|
|
if highestExpectedSlot <= cfg.startSlot {
|
|
if cfg.mode == modeStopOnFinalizedEpoch {
|
|
highestExpectedSlot = blocksFetcher.bestFinalizedSlot()
|
|
} else {
|
|
highestExpectedSlot = blocksFetcher.bestNonFinalizedSlot()
|
|
}
|
|
}
|
|
|
|
// Override fetcher's sync mode.
|
|
blocksFetcher.mode = cfg.mode
|
|
|
|
queue := &blocksQueue{
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
highestExpectedSlot: highestExpectedSlot,
|
|
blocksFetcher: blocksFetcher,
|
|
chain: cfg.chain,
|
|
mode: cfg.mode,
|
|
fetchedData: make(chan *blocksQueueFetchedData, 1),
|
|
quit: make(chan struct{}),
|
|
staleEpochs: make(map[uint64]uint8),
|
|
}
|
|
|
|
// Configure state machines.
|
|
queue.smm = newStateMachineManager()
|
|
queue.smm.addEventHandler(eventTick, stateNew, queue.onScheduleEvent(ctx))
|
|
queue.smm.addEventHandler(eventDataReceived, stateScheduled, queue.onDataReceivedEvent(ctx))
|
|
queue.smm.addEventHandler(eventTick, stateDataParsed, queue.onReadyToSendEvent(ctx))
|
|
queue.smm.addEventHandler(eventTick, stateSkipped, queue.onProcessSkippedEvent(ctx))
|
|
queue.smm.addEventHandler(eventTick, stateSent, queue.onCheckStaleEvent(ctx))
|
|
|
|
return queue
|
|
}
|
|
|
|
// start boots up the queue processing.
|
|
func (q *blocksQueue) start() error {
|
|
select {
|
|
case <-q.ctx.Done():
|
|
return errQueueCtxIsDone
|
|
default:
|
|
go q.loop()
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// stop terminates all queue operations.
|
|
func (q *blocksQueue) stop() error {
|
|
q.cancel()
|
|
select {
|
|
case <-q.quit:
|
|
return nil
|
|
case <-time.After(queueStopCallTimeout):
|
|
return errQueueTakesTooLongToStop
|
|
}
|
|
}
|
|
|
|
// loop is a main queue loop.
|
|
func (q *blocksQueue) loop() {
|
|
defer close(q.quit)
|
|
|
|
defer func() {
|
|
q.blocksFetcher.stop()
|
|
close(q.fetchedData)
|
|
}()
|
|
|
|
if err := q.blocksFetcher.start(); err != nil {
|
|
log.WithError(err).Debug("Can not start blocks provider")
|
|
}
|
|
|
|
// Define initial state machines.
|
|
startSlot := q.chain.HeadSlot()
|
|
blocksPerRequest := q.blocksFetcher.blocksPerSecond
|
|
for i := startSlot; i < startSlot+blocksPerRequest*lookaheadSteps; i += blocksPerRequest {
|
|
q.smm.addStateMachine(i)
|
|
}
|
|
|
|
ticker := time.NewTicker(pollingInterval)
|
|
defer ticker.Stop()
|
|
for {
|
|
// Check highest expected slot when we approach chain's head slot.
|
|
if q.chain.HeadSlot() >= q.highestExpectedSlot {
|
|
// By the time initial sync is complete, highest slot may increase, re-check.
|
|
if q.mode == modeStopOnFinalizedEpoch {
|
|
if q.highestExpectedSlot < q.blocksFetcher.bestFinalizedSlot() {
|
|
q.highestExpectedSlot = q.blocksFetcher.bestFinalizedSlot()
|
|
continue
|
|
}
|
|
} else {
|
|
if q.highestExpectedSlot < q.blocksFetcher.bestNonFinalizedSlot() {
|
|
q.highestExpectedSlot = q.blocksFetcher.bestNonFinalizedSlot()
|
|
continue
|
|
}
|
|
}
|
|
log.WithField("slot", q.highestExpectedSlot).Debug("Highest expected slot reached")
|
|
q.cancel()
|
|
}
|
|
|
|
log.WithFields(logrus.Fields{
|
|
"highestExpectedSlot": q.highestExpectedSlot,
|
|
"headSlot": q.chain.HeadSlot(),
|
|
"state": q.smm.String(),
|
|
"staleEpoch": q.staleEpochs,
|
|
}).Trace("tick")
|
|
|
|
select {
|
|
case <-ticker.C:
|
|
for _, key := range q.smm.keys {
|
|
fsm := q.smm.machines[key]
|
|
if err := fsm.trigger(eventTick, nil); err != nil {
|
|
log.WithFields(logrus.Fields{
|
|
"highestExpectedSlot": q.highestExpectedSlot,
|
|
"noRequiredPeersErrRetries": q.exitConditions.noRequiredPeersErrRetries,
|
|
"event": eventTick,
|
|
"epoch": helpers.SlotToEpoch(fsm.start),
|
|
"start": fsm.start,
|
|
"error": err.Error(),
|
|
}).Debug("Can not trigger event")
|
|
if errors.Is(err, errNoRequiredPeers) {
|
|
forceExit := q.exitConditions.noRequiredPeersErrRetries > noRequiredPeersErrMaxRetries
|
|
if q.mode == modeStopOnFinalizedEpoch || forceExit {
|
|
q.cancel()
|
|
} else {
|
|
q.exitConditions.noRequiredPeersErrRetries++
|
|
log.Debug("Waiting for finalized peers")
|
|
time.Sleep(noRequiredPeersErrRefreshInterval)
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
// Do garbage collection, and advance sliding window forward.
|
|
if q.chain.HeadSlot() >= fsm.start+blocksPerRequest-1 {
|
|
highestStartSlot, err := q.smm.highestStartSlot()
|
|
if err != nil {
|
|
log.WithError(err).Debug("Cannot obtain highest epoch state number")
|
|
continue
|
|
}
|
|
if err := q.smm.removeStateMachine(fsm.start); err != nil {
|
|
log.WithError(err).Debug("Can not remove state machine")
|
|
}
|
|
if len(q.smm.machines) < lookaheadSteps {
|
|
q.smm.addStateMachine(highestStartSlot + blocksPerRequest)
|
|
}
|
|
}
|
|
}
|
|
case response, ok := <-q.blocksFetcher.requestResponses():
|
|
if !ok {
|
|
log.Debug("Fetcher closed output channel")
|
|
q.cancel()
|
|
return
|
|
}
|
|
// Update state of an epoch for which data is received.
|
|
if fsm, ok := q.smm.findStateMachine(response.start); ok {
|
|
if err := fsm.trigger(eventDataReceived, response); err != nil {
|
|
log.WithFields(logrus.Fields{
|
|
"event": eventDataReceived,
|
|
"epoch": helpers.SlotToEpoch(fsm.start),
|
|
"error": err.Error(),
|
|
}).Debug("Can not process event")
|
|
fsm.setState(stateNew)
|
|
continue
|
|
}
|
|
}
|
|
case <-q.ctx.Done():
|
|
log.Debug("Context closed, exiting goroutine (blocks queue)")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// onScheduleEvent is an event called on newly arrived epochs. Transforms state to scheduled.
|
|
func (q *blocksQueue) onScheduleEvent(ctx context.Context) eventHandlerFn {
|
|
return func(m *stateMachine, in interface{}) (stateID, error) {
|
|
if m.state != stateNew {
|
|
return m.state, errInvalidInitialState
|
|
}
|
|
if m.start > q.highestExpectedSlot {
|
|
m.setState(stateSkipped)
|
|
return m.state, errSlotIsTooHigh
|
|
}
|
|
blocksPerRequest := q.blocksFetcher.blocksPerSecond
|
|
if err := q.blocksFetcher.scheduleRequest(ctx, m.start, blocksPerRequest); err != nil {
|
|
return m.state, err
|
|
}
|
|
return stateScheduled, nil
|
|
}
|
|
}
|
|
|
|
// onDataReceivedEvent is an event called when data is received from fetcher.
|
|
func (q *blocksQueue) onDataReceivedEvent(ctx context.Context) eventHandlerFn {
|
|
return func(m *stateMachine, in interface{}) (stateID, error) {
|
|
if ctx.Err() != nil {
|
|
return m.state, ctx.Err()
|
|
}
|
|
if m.state != stateScheduled {
|
|
return m.state, errInvalidInitialState
|
|
}
|
|
response, ok := in.(*fetchRequestResponse)
|
|
if !ok {
|
|
return m.state, errInputNotFetchRequestParams
|
|
}
|
|
if response.err != nil {
|
|
switch response.err {
|
|
case errSlotIsTooHigh:
|
|
// Current window is already too big, re-request previous epochs.
|
|
for _, fsm := range q.smm.machines {
|
|
if fsm.start < response.start && fsm.state == stateSkipped {
|
|
fsm.setState(stateNew)
|
|
}
|
|
}
|
|
case beaconsync.ErrInvalidFetchedData:
|
|
// Peer returned invalid data, penalize.
|
|
q.blocksFetcher.p2p.Peers().Scorers().BadResponsesScorer().Increment(m.pid)
|
|
log.WithField("pid", response.pid).Debug("Peer is penalized for invalid blocks")
|
|
}
|
|
return m.state, response.err
|
|
}
|
|
m.pid = response.pid
|
|
m.blocks = response.blocks
|
|
return stateDataParsed, nil
|
|
}
|
|
}
|
|
|
|
// onReadyToSendEvent is an event called to allow epochs with available blocks to send them downstream.
|
|
func (q *blocksQueue) onReadyToSendEvent(ctx context.Context) eventHandlerFn {
|
|
return func(m *stateMachine, in interface{}) (stateID, error) {
|
|
if ctx.Err() != nil {
|
|
return m.state, ctx.Err()
|
|
}
|
|
if m.state != stateDataParsed {
|
|
return m.state, errInvalidInitialState
|
|
}
|
|
|
|
if len(m.blocks) == 0 {
|
|
return stateSkipped, nil
|
|
}
|
|
|
|
send := func() (stateID, error) {
|
|
data := &blocksQueueFetchedData{
|
|
pid: m.pid,
|
|
blocks: m.blocks,
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return m.state, ctx.Err()
|
|
case q.fetchedData <- data:
|
|
}
|
|
return stateSent, nil
|
|
}
|
|
|
|
// Make sure that we send epochs in a correct order.
|
|
// If machine is the first (has lowest start block), send.
|
|
if m.isFirst() {
|
|
return send()
|
|
}
|
|
|
|
// Make sure that previous epoch is already processed.
|
|
for _, fsm := range q.smm.machines {
|
|
// Review only previous slots.
|
|
if fsm.start < m.start {
|
|
switch fsm.state {
|
|
case stateNew, stateScheduled, stateDataParsed:
|
|
return m.state, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return send()
|
|
}
|
|
}
|
|
|
|
// onProcessSkippedEvent is an event triggered on skipped machines, allowing handlers to
|
|
// extend lookahead window, in case where progress is not possible otherwise.
|
|
func (q *blocksQueue) onProcessSkippedEvent(ctx context.Context) eventHandlerFn {
|
|
return func(m *stateMachine, in interface{}) (stateID, error) {
|
|
if ctx.Err() != nil {
|
|
return m.state, ctx.Err()
|
|
}
|
|
if m.state != stateSkipped {
|
|
return m.state, errInvalidInitialState
|
|
}
|
|
|
|
// Only the highest epoch with skipped state can trigger extension.
|
|
if !m.isLast() {
|
|
// When a state machine stays in skipped state for too long - reset it.
|
|
if time.Since(m.updated) > skippedMachineTimeout {
|
|
return stateNew, nil
|
|
}
|
|
return m.state, nil
|
|
}
|
|
|
|
// Make sure that all machines are in skipped state i.e. manager cannot progress without reset or
|
|
// moving the last machine's start block forward (in an attempt to find next non-skipped block).
|
|
if !q.smm.allMachinesInState(stateSkipped) {
|
|
return m.state, nil
|
|
}
|
|
|
|
// Check if we have enough peers to progress, or sync needs to halt (due to no peers available).
|
|
bestFinalizedSlot := q.blocksFetcher.bestFinalizedSlot()
|
|
if q.mode == modeStopOnFinalizedEpoch {
|
|
if bestFinalizedSlot <= q.chain.HeadSlot() {
|
|
return stateSkipped, errNoRequiredPeers
|
|
}
|
|
} else {
|
|
if q.blocksFetcher.bestNonFinalizedSlot() <= q.chain.HeadSlot() {
|
|
return stateSkipped, errNoRequiredPeers
|
|
}
|
|
}
|
|
|
|
// All machines are skipped, FSMs need reset.
|
|
startSlot := q.chain.HeadSlot() + 1
|
|
if featureconfig.Get().EnableSyncBacktracking && q.mode == modeNonConstrained && startSlot > bestFinalizedSlot {
|
|
q.staleEpochs[helpers.SlotToEpoch(startSlot)]++
|
|
// If FSMs have been reset enough times, try to explore alternative forks.
|
|
if q.staleEpochs[helpers.SlotToEpoch(startSlot)] >= maxResetAttempts {
|
|
delete(q.staleEpochs, helpers.SlotToEpoch(startSlot))
|
|
fork, err := q.blocksFetcher.findFork(ctx, startSlot)
|
|
if err == nil {
|
|
return stateSkipped, q.resetFromFork(ctx, fork)
|
|
}
|
|
log.WithFields(logrus.Fields{
|
|
"epoch": helpers.SlotToEpoch(startSlot),
|
|
"error": err.Error(),
|
|
}).Debug("Can not explore alternative branches")
|
|
}
|
|
}
|
|
return stateSkipped, q.resetFromSlot(ctx, startSlot)
|
|
}
|
|
}
|
|
|
|
// onCheckStaleEvent is an event that allows to mark stale epochs,
|
|
// so that they can be re-processed.
|
|
func (q *blocksQueue) onCheckStaleEvent(ctx context.Context) eventHandlerFn {
|
|
return func(m *stateMachine, in interface{}) (stateID, error) {
|
|
if ctx.Err() != nil {
|
|
return m.state, ctx.Err()
|
|
}
|
|
if m.state != stateSent {
|
|
return m.state, errInvalidInitialState
|
|
}
|
|
|
|
// Break out immediately if bucket is not stale.
|
|
if time.Since(m.updated) < staleEpochTimeout {
|
|
return m.state, nil
|
|
}
|
|
|
|
return stateSkipped, nil
|
|
}
|
|
}
|