prysm-pulse/beacon-chain/sync/initial-sync/blocks_fetcher.go
Victor Farazdagi c26a492225
Init sync optimizations (#5284)
* fix naming slot -> epoch
* better handling of long periods w/o finality
* fixes issue with pointer going too far ahead
2020-04-02 06:54:05 +03:00

510 lines
14 KiB
Go

package initialsync
import (
"bytes"
"context"
"fmt"
"io"
"math"
"math/rand"
"sort"
"sync"
"time"
"github.com/kevinms/leakybucket-go"
"github.com/libp2p/go-libp2p-core/peer"
"github.com/pkg/errors"
eth "github.com/prysmaticlabs/ethereumapis/eth/v1alpha1"
"github.com/prysmaticlabs/prysm/beacon-chain/blockchain"
"github.com/prysmaticlabs/prysm/beacon-chain/core/helpers"
"github.com/prysmaticlabs/prysm/beacon-chain/flags"
"github.com/prysmaticlabs/prysm/beacon-chain/p2p"
prysmsync "github.com/prysmaticlabs/prysm/beacon-chain/sync"
p2ppb "github.com/prysmaticlabs/prysm/proto/beacon/p2p/v1"
"github.com/prysmaticlabs/prysm/shared/mathutil"
"github.com/prysmaticlabs/prysm/shared/params"
"github.com/sirupsen/logrus"
"go.opencensus.io/trace"
)
const (
// maxPendingRequests limits how many concurrent fetch request one can initiate.
maxPendingRequests = 8
// peersPercentagePerRequest caps percentage of peers to be used in a request.
peersPercentagePerRequest = 0.75
)
var (
errNoPeersAvailable = errors.New("no peers available, waiting for reconnect")
errFetcherCtxIsDone = errors.New("fetcher's context is done, reinitialize")
errSlotIsTooHigh = errors.New("slot is higher than the finalized slot")
)
// blocksFetcherConfig is a config to setup the block fetcher.
type blocksFetcherConfig struct {
headFetcher blockchain.HeadFetcher
p2p p2p.P2P
}
// blocksFetcher is a service to fetch chain data from peers.
// On an incoming requests, requested block range is evenly divided
// among available peers (for fair network load distribution).
type blocksFetcher struct {
sync.Mutex
ctx context.Context
cancel context.CancelFunc
headFetcher blockchain.HeadFetcher
p2p p2p.P2P
rateLimiter *leakybucket.Collector
fetchRequests chan *fetchRequestParams
fetchResponses chan *fetchRequestResponse
quit chan struct{} // termination notifier
}
// fetchRequestParams holds parameters necessary to schedule a fetch request.
type fetchRequestParams struct {
ctx context.Context // if provided, it is used instead of global fetcher's context
start uint64 // starting slot
count uint64 // how many slots to receive (fetcher may return fewer slots)
}
// fetchRequestResponse is a combined type to hold results of both successful executions and errors.
// Valid usage pattern will be to check whether result's `err` is nil, before using `blocks`.
type fetchRequestResponse struct {
start, count uint64
blocks []*eth.SignedBeaconBlock
err error
peers []peer.ID
}
// newBlocksFetcher creates ready to use fetcher.
func newBlocksFetcher(ctx context.Context, cfg *blocksFetcherConfig) *blocksFetcher {
ctx, cancel := context.WithCancel(ctx)
rateLimiter := leakybucket.NewCollector(
allowedBlocksPerSecond, /* rate */
allowedBlocksPerSecond, /* capacity */
false /* deleteEmptyBuckets */)
return &blocksFetcher{
ctx: ctx,
cancel: cancel,
headFetcher: cfg.headFetcher,
p2p: cfg.p2p,
rateLimiter: rateLimiter,
fetchRequests: make(chan *fetchRequestParams, maxPendingRequests),
fetchResponses: make(chan *fetchRequestResponse, maxPendingRequests),
quit: make(chan struct{}),
}
}
// start boots up the fetcher, which starts listening for incoming fetch requests.
func (f *blocksFetcher) start() error {
select {
case <-f.ctx.Done():
return errFetcherCtxIsDone
default:
go f.loop()
return nil
}
}
// stop terminates all fetcher operations.
func (f *blocksFetcher) stop() {
f.cancel()
<-f.quit // make sure that loop() is done
}
// requestResponses exposes a channel into which fetcher pushes generated request responses.
func (f *blocksFetcher) requestResponses() <-chan *fetchRequestResponse {
return f.fetchResponses
}
// loop is a main fetcher loop, listens for incoming requests/cancellations, forwards outgoing responses.
func (f *blocksFetcher) loop() {
defer close(f.quit)
// Wait for all loop's goroutines to finish, and safely release resources.
wg := &sync.WaitGroup{}
defer func() {
wg.Wait()
close(f.fetchResponses)
}()
for {
// Make sure there is are available peers before processing requests.
if _, err := f.waitForMinimumPeers(f.ctx); err != nil {
log.Error(err)
}
select {
case <-f.ctx.Done():
log.Debug("Context closed, exiting goroutine (blocks fetcher)")
return
case req := <-f.fetchRequests:
wg.Add(1)
go func() {
defer wg.Done()
select {
case <-f.ctx.Done():
case f.fetchResponses <- f.handleRequest(req.ctx, req.start, req.count):
}
}()
}
}
}
// scheduleRequest adds request to incoming queue.
func (f *blocksFetcher) scheduleRequest(ctx context.Context, start, count uint64) error {
if ctx.Err() != nil {
return ctx.Err()
}
request := &fetchRequestParams{
ctx: ctx,
start: start,
count: count,
}
select {
case <-f.ctx.Done():
return errFetcherCtxIsDone
case f.fetchRequests <- request:
}
return nil
}
// handleRequest parses fetch request and forwards it to response builder.
func (f *blocksFetcher) handleRequest(ctx context.Context, start, count uint64) *fetchRequestResponse {
ctx, span := trace.StartSpan(ctx, "initialsync.handleRequest")
defer span.End()
response := &fetchRequestResponse{
start: start,
count: count,
blocks: []*eth.SignedBeaconBlock{},
err: nil,
peers: []peer.ID{},
}
if ctx.Err() != nil {
response.err = ctx.Err()
return response
}
headEpoch := helpers.SlotToEpoch(f.headFetcher.HeadSlot())
root, finalizedEpoch, peers := f.p2p.Peers().BestFinalized(params.BeaconConfig().MaxPeersToSync, headEpoch)
if len(peers) == 0 {
response.err = errNoPeersAvailable
return response
}
// Short circuit start far exceeding the highest finalized epoch in some infinite loop.
highestFinalizedSlot := helpers.StartSlot(finalizedEpoch + 1)
if start > highestFinalizedSlot {
response.err = errSlotIsTooHigh
return response
}
blocks, err := f.collectPeerResponses(ctx, root, finalizedEpoch, start, 1, count, peers)
if err != nil {
response.err = err
return response
}
response.blocks = blocks
response.peers = peers
return response
}
// collectPeerResponses orchestrates block fetching from the available peers.
// In each request a range of blocks is to be requested from multiple peers.
// Example:
// - number of peers = 4
// - range of block slots is 64...128
// Four requests will be spread across the peers using step argument to distribute the load
// i.e. the first peer is asked for block 64, 68, 72... while the second peer is asked for
// 65, 69, 73... and so on for other peers.
func (f *blocksFetcher) collectPeerResponses(
ctx context.Context,
root []byte,
finalizedEpoch, start, step, count uint64,
peers []peer.ID,
) ([]*eth.SignedBeaconBlock, error) {
ctx, span := trace.StartSpan(ctx, "initialsync.collectPeerResponses")
defer span.End()
if ctx.Err() != nil {
return nil, ctx.Err()
}
peers = f.selectPeers(peers)
if len(peers) == 0 {
return nil, errNoPeersAvailable
}
p2pRequests := new(sync.WaitGroup)
errChan := make(chan error)
blocksChan := make(chan []*eth.SignedBeaconBlock)
p2pRequests.Add(len(peers))
go func() {
p2pRequests.Wait()
close(blocksChan)
}()
// Short circuit start far exceeding the highest finalized epoch in some infinite loop.
highestFinalizedSlot := helpers.StartSlot(finalizedEpoch + 1)
if start > highestFinalizedSlot {
return nil, errSlotIsTooHigh
}
// Spread load evenly among available peers.
perPeerCount := mathutil.Min(count/uint64(len(peers)), allowedBlocksPerSecond)
remainder := int(count % uint64(len(peers)))
for i, pid := range peers {
start, step := start+uint64(i)*step, step*uint64(len(peers))
// If the count was divided by an odd number of peers, there will be some blocks
// missing from the first requests so we accommodate that scenario.
count := perPeerCount
if i < remainder {
count++
}
// Asking for no blocks may cause the client to hang.
if count == 0 {
p2pRequests.Done()
continue
}
go func(ctx context.Context, pid peer.ID) {
defer p2pRequests.Done()
blocks, err := f.requestBeaconBlocksByRange(ctx, pid, root, start, step, count)
if err != nil {
select {
case <-ctx.Done():
case errChan <- err:
return
}
}
select {
case <-ctx.Done():
case blocksChan <- blocks:
}
}(ctx, pid)
}
var unionRespBlocks []*eth.SignedBeaconBlock
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
case err := <-errChan:
return nil, err
case resp, ok := <-blocksChan:
if ok {
unionRespBlocks = append(unionRespBlocks, resp...)
} else {
sort.Slice(unionRespBlocks, func(i, j int) bool {
return unionRespBlocks[i].Block.Slot < unionRespBlocks[j].Block.Slot
})
return unionRespBlocks, nil
}
}
}
}
// requestBeaconBlocksByRange prepares BeaconBlocksByRange request, and handles possible stale peers
// (by resending the request).
func (f *blocksFetcher) requestBeaconBlocksByRange(
ctx context.Context,
pid peer.ID,
root []byte,
start, step, count uint64,
) ([]*eth.SignedBeaconBlock, error) {
if ctx.Err() != nil {
return nil, ctx.Err()
}
req := &p2ppb.BeaconBlocksByRangeRequest{
HeadBlockRoot: root,
StartSlot: start,
Count: count,
Step: step,
}
resp, respErr := f.requestBlocks(ctx, req, pid)
if respErr != nil {
// Fail over to some other, randomly selected, peer.
headEpoch := helpers.SlotToEpoch(f.headFetcher.HeadSlot())
root1, _, peers := f.p2p.Peers().BestFinalized(params.BeaconConfig().MaxPeersToSync, headEpoch)
if bytes.Compare(root, root1) != 0 {
return nil, errors.Errorf("can not resend, root mismatch: %x:%x", root, root1)
}
newPID, err := selectFailOverPeer(pid, peers)
if err != nil {
return nil, err
}
log.WithError(respErr).WithFields(logrus.Fields{
"numPeers": len(peers),
"failedPeer": pid.Pretty(),
"newPeer": newPID.Pretty(),
}).Debug("Request failed, trying to forward request to another peer")
return f.requestBeaconBlocksByRange(ctx, newPID, root, start, step, count)
}
return resp, nil
}
// requestBlocks is a wrapper for handling BeaconBlocksByRangeRequest requests/streams.
func (f *blocksFetcher) requestBlocks(
ctx context.Context,
req *p2ppb.BeaconBlocksByRangeRequest,
pid peer.ID,
) ([]*eth.SignedBeaconBlock, error) {
f.Lock()
if f.rateLimiter.Remaining(pid.String()) < int64(req.Count) {
log.WithField("peer", pid).Debug("Slowing down for rate limit")
time.Sleep(f.rateLimiter.TillEmpty(pid.String()))
}
f.rateLimiter.Add(pid.String(), int64(req.Count))
log.WithFields(logrus.Fields{
"peer": pid,
"start": req.StartSlot,
"count": req.Count,
"step": req.Step,
"head": fmt.Sprintf("%#x", req.HeadBlockRoot),
}).Debug("Requesting blocks")
f.Unlock()
stream, err := f.p2p.Send(ctx, req, pid)
if err != nil {
return nil, err
}
defer stream.Close()
resp := make([]*eth.SignedBeaconBlock, 0, req.Count)
for {
blk, err := prysmsync.ReadChunkedBlock(stream, f.p2p)
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
resp = append(resp, blk)
}
return resp, nil
}
// selectFailOverPeer randomly selects fail over peer from the list of available peers.
func selectFailOverPeer(excludedPID peer.ID, peers []peer.ID) (peer.ID, error) {
for i, pid := range peers {
if pid == excludedPID {
peers = append(peers[:i], peers[i+1:]...)
break
}
}
if len(peers) == 0 {
return "", errNoPeersAvailable
}
randGenerator := rand.New(rand.NewSource(time.Now().Unix()))
randGenerator.Shuffle(len(peers), func(i, j int) {
peers[i], peers[j] = peers[j], peers[i]
})
return peers[0], nil
}
// waitForMinimumPeers spins and waits up until enough peers are available.
func (f *blocksFetcher) waitForMinimumPeers(ctx context.Context) ([]peer.ID, error) {
required := params.BeaconConfig().MaxPeersToSync
if flags.Get().MinimumSyncPeers < required {
required = flags.Get().MinimumSyncPeers
}
for {
if ctx.Err() != nil {
return nil, ctx.Err()
}
headEpoch := helpers.SlotToEpoch(f.headFetcher.HeadSlot())
_, _, peers := f.p2p.Peers().BestFinalized(params.BeaconConfig().MaxPeersToSync, headEpoch)
if len(peers) >= required {
return peers, nil
}
log.WithFields(logrus.Fields{
"suitable": len(peers),
"required": required}).Info("Waiting for enough suitable peers before syncing")
time.Sleep(handshakePollingInterval)
}
}
// selectPeers returns transformed list of peers (randomized, constrained if necessary).
func (f *blocksFetcher) selectPeers(peers []peer.ID) []peer.ID {
if len(peers) == 0 {
return peers
}
// Shuffle peers to prevent a bad peer from
// stalling sync with invalid blocks.
randGenerator := rand.New(rand.NewSource(time.Now().Unix()))
randGenerator.Shuffle(len(peers), func(i, j int) {
peers[i], peers[j] = peers[j], peers[i]
})
required := params.BeaconConfig().MaxPeersToSync
if flags.Get().MinimumSyncPeers < required {
required = flags.Get().MinimumSyncPeers
}
limit := uint64(math.Round(float64(len(peers)) * peersPercentagePerRequest))
limit = mathutil.Max(limit, uint64(required))
limit = mathutil.Min(limit, uint64(len(peers)))
return peers[:limit]
}
// nonSkippedSlotAfter checks slots after the given one in an attempt to find non-empty future slot.
func (f *blocksFetcher) nonSkippedSlotAfter(ctx context.Context, slot uint64) (uint64, error) {
headEpoch := helpers.SlotToEpoch(f.headFetcher.HeadSlot())
root, epoch, peers := f.p2p.Peers().BestFinalized(params.BeaconConfig().MaxPeersToSync, headEpoch)
if len(peers) == 0 {
return 0, errNoPeersAvailable
}
randGenerator := rand.New(rand.NewSource(time.Now().Unix()))
nextPID := func() peer.ID {
randGenerator.Shuffle(len(peers), func(i, j int) {
peers[i], peers[j] = peers[j], peers[i]
})
return peers[0]
}
for slot <= helpers.StartSlot(epoch+1) {
req := &p2ppb.BeaconBlocksByRangeRequest{
HeadBlockRoot: root,
StartSlot: slot + 1,
Count: blockBatchSize,
Step: 1,
}
blocks, err := f.requestBlocks(ctx, req, nextPID())
if err != nil {
return slot, err
}
if len(blocks) > 0 {
slots := make([]uint64, len(blocks))
for i, block := range blocks {
slots[i] = block.Block.Slot
}
return blocks[0].Block.Slot, nil
}
slot += blockBatchSize
}
return slot, nil
}