erigon-pulse/cl/sentinel/sentinel.go
Mark Holt 509a7af26a
Discovery zero refresh timer (#8661)
This fixes an issue where the mumbai testnet node struggle to find
peers. Before this fix in general test peer numbers are typically around
20 in total between eth66, eth67 and eth68. For new peers some can
struggle to find even a single peer after days of operation.

These are the numbers after 12 hours or running on a node which
previously could not find any peers: eth66=13, eth67=76, eth68=91.

The root cause of this issue is the following:

- A significant number of mumbai peers around the boot node return
network ids which are different from those currently available in the
DHT
- The available nodes are all consequently busy and return 'too many
peers' for long periods

These issues case a significant number of discovery timeouts, some of
the queries will never receive a response.

This causes the discovery read loop to enter a channel deadlock - which
means that no responses are processed, nor timeouts fired. This causes
the discovery process in the node to stop. From then on it just
re-requests handshakes from a relatively small number of peers.

This check in fixes this situation with the following changes:

- Remove the deadlock by running the timer in a separate go-routine so
it can run independently of the main request processing.
- Allow the discovery process matcher to match on port if no id match
can be established on initial ping. This allows subsequent node
validation to proceed and if the node proves to be valid via the
remainder of the look-up and handshake process it us used as a valid
peer.
- Completely unsolicited responses, i.e. those which come from a
completely unknown ip:port combination continue to be ignored.
-
2023-11-07 08:48:58 +00:00

346 lines
8.5 KiB
Go

/*
Copyright 2022 Erigon-Lightclient contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sentinel
import (
"context"
"crypto/ecdsa"
"fmt"
"net"
"net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/ledgerwatch/erigon/cl/sentinel/handlers"
"github.com/ledgerwatch/erigon/cl/sentinel/handshake"
"github.com/ledgerwatch/erigon/cl/sentinel/httpreqresp"
"github.com/ledgerwatch/erigon/cl/sentinel/peers"
"github.com/ledgerwatch/erigon/cl/cltypes"
"github.com/ledgerwatch/erigon/cl/persistence"
"github.com/ledgerwatch/erigon/crypto"
"github.com/ledgerwatch/erigon/p2p/discover"
"github.com/ledgerwatch/erigon/p2p/enode"
"github.com/ledgerwatch/erigon/p2p/enr"
"github.com/ledgerwatch/log/v3"
"github.com/libp2p/go-libp2p"
pubsub "github.com/libp2p/go-libp2p-pubsub"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
"github.com/libp2p/go-libp2p/core/peer"
rcmgr "github.com/libp2p/go-libp2p/p2p/host/resource-manager"
rcmgrObs "github.com/libp2p/go-libp2p/p2p/host/resource-manager/obs"
)
const (
// overlay parameters
gossipSubD = 8 // topic stable mesh target count
gossipSubDlo = 6 // topic stable mesh low watermark
gossipSubDhi = 12 // topic stable mesh high watermark
// gossip parameters
gossipSubMcacheLen = 6 // number of windows to retain full messages in cache for `IWANT` responses
gossipSubMcacheGossip = 3 // number of windows to gossip about
gossipSubSeenTTL = 550 // number of heartbeat intervals to retain message IDs
// heartbeat interval
gossipSubHeartbeatInterval = 700 * time.Millisecond // frequency of heartbeat, milliseconds
// decayToZero specifies the terminal value that we will use when decaying
// a value.
decayToZero = 0.01
)
type Sentinel struct {
started bool
listener *discover.UDPv5 // this is us in the network.
ctx context.Context
host host.Host
cfg *SentinelConfig
peers *peers.Pool
httpApi http.Handler
metadataV2 *cltypes.Metadata
handshaker *handshake.HandShaker
db persistence.RawBeaconBlockChain
discoverConfig discover.Config
pubsub *pubsub.PubSub
subManager *GossipManager
metrics bool
listenForPeersDoneCh chan struct{}
logger log.Logger
}
func (s *Sentinel) createLocalNode(
privKey *ecdsa.PrivateKey,
ipAddr net.IP,
udpPort, tcpPort int,
tmpDir string,
) (*enode.LocalNode, error) {
db, err := enode.OpenDB(s.ctx, "", tmpDir)
if err != nil {
return nil, fmt.Errorf("could not open node's peer database: %w", err)
}
localNode := enode.NewLocalNode(db, privKey, s.logger)
ipEntry := enr.IP(ipAddr)
udpEntry := enr.UDP(udpPort)
tcpEntry := enr.TCP(tcpPort)
localNode.Set(ipEntry)
localNode.Set(udpEntry)
localNode.Set(tcpEntry)
localNode.SetFallbackIP(ipAddr)
localNode.SetFallbackUDP(udpPort)
s.setupENR(localNode)
return localNode, nil
}
func (s *Sentinel) SetStatus(status *cltypes.Status) {
s.handshaker.SetStatus(status)
}
func (s *Sentinel) createListener() (*discover.UDPv5, error) {
var (
ipAddr = s.cfg.IpAddr
port = s.cfg.Port
discCfg = s.discoverConfig
)
ip := net.ParseIP(ipAddr)
if ip.To4() == nil {
return nil, fmt.Errorf("IPV4 address not provided instead %s was provided", ipAddr)
}
var bindIP net.IP
var networkVersion string
// check for our network version
switch {
// if we have 16 byte and 4 byte representation then we are in using udp6
case ip.To16() != nil && ip.To4() == nil:
bindIP = net.IPv6zero
networkVersion = "udp6"
// only 4 bytes then we are using udp4
case ip.To4() != nil:
bindIP = net.IPv4zero
networkVersion = "udp4"
default:
return nil, fmt.Errorf("bad ip address provided, %s was provided", ipAddr)
}
udpAddr := &net.UDPAddr{
IP: bindIP,
Port: port,
}
conn, err := net.ListenUDP(networkVersion, udpAddr)
if err != nil {
return nil, err
}
localNode, err := s.createLocalNode(discCfg.PrivateKey, ip, port, int(s.cfg.TCPPort), s.cfg.TmpDir)
if err != nil {
return nil, err
}
// TODO: Set up proper attestation number
s.metadataV2 = &cltypes.Metadata{
SeqNumber: localNode.Seq(),
Attnets: 0,
Syncnets: new(uint64),
}
// Start stream handlers
handlers.NewConsensusHandlers(s.ctx, s.db, s.host, s.peers, s.cfg.BeaconConfig, s.cfg.GenesisConfig, s.metadataV2).Start()
net, err := discover.ListenV5(s.ctx, "any", conn, localNode, discCfg)
if err != nil {
return nil, err
}
return net, err
}
// This is just one of the examples from the libp2p repository.
func New(
ctx context.Context,
cfg *SentinelConfig,
db persistence.RawBeaconBlockChain,
logger log.Logger,
) (*Sentinel, error) {
s := &Sentinel{
ctx: ctx,
cfg: cfg,
db: db,
metrics: true,
logger: logger,
}
// Setup discovery
enodes := make([]*enode.Node, len(cfg.NetworkConfig.BootNodes))
for i, bootnode := range cfg.NetworkConfig.BootNodes {
newNode, err := enode.Parse(enode.ValidSchemes, bootnode)
if err != nil {
return nil, err
}
enodes[i] = newNode
}
privateKey, err := crypto.GenerateKey()
if err != nil {
return nil, err
}
s.discoverConfig = discover.Config{
PrivateKey: privateKey,
Bootnodes: enodes,
}
opts, err := buildOptions(cfg, s)
if err != nil {
return nil, err
}
str, err := rcmgrObs.NewStatsTraceReporter()
if err != nil {
return nil, err
}
rmgr, err := rcmgr.NewResourceManager(rcmgr.NewFixedLimiter(rcmgr.DefaultLimits.AutoScale()), rcmgr.WithTraceReporter(str))
if err != nil {
return nil, err
}
opts = append(opts, libp2p.ResourceManager(rmgr))
gater, err := NewGater(cfg)
if err != nil {
return nil, err
}
opts = append(opts, libp2p.ConnectionGater(gater))
host, err := libp2p.New(opts...)
if err != nil {
return nil, err
}
s.host = host
s.peers = peers.NewPool()
mux := chi.NewRouter()
// mux := httpreqresp.NewRequestHandler(host)
mux.Get("/", httpreqresp.NewRequestHandler(host))
s.httpApi = mux
s.handshaker = handshake.New(ctx, cfg.GenesisConfig, cfg.BeaconConfig, s.httpApi)
pubsub.TimeCacheDuration = 550 * gossipSubHeartbeatInterval
s.pubsub, err = pubsub.NewGossipSub(s.ctx, s.host, s.pubsubOptions()...)
if err != nil {
return nil, fmt.Errorf("[Sentinel] failed to subscribe to gossip err=%w", err)
}
return s, nil
}
func (s *Sentinel) ReqRespHandler() http.Handler {
return s.httpApi
}
func (s *Sentinel) RecvGossip() <-chan *pubsub.Message {
return s.subManager.Recv()
}
func (s *Sentinel) Start() error {
if s.started {
s.logger.Warn("[Sentinel] already running")
}
var err error
s.listener, err = s.createListener()
if err != nil {
return fmt.Errorf("failed creating sentinel listener err=%w", err)
}
if err := s.connectToBootnodes(); err != nil {
return fmt.Errorf("failed to connect to bootnodes err=%w", err)
}
// Configuring handshake
s.host.Network().Notify(&network.NotifyBundle{
ConnectedF: s.onConnection,
DisconnectedF: func(n network.Network, c network.Conn) {
peerId := c.RemotePeer()
s.peers.RemovePeer(peerId)
},
})
s.subManager = NewGossipManager(s.ctx)
go s.listenForPeers()
return nil
}
func (s *Sentinel) Stop() {
s.listenForPeersDoneCh <- struct{}{}
s.listener.Close()
s.subManager.Close()
s.host.Close()
}
func (s *Sentinel) String() string {
return s.listener.Self().String()
}
func (s *Sentinel) HasTooManyPeers() bool {
return s.GetPeersCount() >= peers.DefaultMaxPeers
}
func (s *Sentinel) GetPeersCount() int {
// sub := s.subManager.GetMatchingSubscription(string(BeaconBlockTopic))
// if sub == nil {
return len(s.host.Network().Peers())
// }
// return len(sub.topic.ListPeers())
}
func (s *Sentinel) Host() host.Host {
return s.host
}
func (s *Sentinel) Peers() *peers.Pool {
return s.peers
}
func (s *Sentinel) GossipManager() *GossipManager {
return s.subManager
}
func (s *Sentinel) Config() *SentinelConfig {
return s.cfg
}
func (s *Sentinel) Status() *cltypes.Status {
return s.handshaker.Status()
}
func (s *Sentinel) PeersList() []peer.AddrInfo {
pids := s.host.Network().Peers()
infos := []peer.AddrInfo{}
for _, pid := range pids {
infos = append(infos, s.host.Network().Peerstore().PeerInfo(pid))
}
return infos
}