From 2de26b20f80579db6c794d5f25f6326783973372 Mon Sep 17 00:00:00 2001 From: Paul Hauner Date: Wed, 10 Aug 2022 07:52:57 +0000 Subject: [PATCH] Don't return errors on HTTP API for already-known messages (#3341) ## Issue Addressed - Resolves #3266 ## Proposed Changes Return 200 OK rather than an error when a block, attestation or sync message is already known. Presently, we will log return an error which causes a BN to go "offline" from the VCs perspective which causes the fallback mechanism to do work to try and avoid and upcheck offline nodes. This can be observed as instability in the `vc_beacon_nodes_available_count` metric. The current behaviour also causes scary logs for the user. There's nothing to *actually* be concerned about when we see duplicate messages, this can happen on fallback systems (see code comments). ## Additional Info NA --- beacon_node/http_api/src/lib.rs | 53 +++++++++++++++++++++ beacon_node/http_api/src/publish_blocks.rs | 25 +++++++++- beacon_node/http_api/src/sync_committees.rs | 30 +++++++++++- 3 files changed, 105 insertions(+), 3 deletions(-) diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index ba1dd01cc..ec34dd066 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -1168,12 +1168,46 @@ pub fn serve( blocking_json_task(move || { let seen_timestamp = timestamp_now(); let mut failures = Vec::new(); + let mut num_already_known = 0; for (index, attestation) in attestations.as_slice().iter().enumerate() { let attestation = match chain .verify_unaggregated_attestation_for_gossip(attestation, None) { Ok(attestation) => attestation, + Err(AttnError::PriorAttestationKnown { .. }) => { + num_already_known += 1; + + // Skip to the next attestation since an attestation for this + // validator is already known in this epoch. + // + // There's little value for the network in validating a second + // attestation for another validator since it is either: + // + // 1. A duplicate. + // 2. Slashable. + // 3. Invalid. + // + // We are likely to get duplicates in the case where a VC is using + // fallback BNs. If the first BN actually publishes some/all of a + // batch of attestations but fails to respond in a timely fashion, + // the VC is likely to try publishing the attestations on another + // BN. That second BN may have already seen the attestations from + // the first BN and therefore indicate that the attestations are + // "already seen". An attestation that has already been seen has + // been published on the network so there's no actual error from + // the perspective of the user. + // + // It's better to prevent slashable attestations from ever + // appearing on the network than trying to slash validators, + // especially those validators connected to the local API. + // + // There might be *some* value in determining that this attestation + // is invalid, but since a valid attestation already it exists it + // appears that this validator is capable of producing valid + // attestations and there's no immediate cause for concern. + continue; + } Err(e) => { error!(log, "Failure verifying attestation for gossip"; @@ -1240,6 +1274,15 @@ pub fn serve( )); } } + + if num_already_known > 0 { + debug!( + log, + "Some unagg attestations already known"; + "count" => num_already_known + ); + } + if failures.is_empty() { Ok(()) } else { @@ -2234,6 +2277,16 @@ pub fn serve( // identical aggregates, especially if they're using the same beacon // node. Err(AttnError::AttestationAlreadyKnown(_)) => continue, + // If we've already seen this aggregator produce an aggregate, just + // skip this one. + // + // We're likely to see this with VCs that use fallback BNs. The first + // BN might time-out *after* publishing the aggregate and then the + // second BN will indicate it's already seen the aggregate. + // + // There's no actual error for the user or the network since the + // aggregate has been successfully published by some other node. + Err(AttnError::AggregatorAlreadyKnown(_)) => continue, Err(e) => { error!(log, "Failure verifying aggregate and proofs"; diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs index b282e6f49..60ca8f232 100644 --- a/beacon_node/http_api/src/publish_blocks.rs +++ b/beacon_node/http_api/src/publish_blocks.rs @@ -1,9 +1,9 @@ use crate::metrics; use beacon_chain::validator_monitor::{get_block_delay_ms, timestamp_now}; -use beacon_chain::{BeaconChain, BeaconChainTypes, CountUnrealized}; +use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError, CountUnrealized}; use lighthouse_network::PubsubMessage; use network::NetworkMessage; -use slog::{crit, error, info, Logger}; +use slog::{crit, error, info, warn, Logger}; use slot_clock::SlotClock; use std::sync::Arc; use tokio::sync::mpsc::UnboundedSender; @@ -86,6 +86,27 @@ pub async fn publish_block( Ok(()) } + Err(BlockError::BlockIsAlreadyKnown) => { + info!( + log, + "Block from HTTP API already known"; + "block" => ?block.canonical_root(), + "slot" => block.slot(), + ); + Ok(()) + } + Err(BlockError::RepeatProposal { proposer, slot }) => { + warn!( + log, + "Block ignored due to repeat proposal"; + "msg" => "this can happen when a VC uses fallback BNs. \ + whilst this is not necessarily an error, it can indicate issues with a BN \ + or between the VC and BN.", + "slot" => slot, + "proposer" => proposer, + ); + Ok(()) + } Err(e) => { let msg = format!("{:?}", e); error!( diff --git a/beacon_node/http_api/src/sync_committees.rs b/beacon_node/http_api/src/sync_committees.rs index 77becef7d..a6acf308f 100644 --- a/beacon_node/http_api/src/sync_committees.rs +++ b/beacon_node/http_api/src/sync_committees.rs @@ -11,7 +11,7 @@ use beacon_chain::{ use eth2::types::{self as api_types}; use lighthouse_network::PubsubMessage; use network::NetworkMessage; -use slog::{error, warn, Logger}; +use slog::{debug, error, warn, Logger}; use slot_clock::SlotClock; use std::cmp::max; use std::collections::HashMap; @@ -189,6 +189,24 @@ pub fn process_sync_committee_signatures( verified_for_pool = Some(verified); } + // If this validator has already published a sync message, just ignore this message + // without returning an error. + // + // This is likely to happen when a VC uses fallback BNs. If the first BN publishes + // the message and then fails to respond in a timely fashion then the VC will move + // to the second BN. The BN will then report that this message has already been + // seen, which is not actually an error as far as the network or user are concerned. + Err(SyncVerificationError::PriorSyncCommitteeMessageKnown { + validator_index, + slot, + }) => { + debug!( + log, + "Ignoring already-known sync message"; + "slot" => slot, + "validator_index" => validator_index, + ); + } Err(e) => { error!( log, @@ -283,6 +301,16 @@ pub fn process_signed_contribution_and_proofs( // If we already know the contribution, don't broadcast it or attempt to // further verify it. Return success. Err(SyncVerificationError::SyncContributionAlreadyKnown(_)) => continue, + // If we've already seen this aggregator produce an aggregate, just + // skip this one. + // + // We're likely to see this with VCs that use fallback BNs. The first + // BN might time-out *after* publishing the aggregate and then the + // second BN will indicate it's already seen the aggregate. + // + // There's no actual error for the user or the network since the + // aggregate has been successfully published by some other node. + Err(SyncVerificationError::AggregatorAlreadyKnown(_)) => continue, Err(e) => { error!( log,