From 2de26b20f80579db6c794d5f25f6326783973372 Mon Sep 17 00:00:00 2001
From: Paul Hauner <paul@paulhauner.com>
Date: Wed, 10 Aug 2022 07:52:57 +0000
Subject: [PATCH] Don't return errors on HTTP API for already-known messages
 (#3341)

## Issue Addressed

- Resolves #3266

## Proposed Changes

Return 200 OK rather than an error when a block, attestation or sync message is already known.

Presently, we will log return an error which causes a BN to go "offline" from the VCs perspective which causes the fallback mechanism to do work to try and avoid and upcheck offline nodes. This can be observed as instability in the `vc_beacon_nodes_available_count` metric.

The current behaviour also causes scary logs for the user. There's nothing to *actually* be concerned about when we see duplicate messages, this can happen on fallback systems (see code comments).

## Additional Info

NA
---
 beacon_node/http_api/src/lib.rs             | 53 +++++++++++++++++++++
 beacon_node/http_api/src/publish_blocks.rs  | 25 +++++++++-
 beacon_node/http_api/src/sync_committees.rs | 30 +++++++++++-
 3 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs
index ba1dd01cc..ec34dd066 100644
--- a/beacon_node/http_api/src/lib.rs
+++ b/beacon_node/http_api/src/lib.rs
@@ -1168,12 +1168,46 @@ pub fn serve<T: BeaconChainTypes>(
                 blocking_json_task(move || {
                     let seen_timestamp = timestamp_now();
                     let mut failures = Vec::new();
+                    let mut num_already_known = 0;
 
                     for (index, attestation) in attestations.as_slice().iter().enumerate() {
                         let attestation = match chain
                             .verify_unaggregated_attestation_for_gossip(attestation, None)
                         {
                             Ok(attestation) => attestation,
+                            Err(AttnError::PriorAttestationKnown { .. }) => {
+                                num_already_known += 1;
+
+                                // Skip to the next attestation since an attestation for this
+                                // validator is already known in this epoch.
+                                //
+                                // There's little value for the network in validating a second
+                                // attestation for another validator since it is either:
+                                //
+                                // 1. A duplicate.
+                                // 2. Slashable.
+                                // 3. Invalid.
+                                //
+                                // We are likely to get duplicates in the case where a VC is using
+                                // fallback BNs. If the first BN actually publishes some/all of a
+                                // batch of attestations but fails to respond in a timely fashion,
+                                // the VC is likely to try publishing the attestations on another
+                                // BN. That second BN may have already seen the attestations from
+                                // the first BN and therefore indicate that the attestations are
+                                // "already seen". An attestation that has already been seen has
+                                // been published on the network so there's no actual error from
+                                // the perspective of the user.
+                                //
+                                // It's better to prevent slashable attestations from ever
+                                // appearing on the network than trying to slash validators,
+                                // especially those validators connected to the local API.
+                                //
+                                // There might be *some* value in determining that this attestation
+                                // is invalid, but since a valid attestation already it exists it
+                                // appears that this validator is capable of producing valid
+                                // attestations and there's no immediate cause for concern.
+                                continue;
+                            }
                             Err(e) => {
                                 error!(log,
                                     "Failure verifying attestation for gossip";
@@ -1240,6 +1274,15 @@ pub fn serve<T: BeaconChainTypes>(
                             ));
                         }
                     }
+
+                    if num_already_known > 0 {
+                        debug!(
+                            log,
+                            "Some unagg attestations already known";
+                            "count" => num_already_known
+                        );
+                    }
+
                     if failures.is_empty() {
                         Ok(())
                     } else {
@@ -2234,6 +2277,16 @@ pub fn serve<T: BeaconChainTypes>(
                             // identical aggregates, especially if they're using the same beacon
                             // node.
                             Err(AttnError::AttestationAlreadyKnown(_)) => continue,
+                            // If we've already seen this aggregator produce an aggregate, just
+                            // skip this one.
+                            //
+                            // We're likely to see this with VCs that use fallback BNs. The first
+                            // BN might time-out *after* publishing the aggregate and then the
+                            // second BN will indicate it's already seen the aggregate.
+                            //
+                            // There's no actual error for the user or the network since the
+                            // aggregate has been successfully published by some other node.
+                            Err(AttnError::AggregatorAlreadyKnown(_)) => continue,
                             Err(e) => {
                                 error!(log,
                                     "Failure verifying aggregate and proofs";
diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs
index b282e6f49..60ca8f232 100644
--- a/beacon_node/http_api/src/publish_blocks.rs
+++ b/beacon_node/http_api/src/publish_blocks.rs
@@ -1,9 +1,9 @@
 use crate::metrics;
 use beacon_chain::validator_monitor::{get_block_delay_ms, timestamp_now};
-use beacon_chain::{BeaconChain, BeaconChainTypes, CountUnrealized};
+use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError, CountUnrealized};
 use lighthouse_network::PubsubMessage;
 use network::NetworkMessage;
-use slog::{crit, error, info, Logger};
+use slog::{crit, error, info, warn, Logger};
 use slot_clock::SlotClock;
 use std::sync::Arc;
 use tokio::sync::mpsc::UnboundedSender;
@@ -86,6 +86,27 @@ pub async fn publish_block<T: BeaconChainTypes>(
 
             Ok(())
         }
+        Err(BlockError::BlockIsAlreadyKnown) => {
+            info!(
+                log,
+                "Block from HTTP API already known";
+                "block" => ?block.canonical_root(),
+                "slot" => block.slot(),
+            );
+            Ok(())
+        }
+        Err(BlockError::RepeatProposal { proposer, slot }) => {
+            warn!(
+                log,
+                "Block ignored due to repeat proposal";
+                "msg" => "this can happen when a VC uses fallback BNs. \
+                    whilst this is not necessarily an error, it can indicate issues with a BN \
+                    or between the VC and BN.",
+                "slot" => slot,
+                "proposer" => proposer,
+            );
+            Ok(())
+        }
         Err(e) => {
             let msg = format!("{:?}", e);
             error!(
diff --git a/beacon_node/http_api/src/sync_committees.rs b/beacon_node/http_api/src/sync_committees.rs
index 77becef7d..a6acf308f 100644
--- a/beacon_node/http_api/src/sync_committees.rs
+++ b/beacon_node/http_api/src/sync_committees.rs
@@ -11,7 +11,7 @@ use beacon_chain::{
 use eth2::types::{self as api_types};
 use lighthouse_network::PubsubMessage;
 use network::NetworkMessage;
-use slog::{error, warn, Logger};
+use slog::{debug, error, warn, Logger};
 use slot_clock::SlotClock;
 use std::cmp::max;
 use std::collections::HashMap;
@@ -189,6 +189,24 @@ pub fn process_sync_committee_signatures<T: BeaconChainTypes>(
 
                     verified_for_pool = Some(verified);
                 }
+                // If this validator has already published a sync message, just ignore this message
+                // without returning an error.
+                //
+                // This is likely to happen when a VC uses fallback BNs. If the first BN publishes
+                // the message and then fails to respond in a timely fashion then the VC will move
+                // to the second BN. The BN will then report that this message has already been
+                // seen, which is not actually an error as far as the network or user are concerned.
+                Err(SyncVerificationError::PriorSyncCommitteeMessageKnown {
+                    validator_index,
+                    slot,
+                }) => {
+                    debug!(
+                        log,
+                        "Ignoring already-known sync message";
+                        "slot" => slot,
+                        "validator_index" => validator_index,
+                    );
+                }
                 Err(e) => {
                     error!(
                         log,
@@ -283,6 +301,16 @@ pub fn process_signed_contribution_and_proofs<T: BeaconChainTypes>(
             // If we already know the contribution, don't broadcast it or attempt to
             // further verify it. Return success.
             Err(SyncVerificationError::SyncContributionAlreadyKnown(_)) => continue,
+            // If we've already seen this aggregator produce an aggregate, just
+            // skip this one.
+            //
+            // We're likely to see this with VCs that use fallback BNs. The first
+            // BN might time-out *after* publishing the aggregate and then the
+            // second BN will indicate it's already seen the aggregate.
+            //
+            // There's no actual error for the user or the network since the
+            // aggregate has been successfully published by some other node.
+            Err(SyncVerificationError::AggregatorAlreadyKnown(_)) => continue,
             Err(e) => {
                 error!(
                     log,