From 42da392edcac20bc528f91ae53fd7ff7b0ce808d Mon Sep 17 00:00:00 2001 From: realbigsean Date: Tue, 31 Oct 2023 21:04:18 +0000 Subject: [PATCH] fix deneb sync bug (#4869) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Issue Addressed I observed our forward sync on devnet 9 would stall when we would hit this log: ``` 250425:Oct 19 00:54:17.133 WARN Blocks and blobs request for range received invalid data, error: KzgCommitmentMismatch, batch_id: 4338, peer_id: 16Uiu2HAmHbmkEQFDrJfNuy1aYyAfHkNUwSD9FN7EVAqGJ8YTF9Mh, service: sync, module: network::sync::manager:1036 ``` ## Proposed Changes `range_sync_block_and_blob_response` [here](https://github.com/sigp/lighthouse/blob/1cb02a13a53d0e603ad5920c03832e5779c3df61/beacon_node/network/src/sync/manager.rs#L1013) removes the request from the sync manager. later, however if there's an error, `inject_error` [here](https://github.com/sigp/lighthouse/blob/1cb02a13a53d0e603ad5920c03832e5779c3df61/beacon_node/network/src/sync/manager.rs#L1055) expects the request to exist so we can handle retry logic. So this PR just re-inserts the request (withthout any accumulated blobs or blocks) when we hit an error here. The issue is unique to block+blob sync because the error here is only possible from mismatches between blocks + blobs after we've downloaded both, there's no equivalent error in block sync Co-authored-by: realbigsean --- beacon_node/network/src/sync/manager.rs | 16 ++++++++++++++++ .../network/src/sync/network_context.rs | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 90fa16929..c1e9cde3f 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -43,6 +43,7 @@ use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::common::{Current, Parent}; use crate::sync::block_lookups::{BlobRequestState, BlockRequestState}; +use crate::sync::network_context::BlocksAndBlobsByRangeRequest; use crate::sync::range_sync::ByRangeRequestType; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::block_verification_types::RpcBlock; @@ -1031,6 +1032,14 @@ impl SyncManager { } } Err(e) => { + // Re-insert the request so we can retry + let new_req = BlocksAndBlobsByRangeRequest { + chain_id, + batch_id: resp.batch_id, + block_blob_info: <_>::default(), + }; + self.network + .insert_range_blocks_and_blobs_request(id, new_req); // inform range that the request needs to be treated as failed // With time we will want to downgrade this log warn!( @@ -1090,6 +1099,13 @@ impl SyncManager { } } Err(e) => { + // Re-insert the request so we can retry + self.network.insert_backfill_blocks_and_blobs_requests( + id, + resp.batch_id, + <_>::default(), + ); + // inform backfill that the request needs to be treated as failed // With time we will want to downgrade this log warn!( diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index ab18c0abd..c01bc3e42 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -591,4 +591,22 @@ impl SyncNetworkContext { ByRangeRequestType::Blocks } } + + pub fn insert_range_blocks_and_blobs_request( + &mut self, + id: Id, + request: BlocksAndBlobsByRangeRequest, + ) { + self.range_blocks_and_blobs_requests.insert(id, request); + } + + pub fn insert_backfill_blocks_and_blobs_requests( + &mut self, + id: Id, + batch_id: BatchId, + request: BlocksAndBlobsRequestInfo, + ) { + self.backfill_blocks_and_blobs_requests + .insert(id, (batch_id, request)); + } }