From e206d3f8975bd98cc86d14055dca40f996bacc60 Mon Sep 17 00:00:00 2001 From: rjl493456442 Date: Fri, 8 Dec 2023 21:28:23 +0800 Subject: [PATCH] trie: remove inconsistent trie nodes during sync in path mode (#28595) This fixes a database corruption issue that could occur during state healing. When sync is aborted while certain modifications were already committed, and a reorg occurs, the database would contain incorrect trie nodes stored by path. These nodes need to detected/deleted in order to obtain a complete and fully correct state after state healing. --------- Co-authored-by: Felix Lange --- ethdb/dbtest/testsuite.go | 6 +- trie/sync.go | 223 ++++++++++++++++++++++++-------------- trie/sync_test.go | 125 ++++++++++++++++++++- 3 files changed, 270 insertions(+), 84 deletions(-) diff --git a/ethdb/dbtest/testsuite.go b/ethdb/dbtest/testsuite.go index 0d3d5f5aa..29bd24364 100644 --- a/ethdb/dbtest/testsuite.go +++ b/ethdb/dbtest/testsuite.go @@ -273,9 +273,13 @@ func TestDatabaseSuite(t *testing.T, New func() ethdb.KeyValueStore) { b.Put([]byte("5"), nil) b.Delete([]byte("1")) b.Put([]byte("6"), nil) - b.Delete([]byte("3")) + + b.Delete([]byte("3")) // delete then put b.Put([]byte("3"), nil) + b.Put([]byte("7"), nil) // put then delete + b.Delete([]byte("7")) + if err := b.Write(); err != nil { t.Fatal(err) } diff --git a/trie/sync.go b/trie/sync.go index 8eaed9f21..589d28364 100644 --- a/trie/sync.go +++ b/trie/sync.go @@ -116,10 +116,9 @@ type LeafCallback func(keys [][]byte, path []byte, leaf []byte, parent common.Ha // nodeRequest represents a scheduled or already in-flight trie node retrieval request. type nodeRequest struct { - hash common.Hash // Hash of the trie node to retrieve - path []byte // Merkle path leading to this node for prioritization - data []byte // Data content of the node, cached until all subtrees complete - deletes [][]byte // List of internal path segments for trie nodes to delete + hash common.Hash // Hash of the trie node to retrieve + path []byte // Merkle path leading to this node for prioritization + data []byte // Data content of the node, cached until all subtrees complete parent *nodeRequest // Parent state node referencing this entry deps int // Number of dependencies before allowed to commit this node @@ -146,38 +145,85 @@ type CodeSyncResult struct { Data []byte // Data content of the retrieved bytecode } +// nodeOp represents an operation upon the trie node. It can either represent a +// deletion to the specific node or a node write for persisting retrieved node. +type nodeOp struct { + owner common.Hash // identifier of the trie (empty for account trie) + path []byte // path from the root to the specified node. + blob []byte // the content of the node (nil for deletion) + hash common.Hash // hash of the node content (empty for node deletion) +} + +// isDelete indicates if the operation is a database deletion. +func (op *nodeOp) isDelete() bool { + return len(op.blob) == 0 +} + // syncMemBatch is an in-memory buffer of successfully downloaded but not yet // persisted data items. type syncMemBatch struct { - nodes map[string][]byte // In-memory membatch of recently completed nodes - hashes map[string]common.Hash // Hashes of recently completed nodes - deletes map[string]struct{} // List of paths for trie node to delete - codes map[common.Hash][]byte // In-memory membatch of recently completed codes - size uint64 // Estimated batch-size of in-memory data. + scheme string // State scheme identifier + codes map[common.Hash][]byte // In-memory batch of recently completed codes + nodes []nodeOp // In-memory batch of recently completed/deleted nodes + size uint64 // Estimated batch-size of in-memory data. } // newSyncMemBatch allocates a new memory-buffer for not-yet persisted trie nodes. -func newSyncMemBatch() *syncMemBatch { +func newSyncMemBatch(scheme string) *syncMemBatch { return &syncMemBatch{ - nodes: make(map[string][]byte), - hashes: make(map[string]common.Hash), - deletes: make(map[string]struct{}), - codes: make(map[common.Hash][]byte), + scheme: scheme, + codes: make(map[common.Hash][]byte), } } -// hasNode reports the trie node with specific path is already cached. -func (batch *syncMemBatch) hasNode(path []byte) bool { - _, ok := batch.nodes[string(path)] - return ok -} - // hasCode reports the contract code with specific hash is already cached. func (batch *syncMemBatch) hasCode(hash common.Hash) bool { _, ok := batch.codes[hash] return ok } +// addCode caches a contract code database write operation. +func (batch *syncMemBatch) addCode(hash common.Hash, code []byte) { + batch.codes[hash] = code + batch.size += common.HashLength + uint64(len(code)) +} + +// addNode caches a node database write operation. +func (batch *syncMemBatch) addNode(owner common.Hash, path []byte, blob []byte, hash common.Hash) { + if batch.scheme == rawdb.PathScheme { + if owner == (common.Hash{}) { + batch.size += uint64(len(path) + len(blob)) + } else { + batch.size += common.HashLength + uint64(len(path)+len(blob)) + } + } else { + batch.size += common.HashLength + uint64(len(blob)) + } + batch.nodes = append(batch.nodes, nodeOp{ + owner: owner, + path: path, + blob: blob, + hash: hash, + }) +} + +// delNode caches a node database delete operation. +func (batch *syncMemBatch) delNode(owner common.Hash, path []byte) { + if batch.scheme != rawdb.PathScheme { + log.Error("Unexpected node deletion", "owner", owner, "path", path, "scheme", batch.scheme) + return // deletion is not supported in hash mode. + } + if owner == (common.Hash{}) { + batch.size += uint64(len(path)) + } else { + batch.size += common.HashLength + uint64(len(path)) + } + batch.nodes = append(batch.nodes, nodeOp{ + owner: owner, + path: path, + }) +} + // Sync is the main state trie synchronisation scheduler, which provides yet // unknown trie hashes to retrieve, accepts node data associated with said hashes // and reconstructs the trie step by step until all is done. @@ -196,7 +242,7 @@ func NewSync(root common.Hash, database ethdb.KeyValueReader, callback LeafCallb ts := &Sync{ scheme: scheme, database: database, - membatch: newSyncMemBatch(), + membatch: newSyncMemBatch(scheme), nodeReqs: make(map[string]*nodeRequest), codeReqs: make(map[common.Hash]*codeRequest), queue: prque.New[int64, any](nil), // Ugh, can contain both string and hash, whyyy @@ -210,16 +256,17 @@ func NewSync(root common.Hash, database ethdb.KeyValueReader, callback LeafCallb // parent for completion tracking. The given path is a unique node path in // hex format and contain all the parent path if it's layered trie node. func (s *Sync) AddSubTrie(root common.Hash, path []byte, parent common.Hash, parentPath []byte, callback LeafCallback) { - // Short circuit if the trie is empty or already known if root == types.EmptyRootHash { return } - if s.membatch.hasNode(path) { - return - } owner, inner := ResolvePath(path) - if rawdb.HasTrieNode(s.database, owner, inner, root, s.scheme) { + exist, inconsistent := s.hasNode(owner, inner, root) + if exist { + // The entire subtrie is already present in the database. return + } else if inconsistent { + // There is a pre-existing node with the wrong hash in DB, remove it. + s.membatch.delNode(owner, inner) } // Assemble the new sub-trie sync request req := &nodeRequest{ @@ -371,39 +418,42 @@ func (s *Sync) ProcessNode(result NodeSyncResult) error { } // Commit flushes the data stored in the internal membatch out to persistent -// storage, returning any occurred error. +// storage, returning any occurred error. The whole data set will be flushed +// in an atomic database batch. func (s *Sync) Commit(dbw ethdb.Batch) error { // Flush the pending node writes into database batch. var ( account int storage int ) - for path, value := range s.membatch.nodes { - owner, inner := ResolvePath([]byte(path)) - if owner == (common.Hash{}) { - account += 1 + for _, op := range s.membatch.nodes { + if op.isDelete() { + // node deletion is only supported in path mode. + if op.owner == (common.Hash{}) { + rawdb.DeleteAccountTrieNode(dbw, op.path) + } else { + rawdb.DeleteStorageTrieNode(dbw, op.owner, op.path) + } + deletionGauge.Inc(1) } else { - storage += 1 + if op.owner == (common.Hash{}) { + account += 1 + } else { + storage += 1 + } + rawdb.WriteTrieNode(dbw, op.owner, op.path, op.hash, op.blob, s.scheme) } - rawdb.WriteTrieNode(dbw, owner, inner, s.membatch.hashes[path], value, s.scheme) } accountNodeSyncedGauge.Inc(int64(account)) storageNodeSyncedGauge.Inc(int64(storage)) - // Flush the pending node deletes into the database batch. - // Please note that each written and deleted node has a - // unique path, ensuring no duplication occurs. - for path := range s.membatch.deletes { - owner, inner := ResolvePath([]byte(path)) - rawdb.DeleteTrieNode(dbw, owner, inner, common.Hash{} /* unused */, s.scheme) - } // Flush the pending code writes into database batch. for hash, value := range s.membatch.codes { rawdb.WriteCode(dbw, hash, value) } codeSyncedGauge.Inc(int64(len(s.membatch.codes))) - s.membatch = newSyncMemBatch() // reset the batch + s.membatch = newSyncMemBatch(s.scheme) // reset the batch return nil } @@ -476,12 +526,15 @@ func (s *Sync) children(req *nodeRequest, object node) ([]*nodeRequest, error) { // child as invalid. This is essential in the case of path mode // scheme; otherwise, state healing might overwrite existing child // nodes silently while leaving a dangling parent node within the - // range of this internal path on disk. This would break the - // guarantee for state healing. + // range of this internal path on disk and the persistent state + // ends up with a very weird situation that nodes on the same path + // are not inconsistent while they all present in disk. This property + // would break the guarantee for state healing. // // While it's possible for this shortNode to overwrite a previously // existing full node, the other branches of the fullNode can be - // retained as they remain untouched and complete. + // retained as they are not accessible with the new shortNode, and + // also the whole sub-trie is still untouched and complete. // // This step is only necessary for path mode, as there is no deletion // in hash mode at all. @@ -498,8 +551,7 @@ func (s *Sync) children(req *nodeRequest, object node) ([]*nodeRequest, error) { exists = rawdb.ExistsStorageTrieNode(s.database, owner, append(inner, key[:i]...)) } if exists { - req.deletes = append(req.deletes, key[:i]) - deletionGauge.Inc(1) + s.membatch.delNode(owner, append(inner, key[:i]...)) log.Debug("Detected dangling node", "owner", owner, "path", append(inner, key[:i]...)) } } @@ -521,6 +573,7 @@ func (s *Sync) children(req *nodeRequest, object node) ([]*nodeRequest, error) { var ( missing = make(chan *nodeRequest, len(children)) pending sync.WaitGroup + batchMu sync.Mutex ) for _, child := range children { // Notify any external watcher of a new key/value node @@ -538,34 +591,32 @@ func (s *Sync) children(req *nodeRequest, object node) ([]*nodeRequest, error) { } } } - // If the child references another node, resolve or schedule + // If the child references another node, resolve or schedule. + // We check all children concurrently. if node, ok := (child.node).(hashNode); ok { - // Try to resolve the node from the local database - if s.membatch.hasNode(child.path) { - continue - } - // Check the presence of children concurrently + path := child.path + hash := common.BytesToHash(node) pending.Add(1) - go func(child childNode) { + go func() { defer pending.Done() - - // If database says duplicate, then at least the trie node is present - // and we hold the assumption that it's NOT legacy contract code. - var ( - chash = common.BytesToHash(node) - owner, inner = ResolvePath(child.path) - ) - if rawdb.HasTrieNode(s.database, owner, inner, chash, s.scheme) { + owner, inner := ResolvePath(path) + exist, inconsistent := s.hasNode(owner, inner, hash) + if exist { return + } else if inconsistent { + // There is a pre-existing node with the wrong hash in DB, remove it. + batchMu.Lock() + s.membatch.delNode(owner, inner) + batchMu.Unlock() } // Locally unknown node, schedule for retrieval missing <- &nodeRequest{ - path: child.path, - hash: chash, + path: path, + hash: hash, parent: req, callback: req.callback, } - }(child) + }() } } pending.Wait() @@ -587,21 +638,10 @@ func (s *Sync) children(req *nodeRequest, object node) ([]*nodeRequest, error) { // committed themselves. func (s *Sync) commitNodeRequest(req *nodeRequest) error { // Write the node content to the membatch - s.membatch.nodes[string(req.path)] = req.data - s.membatch.hashes[string(req.path)] = req.hash + owner, path := ResolvePath(req.path) + s.membatch.addNode(owner, path, req.data, req.hash) - // The size tracking refers to the db-batch, not the in-memory data. - if s.scheme == rawdb.PathScheme { - s.membatch.size += uint64(len(req.path) + len(req.data)) - } else { - s.membatch.size += common.HashLength + uint64(len(req.data)) - } - // Delete the internal nodes which are marked as invalid - for _, segment := range req.deletes { - path := append(req.path, segment...) - s.membatch.deletes[string(path)] = struct{}{} - s.membatch.size += uint64(len(path)) - } + // Removed the completed node request delete(s.nodeReqs, string(req.path)) s.fetches[len(req.path)]-- @@ -622,8 +662,9 @@ func (s *Sync) commitNodeRequest(req *nodeRequest) error { // committed themselves. func (s *Sync) commitCodeRequest(req *codeRequest) error { // Write the node content to the membatch - s.membatch.codes[req.hash] = req.data - s.membatch.size += common.HashLength + uint64(len(req.data)) + s.membatch.addCode(req.hash, req.data) + + // Removed the completed code request delete(s.codeReqs, req.hash) s.fetches[len(req.path)]-- @@ -639,6 +680,28 @@ func (s *Sync) commitCodeRequest(req *codeRequest) error { return nil } +// hasNode reports whether the specified trie node is present in the database. +// 'exists' is true when the node exists in the database and matches the given root +// hash. The 'inconsistent' return value is true when the node exists but does not +// match the expected hash. +func (s *Sync) hasNode(owner common.Hash, path []byte, hash common.Hash) (exists bool, inconsistent bool) { + // If node is running with hash scheme, check the presence with node hash. + if s.scheme == rawdb.HashScheme { + return rawdb.HasLegacyTrieNode(s.database, hash), false + } + // If node is running with path scheme, check the presence with node path. + var blob []byte + var dbHash common.Hash + if owner == (common.Hash{}) { + blob, dbHash = rawdb.ReadAccountTrieNode(s.database, path) + } else { + blob, dbHash = rawdb.ReadStorageTrieNode(s.database, owner, path) + } + exists = hash == dbHash + inconsistent = !exists && len(blob) != 0 + return exists, inconsistent +} + // ResolvePath resolves the provided composite node path by separating the // path in account trie if it's existent. func ResolvePath(path []byte) (common.Hash, []byte) { diff --git a/trie/sync_test.go b/trie/sync_test.go index 5edfb32a3..585181b48 100644 --- a/trie/sync_test.go +++ b/trie/sync_test.go @@ -684,8 +684,11 @@ func testSyncOrdering(t *testing.T, scheme string) { } } } - func syncWith(t *testing.T, root common.Hash, db ethdb.Database, srcDb *Database) { + syncWithHookWriter(t, root, db, srcDb, nil) +} + +func syncWithHookWriter(t *testing.T, root common.Hash, db ethdb.Database, srcDb *Database, hookWriter ethdb.KeyValueWriter) { // Create a destination trie and sync with the scheduler sched := NewSync(root, db, nil, srcDb.Scheme()) @@ -723,8 +726,11 @@ func syncWith(t *testing.T, root common.Hash, db ethdb.Database, srcDb *Database if err := sched.Commit(batch); err != nil { t.Fatalf("failed to commit data: %v", err) } - batch.Write() - + if hookWriter != nil { + batch.Replay(hookWriter) + } else { + batch.Write() + } paths, nodes, _ = sched.Missing(0) elements = elements[:0] for i := 0; i < len(paths); i++ { @@ -894,3 +900,116 @@ func testPivotMove(t *testing.T, scheme string, tiny bool) { syncWith(t, rootC, destDisk, srcTrieDB) checkTrieContents(t, destDisk, scheme, srcTrie.Hash().Bytes(), stateC, true) } + +func TestSyncAbort(t *testing.T) { + testSyncAbort(t, rawdb.PathScheme) + testSyncAbort(t, rawdb.HashScheme) +} + +type hookWriter struct { + db ethdb.KeyValueStore + filter func(key []byte, value []byte) bool +} + +// Put inserts the given value into the key-value data store. +func (w *hookWriter) Put(key []byte, value []byte) error { + if w.filter != nil && w.filter(key, value) { + return nil + } + return w.db.Put(key, value) +} + +// Delete removes the key from the key-value data store. +func (w *hookWriter) Delete(key []byte) error { + return w.db.Delete(key) +} + +func testSyncAbort(t *testing.T, scheme string) { + var ( + srcDisk = rawdb.NewMemoryDatabase() + srcTrieDB = newTestDatabase(srcDisk, scheme) + srcTrie, _ = New(TrieID(types.EmptyRootHash), srcTrieDB) + + deleteFn = func(key []byte, tr *Trie, states map[string][]byte) { + tr.Delete(key) + delete(states, string(key)) + } + writeFn = func(key []byte, val []byte, tr *Trie, states map[string][]byte) { + if val == nil { + val = randBytes(32) + } + tr.Update(key, val) + states[string(key)] = common.CopyBytes(val) + } + copyStates = func(states map[string][]byte) map[string][]byte { + cpy := make(map[string][]byte) + for k, v := range states { + cpy[k] = v + } + return cpy + } + ) + var ( + stateA = make(map[string][]byte) + key = randBytes(32) + val = randBytes(32) + ) + for i := 0; i < 256; i++ { + writeFn(randBytes(32), nil, srcTrie, stateA) + } + writeFn(key, val, srcTrie, stateA) + + rootA, nodesA, _ := srcTrie.Commit(false) + if err := srcTrieDB.Update(rootA, types.EmptyRootHash, 0, trienode.NewWithNodeSet(nodesA), nil); err != nil { + panic(err) + } + if err := srcTrieDB.Commit(rootA, false); err != nil { + panic(err) + } + // Create a destination trie and sync with the scheduler + destDisk := rawdb.NewMemoryDatabase() + syncWith(t, rootA, destDisk, srcTrieDB) + checkTrieContents(t, destDisk, scheme, srcTrie.Hash().Bytes(), stateA, true) + + // Delete the element from the trie + stateB := copyStates(stateA) + srcTrie, _ = New(TrieID(rootA), srcTrieDB) + deleteFn(key, srcTrie, stateB) + + rootB, nodesB, _ := srcTrie.Commit(false) + if err := srcTrieDB.Update(rootB, rootA, 0, trienode.NewWithNodeSet(nodesB), nil); err != nil { + panic(err) + } + if err := srcTrieDB.Commit(rootB, false); err != nil { + panic(err) + } + + // Sync the new state, but never persist the new root node. Before the + // fix #28595, the original old root node will still be left in database + // which breaks the next healing cycle. + syncWithHookWriter(t, rootB, destDisk, srcTrieDB, &hookWriter{db: destDisk, filter: func(key []byte, value []byte) bool { + if scheme == rawdb.HashScheme { + return false + } + if len(value) == 0 { + return false + } + ok, path := rawdb.ResolveAccountTrieNodeKey(key) + return ok && len(path) == 0 + }}) + + // Add elements to expand trie + stateC := copyStates(stateB) + srcTrie, _ = New(TrieID(rootB), srcTrieDB) + + writeFn(key, val, srcTrie, stateC) + rootC, nodesC, _ := srcTrie.Commit(false) + if err := srcTrieDB.Update(rootC, rootB, 0, trienode.NewWithNodeSet(nodesC), nil); err != nil { + panic(err) + } + if err := srcTrieDB.Commit(rootC, false); err != nil { + panic(err) + } + syncWith(t, rootC, destDisk, srcTrieDB) + checkTrieContents(t, destDisk, scheme, srcTrie.Hash().Bytes(), stateC, true) +}