From d926bf2c7e3182d694c15829a37a0ca7331cd03c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= <peterke@gmail.com> Date: Thu, 21 Jun 2018 12:28:05 +0300 Subject: [PATCH] trie: cache collapsed tries node, not rlp blobs (#16876) The current trie memory database/cache that we do pruning on stores trie nodes as binary rlp encoded blobs, and also stores the node relationships/references for GC purposes. However, most of the trie nodes (everything apart from a value node) is in essence just a collection of references. This PR switches out the RLP encoded trie blobs with the collapsed-but-not-serialized trie nodes. This permits most of the references to be recovered from within the node data structure, avoiding the need to track them a second time (expensive memory wise). --- core/blockchain.go | 4 +- core/blockchain_test.go | 4 +- core/state/statedb.go | 2 +- eth/api_tracer.go | 6 +- trie/database.go | 274 ++++++++++++++++++++++++++++++++++------ trie/hasher.go | 30 +---- trie/node.go | 15 ++- trie/trie.go | 8 +- 8 files changed, 268 insertions(+), 75 deletions(-) diff --git a/core/blockchain.go b/core/blockchain.go index 34832252a..0b50e3f37 100644 --- a/core/blockchain.go +++ b/core/blockchain.go @@ -672,7 +672,7 @@ func (bc *BlockChain) Stop() { } } for !bc.triegc.Empty() { - triedb.Dereference(bc.triegc.PopItem().(common.Hash), common.Hash{}) + triedb.Dereference(bc.triegc.PopItem().(common.Hash)) } if size, _ := triedb.Size(); size != 0 { log.Error("Dangling trie nodes after full cleanup") @@ -947,7 +947,7 @@ func (bc *BlockChain) WriteBlockWithState(block *types.Block, receipts []*types. bc.triegc.Push(root, number) break } - triedb.Dereference(root.(common.Hash), common.Hash{}) + triedb.Dereference(root.(common.Hash)) } } } diff --git a/core/blockchain_test.go b/core/blockchain_test.go index f409bb7b0..687209bfa 100644 --- a/core/blockchain_test.go +++ b/core/blockchain_test.go @@ -1313,8 +1313,8 @@ func TestTrieForkGC(t *testing.T) { } // Dereference all the recent tries and ensure no past trie is left in for i := 0; i < triesInMemory; i++ { - chain.stateCache.TrieDB().Dereference(blocks[len(blocks)-1-i].Root(), common.Hash{}) - chain.stateCache.TrieDB().Dereference(forks[len(blocks)-1-i].Root(), common.Hash{}) + chain.stateCache.TrieDB().Dereference(blocks[len(blocks)-1-i].Root()) + chain.stateCache.TrieDB().Dereference(forks[len(blocks)-1-i].Root()) } if len(chain.stateCache.TrieDB().Nodes()) > 0 { t.Fatalf("stale tries still alive after garbase collection") diff --git a/core/state/statedb.go b/core/state/statedb.go index ffea761d9..92d394ae3 100644 --- a/core/state/statedb.go +++ b/core/state/statedb.go @@ -596,7 +596,7 @@ func (s *StateDB) Commit(deleteEmptyObjects bool) (root common.Hash, err error) case isDirty: // Write any contract code associated with the state object if stateObject.code != nil && stateObject.dirtyCode { - s.db.TrieDB().Insert(common.BytesToHash(stateObject.CodeHash()), stateObject.code) + s.db.TrieDB().InsertBlob(common.BytesToHash(stateObject.CodeHash()), stateObject.code) stateObject.dirtyCode = false } // Write any storage changes in the state object to its storage trie. diff --git a/eth/api_tracer.go b/eth/api_tracer.go index 61f5c71d6..623e5ed1b 100644 --- a/eth/api_tracer.go +++ b/eth/api_tracer.go @@ -297,7 +297,7 @@ func (api *PrivateDebugAPI) traceChain(ctx context.Context, start, end *types.Bl database.TrieDB().Reference(root, common.Hash{}) } // Dereference all past tries we ourselves are done working with - database.TrieDB().Dereference(proot, common.Hash{}) + database.TrieDB().Dereference(proot) proot = root // TODO(karalabe): Do we need the preimages? Won't they accumulate too much? @@ -320,7 +320,7 @@ func (api *PrivateDebugAPI) traceChain(ctx context.Context, start, end *types.Bl done[uint64(result.Block)] = result // Dereference any paret tries held in memory by this task - database.TrieDB().Dereference(res.rootref, common.Hash{}) + database.TrieDB().Dereference(res.rootref) // Stream completed traces to the user, aborting on the first error for result, ok := done[next]; ok; result, ok = done[next] { @@ -526,7 +526,7 @@ func (api *PrivateDebugAPI) computeStateDB(block *types.Block, reexec uint64) (* return nil, err } database.TrieDB().Reference(root, common.Hash{}) - database.TrieDB().Dereference(proot, common.Hash{}) + database.TrieDB().Dereference(proot) proot = root } nodes, imgs := database.TrieDB().Size() diff --git a/trie/database.go b/trie/database.go index 468c139df..88c6e9cd6 100644 --- a/trie/database.go +++ b/trie/database.go @@ -17,6 +17,8 @@ package trie import ( + "fmt" + "io" "sync" "time" @@ -24,6 +26,7 @@ import ( "github.com/ethereum/go-ethereum/ethdb" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/metrics" + "github.com/ethereum/go-ethereum/rlp" ) var ( @@ -82,25 +85,188 @@ type Database struct { lock sync.RWMutex } +// rawNode is a simple binary blob used to differentiate between collapsed trie +// nodes and already encoded RLP binary blobs (while at the same time store them +// in the same cache fields). +type rawNode []byte + +func (n rawNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } +func (n rawNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } +func (n rawNode) fstring(ind string) string { panic("this should never end up in a live trie") } + +// rawFullNode represents only the useful data content of a full node, with the +// caches and flags stripped out to minimize its data storage. This type honors +// the same RLP encoding as the original parent. +type rawFullNode [17]node + +func (n rawFullNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } +func (n rawFullNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } +func (n rawFullNode) fstring(ind string) string { panic("this should never end up in a live trie") } + +func (n rawFullNode) EncodeRLP(w io.Writer) error { + var nodes [17]node + + for i, child := range n { + if child != nil { + nodes[i] = child + } else { + nodes[i] = nilValueNode + } + } + return rlp.Encode(w, nodes) +} + +// rawShortNode represents only the useful data content of a short node, with the +// caches and flags stripped out to minimize its data storage. This type honors +// the same RLP encoding as the original parent. +type rawShortNode struct { + Key []byte + Val node +} + +func (n rawShortNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } +func (n rawShortNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } +func (n rawShortNode) fstring(ind string) string { panic("this should never end up in a live trie") } + // cachedNode is all the information we know about a single cached node in the // memory database write layer. type cachedNode struct { - blob []byte // Cached data block of the trie node - parents int // Number of live nodes referencing this one - children map[common.Hash]int // Children referenced by this nodes + node node // Cached collapsed trie node, or raw rlp data + size uint16 // Byte size of the useful cached data + + parents uint16 // Number of live nodes referencing this one + children map[common.Hash]uint16 // External children referenced by this node flushPrev common.Hash // Previous node in the flush-list flushNext common.Hash // Next node in the flush-list } +// rlp returns the raw rlp encoded blob of the cached node, either directly from +// the cache, or by regenerating it from the collapsed node. +func (n *cachedNode) rlp() []byte { + if node, ok := n.node.(rawNode); ok { + return node + } + blob, err := rlp.EncodeToBytes(n.node) + if err != nil { + panic(err) + } + return blob +} + +// obj returns the decoded and expanded trie node, either directly from the cache, +// or by regenerating it from the rlp encoded blob. +func (n *cachedNode) obj(hash common.Hash, cachegen uint16) node { + if node, ok := n.node.(rawNode); ok { + return mustDecodeNode(hash[:], node, cachegen) + } + return expandNode(hash[:], n.node, cachegen) +} + +// childs returns all the tracked children of this node, both the implicit ones +// from inside the node as well as the explicit ones from outside the node. +func (n *cachedNode) childs() []common.Hash { + children := make([]common.Hash, 0, 16) + for child := range n.children { + children = append(children, child) + } + if _, ok := n.node.(rawNode); !ok { + gatherChildren(n.node, &children) + } + return children +} + +// gatherChildren traverses the node hierarchy of a collapsed storage node and +// retrieves all the hashnode children. +func gatherChildren(n node, children *[]common.Hash) { + switch n := n.(type) { + case *rawShortNode: + gatherChildren(n.Val, children) + + case rawFullNode: + for i := 0; i < 16; i++ { + gatherChildren(n[i], children) + } + case hashNode: + *children = append(*children, common.BytesToHash(n)) + + case valueNode, nil: + + default: + panic(fmt.Sprintf("unknown node type: %T", n)) + } +} + +// simplifyNode traverses the hierarchy of an expanded memory node and discards +// all the internal caches, returning a node that only contains the raw data. +func simplifyNode(n node) node { + switch n := n.(type) { + case *shortNode: + // Short nodes discard the flags and cascade + return &rawShortNode{Key: n.Key, Val: simplifyNode(n.Val)} + + case *fullNode: + // Full nodes discard the flags and cascade + node := rawFullNode(n.Children) + for i := 0; i < len(node); i++ { + if node[i] != nil { + node[i] = simplifyNode(node[i]) + } + } + return node + + case valueNode, hashNode, rawNode: + return n + + default: + panic(fmt.Sprintf("unknown node type: %T", n)) + } +} + +// expandNode traverses the node hierarchy of a collapsed storage node and converts +// all fields and keys into expanded memory form. +func expandNode(hash hashNode, n node, cachegen uint16) node { + switch n := n.(type) { + case *rawShortNode: + // Short nodes need key and child expansion + return &shortNode{ + Key: compactToHex(n.Key), + Val: expandNode(nil, n.Val, cachegen), + flags: nodeFlag{ + hash: hash, + gen: cachegen, + }, + } + + case rawFullNode: + // Full nodes need child expansion + node := &fullNode{ + flags: nodeFlag{ + hash: hash, + gen: cachegen, + }, + } + for i := 0; i < len(node.Children); i++ { + if n[i] != nil { + node.Children[i] = expandNode(nil, n[i], cachegen) + } + } + return node + + case valueNode, hashNode: + return n + + default: + panic(fmt.Sprintf("unknown node type: %T", n)) + } +} + // NewDatabase creates a new trie database to store ephemeral trie content before // its written out to disk or garbage collected. func NewDatabase(diskdb ethdb.Database) *Database { return &Database{ - diskdb: diskdb, - nodes: map[common.Hash]*cachedNode{ - {}: {children: make(map[common.Hash]int)}, - }, + diskdb: diskdb, + nodes: map[common.Hash]*cachedNode{{}: {}}, preimages: make(map[common.Hash][]byte), } } @@ -110,33 +276,46 @@ func (db *Database) DiskDB() DatabaseReader { return db.diskdb } -// Insert writes a new trie node to the memory database if it's yet unknown. The -// method will make a copy of the slice. -func (db *Database) Insert(hash common.Hash, blob []byte) { +// InsertBlob writes a new reference tracked blob to the memory database if it's +// yet unknown. This method should only be used for non-trie nodes that require +// reference counting, since trie nodes are garbage collected directly through +// their embedded children. +func (db *Database) InsertBlob(hash common.Hash, blob []byte) { db.lock.Lock() defer db.lock.Unlock() - db.insert(hash, blob) + db.insert(hash, blob, rawNode(blob)) } -// insert is the private locked version of Insert. -func (db *Database) insert(hash common.Hash, blob []byte) { +// insert inserts a collapsed trie node into the memory database. This method is +// a more generic version of InsertBlob, supporting both raw blob insertions as +// well ex trie node insertions. The blob must always be specified to allow proper +// size tracking. +func (db *Database) insert(hash common.Hash, blob []byte, node node) { // If the node's already cached, skip if _, ok := db.nodes[hash]; ok { return } - db.nodes[hash] = &cachedNode{ - blob: common.CopyBytes(blob), - children: make(map[common.Hash]int), + // Create the cached entry for this node + entry := &cachedNode{ + node: simplifyNode(node), + size: uint16(len(blob)), flushPrev: db.newest, } + for _, child := range entry.childs() { + if c := db.nodes[child]; c != nil { + c.parents++ + } + } + db.nodes[hash] = entry + // Update the flush-list endpoints if db.oldest == (common.Hash{}) { db.oldest, db.newest = hash, hash } else { db.nodes[db.newest].flushNext, db.newest = hash, hash } - db.nodesSize += common.StorageSize(common.HashLength + len(blob)) + db.nodesSize += common.StorageSize(common.HashLength + entry.size) } // insertPreimage writes a new trie node pre-image to the memory database if it's @@ -151,8 +330,27 @@ func (db *Database) insertPreimage(hash common.Hash, preimage []byte) { db.preimagesSize += common.StorageSize(common.HashLength + len(preimage)) } -// Node retrieves a cached trie node from memory. If it cannot be found cached, -// the method queries the persistent database for the content. +// node retrieves a cached trie node from memory, or returns nil if none can be +// found in the memory cache. +func (db *Database) node(hash common.Hash, cachegen uint16) node { + // Retrieve the node from cache if available + db.lock.RLock() + node := db.nodes[hash] + db.lock.RUnlock() + + if node != nil { + return node.obj(hash, cachegen) + } + // Content unavailable in memory, attempt to retrieve from disk + enc, err := db.diskdb.Get(hash[:]) + if err != nil || enc == nil { + return nil + } + return mustDecodeNode(hash[:], enc, cachegen) +} + +// Node retrieves an encoded cached trie node from memory. If it cannot be found +// cached, the method queries the persistent database for the content. func (db *Database) Node(hash common.Hash) ([]byte, error) { // Retrieve the node from cache if available db.lock.RLock() @@ -160,7 +358,7 @@ func (db *Database) Node(hash common.Hash) ([]byte, error) { db.lock.RUnlock() if node != nil { - return node.blob, nil + return node.rlp(), nil } // Content unavailable in memory, attempt to retrieve from disk return db.diskdb.Get(hash[:]) @@ -222,20 +420,22 @@ func (db *Database) reference(child common.Hash, parent common.Hash) { return } // If the reference already exists, only duplicate for roots - if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) { + if db.nodes[parent].children == nil { + db.nodes[parent].children = make(map[common.Hash]uint16) + } else if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) { return } node.parents++ db.nodes[parent].children[child]++ } -// Dereference removes an existing reference from a parent node to a child node. -func (db *Database) Dereference(child common.Hash, parent common.Hash) { +// Dereference removes an existing reference from a root node. +func (db *Database) Dereference(root common.Hash) { db.lock.Lock() defer db.lock.Unlock() nodes, storage, start := len(db.nodes), db.nodesSize, time.Now() - db.dereference(child, parent) + db.dereference(root, common.Hash{}) db.gcnodes += uint64(nodes - len(db.nodes)) db.gcsize += storage - db.nodesSize @@ -254,9 +454,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) { // Dereference the parent-child node := db.nodes[parent] - node.children[child]-- - if node.children[child] == 0 { - delete(node.children, child) + if node.children != nil && node.children[child] > 0 { + node.children[child]-- + if node.children[child] == 0 { + delete(node.children, child) + } } // If the child does not exist, it's a previously committed node. node, ok := db.nodes[child] @@ -274,11 +476,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) { db.nodes[node.flushNext].flushPrev = node.flushPrev } // Dereference all children and delete the node - for hash := range node.children { + for _, hash := range node.childs() { db.dereference(hash, child) } delete(db.nodes, child) - db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) } } @@ -323,7 +525,7 @@ func (db *Database) Cap(limit common.StorageSize) error { for size > limit && oldest != (common.Hash{}) { // Fetch the oldest referenced node and push into the batch node := db.nodes[oldest] - if err := batch.Put(oldest[:], node.blob); err != nil { + if err := batch.Put(oldest[:], node.rlp()); err != nil { db.lock.RUnlock() return err } @@ -340,7 +542,7 @@ func (db *Database) Cap(limit common.StorageSize) error { // is the total size, including both the useful cached data (hash -> blob), as // well as the flushlist metadata (2*hash). When flushing items from the cache, // we need to reduce both. - size -= common.StorageSize(3*common.HashLength + len(node.blob)) + size -= common.StorageSize(3*common.HashLength + int(node.size)) oldest = node.flushNext } // Flush out any remainder data from the last batch @@ -364,7 +566,7 @@ func (db *Database) Cap(limit common.StorageSize) error { delete(db.nodes, db.oldest) db.oldest = node.flushNext - db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) } if db.oldest != (common.Hash{}) { db.nodes[db.oldest].flushPrev = common.Hash{} @@ -460,12 +662,12 @@ func (db *Database) commit(hash common.Hash, batch ethdb.Batch) error { if !ok { return nil } - for child := range node.children { + for _, child := range node.childs() { if err := db.commit(child, batch); err != nil { return err } } - if err := batch.Put(hash[:], node.blob); err != nil { + if err := batch.Put(hash[:], node.rlp()); err != nil { return err } // If we've reached an optimal batch size, commit and start over @@ -496,11 +698,11 @@ func (db *Database) uncache(hash common.Hash) { db.nodes[node.flushNext].flushPrev = node.flushPrev } // Uncache the node's subtries and remove the node itself too - for child := range node.children { + for _, child := range node.childs() { db.uncache(child) } delete(db.nodes, hash) - db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) } // Size returns the current storage size of the memory cache in front of the diff --git a/trie/hasher.go b/trie/hasher.go index 47c6dd8f9..7b1d7793f 100644 --- a/trie/hasher.go +++ b/trie/hasher.go @@ -137,9 +137,6 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) { return original, original, err } } - if collapsed.Val == nil { - collapsed.Val = valueNode(nil) // Ensure that nil children are encoded as empty strings. - } return collapsed, cached, nil case *fullNode: @@ -152,14 +149,9 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) { if err != nil { return original, original, err } - } else { - collapsed.Children[i] = valueNode(nil) // Ensure that nil children are encoded as empty strings. } } cached.Children[16] = n.Children[16] - if collapsed.Children[16] == nil { - collapsed.Children[16] = valueNode(nil) - } return collapsed, cached, nil default: @@ -192,34 +184,22 @@ func (h *hasher) store(n node, db *Database, force bool) (node, error) { if db != nil { // We are pooling the trie nodes into an intermediate memory cache - db.lock.Lock() hash := common.BytesToHash(hash) - db.insert(hash, h.tmp) - // Track all direct parent->child node references - switch n := n.(type) { - case *shortNode: - if child, ok := n.Val.(hashNode); ok { - db.reference(common.BytesToHash(child), hash) - } - case *fullNode: - for i := 0; i < 16; i++ { - if child, ok := n.Children[i].(hashNode); ok { - db.reference(common.BytesToHash(child), hash) - } - } - } + + db.lock.Lock() + db.insert(hash, h.tmp, n) db.lock.Unlock() // Track external references from account->storage trie if h.onleaf != nil { switch n := n.(type) { case *shortNode: - if child, ok := n.Val.(valueNode); ok && child != nil { + if child, ok := n.Val.(valueNode); ok { h.onleaf(child, hash) } case *fullNode: for i := 0; i < 16; i++ { - if child, ok := n.Children[i].(valueNode); ok && child != nil { + if child, ok := n.Children[i].(valueNode); ok { h.onleaf(child, hash) } } diff --git a/trie/node.go b/trie/node.go index 02815042c..a06f1b389 100644 --- a/trie/node.go +++ b/trie/node.go @@ -47,9 +47,22 @@ type ( valueNode []byte ) +// nilValueNode is used when collapsing internal trie nodes for hashing, since +// unset children need to serialize correctly. +var nilValueNode = valueNode(nil) + // EncodeRLP encodes a full node into the consensus RLP format. func (n *fullNode) EncodeRLP(w io.Writer) error { - return rlp.Encode(w, n.Children) + var nodes [17]node + + for i, child := range n.Children { + if child != nil { + nodes[i] = child + } else { + nodes[i] = nilValueNode + } + } + return rlp.Encode(w, nodes) } func (n *fullNode) copy() *fullNode { copy := *n; return © } diff --git a/trie/trie.go b/trie/trie.go index 30543c549..4284e30ad 100644 --- a/trie/trie.go +++ b/trie/trie.go @@ -433,12 +433,10 @@ func (t *Trie) resolveHash(n hashNode, prefix []byte) (node, error) { cacheMissCounter.Inc(1) hash := common.BytesToHash(n) - - enc, err := t.db.Node(hash) - if err != nil || enc == nil { - return nil, &MissingNodeError{NodeHash: hash, Path: prefix} + if node := t.db.node(hash, t.cachegen); node != nil { + return node, nil } - return mustDecodeNode(n, enc, t.cachegen), nil + return nil, &MissingNodeError{NodeHash: hash, Path: prefix} } // Root returns the root hash of the trie.