trie: cache collapsed tries node, not rlp blobs (#16876)

The current trie memory database/cache that we do pruning on stores
trie nodes as binary rlp encoded blobs, and also stores the node
relationships/references for GC purposes. However, most of the trie
nodes (everything apart from a value node) is in essence just a
collection of references.

This PR switches out the RLP encoded trie blobs with the
collapsed-but-not-serialized trie nodes. This permits most of the
references to be recovered from within the node data structure,
avoiding the need to track them a second time (expensive memory wise).
This commit is contained in:
Péter Szilágyi 2018-06-21 12:28:05 +03:00 committed by Felix Lange
parent 8db8d074e2
commit d926bf2c7e
8 changed files with 268 additions and 75 deletions

@ -672,7 +672,7 @@ func (bc *BlockChain) Stop() {
}
}
for !bc.triegc.Empty() {
triedb.Dereference(bc.triegc.PopItem().(common.Hash), common.Hash{})
triedb.Dereference(bc.triegc.PopItem().(common.Hash))
}
if size, _ := triedb.Size(); size != 0 {
log.Error("Dangling trie nodes after full cleanup")
@ -947,7 +947,7 @@ func (bc *BlockChain) WriteBlockWithState(block *types.Block, receipts []*types.
bc.triegc.Push(root, number)
break
}
triedb.Dereference(root.(common.Hash), common.Hash{})
triedb.Dereference(root.(common.Hash))
}
}
}

@ -1313,8 +1313,8 @@ func TestTrieForkGC(t *testing.T) {
}
// Dereference all the recent tries and ensure no past trie is left in
for i := 0; i < triesInMemory; i++ {
chain.stateCache.TrieDB().Dereference(blocks[len(blocks)-1-i].Root(), common.Hash{})
chain.stateCache.TrieDB().Dereference(forks[len(blocks)-1-i].Root(), common.Hash{})
chain.stateCache.TrieDB().Dereference(blocks[len(blocks)-1-i].Root())
chain.stateCache.TrieDB().Dereference(forks[len(blocks)-1-i].Root())
}
if len(chain.stateCache.TrieDB().Nodes()) > 0 {
t.Fatalf("stale tries still alive after garbase collection")

@ -596,7 +596,7 @@ func (s *StateDB) Commit(deleteEmptyObjects bool) (root common.Hash, err error)
case isDirty:
// Write any contract code associated with the state object
if stateObject.code != nil && stateObject.dirtyCode {
s.db.TrieDB().Insert(common.BytesToHash(stateObject.CodeHash()), stateObject.code)
s.db.TrieDB().InsertBlob(common.BytesToHash(stateObject.CodeHash()), stateObject.code)
stateObject.dirtyCode = false
}
// Write any storage changes in the state object to its storage trie.

@ -297,7 +297,7 @@ func (api *PrivateDebugAPI) traceChain(ctx context.Context, start, end *types.Bl
database.TrieDB().Reference(root, common.Hash{})
}
// Dereference all past tries we ourselves are done working with
database.TrieDB().Dereference(proot, common.Hash{})
database.TrieDB().Dereference(proot)
proot = root
// TODO(karalabe): Do we need the preimages? Won't they accumulate too much?
@ -320,7 +320,7 @@ func (api *PrivateDebugAPI) traceChain(ctx context.Context, start, end *types.Bl
done[uint64(result.Block)] = result
// Dereference any paret tries held in memory by this task
database.TrieDB().Dereference(res.rootref, common.Hash{})
database.TrieDB().Dereference(res.rootref)
// Stream completed traces to the user, aborting on the first error
for result, ok := done[next]; ok; result, ok = done[next] {
@ -526,7 +526,7 @@ func (api *PrivateDebugAPI) computeStateDB(block *types.Block, reexec uint64) (*
return nil, err
}
database.TrieDB().Reference(root, common.Hash{})
database.TrieDB().Dereference(proot, common.Hash{})
database.TrieDB().Dereference(proot)
proot = root
}
nodes, imgs := database.TrieDB().Size()

@ -17,6 +17,8 @@
package trie
import (
"fmt"
"io"
"sync"
"time"
@ -24,6 +26,7 @@ import (
"github.com/ethereum/go-ethereum/ethdb"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/metrics"
"github.com/ethereum/go-ethereum/rlp"
)
var (
@ -82,25 +85,188 @@ type Database struct {
lock sync.RWMutex
}
// rawNode is a simple binary blob used to differentiate between collapsed trie
// nodes and already encoded RLP binary blobs (while at the same time store them
// in the same cache fields).
type rawNode []byte
func (n rawNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
func (n rawNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") }
func (n rawNode) fstring(ind string) string { panic("this should never end up in a live trie") }
// rawFullNode represents only the useful data content of a full node, with the
// caches and flags stripped out to minimize its data storage. This type honors
// the same RLP encoding as the original parent.
type rawFullNode [17]node
func (n rawFullNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
func (n rawFullNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") }
func (n rawFullNode) fstring(ind string) string { panic("this should never end up in a live trie") }
func (n rawFullNode) EncodeRLP(w io.Writer) error {
var nodes [17]node
for i, child := range n {
if child != nil {
nodes[i] = child
} else {
nodes[i] = nilValueNode
}
}
return rlp.Encode(w, nodes)
}
// rawShortNode represents only the useful data content of a short node, with the
// caches and flags stripped out to minimize its data storage. This type honors
// the same RLP encoding as the original parent.
type rawShortNode struct {
Key []byte
Val node
}
func (n rawShortNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
func (n rawShortNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") }
func (n rawShortNode) fstring(ind string) string { panic("this should never end up in a live trie") }
// cachedNode is all the information we know about a single cached node in the
// memory database write layer.
type cachedNode struct {
blob []byte // Cached data block of the trie node
parents int // Number of live nodes referencing this one
children map[common.Hash]int // Children referenced by this nodes
node node // Cached collapsed trie node, or raw rlp data
size uint16 // Byte size of the useful cached data
parents uint16 // Number of live nodes referencing this one
children map[common.Hash]uint16 // External children referenced by this node
flushPrev common.Hash // Previous node in the flush-list
flushNext common.Hash // Next node in the flush-list
}
// rlp returns the raw rlp encoded blob of the cached node, either directly from
// the cache, or by regenerating it from the collapsed node.
func (n *cachedNode) rlp() []byte {
if node, ok := n.node.(rawNode); ok {
return node
}
blob, err := rlp.EncodeToBytes(n.node)
if err != nil {
panic(err)
}
return blob
}
// obj returns the decoded and expanded trie node, either directly from the cache,
// or by regenerating it from the rlp encoded blob.
func (n *cachedNode) obj(hash common.Hash, cachegen uint16) node {
if node, ok := n.node.(rawNode); ok {
return mustDecodeNode(hash[:], node, cachegen)
}
return expandNode(hash[:], n.node, cachegen)
}
// childs returns all the tracked children of this node, both the implicit ones
// from inside the node as well as the explicit ones from outside the node.
func (n *cachedNode) childs() []common.Hash {
children := make([]common.Hash, 0, 16)
for child := range n.children {
children = append(children, child)
}
if _, ok := n.node.(rawNode); !ok {
gatherChildren(n.node, &children)
}
return children
}
// gatherChildren traverses the node hierarchy of a collapsed storage node and
// retrieves all the hashnode children.
func gatherChildren(n node, children *[]common.Hash) {
switch n := n.(type) {
case *rawShortNode:
gatherChildren(n.Val, children)
case rawFullNode:
for i := 0; i < 16; i++ {
gatherChildren(n[i], children)
}
case hashNode:
*children = append(*children, common.BytesToHash(n))
case valueNode, nil:
default:
panic(fmt.Sprintf("unknown node type: %T", n))
}
}
// simplifyNode traverses the hierarchy of an expanded memory node and discards
// all the internal caches, returning a node that only contains the raw data.
func simplifyNode(n node) node {
switch n := n.(type) {
case *shortNode:
// Short nodes discard the flags and cascade
return &rawShortNode{Key: n.Key, Val: simplifyNode(n.Val)}
case *fullNode:
// Full nodes discard the flags and cascade
node := rawFullNode(n.Children)
for i := 0; i < len(node); i++ {
if node[i] != nil {
node[i] = simplifyNode(node[i])
}
}
return node
case valueNode, hashNode, rawNode:
return n
default:
panic(fmt.Sprintf("unknown node type: %T", n))
}
}
// expandNode traverses the node hierarchy of a collapsed storage node and converts
// all fields and keys into expanded memory form.
func expandNode(hash hashNode, n node, cachegen uint16) node {
switch n := n.(type) {
case *rawShortNode:
// Short nodes need key and child expansion
return &shortNode{
Key: compactToHex(n.Key),
Val: expandNode(nil, n.Val, cachegen),
flags: nodeFlag{
hash: hash,
gen: cachegen,
},
}
case rawFullNode:
// Full nodes need child expansion
node := &fullNode{
flags: nodeFlag{
hash: hash,
gen: cachegen,
},
}
for i := 0; i < len(node.Children); i++ {
if n[i] != nil {
node.Children[i] = expandNode(nil, n[i], cachegen)
}
}
return node
case valueNode, hashNode:
return n
default:
panic(fmt.Sprintf("unknown node type: %T", n))
}
}
// NewDatabase creates a new trie database to store ephemeral trie content before
// its written out to disk or garbage collected.
func NewDatabase(diskdb ethdb.Database) *Database {
return &Database{
diskdb: diskdb,
nodes: map[common.Hash]*cachedNode{
{}: {children: make(map[common.Hash]int)},
},
nodes: map[common.Hash]*cachedNode{{}: {}},
preimages: make(map[common.Hash][]byte),
}
}
@ -110,33 +276,46 @@ func (db *Database) DiskDB() DatabaseReader {
return db.diskdb
}
// Insert writes a new trie node to the memory database if it's yet unknown. The
// method will make a copy of the slice.
func (db *Database) Insert(hash common.Hash, blob []byte) {
// InsertBlob writes a new reference tracked blob to the memory database if it's
// yet unknown. This method should only be used for non-trie nodes that require
// reference counting, since trie nodes are garbage collected directly through
// their embedded children.
func (db *Database) InsertBlob(hash common.Hash, blob []byte) {
db.lock.Lock()
defer db.lock.Unlock()
db.insert(hash, blob)
db.insert(hash, blob, rawNode(blob))
}
// insert is the private locked version of Insert.
func (db *Database) insert(hash common.Hash, blob []byte) {
// insert inserts a collapsed trie node into the memory database. This method is
// a more generic version of InsertBlob, supporting both raw blob insertions as
// well ex trie node insertions. The blob must always be specified to allow proper
// size tracking.
func (db *Database) insert(hash common.Hash, blob []byte, node node) {
// If the node's already cached, skip
if _, ok := db.nodes[hash]; ok {
return
}
db.nodes[hash] = &cachedNode{
blob: common.CopyBytes(blob),
children: make(map[common.Hash]int),
// Create the cached entry for this node
entry := &cachedNode{
node: simplifyNode(node),
size: uint16(len(blob)),
flushPrev: db.newest,
}
for _, child := range entry.childs() {
if c := db.nodes[child]; c != nil {
c.parents++
}
}
db.nodes[hash] = entry
// Update the flush-list endpoints
if db.oldest == (common.Hash{}) {
db.oldest, db.newest = hash, hash
} else {
db.nodes[db.newest].flushNext, db.newest = hash, hash
}
db.nodesSize += common.StorageSize(common.HashLength + len(blob))
db.nodesSize += common.StorageSize(common.HashLength + entry.size)
}
// insertPreimage writes a new trie node pre-image to the memory database if it's
@ -151,8 +330,27 @@ func (db *Database) insertPreimage(hash common.Hash, preimage []byte) {
db.preimagesSize += common.StorageSize(common.HashLength + len(preimage))
}
// Node retrieves a cached trie node from memory. If it cannot be found cached,
// the method queries the persistent database for the content.
// node retrieves a cached trie node from memory, or returns nil if none can be
// found in the memory cache.
func (db *Database) node(hash common.Hash, cachegen uint16) node {
// Retrieve the node from cache if available
db.lock.RLock()
node := db.nodes[hash]
db.lock.RUnlock()
if node != nil {
return node.obj(hash, cachegen)
}
// Content unavailable in memory, attempt to retrieve from disk
enc, err := db.diskdb.Get(hash[:])
if err != nil || enc == nil {
return nil
}
return mustDecodeNode(hash[:], enc, cachegen)
}
// Node retrieves an encoded cached trie node from memory. If it cannot be found
// cached, the method queries the persistent database for the content.
func (db *Database) Node(hash common.Hash) ([]byte, error) {
// Retrieve the node from cache if available
db.lock.RLock()
@ -160,7 +358,7 @@ func (db *Database) Node(hash common.Hash) ([]byte, error) {
db.lock.RUnlock()
if node != nil {
return node.blob, nil
return node.rlp(), nil
}
// Content unavailable in memory, attempt to retrieve from disk
return db.diskdb.Get(hash[:])
@ -222,20 +420,22 @@ func (db *Database) reference(child common.Hash, parent common.Hash) {
return
}
// If the reference already exists, only duplicate for roots
if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) {
if db.nodes[parent].children == nil {
db.nodes[parent].children = make(map[common.Hash]uint16)
} else if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) {
return
}
node.parents++
db.nodes[parent].children[child]++
}
// Dereference removes an existing reference from a parent node to a child node.
func (db *Database) Dereference(child common.Hash, parent common.Hash) {
// Dereference removes an existing reference from a root node.
func (db *Database) Dereference(root common.Hash) {
db.lock.Lock()
defer db.lock.Unlock()
nodes, storage, start := len(db.nodes), db.nodesSize, time.Now()
db.dereference(child, parent)
db.dereference(root, common.Hash{})
db.gcnodes += uint64(nodes - len(db.nodes))
db.gcsize += storage - db.nodesSize
@ -254,10 +454,12 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) {
// Dereference the parent-child
node := db.nodes[parent]
if node.children != nil && node.children[child] > 0 {
node.children[child]--
if node.children[child] == 0 {
delete(node.children, child)
}
}
// If the child does not exist, it's a previously committed node.
node, ok := db.nodes[child]
if !ok {
@ -274,11 +476,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) {
db.nodes[node.flushNext].flushPrev = node.flushPrev
}
// Dereference all children and delete the node
for hash := range node.children {
for _, hash := range node.childs() {
db.dereference(hash, child)
}
delete(db.nodes, child)
db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
}
}
@ -323,7 +525,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
for size > limit && oldest != (common.Hash{}) {
// Fetch the oldest referenced node and push into the batch
node := db.nodes[oldest]
if err := batch.Put(oldest[:], node.blob); err != nil {
if err := batch.Put(oldest[:], node.rlp()); err != nil {
db.lock.RUnlock()
return err
}
@ -340,7 +542,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
// is the total size, including both the useful cached data (hash -> blob), as
// well as the flushlist metadata (2*hash). When flushing items from the cache,
// we need to reduce both.
size -= common.StorageSize(3*common.HashLength + len(node.blob))
size -= common.StorageSize(3*common.HashLength + int(node.size))
oldest = node.flushNext
}
// Flush out any remainder data from the last batch
@ -364,7 +566,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
delete(db.nodes, db.oldest)
db.oldest = node.flushNext
db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
}
if db.oldest != (common.Hash{}) {
db.nodes[db.oldest].flushPrev = common.Hash{}
@ -460,12 +662,12 @@ func (db *Database) commit(hash common.Hash, batch ethdb.Batch) error {
if !ok {
return nil
}
for child := range node.children {
for _, child := range node.childs() {
if err := db.commit(child, batch); err != nil {
return err
}
}
if err := batch.Put(hash[:], node.blob); err != nil {
if err := batch.Put(hash[:], node.rlp()); err != nil {
return err
}
// If we've reached an optimal batch size, commit and start over
@ -496,11 +698,11 @@ func (db *Database) uncache(hash common.Hash) {
db.nodes[node.flushNext].flushPrev = node.flushPrev
}
// Uncache the node's subtries and remove the node itself too
for child := range node.children {
for _, child := range node.childs() {
db.uncache(child)
}
delete(db.nodes, hash)
db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
}
// Size returns the current storage size of the memory cache in front of the

@ -137,9 +137,6 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) {
return original, original, err
}
}
if collapsed.Val == nil {
collapsed.Val = valueNode(nil) // Ensure that nil children are encoded as empty strings.
}
return collapsed, cached, nil
case *fullNode:
@ -152,14 +149,9 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) {
if err != nil {
return original, original, err
}
} else {
collapsed.Children[i] = valueNode(nil) // Ensure that nil children are encoded as empty strings.
}
}
cached.Children[16] = n.Children[16]
if collapsed.Children[16] == nil {
collapsed.Children[16] = valueNode(nil)
}
return collapsed, cached, nil
default:
@ -192,34 +184,22 @@ func (h *hasher) store(n node, db *Database, force bool) (node, error) {
if db != nil {
// We are pooling the trie nodes into an intermediate memory cache
db.lock.Lock()
hash := common.BytesToHash(hash)
db.insert(hash, h.tmp)
// Track all direct parent->child node references
switch n := n.(type) {
case *shortNode:
if child, ok := n.Val.(hashNode); ok {
db.reference(common.BytesToHash(child), hash)
}
case *fullNode:
for i := 0; i < 16; i++ {
if child, ok := n.Children[i].(hashNode); ok {
db.reference(common.BytesToHash(child), hash)
}
}
}
db.lock.Lock()
db.insert(hash, h.tmp, n)
db.lock.Unlock()
// Track external references from account->storage trie
if h.onleaf != nil {
switch n := n.(type) {
case *shortNode:
if child, ok := n.Val.(valueNode); ok && child != nil {
if child, ok := n.Val.(valueNode); ok {
h.onleaf(child, hash)
}
case *fullNode:
for i := 0; i < 16; i++ {
if child, ok := n.Children[i].(valueNode); ok && child != nil {
if child, ok := n.Children[i].(valueNode); ok {
h.onleaf(child, hash)
}
}

@ -47,9 +47,22 @@ type (
valueNode []byte
)
// nilValueNode is used when collapsing internal trie nodes for hashing, since
// unset children need to serialize correctly.
var nilValueNode = valueNode(nil)
// EncodeRLP encodes a full node into the consensus RLP format.
func (n *fullNode) EncodeRLP(w io.Writer) error {
return rlp.Encode(w, n.Children)
var nodes [17]node
for i, child := range n.Children {
if child != nil {
nodes[i] = child
} else {
nodes[i] = nilValueNode
}
}
return rlp.Encode(w, nodes)
}
func (n *fullNode) copy() *fullNode { copy := *n; return &copy }

@ -433,12 +433,10 @@ func (t *Trie) resolveHash(n hashNode, prefix []byte) (node, error) {
cacheMissCounter.Inc(1)
hash := common.BytesToHash(n)
enc, err := t.db.Node(hash)
if err != nil || enc == nil {
return nil, &MissingNodeError{NodeHash: hash, Path: prefix}
if node := t.db.node(hash, t.cachegen); node != nil {
return node, nil
}
return mustDecodeNode(n, enc, t.cachegen), nil
return nil, &MissingNodeError{NodeHash: hash, Path: prefix}
}
// Root returns the root hash of the trie.